diff --git a/test/instrumentation/documentation/main.go b/test/instrumentation/documentation/main.go index c0d5af8ecb3..62c05dab059 100755 --- a/test/instrumentation/documentation/main.go +++ b/test/instrumentation/documentation/main.go @@ -20,6 +20,7 @@ import ( "bytes" "fmt" "os" + "sort" "strings" "text/template" "time" @@ -37,7 +38,7 @@ var ( const ( templ = `--- -title: Kubernetes Metrics Across Components +title: Kubernetes Metrics content_type: instrumentation --- @@ -55,8 +56,8 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api Name Stability Level - Type - Help + Type + Help Labels Const Labels @@ -82,6 +83,7 @@ func main() { if err != nil { println("err", err) } + sort.Sort(byFQName(metrics)) t := template.New("t") t, err := t.Parse(templ) if err != nil { @@ -128,3 +130,18 @@ type metric struct { func (m metric) BuildFQName() string { return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name) } + +type byFQName []metric + +func (ms byFQName) Len() int { return len(ms) } +func (ms byFQName) Less(i, j int) bool { + if ms[i].StabilityLevel < ms[j].StabilityLevel { + return true + } else if ms[i].StabilityLevel > ms[j].StabilityLevel { + return false + } + return ms[i].BuildFQName() < ms[j].BuildFQName() +} +func (ms byFQName) Swap(i, j int) { + ms[i], ms[j] = ms[j], ms[i] +} diff --git a/test/instrumentation/main.go b/test/instrumentation/main.go index af5465e76f8..451023cc21a 100644 --- a/test/instrumentation/main.go +++ b/test/instrumentation/main.go @@ -89,6 +89,12 @@ func main() { if len(stableMetrics) == 0 { os.Exit(0) } + for i, m := range stableMetrics { + if m.StabilityLevel == "" { + m.StabilityLevel = "ALPHA" + } + stableMetrics[i] = m + } sort.Sort(byFQName(stableMetrics)) data, err := yaml.Marshal(stableMetrics) if err != nil { diff --git a/test/instrumentation/stability-utils.sh b/test/instrumentation/stability-utils.sh index a751ff4834b..749d68e37cc 100755 --- a/test/instrumentation/stability-utils.sh +++ b/test/instrumentation/stability-utils.sh @@ -115,7 +115,7 @@ kube::update::documentation::list() { exit 1 fi mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/documentation-list.yaml" - echo "${green}Updated golden list of stable metrics.${reset}" + echo "${green}Updated list of metrics for documentation ${reset}" } kube::update::documentation() { diff --git a/test/instrumentation/testdata/documentation-list.yaml b/test/instrumentation/testdata/documentation-list.yaml index 281bd57f74b..4b884cb9521 100644 --- a/test/instrumentation/testdata/documentation-list.yaml +++ b/test/instrumentation/testdata/documentation-list.yaml @@ -12,12 +12,6 @@ certificate is invalid or unused, the value will be +INF. type: Gauge stabilityLevel: ALPHA -- name: endpointslices_changed_per_sync - subsystem: endpoint_slice_controller - help: Number of EndpointSlices changed on each Service sync - type: Histogram - labels: - - topology - name: cronjob_job_creation_skew_duration_seconds subsystem: cronjob_controller help: Time between when a cronjob is scheduled to be run, and when the corresponding @@ -94,6 +88,13 @@ - 8192 - 16384 - 32768 +- name: endpointslices_changed_per_sync + subsystem: endpoint_slice_controller + help: Number of EndpointSlices changed on each Service sync + type: Histogram + stabilityLevel: ALPHA + labels: + - topology - name: num_endpoint_slices subsystem: endpoint_slice_controller help: Number of EndpointSlices @@ -269,22 +270,6 @@ stabilityLevel: ALPHA labels: - code -- name: job_pods_finished_total - subsystem: job_controller - help: The number of finished Pods that are fully tracked - type: Counter - labels: - - completion_mode - - result -- name: terminated_pods_tracking_finalizer_total - subsystem: job_controller - help: |- - `The number of terminated pods (phase=Failed|Succeeded) - that have the finalizer batch.kubernetes.io/job-tracking - The event label can be "add" or "delete".` - type: Counter - labels: - - event - name: attachdetach_controller_forced_detaches help: Number of times the A/D Controller performed a forced detach type: Counter @@ -297,6 +282,14 @@ labels: - completion_mode - result +- name: job_pods_finished_total + subsystem: job_controller + help: The number of finished Pods that are fully tracked + type: Counter + stabilityLevel: ALPHA + labels: + - completion_mode + - result - name: job_sync_duration_seconds subsystem: job_controller help: The time it took to sync a job @@ -331,6 +324,16 @@ - action - completion_mode - result +- name: terminated_pods_tracking_finalizer_total + subsystem: job_controller + help: |- + `The number of terminated pods (phase=Failed|Succeeded) + that have the finalizer batch.kubernetes.io/job-tracking + The event label can be "add" or "delete".` + type: Counter + stabilityLevel: ALPHA + labels: + - event - name: evictions_number subsystem: node_collector help: Number of Node evictions that happened since current instance of NodeController diff --git a/test/instrumentation/testdata/documentation.md b/test/instrumentation/testdata/documentation.md index 9134078195b..df7b46f6524 100644 --- a/test/instrumentation/testdata/documentation.md +++ b/test/instrumentation/testdata/documentation.md @@ -1,5 +1,5 @@ --- -title: Kubernetes Metrics Across Components +title: Kubernetes Metrics content_type: instrumentation --- @@ -17,165 +17,18 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api Name Stability Level - Type - Help + Type + Help Labels Const Labels -etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None -kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone -endpoint_slice_controller_endpointslices_changed_per_syncHistogramNumber of EndpointSlices changed on each Service sync
topology
None -cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone -endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None -endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone -endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone -endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None -endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone -endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None -endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone -endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone -endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone -endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone -endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone -garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone -root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None -root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None -job_controller_job_pods_finished_totalCounterThe number of finished Pods that are fully tracked
completion_mode
result
None -job_controller_terminated_pods_tracking_finalizer_totalCounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None -attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone -job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None -job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None -job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None -node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None -node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None -node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None -node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None -node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None -node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None -node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None -node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None -node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None -replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone -ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone -node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None -ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone -kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone -kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone -kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone -kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None -kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None -kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone -volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None -kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None -kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone -kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None -kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None -kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None -kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None -kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone -kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None -kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None -kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None -kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone -kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone -kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None -kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone -kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone -kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone -kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone -kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None -kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone -kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone -kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None -kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone -kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None -kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None -kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None -kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None -kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone -kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None -kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None -kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None -kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None -kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None -kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None -kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None -kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone -kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone -kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None -prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None -prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None -apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None -kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone -kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone -kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone -kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone -kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone -kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None -kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone -kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone -kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None -kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone -kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone -volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone -volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone -volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone -volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone -volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone -kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None -kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None -kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None -kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None -kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone -kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None -scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None -scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None -scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None -scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None -scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None -scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None -scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone -scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None -scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None -scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None -serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone -serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone -serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone -scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None -scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None -scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone -scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None -scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone -scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone -scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None -scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None -scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None -csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None -storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None -volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None -node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None +aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None +aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None +aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None apiextensions_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing CRD name and reason.
crd
reason
None apiextensions_openapi_v3_regeneration_countALPHACounterCounter of OpenAPI v3 spec regeneration count broken down by group, version, causing CRD and reason.
crd
group
reason
version
None -apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None apiserver_admission_step_admission_duration_seconds_summaryALPHASummaryAdmission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None apiserver_admission_webhook_fail_open_countALPHACounterAdmission webhook fail open count, identified by name and broken out for each admission type (validating or mutating).
name
type
None apiserver_admission_webhook_rejection_countALPHACounterAdmission webhook rejection count, identified by name and broken out for each admission type (validating or admit) and operation. Additional labels specify an error type (calling_webhook_error or apiserver_internal_error if an error occurred; no_error otherwise) and optionally a non-zero rejection code if the webhook rejects the request with an HTTP status code (honored by the apiserver when the code is greater or equal to 400). Codes greater than 600 are truncated to 600, to keep the metrics cardinality bounded.
error_type
name
operation
rejection_code
type
None @@ -184,47 +37,22 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_audit_event_totalALPHACounterCounter of audit events generated and sent to the audit backend.NoneNone apiserver_audit_level_totalALPHACounterCounter of policy levels for audit events (1 per request).
level
None apiserver_audit_requests_rejected_totalALPHACounterCounter of apiserver requests rejected due to an error in audit logging backend.NoneNone -apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None -apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None -apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None -apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None -apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None +apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None +apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None apiserver_cel_compilation_duration_secondsALPHAHistogramNoneNone apiserver_cel_evaluation_duration_secondsALPHAHistogramNoneNone +apiserver_certificates_registry_csr_honored_duration_totalALPHACounterTotal number of issued CSRs with a requested duration that was honored, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None +apiserver_certificates_registry_csr_requested_duration_totalALPHACounterTotal number of issued CSRs with a requested duration, sliced by signer (only kubernetes.io signer names are specifically identified)
signerName
None apiserver_client_certificate_expiration_secondsALPHAHistogramDistribution of the remaining lifetime on the certificate used to authenticate a request.NoneNone +apiserver_crd_webhook_conversion_duration_secondsALPHAHistogramCRD webhook conversion duration in seconds
crd_name
from_version
succeeded
to_version
None apiserver_current_inqueue_requestsALPHAGaugeMaximal number of queued requests in this apiserver per request kind in last second.
request_kind
None +apiserver_delegated_authn_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None +apiserver_delegated_authn_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None apiserver_delegated_authz_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by status code.
code
None apiserver_delegated_authz_request_totalALPHACounterNumber of HTTP requests partitioned by status code.
code
None apiserver_egress_dialer_dial_duration_secondsALPHAHistogramDial latency histogram in seconds, labeled by the protocol (http-connect or grpc), transport (tcp or uds)
protocol
transport
None apiserver_egress_dialer_dial_failure_countALPHACounterDial failure count, labeled by the protocol (http-connect or grpc), transport (tcp or uds), and stage (connect or proxy). The stage indicates at which stage the dial failed
protocol
stage
transport
None -apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None -apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None -apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None -apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None -apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None -apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None -apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None -apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone -apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None -apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None -authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None -authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None -authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None -authentication_token_cache_active_fetch_countALPHAGauge
status
None -authentication_token_cache_fetch_totalALPHACounter
status
None -authentication_token_cache_request_duration_secondsALPHAHistogram
status
None -authentication_token_cache_request_totalALPHACounter
status
None -field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None -apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None -apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None -apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None -apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None -apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None -apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None -apiserver_cache_list_fetched_objects_totalALPHACounterNumber of objects read from watch cache in the course of serving a LIST request
index
resource_prefix
None -apiserver_cache_list_returned_objects_totalALPHACounterNumber of objects returned for a LIST request from watch cache
resource_prefix
None -apiserver_cache_list_totalALPHACounterNumber of LIST requests served from watch cache
index
resource_prefix
None apiserver_envelope_encryption_dek_cache_fill_percentALPHAGaugePercent of the cache slots currently occupied by cached DEKs.NoneNone apiserver_envelope_encryption_dek_cache_inter_arrival_time_secondsALPHAHistogramTime (in seconds) of inter arrival of transformation requests.
transformation_type
None apiserver_flowcontrol_current_executing_requestsALPHAGaugeNumber of requests in initial (for a WATCH) or any (for a non-WATCH) execution stage in the API Priority and Fairness subsystem
flow_schema
priority_level
None @@ -249,6 +77,16 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_flowcontrol_watch_count_samplesALPHAHistogramcount of watchers for mutating requests in API Priority and Fairness
flow_schema
priority_level
None apiserver_flowcontrol_work_estimated_seatsALPHAHistogramNumber of estimated seats (maximum of initial and final seats) associated with requests in API Priority and Fairness
flow_schema
priority_level
None apiserver_init_events_totalALPHACounterCounter of init events processed in watch cache broken by resource type.
resource
None +apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone +apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone +apiserver_request_aborts_totalALPHACounterNumber of requests which apiserver aborted possibly due to a timeout, for each group, version, verb, resource, subresource and scope
group
resource
scope
subresource
verb
version
None +apiserver_request_body_sizesALPHAHistogramApiserver request body sizes broken out by size.
resource
verb
None +apiserver_request_filter_duration_secondsALPHAHistogramRequest filter latency distribution in seconds, for each filter type
filter
None +apiserver_request_post_timeout_totalALPHACounterTracks the activity of the request handlers after the associated requests have been timed out by the apiserver
source
status
None +apiserver_request_slo_duration_secondsALPHAHistogramResponse latency distribution (not counting webhook duration) in seconds for each verb, group, version, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_terminations_totalALPHACounterNumber of requests which apiserver terminated in self-defense.
code
component
group
resource
scope
subresource
verb
version
None +apiserver_request_timestamp_comparison_timeALPHAHistogramTime taken for comparison of old vs new objects in UPDATE or PATCH requests
code_path
None +apiserver_selfrequest_totalALPHACounterCounter of apiserver self-requests broken out for each verb, API resource and subresource.
resource
subresource
verb
None apiserver_storage_data_key_generation_duration_secondsALPHAHistogramLatencies in seconds of data encryption key(DEK) generation operations.NoneNone apiserver_storage_data_key_generation_failures_totalALPHACounterTotal number of failed data encryption key(DEK) generation operations.NoneNone apiserver_storage_envelope_transformation_cache_misses_totalALPHACounterTotal number of cache misses while accessing key decryption key(KEK).NoneNone @@ -259,46 +97,21 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api apiserver_storage_transformation_duration_secondsALPHAHistogramLatencies in seconds of value transformation operations.
transformation_type
None apiserver_storage_transformation_operations_totalALPHACounterTotal number of transformations.
status
transformation_type
transformer_prefix
None apiserver_terminated_watchers_totalALPHACounterCounter of watchers closed due to unresponsiveness broken by resource type.
resource
None +apiserver_tls_handshake_errors_totalALPHACounterNumber of requests dropped with 'TLS handshake error from' errorNoneNone apiserver_watch_cache_events_dispatched_totalALPHACounterCounter of events dispatched in watch cache broken by resource type.
resource
None apiserver_watch_cache_initializations_totalALPHACounterCounter of watch cache initializations broken by resource type.
resource
None +apiserver_watch_events_sizesALPHAHistogramWatch event size distribution in bytes
group
kind
version
None +apiserver_watch_events_totalALPHACounterNumber of events sent in watch clients
group
kind
version
None apiserver_webhooks_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone apiserver_webhooks_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone -etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None -etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None -etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone -etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None -watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None -watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None -watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None -apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None -service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone -service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone -kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None -kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None -kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None -kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None -leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None -rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None -rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone -rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone -rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None -rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None -rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None -rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None -rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None -running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None -workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None -workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None -workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None -workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None -workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None -workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None -workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None -apiserver_kube_aggregator_x509_insecure_sha1_totalALPHACounterCounts the number of requests to servers with insecure SHA1 signatures in their serving certificate OR the number of connection failures due to the insecure SHA1 signatures (either/or, based on the runtime environment)NoneNone -apiserver_kube_aggregator_x509_missing_san_totalALPHACounterCounts the number of requests to servers missing SAN extension in their serving certificate OR the number of connection failures due to the lack of x509 certificate SAN extension missing (either/or, based on the runtime environment)NoneNone -aggregator_openapi_v2_regeneration_countALPHACounterCounter of OpenAPI v2 spec regeneration count broken down by causing APIService name and reason.
apiservice
reason
None -aggregator_openapi_v2_regeneration_durationALPHAGaugeGauge of OpenAPI v2 spec regeneration duration in seconds.
reason
None -aggregator_unavailable_apiservice_totalALPHACounterCounter of APIServices which are marked as unavailable broken down by APIService name and reason.
name
reason
None +attachdetach_controller_forced_detachesALPHACounterNumber of times the A/D Controller performed a forced detachNoneNone +authenticated_user_requestsALPHACounterCounter of authenticated requests broken out by username.
username
None +authentication_attemptsALPHACounterCounter of authenticated attempts.
result
None +authentication_duration_secondsALPHAHistogramAuthentication duration in seconds broken out by result.
result
None +authentication_token_cache_active_fetch_countALPHAGauge
status
None +authentication_token_cache_fetch_totalALPHACounter
status
None +authentication_token_cache_request_duration_secondsALPHAHistogram
status
None +authentication_token_cache_request_totalALPHACounter
status
None cloudprovider_aws_api_request_duration_secondsALPHAHistogramLatency of AWS API calls
request
None cloudprovider_aws_api_request_errorsALPHACounterAWS API errors
request
None cloudprovider_aws_api_throttled_requests_totalALPHACounterAWS API throttled requests
operation_name
None @@ -314,12 +127,199 @@ These are the metrics which are exported in Kubernetes components (i.e. kube-api cloudprovider_vsphere_api_request_errorsALPHACountervsphere Api errors
request
None cloudprovider_vsphere_operation_duration_secondsALPHAHistogramLatency of vsphere operation call
operation
None cloudprovider_vsphere_operation_errorsALPHACountervsphere operation errors
operation
None +cronjob_controller_cronjob_job_creation_skew_duration_secondsALPHAHistogramTime between when a cronjob is scheduled to be run, and when the corresponding job is createdNoneNone +csi_operations_secondsALPHAHistogramContainer Storage Interface operation duration with gRPC error code status total
driver_name
grpc_status_code
method_name
migrated
None +endpoint_slice_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Service syncNoneNone +endpoint_slice_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Service syncNoneNone +endpoint_slice_controller_endpointslices_changed_per_syncALPHAHistogramNumber of EndpointSlices changed on each Service sync
topology
None +endpoint_slice_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +endpoint_slice_controller_syncsALPHACounterNumber of EndpointSlice syncs
result
None +endpoint_slice_mirroring_controller_addresses_skipped_per_syncALPHAHistogramNumber of addresses skipped on each Endpoints sync due to being invalid or exceeding MaxEndpointsPerSubsetNoneNone +endpoint_slice_mirroring_controller_changesALPHACounterNumber of EndpointSlice changes
operation
None +endpoint_slice_mirroring_controller_desired_endpoint_slicesALPHAGaugeNumber of EndpointSlices that would exist with perfect endpoint allocationNoneNone +endpoint_slice_mirroring_controller_endpoints_added_per_syncALPHAHistogramNumber of endpoints added on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_desiredALPHAGaugeNumber of endpoints desiredNoneNone +endpoint_slice_mirroring_controller_endpoints_removed_per_syncALPHAHistogramNumber of endpoints removed on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_endpoints_sync_durationALPHAHistogramDuration of syncEndpoints() in secondsNoneNone +endpoint_slice_mirroring_controller_endpoints_updated_per_syncALPHAHistogramNumber of endpoints updated on each Endpoints syncNoneNone +endpoint_slice_mirroring_controller_num_endpoint_slicesALPHAGaugeNumber of EndpointSlicesNoneNone +ephemeral_volume_controller_create_failures_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +ephemeral_volume_controller_create_totalALPHACounterNumber of PersistenVolumeClaims creation requestsNoneNone +etcd_bookmark_countsALPHAGaugeNumber of etcd bookmarks (progress notify events) split by kind.
resource
None +etcd_db_total_size_in_bytesALPHAGaugeTotal size of the etcd database file physically allocated in bytes.
endpoint
None +etcd_lease_object_countsALPHAHistogramNumber of objects attached to a single etcd lease.NoneNone +etcd_request_duration_secondsALPHAHistogramEtcd request latency in seconds for each operation and object type.
operation
type
None +etcd_version_infoALPHAGaugeEtcd server's binary version
binary_version
None +field_validation_request_duration_secondsALPHAHistogramResponse latency distribution in seconds for each field validation value and whether field validation is enabled or not
enabled
field_validation
None +garbagecollector_controller_resources_sync_error_totalALPHACounterNumber of garbage collector resources sync errorsNoneNone get_token_countALPHACounterCounter of total Token() requests to the alternate token sourceNoneNone get_token_fail_countALPHACounterCounter of failed Token() requests to the alternate token sourceNoneNone +job_controller_job_finished_totalALPHACounterThe number of finished job
completion_mode
result
None +job_controller_job_pods_finished_totalALPHACounterThe number of finished Pods that are fully tracked
completion_mode
result
None +job_controller_job_sync_duration_secondsALPHAHistogramThe time it took to sync a job
action
completion_mode
result
None +job_controller_job_sync_totalALPHACounterThe number of job syncs
action
completion_mode
result
None +job_controller_terminated_pods_tracking_finalizer_totalALPHACounter`The number of terminated pods (phase=Failed|Succeeded), that have the finalizer batch.kubernetes.io/job-tracking, The event label can be "add" or "delete".`
event
None +kube_apiserver_clusterip_allocator_allocated_ipsALPHAGaugeGauge measuring the number of allocated IPs for Services
cidr
None +kube_apiserver_clusterip_allocator_allocation_errors_totalALPHACounterNumber of errors trying to allocate Cluster IPs
cidr
scope
None +kube_apiserver_clusterip_allocator_allocation_totalALPHACounterNumber of Cluster IPs allocations
cidr
scope
None +kube_apiserver_clusterip_allocator_available_ipsALPHAGaugeGauge measuring the number of available IPs for Services
cidr
None +kube_apiserver_pod_logs_pods_logs_backend_tls_failure_totalALPHACounterTotal number of requests for pods/logs that failed due to kubelet server TLS verificationNoneNone +kube_apiserver_pod_logs_pods_logs_insecure_backend_totalALPHACounterTotal number of requests for pods/logs sliced by usage type: enforce_tls, skip_tls_allowed, skip_tls_denied
usage
None +kubelet_certificate_manager_client_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +kubelet_certificate_manager_client_ttl_secondsALPHAGaugeGauge of the TTL (time-to-live) of the Kubelet's client certificate. The value is in seconds until certificate expiry (negative if already expired). If client certificate is invalid or unused, the value will be +INF.NoneNone +kubelet_certificate_manager_server_rotation_secondsALPHAHistogramHistogram of the number of seconds the previous certificate lived before being rotated.NoneNone +kubelet_certificate_manager_server_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the Kubelet's serving certificate. The value is in seconds until certificate expiry (negative if already expired). If serving certificate is invalid or unused, the value will be +INF.NoneNone +kubelet_cgroup_manager_duration_secondsALPHAHistogramDuration in seconds for cgroup manager operations. Broken down by method.
operation_type
None +kubelet_containers_per_pod_countALPHAHistogramThe number of containers per pod.NoneNone +kubelet_device_plugin_alloc_duration_secondsALPHAHistogramDuration in seconds to serve a device plugin Allocation request. Broken down by resource name.
resource_name
None +kubelet_device_plugin_registration_totalALPHACounterCumulative number of device plugin registrations. Broken down by resource name.
resource_name
None +kubelet_eviction_stats_age_secondsALPHAHistogramTime between when stats are collected, and when pod is evicted based on those stats by eviction signal
eviction_signal
None +kubelet_evictionsALPHACounterCumulative number of pod evictions by eviction signal
eviction_signal
None +kubelet_graceful_shutdown_end_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_graceful_shutdown_start_time_secondsALPHAGaugeLast graceful shutdown start time since unix epoch in secondsNoneNone +kubelet_http_inflight_requestsALPHAGaugeNumber of the inflight http requests
long_running
method
path
server_type
None +kubelet_http_requests_duration_secondsALPHAHistogramDuration in seconds to serve http requests
long_running
method
path
server_type
None +kubelet_http_requests_totalALPHACounterNumber of the http requests received since the server started
long_running
method
path
server_type
None +kubelet_kubelet_credential_provider_plugin_durationALPHAHistogramDuration of execution in seconds for credential provider plugin
plugin_name
None +kubelet_kubelet_credential_provider_plugin_errorsALPHACounterNumber of errors from credential provider plugin
plugin_name
None +kubelet_lifecycle_handler_http_fallbacks_totalALPHACounterThe number of times lifecycle handlers successfully fell back to http from https.NoneNone +kubelet_managed_ephemeral_containersALPHAGaugeCurrent number of ephemeral containers in pods managed by this kubelet. Ephemeral containers will be ignored if disabled by the EphemeralContainers feature gate, and this number will be 0.NoneNone +kubelet_node_nameALPHAGaugeThe node's name. The count is always 1.
node
None +kubelet_pleg_discard_eventsALPHACounterThe number of discard events in PLEG.NoneNone +kubelet_pleg_last_seen_secondsALPHAGaugeTimestamp in seconds when PLEG was last seen active.NoneNone +kubelet_pleg_relist_duration_secondsALPHAHistogramDuration in seconds for relisting pods in PLEG.NoneNone +kubelet_pleg_relist_interval_secondsALPHAHistogramInterval in seconds between relisting in PLEG.NoneNone +kubelet_pod_resources_endpoint_errors_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_errors_listALPHACounterNumber of requests to the PodResource List endpoint which returned error. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_get_allocatableALPHACounterNumber of requests to the PodResource GetAllocatableResources endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_listALPHACounterNumber of requests to the PodResource List endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_resources_endpoint_requests_totalALPHACounterCumulative number of requests to the PodResource endpoint. Broken down by server api version.
server_api_version
None +kubelet_pod_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod for the first time to the pod starting to runNoneNone +kubelet_pod_status_sync_duration_secondsALPHAHistogramDuration in seconds to sync a pod status update. Measures time from detection of a change to pod status until the API is successfully updated for that pod, even if multiple intevening changes to pod status occur.NoneNone +kubelet_pod_worker_duration_secondsALPHAHistogramDuration in seconds to sync a single pod. Broken down by operation type: create, update, or sync
operation_type
None +kubelet_pod_worker_start_duration_secondsALPHAHistogramDuration in seconds from kubelet seeing a pod to starting a worker.NoneNone +kubelet_preemptionsALPHACounterCumulative number of pod preemptions by preemption resource
preemption_signal
None +kubelet_run_podsandbox_duration_secondsALPHAHistogramDuration in seconds of the run_podsandbox operations. Broken down by RuntimeClass.Handler.
runtime_handler
None +kubelet_run_podsandbox_errors_totalALPHACounterCumulative number of the run_podsandbox operation errors by RuntimeClass.Handler.
runtime_handler
None +kubelet_running_containersALPHAGaugeNumber of containers currently running
container_state
None +kubelet_running_podsALPHAGaugeNumber of pods that have a running pod sandboxNoneNone +kubelet_runtime_operations_duration_secondsALPHAHistogramDuration in seconds of runtime operations. Broken down by operation type.
operation_type
None +kubelet_runtime_operations_errors_totalALPHACounterCumulative number of runtime operation errors by operation type.
operation_type
None +kubelet_runtime_operations_totalALPHACounterCumulative number of runtime operations by operation type.
operation_type
None +kubelet_server_expiration_renew_errorsALPHACounterCounter of certificate renewal errors.NoneNone +kubelet_started_containers_errors_totalALPHACounterCumulative number of errors when starting containers
code
container_type
None +kubelet_started_containers_totalALPHACounterCumulative number of containers started
container_type
None +kubelet_started_host_process_containers_errors_totalALPHACounterCumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
code
container_type
None +kubelet_started_host_process_containers_totalALPHACounterCumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.
container_type
None +kubelet_started_pods_errors_totalALPHACounterCumulative number of errors when starting podsNoneNone +kubelet_started_pods_totalALPHACounterCumulative number of pods startedNoneNone +kubelet_volume_metric_collection_duration_secondsALPHAHistogramDuration in seconds to calculate volume stats
metric_source
None +kubeproxy_network_programming_duration_secondsALPHAHistogramIn Cluster Network Programming Latency in secondsNoneNone +kubeproxy_sync_proxy_rules_duration_secondsALPHAHistogramSyncProxyRules latency in secondsNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_pendingALPHAGaugePending proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_endpoint_changes_totalALPHACounterCumulative proxy rules Endpoint changesNoneNone +kubeproxy_sync_proxy_rules_iptables_restore_failures_totalALPHACounterCumulative proxy iptables restore failuresNoneNone +kubeproxy_sync_proxy_rules_iptables_totalALPHAGaugeNumber of proxy iptables rules programmed
table
None +kubeproxy_sync_proxy_rules_last_queued_timestamp_secondsALPHAGaugeThe last time a sync of proxy rules was queuedNoneNone +kubeproxy_sync_proxy_rules_last_timestamp_secondsALPHAGaugeThe last time proxy rules were successfully syncedNoneNone +kubeproxy_sync_proxy_rules_no_local_endpoints_totalALPHAGaugeNumber of services with a Local traffic policy and no endpoints
traffic_policy
None +kubeproxy_sync_proxy_rules_service_changes_pendingALPHAGaugePending proxy rules Service changesNoneNone +kubeproxy_sync_proxy_rules_service_changes_totalALPHACounterCumulative proxy rules Service changesNoneNone +kubernetes_build_infoALPHAGaugeA metric with a constant '1' value labeled by major, minor, git version, git commit, git tree state, build date, Go version, and compiler from which Kubernetes was built, and platform on which it is running.
build_date
compiler
git_commit
git_tree_state
git_version
go_version
major
minor
platform
None +kubernetes_feature_enabledALPHAGaugeThis metric records the data about the stage and enablement of a k8s feature.
name
stage
None +kubernetes_healthcheckALPHAGaugeThis metric records the result of a single healthcheck.
name
type
None +kubernetes_healthchecks_totalALPHACounterThis metric records the results of all healthcheck.
name
status
type
None +leader_election_master_statusALPHAGaugeGauge of if the reporting system is master of the relevant lease, 0 indicates backup, 1 indicates master. 'name' is the string used to identify the lease. Please make sure to group by name.
name
None +node_authorizer_graph_actions_duration_secondsALPHAHistogramHistogram of duration of graph actions in node authorizer.
operation
None +node_collector_evictions_numberALPHACounterNumber of Node evictions that happened since current instance of NodeController started, This metric is replaced by node_collector_evictions_total.
zone
None +node_collector_unhealthy_nodes_in_zoneALPHAGaugeGauge measuring number of not Ready Nodes per zones.
zone
None +node_collector_zone_healthALPHAGaugeGauge measuring percentage of healthy nodes per zone.
zone
None +node_collector_zone_sizeALPHAGaugeGauge measuring number of registered Nodes per zones.
zone
None +node_ipam_controller_cidrset_allocation_tries_per_requestALPHAHistogramNumber of endpoints added on each Service sync
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_cidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_cidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None +node_ipam_controller_multicidrset_allocation_tries_per_requestALPHAHistogramHistogram measuring CIDR allocation tries per request.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_allocations_totalALPHACounterCounter measuring total number of CIDR allocations.
clusterCIDR
None +node_ipam_controller_multicidrset_cidrs_releases_totalALPHACounterCounter measuring total number of CIDR releases.
clusterCIDR
None +node_ipam_controller_multicidrset_usage_cidrsALPHAGaugeGauge measuring percentage of allocated CIDRs.
clusterCIDR
None number_of_l4_ilbsALPHAGaugeNumber of L4 ILBs
feature
None pod_security_errors_totalALPHACounterNumber of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
fatal
request_operation
resource
subresource
None pod_security_evaluations_totalALPHACounterNumber of policy evaluations that occurred, not counting ignored or exempt requests.
decision
mode
policy_level
policy_version
request_operation
resource
subresource
None pod_security_exemptions_totalALPHACounterNumber of exempt requests, not counting ignored or out of scope requests.
request_operation
resource
subresource
None +prober_probe_duration_secondsALPHAHistogramDuration in seconds for a probe response.
container
namespace
pod
probe_type
None +prober_probe_totalALPHACounterCumulative number of a liveness, readiness or startup probe for a container by result.
container
namespace
pod
pod_uid
probe_type
result
None +replicaset_controller_sorting_deletion_age_ratioALPHAHistogramThe ratio of chosen deleted pod's ages to the current youngest pod's age (at the time). Should be <2.The intent of this metric is to measure the rough efficacy of the LogarithmicScaleDown feature gate's effect onthe sorting (and deletion) of pods when a replicaset scales down. This only considers Ready pods when calculating and reporting.NoneNone +rest_client_exec_plugin_call_totalALPHACounterNumber of calls to an exec plugin, partitioned by the type of event encountered (no_error, plugin_execution_error, plugin_not_found_error, client_internal_error) and an optional exit code. The exit code will be set to 0 if and only if the plugin call was successful.
call_status
code
None +rest_client_exec_plugin_certificate_rotation_ageALPHAHistogramHistogram of the number of seconds the last auth exec plugin client certificate lived before being rotated. If auth exec plugin client certificates are unused, histogram will contain no data.NoneNone +rest_client_exec_plugin_ttl_secondsALPHAGaugeGauge of the shortest TTL (time-to-live) of the client certificate(s) managed by the auth exec plugin. The value is in seconds until certificate expiry (negative if already expired). If auth exec plugins are unused or manage no TLS certificates, the value will be +INF.NoneNone +rest_client_rate_limiter_duration_secondsALPHAHistogramClient side rate limiter latency in seconds. Broken down by verb, and host.
host
verb
None +rest_client_request_duration_secondsALPHAHistogramRequest latency in seconds. Broken down by verb, and host.
host
verb
None +rest_client_request_size_bytesALPHAHistogramRequest size in bytes. Broken down by verb and host.
host
verb
None +rest_client_requests_totalALPHACounterNumber of HTTP requests, partitioned by status code, method, and host.
code
host
method
None +rest_client_response_size_bytesALPHAHistogramResponse size in bytes. Broken down by verb and host.
host
verb
None +root_ca_cert_publisher_sync_duration_secondsALPHAHistogramNumber of namespace syncs happened in root ca cert publisher.
code
None +root_ca_cert_publisher_sync_totalALPHACounterNumber of namespace syncs happened in root ca cert publisher.
code
None +running_managed_controllersALPHAGaugeIndicates where instances of a controller are currently running
manager
name
None +scheduler_e2e_scheduling_duration_secondsALPHAHistogramE2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.
profile
result
None +scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding.
operation
None +scheduler_permit_wait_duration_secondsALPHAHistogramDuration of waiting on permit.
result
None +scheduler_plugin_execution_duration_secondsALPHAHistogramDuration for running a plugin at a specific extension point.
extension_point
plugin
status
None +scheduler_scheduler_cache_sizeALPHAGaugeNumber of nodes, pods, and assumed (bound) pods in the scheduler cache.
type
None +scheduler_scheduler_goroutinesALPHAGaugeNumber of running goroutines split by the work they do such as binding. This metric is replaced by the \"goroutines\" metric.
work
None +scheduler_scheduling_algorithm_duration_secondsALPHAHistogramScheduling algorithm latency in secondsNoneNone +scheduler_unschedulable_podsALPHAGaugeThe number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.
plugin
profile
None +scheduler_volume_binder_cache_requests_totalALPHACounterTotal number for request volume binding cache
operation
None +scheduler_volume_scheduling_stage_error_totalALPHACounterVolume scheduling stage error count
operation
None +service_controller_nodesync_latency_secondsALPHAHistogramA metric measuring the latency for nodesync which updates loadbalancer hosts on cluster node updates.NoneNone +service_controller_update_loadbalancer_host_latency_secondsALPHAHistogramA metric measuring the latency for updating each load balancer hosts.NoneNone +serviceaccount_legacy_tokens_totalALPHACounterCumulative legacy service account tokens usedNoneNone +serviceaccount_stale_tokens_totalALPHACounterCumulative stale projected service account tokens usedNoneNone +serviceaccount_valid_tokens_totalALPHACounterCumulative valid projected service account tokens usedNoneNone +storage_operation_duration_secondsALPHAHistogramStorage operation duration
migrated
operation_name
status
volume_plugin
None +ttl_after_finished_controller_job_deletion_duration_secondsALPHAHistogramThe time it took to delete the job since it became eligible for deletionNoneNone +volume_manager_selinux_container_errors_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of containers.NoneNone +volume_manager_selinux_container_warnings_totalALPHAGaugeNumber of errors when kubelet cannot compute SELinux context for a container that are ignored. They will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_pod_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone +volume_manager_selinux_pod_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod defines different SELinux contexts for its containers that use the same volume. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_volume_context_mismatch_errors_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. Kubelet can't start such a Pod then and it will retry, therefore value of this metric may not represent the actual nr. of Pods.NoneNone +volume_manager_selinux_volume_context_mismatch_warnings_totalALPHAGaugeNumber of errors when a Pod uses a volume that is already mounted with a different SELinux context than the Pod needs. They are not errors yet, but they will become real errors when SELinuxMountReadWriteOncePod feature is expanded to all volume access modes.NoneNone +volume_manager_selinux_volumes_admitted_totalALPHAGaugeNumber of volumes whose SELinux context was fine and will be mounted with mount -o context option.NoneNone +volume_operation_total_errorsALPHACounterTotal volume operation errors
operation_name
plugin_name
None +volume_operation_total_secondsALPHAHistogramStorage operation end to end duration in seconds
operation_name
plugin_name
None +watch_cache_capacityALPHAGaugeTotal capacity of watch cache broken by resource type.
resource
None +watch_cache_capacity_decrease_totalALPHACounterTotal number of watch cache capacity decrease events broken by resource type.
resource
None +watch_cache_capacity_increase_totalALPHACounterTotal number of watch cache capacity increase events broken by resource type.
resource
None +workqueue_adds_totalALPHACounterTotal number of adds handled by workqueue
name
None +workqueue_depthALPHAGaugeCurrent depth of workqueue
name
None +workqueue_longest_running_processor_secondsALPHAGaugeHow many seconds has the longest running processor for workqueue been running.
name
None +workqueue_queue_duration_secondsALPHAHistogramHow long in seconds an item stays in workqueue before being requested.
name
None +workqueue_retries_totalALPHACounterTotal number of retries handled by workqueue
name
None +workqueue_unfinished_work_secondsALPHAGaugeHow many seconds of work has done that is in progress and hasn't been observed by work_duration. Large values indicate stuck threads. One can deduce the number of stuck threads by observing the rate at which this increases.
name
None +workqueue_work_duration_secondsALPHAHistogramHow long in seconds processing an item from workqueue takes.
name
None +apiserver_admission_controller_admission_duration_secondsSTABLEHistogramAdmission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_admission_step_admission_duration_secondsSTABLEHistogramAdmission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).
operation
rejected
type
None +apiserver_admission_webhook_admission_duration_secondsSTABLEHistogramAdmission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).
name
operation
rejected
type
None +apiserver_current_inflight_requestsSTABLEGaugeMaximal number of currently used inflight request limit of this apiserver per request kind in last second.
request_kind
None +apiserver_longrunning_requestsSTABLEGaugeGauge of all active long-running apiserver requests broken out by verb, group, version, resource, scope and component. Not all requests are tracked this way.
component
group
resource
scope
subresource
verb
version
None +apiserver_request_duration_secondsSTABLEHistogramResponse latency distribution in seconds for each verb, dry run value, group, version, resource, subresource, scope and component.
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_request_totalSTABLECounterCounter of apiserver requests broken out for each verb, dry run value, group, version, resource, scope, component, and HTTP response code.
code
component
dry_run
group
resource
scope
subresource
verb
version
None +apiserver_requested_deprecated_apisSTABLEGaugeGauge of deprecated APIs that have been requested, broken out by API group, version, resource, subresource, and removed_release.
group
removed_release
resource
subresource
version
None +apiserver_response_sizesSTABLEHistogramResponse size distribution in bytes for each group, version, verb, resource, subresource, scope and component.
component
group
resource
scope
subresource
verb
version
None +apiserver_storage_objectsSTABLEGaugeNumber of stored objects at the time of last check split by kind.
resource
None +node_collector_evictions_totalSTABLECounterNumber of Node evictions that happened since current instance of NodeController started.
zone
None +scheduler_framework_extension_point_duration_secondsSTABLEHistogramLatency for running all plugins of a specific extension point.
extension_point
profile
status
None +scheduler_pending_podsSTABLEGaugeNumber of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods.
queue
None +scheduler_pod_scheduling_attemptsSTABLEHistogramNumber of attempts to successfully schedule a pod.NoneNone +scheduler_pod_scheduling_duration_secondsSTABLEHistogramE2e latency for a pod being scheduled which may include multiple scheduling attempts.
attempts
None +scheduler_preemption_attempts_totalSTABLECounterTotal preemption attempts in the cluster till nowNoneNone +scheduler_preemption_victimsSTABLEHistogramNumber of selected preemption victimsNoneNone +scheduler_queue_incoming_pods_totalSTABLECounterNumber of pods added to scheduling queues by event and queue type.
event
queue
None +scheduler_schedule_attempts_totalSTABLECounterNumber of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.
profile
result
None +scheduler_scheduling_attempt_duration_secondsSTABLEHistogramScheduling attempt latency in seconds (scheduling algorithm + binding)
profile
result
None