From 9fd99a48f5c685fb76acc148ad771c01f3022c84 Mon Sep 17 00:00:00 2001 From: danielqsj Date: Wed, 2 Jan 2019 10:45:41 +0800 Subject: [PATCH 1/5] Change kubelet metrics to conform guideline --- pkg/kubelet/cm/cgroup_manager_linux.go | 9 +- pkg/kubelet/cm/devicemanager/manager.go | 4 +- pkg/kubelet/eviction/eviction_manager.go | 3 +- pkg/kubelet/kubelet.go | 9 +- .../kuberuntime/instrumented_services.go | 5 +- .../kuberuntime/instrumented_services_test.go | 4 +- pkg/kubelet/metrics/metrics.go | 180 +++++++++++++++--- pkg/kubelet/pleg/generic.go | 6 +- test/e2e/framework/metrics_util.go | 6 +- 9 files changed, 181 insertions(+), 45 deletions(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 3bb09f0ddda..85d92edd0ea 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -284,7 +284,8 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool { func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start)) }() cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name) @@ -411,7 +412,8 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start)) }() // Extract the cgroup resource parameters @@ -446,7 +448,8 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start)) }() resources := m.toResources(cgroupConfig.ResourceParameters) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index dbb2720dfe9..e12df4d43af 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -354,6 +354,7 @@ func (m *ManagerImpl) Allocate(node *schedulernodeinfo.NodeInfo, attrs *lifecycl func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) { klog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName) metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc() + metrics.DeprecatedDevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc() var versionCompatible bool for _, v := range pluginapi.SupportedVersions { if r.Version == v { @@ -696,7 +697,8 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource) resp, err := eI.e.allocate(devs) - metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) + metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) + metrics.DeprecatedDevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) if err != nil { // In case of allocation failure, we want to restore m.allocatedDevices // to the actual allocated state from m.podDevices. diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 801fc8106fa..2ecabf8194a 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -361,7 +361,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act for _, t := range thresholds { timeObserved := observations[t.Signal].time if !timeObserved.IsZero() { - metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInMicroseconds(timeObserved.Time)) + metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time)) + metrics.DeprecatedEvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInMicroseconds(timeObserved.Time)) } } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index bb00c21a946..d80a0c4a87e 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1500,7 +1500,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { if !firstSeenTime.IsZero() { // This is the first time we are syncing the pod. Record the latency // since kubelet first saw the pod if firstSeenTime is set. - metrics.PodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) + metrics.PodWorkerStartLatency.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.DeprecatedPodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } else { klog.V(3).Infof("First seen time not recorded for pod %q", pod.UID) } @@ -1517,7 +1518,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && !firstSeenTime.IsZero() { - metrics.PodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) + metrics.PodStartLatency.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.DeprecatedPodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } runnable := kl.canRunPod(pod) @@ -1996,7 +1998,8 @@ func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mir UpdateType: syncType, OnCompleteFunc: func(err error) { if err != nil { - metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) + metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedPodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) } }, }) diff --git a/pkg/kubelet/kuberuntime/instrumented_services.go b/pkg/kubelet/kuberuntime/instrumented_services.go index dc3c6575f8b..a587cf0eea6 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services.go +++ b/pkg/kubelet/kuberuntime/instrumented_services.go @@ -49,13 +49,16 @@ func newInstrumentedImageManagerService(service internalapi.ImageManagerService) // recordOperation records the duration of the operation. func recordOperation(operation string, start time.Time) { metrics.RuntimeOperations.WithLabelValues(operation).Inc() - metrics.RuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) + metrics.DeprecatedRuntimeOperations.WithLabelValues(operation).Inc() + metrics.RuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedRuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) } // recordError records error for metric if an error occurred. func recordError(operation string, err error) { if err != nil { metrics.RuntimeOperationsErrors.WithLabelValues(operation).Inc() + metrics.DeprecatedRuntimeOperationsErrors.WithLabelValues(operation).Inc() } } diff --git a/pkg/kubelet/kuberuntime/instrumented_services_test.go b/pkg/kubelet/kuberuntime/instrumented_services_test.go index 6463e617f06..1a9521a0ea1 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services_test.go +++ b/pkg/kubelet/kuberuntime/instrumented_services_test.go @@ -50,8 +50,8 @@ func TestRecordOperation(t *testing.T) { }() recordOperation("create_container", time.Now()) - runtimeOperationsCounterExpected := "kubelet_runtime_operations{operation_type=\"create_container\"} 1" - runtimeOperationsLatencyExpected := "kubelet_runtime_operations_latency_microseconds_count{operation_type=\"create_container\"} 1" + runtimeOperationsCounterExpected := "kubelet_runtime_operations_total{operation_type=\"create_container\"} 1" + runtimeOperationsLatencyExpected := "kubelet_runtime_operations_latency_seconds_count{operation_type=\"create_container\"} 1" assert.HTTPBodyContains(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { mux.ServeHTTP(w, r) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index c1b38795338..29d8c6db83e 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -31,30 +31,42 @@ import ( ) const ( - KubeletSubsystem = "kubelet" - NodeNameKey = "node_name" - NodeLabelKey = "node" - PodWorkerLatencyKey = "pod_worker_latency_microseconds" - PodStartLatencyKey = "pod_start_latency_microseconds" - CgroupManagerOperationsKey = "cgroup_manager_latency_microseconds" - PodWorkerStartLatencyKey = "pod_worker_start_latency_microseconds" - PLEGRelistLatencyKey = "pleg_relist_latency_microseconds" - PLEGDiscardEventsKey = "pleg_discard_events" - PLEGRelistIntervalKey = "pleg_relist_interval_microseconds" - EvictionStatsAgeKey = "eviction_stats_age_microseconds" - VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" - VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" - VolumeStatsUsedBytesKey = "volume_stats_used_bytes" - VolumeStatsInodesKey = "volume_stats_inodes" - VolumeStatsInodesFreeKey = "volume_stats_inodes_free" - VolumeStatsInodesUsedKey = "volume_stats_inodes_used" + KubeletSubsystem = "kubelet" + NodeNameKey = "node_name" + NodeLabelKey = "node" + PodWorkerLatencyKey = "pod_worker_latency_seconds" + PodStartLatencyKey = "pod_start_latency_seconds" + CgroupManagerOperationsKey = "cgroup_manager_latency_seconds" + PodWorkerStartLatencyKey = "pod_worker_start_latency_seconds" + PLEGRelistLatencyKey = "pleg_relist_latency_seconds" + PLEGDiscardEventsKey = "pleg_discard_events" + PLEGRelistIntervalKey = "pleg_relist_interval_seconds" + EvictionStatsAgeKey = "eviction_stats_age_seconds" + DeprecatedPodWorkerLatencyKey = "pod_worker_latency_microseconds" + DeprecatedPodStartLatencyKey = "pod_start_latency_microseconds" + DeprecatedCgroupManagerOperationsKey = "cgroup_manager_latency_microseconds" + DeprecatedPodWorkerStartLatencyKey = "pod_worker_start_latency_microseconds" + DeprecatedPLEGRelistLatencyKey = "pleg_relist_latency_microseconds" + DeprecatedPLEGRelistIntervalKey = "pleg_relist_interval_microseconds" + DeprecatedEvictionStatsAgeKey = "eviction_stats_age_microseconds" + VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" + VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" + VolumeStatsUsedBytesKey = "volume_stats_used_bytes" + VolumeStatsInodesKey = "volume_stats_inodes" + VolumeStatsInodesFreeKey = "volume_stats_inodes_free" + VolumeStatsInodesUsedKey = "volume_stats_inodes_used" // Metrics keys of remote runtime operations - RuntimeOperationsKey = "runtime_operations" - RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" - RuntimeOperationsErrorsKey = "runtime_operations_errors" + RuntimeOperationsKey = "runtime_operations_total" + RuntimeOperationsLatencyKey = "runtime_operations_latency_seconds" + RuntimeOperationsErrorsKey = "runtime_operations_errors_total" + DeprecatedRuntimeOperationsKey = "runtime_operations" + DeprecatedRuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" + DeprecatedRuntimeOperationsErrorsKey = "runtime_operations_errors" // Metrics keys of device plugin operations - DevicePluginRegistrationCountKey = "device_plugin_registration_count" - DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" + DevicePluginRegistrationCountKey = "device_plugin_registration_total" + DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_seconds" + DeprecatedDevicePluginRegistrationCountKey = "device_plugin_registration_count" + DeprecatedDevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" // Metric keys for node config AssignedConfigKey = "node_config_assigned" @@ -92,7 +104,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: PodWorkerLatencyKey, - Help: "Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", + Help: "Latency in seconds to sync a single pod. Broken down by operation type: create, update, or sync", }, []string{"operation_type"}, ) @@ -100,14 +112,14 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: PodStartLatencyKey, - Help: "Latency in microseconds for a single pod to go from pending to running.", + Help: "Latency in seconds for a single pod to go from pending to running.", }, ) CgroupManagerLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: CgroupManagerOperationsKey, - Help: "Latency in microseconds for cgroup manager operations. Broken down by method.", + Help: "Latency in seconds for cgroup manager operations. Broken down by method.", }, []string{"operation_type"}, ) @@ -115,14 +127,14 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: PodWorkerStartLatencyKey, - Help: "Latency in microseconds from seeing a pod to starting a worker.", + Help: "Latency in seconds from seeing a pod to starting a worker.", }, ) PLEGRelistLatency = prometheus.NewSummary( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: PLEGRelistLatencyKey, - Help: "Latency in microseconds for relisting pods in PLEG.", + Help: "Latency in seconds for relisting pods in PLEG.", }, ) PLEGDiscardEvents = prometheus.NewCounterVec( @@ -137,7 +149,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: PLEGRelistIntervalKey, - Help: "Interval in microseconds between relisting in PLEG.", + Help: "Interval in seconds between relisting in PLEG.", }, ) // Metrics of remote runtime operations. @@ -153,7 +165,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: RuntimeOperationsLatencyKey, - Help: "Latency in microseconds of runtime operations. Broken down by operation type.", + Help: "Latency in seconds of runtime operations. Broken down by operation type.", }, []string{"operation_type"}, ) @@ -185,6 +197,99 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DevicePluginAllocationLatencyKey, + Help: "Latency in seconds to serve a device plugin Allocation request. Broken down by resource name.", + }, + []string{"resource_name"}, + ) + + DeprecatedPodWorkerLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPodWorkerLatencyKey, + Help: "Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", + }, + []string{"operation_type"}, + ) + DeprecatedPodStartLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPodStartLatencyKey, + Help: "Latency in microseconds for a single pod to go from pending to running.", + }, + ) + DeprecatedCgroupManagerLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedCgroupManagerOperationsKey, + Help: "Latency in microseconds for cgroup manager operations. Broken down by method.", + }, + []string{"operation_type"}, + ) + DeprecatedPodWorkerStartLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPodWorkerStartLatencyKey, + Help: "Latency in microseconds from seeing a pod to starting a worker.", + }, + ) + DeprecatedPLEGRelistLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPLEGRelistLatencyKey, + Help: "Latency in microseconds for relisting pods in PLEG.", + }, + ) + DeprecatedPLEGRelistInterval = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPLEGRelistIntervalKey, + Help: "Interval in microseconds between relisting in PLEG.", + }, + ) + DeprecatedRuntimeOperations = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsKey, + Help: "Cumulative number of runtime operations by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedRuntimeOperationsLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsLatencyKey, + Help: "Latency in microseconds of runtime operations. Broken down by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedRuntimeOperationsErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsErrorsKey, + Help: "Cumulative number of runtime operation errors by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedEvictionStatsAge = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedEvictionStatsAgeKey, + Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", + }, + []string{"eviction_signal"}, + ) + DeprecatedDevicePluginRegistrationCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedDevicePluginRegistrationCountKey, + Help: "Cumulative number of device plugin registrations. Broken down by resource name.", + }, + []string{"resource_name"}, + ) + DeprecatedDevicePluginAllocationLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedDevicePluginAllocationLatencyKey, Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", }, []string{"resource_name"}, @@ -263,6 +368,18 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu prometheus.MustRegister(EvictionStatsAge) prometheus.MustRegister(DevicePluginRegistrationCount) prometheus.MustRegister(DevicePluginAllocationLatency) + prometheus.MustRegister(DeprecatedPodWorkerLatency) + prometheus.MustRegister(DeprecatedPodStartLatency) + prometheus.MustRegister(DeprecatedCgroupManagerLatency) + prometheus.MustRegister(DeprecatedPodWorkerStartLatency) + prometheus.MustRegister(DeprecatedPLEGRelistLatency) + prometheus.MustRegister(DeprecatedPLEGRelistInterval) + prometheus.MustRegister(DeprecatedRuntimeOperations) + prometheus.MustRegister(DeprecatedRuntimeOperationsLatency) + prometheus.MustRegister(DeprecatedRuntimeOperationsErrors) + prometheus.MustRegister(DeprecatedEvictionStatsAge) + prometheus.MustRegister(DeprecatedDevicePluginRegistrationCount) + prometheus.MustRegister(DeprecatedDevicePluginAllocationLatency) if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { prometheus.MustRegister(AssignedConfig) prometheus.MustRegister(ActiveConfig) @@ -280,6 +397,11 @@ func SinceInMicroseconds(start time.Time) float64 { return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()) } +// Gets the time since the specified start in seconds. +func SinceInSeconds(start time.Time) float64 { + return time.Since(start).Seconds() +} + func newPodAndContainerCollector(containerCache kubecontainer.RuntimeCache) *podAndContainerCollector { return &podAndContainerCollector{ containerCache: containerCache, diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index bce9c7ea20e..7cb162944c7 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -189,12 +189,14 @@ func (g *GenericPLEG) relist() { klog.V(5).Infof("GenericPLEG: Relisting") if lastRelistTime := g.getRelistTime(); !lastRelistTime.IsZero() { - metrics.PLEGRelistInterval.Observe(metrics.SinceInMicroseconds(lastRelistTime)) + metrics.PLEGRelistInterval.Observe(metrics.SinceInSeconds(lastRelistTime)) + metrics.DeprecatedPLEGRelistInterval.Observe(metrics.SinceInMicroseconds(lastRelistTime)) } timestamp := g.clock.Now() defer func() { - metrics.PLEGRelistLatency.Observe(metrics.SinceInMicroseconds(timestamp)) + metrics.PLEGRelistLatency.Observe(metrics.SinceInSeconds(timestamp)) + metrics.DeprecatedPLEGRelistLatency.Observe(metrics.SinceInMicroseconds(timestamp)) }() // Get all the pods. diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index 106f8015b2f..cf9fffe5da2 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -168,9 +168,9 @@ var InterestingKubeletMetrics = []string{ "kubelet_docker_errors", "kubelet_docker_operations_latency_seconds", "kubelet_generate_pod_status_latency_microseconds", - "kubelet_pod_start_latency_microseconds", - "kubelet_pod_worker_latency_microseconds", - "kubelet_pod_worker_start_latency_microseconds", + "kubelet_pod_start_latency_seconds", + "kubelet_pod_worker_latency_seconds", + "kubelet_pod_worker_start_latency_seconds", "kubelet_sync_pods_latency_microseconds", } From 0e9515c709c075cad3cc0e1fc1aa60e197e1934d Mon Sep 17 00:00:00 2001 From: danielqsj Date: Wed, 2 Jan 2019 11:04:25 +0800 Subject: [PATCH 2/5] Move kubelet metrics to histogram metrics --- pkg/kubelet/metrics/metrics.go | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 29d8c6db83e..0c902ec450a 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -93,45 +93,45 @@ var ( }, []string{NodeLabelKey}, ) - ContainersPerPodCount = prometheus.NewSummary( - prometheus.SummaryOpts{ + ContainersPerPodCount = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: "containers_per_pod_count", Help: "The number of containers per pod.", }, ) - PodWorkerLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + PodWorkerLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PodWorkerLatencyKey, Help: "Latency in seconds to sync a single pod. Broken down by operation type: create, update, or sync", }, []string{"operation_type"}, ) - PodStartLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PodStartLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PodStartLatencyKey, Help: "Latency in seconds for a single pod to go from pending to running.", }, ) - CgroupManagerLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + CgroupManagerLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: CgroupManagerOperationsKey, Help: "Latency in seconds for cgroup manager operations. Broken down by method.", }, []string{"operation_type"}, ) - PodWorkerStartLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PodWorkerStartLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PodWorkerStartLatencyKey, Help: "Latency in seconds from seeing a pod to starting a worker.", }, ) - PLEGRelistLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PLEGRelistLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PLEGRelistLatencyKey, Help: "Latency in seconds for relisting pods in PLEG.", @@ -145,8 +145,8 @@ var ( }, []string{}, ) - PLEGRelistInterval = prometheus.NewSummary( - prometheus.SummaryOpts{ + PLEGRelistInterval = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PLEGRelistIntervalKey, Help: "Interval in seconds between relisting in PLEG.", @@ -161,8 +161,8 @@ var ( }, []string{"operation_type"}, ) - RuntimeOperationsLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + RuntimeOperationsLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: RuntimeOperationsLatencyKey, Help: "Latency in seconds of runtime operations. Broken down by operation type.", @@ -177,8 +177,8 @@ var ( }, []string{"operation_type"}, ) - EvictionStatsAge = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + EvictionStatsAge = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: EvictionStatsAgeKey, Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", @@ -193,8 +193,8 @@ var ( }, []string{"resource_name"}, ) - DevicePluginAllocationLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + DevicePluginAllocationLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: DevicePluginAllocationLatencyKey, Help: "Latency in seconds to serve a device plugin Allocation request. Broken down by resource name.", From 4fa0ee7805d3bac4ffee4c51ba5f0f13b00cec71 Mon Sep 17 00:00:00 2001 From: danielqsj Date: Tue, 8 Jan 2019 17:59:57 +0800 Subject: [PATCH 3/5] Mark deprecated in related kubelet metrics --- pkg/kubelet/metrics/metrics.go | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 0c902ec450a..24c8ad6c7d6 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -206,7 +206,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedPodWorkerLatencyKey, - Help: "Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", + Help: "(Deprecated) Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", }, []string{"operation_type"}, ) @@ -214,14 +214,14 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedPodStartLatencyKey, - Help: "Latency in microseconds for a single pod to go from pending to running.", + Help: "(Deprecated) Latency in microseconds for a single pod to go from pending to running.", }, ) DeprecatedCgroupManagerLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedCgroupManagerOperationsKey, - Help: "Latency in microseconds for cgroup manager operations. Broken down by method.", + Help: "(Deprecated) Latency in microseconds for cgroup manager operations. Broken down by method.", }, []string{"operation_type"}, ) @@ -229,28 +229,28 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedPodWorkerStartLatencyKey, - Help: "Latency in microseconds from seeing a pod to starting a worker.", + Help: "(Deprecated) Latency in microseconds from seeing a pod to starting a worker.", }, ) DeprecatedPLEGRelistLatency = prometheus.NewSummary( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedPLEGRelistLatencyKey, - Help: "Latency in microseconds for relisting pods in PLEG.", + Help: "(Deprecated) Latency in microseconds for relisting pods in PLEG.", }, ) DeprecatedPLEGRelistInterval = prometheus.NewSummary( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedPLEGRelistIntervalKey, - Help: "Interval in microseconds between relisting in PLEG.", + Help: "(Deprecated) Interval in microseconds between relisting in PLEG.", }, ) DeprecatedRuntimeOperations = prometheus.NewCounterVec( prometheus.CounterOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedRuntimeOperationsKey, - Help: "Cumulative number of runtime operations by operation type.", + Help: "(Deprecated) Cumulative number of runtime operations by operation type.", }, []string{"operation_type"}, ) @@ -258,7 +258,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedRuntimeOperationsLatencyKey, - Help: "Latency in microseconds of runtime operations. Broken down by operation type.", + Help: "(Deprecated) Latency in microseconds of runtime operations. Broken down by operation type.", }, []string{"operation_type"}, ) @@ -266,7 +266,7 @@ var ( prometheus.CounterOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedRuntimeOperationsErrorsKey, - Help: "Cumulative number of runtime operation errors by operation type.", + Help: "(Deprecated) Cumulative number of runtime operation errors by operation type.", }, []string{"operation_type"}, ) @@ -274,7 +274,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedEvictionStatsAgeKey, - Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", + Help: "(Deprecated) Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", }, []string{"eviction_signal"}, ) @@ -282,7 +282,7 @@ var ( prometheus.CounterOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedDevicePluginRegistrationCountKey, - Help: "Cumulative number of device plugin registrations. Broken down by resource name.", + Help: "(Deprecated) Cumulative number of device plugin registrations. Broken down by resource name.", }, []string{"resource_name"}, ) @@ -290,7 +290,7 @@ var ( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, Name: DeprecatedDevicePluginAllocationLatencyKey, - Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", + Help: "(Deprecated) Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", }, []string{"resource_name"}, ) From 0bfe4c26b1bfac0443f46f176c900a98704a2cbe Mon Sep 17 00:00:00 2001 From: danielqsj Date: Mon, 18 Feb 2019 14:07:30 +0800 Subject: [PATCH 4/5] add default buckets for histogram metrics --- pkg/kubelet/metrics/metrics.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 24c8ad6c7d6..34cd7aaf69d 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -98,6 +98,7 @@ var ( Subsystem: KubeletSubsystem, Name: "containers_per_pod_count", Help: "The number of containers per pod.", + Buckets: prometheus.DefBuckets, }, ) PodWorkerLatency = prometheus.NewHistogramVec( @@ -105,6 +106,7 @@ var ( Subsystem: KubeletSubsystem, Name: PodWorkerLatencyKey, Help: "Latency in seconds to sync a single pod. Broken down by operation type: create, update, or sync", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) @@ -113,6 +115,7 @@ var ( Subsystem: KubeletSubsystem, Name: PodStartLatencyKey, Help: "Latency in seconds for a single pod to go from pending to running.", + Buckets: prometheus.DefBuckets, }, ) CgroupManagerLatency = prometheus.NewHistogramVec( @@ -120,6 +123,7 @@ var ( Subsystem: KubeletSubsystem, Name: CgroupManagerOperationsKey, Help: "Latency in seconds for cgroup manager operations. Broken down by method.", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) @@ -128,6 +132,7 @@ var ( Subsystem: KubeletSubsystem, Name: PodWorkerStartLatencyKey, Help: "Latency in seconds from seeing a pod to starting a worker.", + Buckets: prometheus.DefBuckets, }, ) PLEGRelistLatency = prometheus.NewHistogram( @@ -135,6 +140,7 @@ var ( Subsystem: KubeletSubsystem, Name: PLEGRelistLatencyKey, Help: "Latency in seconds for relisting pods in PLEG.", + Buckets: prometheus.DefBuckets, }, ) PLEGDiscardEvents = prometheus.NewCounterVec( @@ -150,6 +156,7 @@ var ( Subsystem: KubeletSubsystem, Name: PLEGRelistIntervalKey, Help: "Interval in seconds between relisting in PLEG.", + Buckets: prometheus.DefBuckets, }, ) // Metrics of remote runtime operations. @@ -166,6 +173,7 @@ var ( Subsystem: KubeletSubsystem, Name: RuntimeOperationsLatencyKey, Help: "Latency in seconds of runtime operations. Broken down by operation type.", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) @@ -182,6 +190,7 @@ var ( Subsystem: KubeletSubsystem, Name: EvictionStatsAgeKey, Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", + Buckets: prometheus.DefBuckets, }, []string{"eviction_signal"}, ) @@ -198,6 +207,7 @@ var ( Subsystem: KubeletSubsystem, Name: DevicePluginAllocationLatencyKey, Help: "Latency in seconds to serve a device plugin Allocation request. Broken down by resource name.", + Buckets: prometheus.DefBuckets, }, []string{"resource_name"}, ) From 79a3eb816c66d9928e02b8589a05fd0de2b8f378 Mon Sep 17 00:00:00 2001 From: danielqsj Date: Mon, 18 Feb 2019 17:40:04 +0800 Subject: [PATCH 5/5] rename latency to duration in metrics --- pkg/kubelet/cm/cgroup_manager_linux.go | 6 +- pkg/kubelet/cm/devicemanager/manager.go | 2 +- pkg/kubelet/kubelet.go | 6 +- .../kuberuntime/instrumented_services.go | 2 +- .../kuberuntime/instrumented_services_test.go | 6 +- pkg/kubelet/metrics/metrics.go | 68 +++++++++---------- pkg/kubelet/pleg/generic.go | 2 +- test/e2e/framework/kubelet_stats.go | 10 +-- test/e2e/framework/metrics_util.go | 6 +- test/e2e_node/density_test.go | 4 +- test/e2e_node/gpu_device_plugin.go | 2 +- 11 files changed, 57 insertions(+), 57 deletions(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 85d92edd0ea..b08bab132d5 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -284,7 +284,7 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool { func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedCgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start)) }() @@ -412,7 +412,7 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedCgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start)) }() @@ -448,7 +448,7 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedCgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start)) }() diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index e12df4d43af..355a223a4c6 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -697,7 +697,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource) resp, err := eI.e.allocate(devs) - metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) + metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) metrics.DeprecatedDevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) if err != nil { // In case of allocation failure, we want to restore m.allocatedDevices diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index d80a0c4a87e..b413304c377 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1500,7 +1500,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { if !firstSeenTime.IsZero() { // This is the first time we are syncing the pod. Record the latency // since kubelet first saw the pod if firstSeenTime is set. - metrics.PodWorkerStartLatency.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) metrics.DeprecatedPodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } else { klog.V(3).Infof("First seen time not recorded for pod %q", pod.UID) @@ -1518,7 +1518,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && !firstSeenTime.IsZero() { - metrics.PodStartLatency.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) metrics.DeprecatedPodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } @@ -1998,7 +1998,7 @@ func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mir UpdateType: syncType, OnCompleteFunc: func(err error) { if err != nil { - metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) + metrics.PodWorkerDuration.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedPodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) } }, diff --git a/pkg/kubelet/kuberuntime/instrumented_services.go b/pkg/kubelet/kuberuntime/instrumented_services.go index a587cf0eea6..b96faa0cd82 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services.go +++ b/pkg/kubelet/kuberuntime/instrumented_services.go @@ -50,7 +50,7 @@ func newInstrumentedImageManagerService(service internalapi.ImageManagerService) func recordOperation(operation string, start time.Time) { metrics.RuntimeOperations.WithLabelValues(operation).Inc() metrics.DeprecatedRuntimeOperations.WithLabelValues(operation).Inc() - metrics.RuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInSeconds(start)) + metrics.RuntimeOperationsDuration.WithLabelValues(operation).Observe(metrics.SinceInSeconds(start)) metrics.DeprecatedRuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) } diff --git a/pkg/kubelet/kuberuntime/instrumented_services_test.go b/pkg/kubelet/kuberuntime/instrumented_services_test.go index 1a9521a0ea1..d29f00fa199 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services_test.go +++ b/pkg/kubelet/kuberuntime/instrumented_services_test.go @@ -30,7 +30,7 @@ import ( func TestRecordOperation(t *testing.T) { prometheus.MustRegister(metrics.RuntimeOperations) - prometheus.MustRegister(metrics.RuntimeOperationsLatency) + prometheus.MustRegister(metrics.RuntimeOperationsDuration) prometheus.MustRegister(metrics.RuntimeOperationsErrors) temporalServer := "127.0.0.1:1234" @@ -51,7 +51,7 @@ func TestRecordOperation(t *testing.T) { recordOperation("create_container", time.Now()) runtimeOperationsCounterExpected := "kubelet_runtime_operations_total{operation_type=\"create_container\"} 1" - runtimeOperationsLatencyExpected := "kubelet_runtime_operations_latency_seconds_count{operation_type=\"create_container\"} 1" + runtimeOperationsDurationExpected := "kubelet_runtime_operations_duration_seconds_count{operation_type=\"create_container\"} 1" assert.HTTPBodyContains(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { mux.ServeHTTP(w, r) @@ -59,7 +59,7 @@ func TestRecordOperation(t *testing.T) { assert.HTTPBodyContains(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { mux.ServeHTTP(w, r) - }), "GET", prometheusURL, nil, runtimeOperationsLatencyExpected) + }), "GET", prometheusURL, nil, runtimeOperationsDurationExpected) } func TestInstrumentedVersion(t *testing.T) { diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 34cd7aaf69d..da31c581f8f 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -34,11 +34,11 @@ const ( KubeletSubsystem = "kubelet" NodeNameKey = "node_name" NodeLabelKey = "node" - PodWorkerLatencyKey = "pod_worker_latency_seconds" - PodStartLatencyKey = "pod_start_latency_seconds" - CgroupManagerOperationsKey = "cgroup_manager_latency_seconds" - PodWorkerStartLatencyKey = "pod_worker_start_latency_seconds" - PLEGRelistLatencyKey = "pleg_relist_latency_seconds" + PodWorkerDurationKey = "pod_worker_duration_seconds" + PodStartDurationKey = "pod_start_duration_seconds" + CgroupManagerOperationsKey = "cgroup_manager_duration_seconds" + PodWorkerStartDurationKey = "pod_worker_start_duration_seconds" + PLEGRelistDurationKey = "pleg_relist_duration_seconds" PLEGDiscardEventsKey = "pleg_discard_events" PLEGRelistIntervalKey = "pleg_relist_interval_seconds" EvictionStatsAgeKey = "eviction_stats_age_seconds" @@ -57,14 +57,14 @@ const ( VolumeStatsInodesUsedKey = "volume_stats_inodes_used" // Metrics keys of remote runtime operations RuntimeOperationsKey = "runtime_operations_total" - RuntimeOperationsLatencyKey = "runtime_operations_latency_seconds" + RuntimeOperationsDurationKey = "runtime_operations_duration_seconds" RuntimeOperationsErrorsKey = "runtime_operations_errors_total" DeprecatedRuntimeOperationsKey = "runtime_operations" DeprecatedRuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" DeprecatedRuntimeOperationsErrorsKey = "runtime_operations_errors" // Metrics keys of device plugin operations DevicePluginRegistrationCountKey = "device_plugin_registration_total" - DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_seconds" + DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds" DeprecatedDevicePluginRegistrationCountKey = "device_plugin_registration_count" DeprecatedDevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" @@ -101,45 +101,45 @@ var ( Buckets: prometheus.DefBuckets, }, ) - PodWorkerLatency = prometheus.NewHistogramVec( + PodWorkerDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodWorkerLatencyKey, - Help: "Latency in seconds to sync a single pod. Broken down by operation type: create, update, or sync", + Name: PodWorkerDurationKey, + Help: "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync", Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) - PodStartLatency = prometheus.NewHistogram( + PodStartDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodStartLatencyKey, - Help: "Latency in seconds for a single pod to go from pending to running.", + Name: PodStartDurationKey, + Help: "Duration in seconds for a single pod to go from pending to running.", Buckets: prometheus.DefBuckets, }, ) - CgroupManagerLatency = prometheus.NewHistogramVec( + CgroupManagerDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: CgroupManagerOperationsKey, - Help: "Latency in seconds for cgroup manager operations. Broken down by method.", + Help: "Duration in seconds for cgroup manager operations. Broken down by method.", Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) - PodWorkerStartLatency = prometheus.NewHistogram( + PodWorkerStartDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodWorkerStartLatencyKey, - Help: "Latency in seconds from seeing a pod to starting a worker.", + Name: PodWorkerStartDurationKey, + Help: "Duration in seconds from seeing a pod to starting a worker.", Buckets: prometheus.DefBuckets, }, ) - PLEGRelistLatency = prometheus.NewHistogram( + PLEGRelistDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PLEGRelistLatencyKey, - Help: "Latency in seconds for relisting pods in PLEG.", + Name: PLEGRelistDurationKey, + Help: "Duration in seconds for relisting pods in PLEG.", Buckets: prometheus.DefBuckets, }, ) @@ -168,11 +168,11 @@ var ( }, []string{"operation_type"}, ) - RuntimeOperationsLatency = prometheus.NewHistogramVec( + RuntimeOperationsDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: RuntimeOperationsLatencyKey, - Help: "Latency in seconds of runtime operations. Broken down by operation type.", + Name: RuntimeOperationsDurationKey, + Help: "Duration in seconds of runtime operations. Broken down by operation type.", Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, @@ -202,11 +202,11 @@ var ( }, []string{"resource_name"}, ) - DevicePluginAllocationLatency = prometheus.NewHistogramVec( + DevicePluginAllocationDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: DevicePluginAllocationLatencyKey, - Help: "Latency in seconds to serve a device plugin Allocation request. Broken down by resource name.", + Name: DevicePluginAllocationDurationKey, + Help: "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.", Buckets: prometheus.DefBuckets, }, []string{"resource_name"}, @@ -363,21 +363,21 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu // Register the metrics. registerMetrics.Do(func() { prometheus.MustRegister(NodeName) - prometheus.MustRegister(PodWorkerLatency) - prometheus.MustRegister(PodStartLatency) - prometheus.MustRegister(CgroupManagerLatency) - prometheus.MustRegister(PodWorkerStartLatency) + prometheus.MustRegister(PodWorkerDuration) + prometheus.MustRegister(PodStartDuration) + prometheus.MustRegister(CgroupManagerDuration) + prometheus.MustRegister(PodWorkerStartDuration) prometheus.MustRegister(ContainersPerPodCount) prometheus.MustRegister(newPodAndContainerCollector(containerCache)) - prometheus.MustRegister(PLEGRelistLatency) + prometheus.MustRegister(PLEGRelistDuration) prometheus.MustRegister(PLEGDiscardEvents) prometheus.MustRegister(PLEGRelistInterval) prometheus.MustRegister(RuntimeOperations) - prometheus.MustRegister(RuntimeOperationsLatency) + prometheus.MustRegister(RuntimeOperationsDuration) prometheus.MustRegister(RuntimeOperationsErrors) prometheus.MustRegister(EvictionStatsAge) prometheus.MustRegister(DevicePluginRegistrationCount) - prometheus.MustRegister(DevicePluginAllocationLatency) + prometheus.MustRegister(DevicePluginAllocationDuration) prometheus.MustRegister(DeprecatedPodWorkerLatency) prometheus.MustRegister(DeprecatedPodStartLatency) prometheus.MustRegister(DeprecatedCgroupManagerLatency) diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index 7cb162944c7..64bf43a1ef4 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -195,7 +195,7 @@ func (g *GenericPLEG) relist() { timestamp := g.clock.Now() defer func() { - metrics.PLEGRelistLatency.Observe(metrics.SinceInSeconds(timestamp)) + metrics.PLEGRelistDuration.Observe(metrics.SinceInSeconds(timestamp)) metrics.DeprecatedPLEGRelistLatency.Observe(metrics.SinceInMicroseconds(timestamp)) }() diff --git a/test/e2e/framework/kubelet_stats.go b/test/e2e/framework/kubelet_stats.go index 3803f088210..9b8f4c9cb72 100644 --- a/test/e2e/framework/kubelet_stats.go +++ b/test/e2e/framework/kubelet_stats.go @@ -102,13 +102,13 @@ func getKubeletMetrics(c clientset.Interface, nodeName string) (metrics.KubeletM // Note that the KubeletMetrics passed in should not contain subsystem prefix. func GetDefaultKubeletLatencyMetrics(ms metrics.KubeletMetrics) KubeletLatencyMetrics { latencyMetricNames := sets.NewString( - kubeletmetrics.PodWorkerLatencyKey, - kubeletmetrics.PodWorkerStartLatencyKey, - kubeletmetrics.PodStartLatencyKey, + kubeletmetrics.PodWorkerDurationKey, + kubeletmetrics.PodWorkerStartDurationKey, + kubeletmetrics.PodStartDurationKey, kubeletmetrics.CgroupManagerOperationsKey, dockermetrics.DockerOperationsLatencyKey, - kubeletmetrics.PodWorkerStartLatencyKey, - kubeletmetrics.PLEGRelistLatencyKey, + kubeletmetrics.PodWorkerStartDurationKey, + kubeletmetrics.PLEGRelistDurationKey, ) return GetKubeletLatencyMetrics(ms, latencyMetricNames) } diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index cf9fffe5da2..d098c00e049 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -168,9 +168,9 @@ var InterestingKubeletMetrics = []string{ "kubelet_docker_errors", "kubelet_docker_operations_latency_seconds", "kubelet_generate_pod_status_latency_microseconds", - "kubelet_pod_start_latency_seconds", - "kubelet_pod_worker_latency_seconds", - "kubelet_pod_worker_start_latency_seconds", + "kubelet_pod_start_duration_seconds", + "kubelet_pod_worker_duration_seconds", + "kubelet_pod_worker_start_duration_seconds", "kubelet_sync_pods_latency_microseconds", } diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go index f0b943061e6..3a80860a03a 100644 --- a/test/e2e_node/density_test.go +++ b/test/e2e_node/density_test.go @@ -459,12 +459,12 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) { for _, samples := range ms { for _, sample := range samples { - if sample.Metric["__name__"] == kubemetrics.KubeletSubsystem+"_"+kubemetrics.PodStartLatencyKey { + if sample.Metric["__name__"] == kubemetrics.KubeletSubsystem+"_"+kubemetrics.PodStartDurationKey { quantile, _ := strconv.ParseFloat(string(sample.Metric["quantile"]), 64) latencyMetrics = append(latencyMetrics, framework.KubeletLatencyMetric{ Quantile: quantile, - Method: kubemetrics.PodStartLatencyKey, + Method: kubemetrics.PodStartDurationKey, Latency: time.Duration(int(sample.Value)) * time.Microsecond}) } } diff --git a/test/e2e_node/gpu_device_plugin.go b/test/e2e_node/gpu_device_plugin.go index 8328324b695..17a91f054e8 100644 --- a/test/e2e_node/gpu_device_plugin.go +++ b/test/e2e_node/gpu_device_plugin.go @@ -156,7 +156,7 @@ func logDevicePluginMetrics() { framework.ExpectNoError(err) for msKey, samples := range ms { switch msKey { - case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey: + case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationDurationKey: for _, sample := range samples { latency := sample.Value resource := string(sample.Metric["resource_name"])