diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 3bb09f0ddda..b08bab132d5 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -284,7 +284,8 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool { func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("destroy").Observe(metrics.SinceInMicroseconds(start)) }() cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name) @@ -411,7 +412,8 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("update").Observe(metrics.SinceInMicroseconds(start)) }() // Extract the cgroup resource parameters @@ -446,7 +448,8 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { - metrics.CgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start)) + metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedCgroupManagerLatency.WithLabelValues("create").Observe(metrics.SinceInMicroseconds(start)) }() resources := m.toResources(cgroupConfig.ResourceParameters) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index dbb2720dfe9..355a223a4c6 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -354,6 +354,7 @@ func (m *ManagerImpl) Allocate(node *schedulernodeinfo.NodeInfo, attrs *lifecycl func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) { klog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName) metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc() + metrics.DeprecatedDevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc() var versionCompatible bool for _, v := range pluginapi.SupportedVersions { if r.Version == v { @@ -696,7 +697,8 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont // in a passed in AllocateRequest pointer, and issues a single Allocate call per pod. klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource) resp, err := eI.e.allocate(devs) - metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) + metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime)) + metrics.DeprecatedDevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) if err != nil { // In case of allocation failure, we want to restore m.allocatedDevices // to the actual allocated state from m.podDevices. diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 801fc8106fa..2ecabf8194a 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -361,7 +361,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act for _, t := range thresholds { timeObserved := observations[t.Signal].time if !timeObserved.IsZero() { - metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInMicroseconds(timeObserved.Time)) + metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time)) + metrics.DeprecatedEvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInMicroseconds(timeObserved.Time)) } } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index bb00c21a946..b413304c377 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1500,7 +1500,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { if !firstSeenTime.IsZero() { // This is the first time we are syncing the pod. Record the latency // since kubelet first saw the pod if firstSeenTime is set. - metrics.PodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) + metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.DeprecatedPodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } else { klog.V(3).Infof("First seen time not recorded for pod %q", pod.UID) } @@ -1517,7 +1518,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && !firstSeenTime.IsZero() { - metrics.PodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) + metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) + metrics.DeprecatedPodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } runnable := kl.canRunPod(pod) @@ -1996,7 +1998,8 @@ func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mir UpdateType: syncType, OnCompleteFunc: func(err error) { if err != nil { - metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) + metrics.PodWorkerDuration.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedPodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) } }, }) diff --git a/pkg/kubelet/kuberuntime/instrumented_services.go b/pkg/kubelet/kuberuntime/instrumented_services.go index dc3c6575f8b..b96faa0cd82 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services.go +++ b/pkg/kubelet/kuberuntime/instrumented_services.go @@ -49,13 +49,16 @@ func newInstrumentedImageManagerService(service internalapi.ImageManagerService) // recordOperation records the duration of the operation. func recordOperation(operation string, start time.Time) { metrics.RuntimeOperations.WithLabelValues(operation).Inc() - metrics.RuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) + metrics.DeprecatedRuntimeOperations.WithLabelValues(operation).Inc() + metrics.RuntimeOperationsDuration.WithLabelValues(operation).Observe(metrics.SinceInSeconds(start)) + metrics.DeprecatedRuntimeOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInMicroseconds(start)) } // recordError records error for metric if an error occurred. func recordError(operation string, err error) { if err != nil { metrics.RuntimeOperationsErrors.WithLabelValues(operation).Inc() + metrics.DeprecatedRuntimeOperationsErrors.WithLabelValues(operation).Inc() } } diff --git a/pkg/kubelet/kuberuntime/instrumented_services_test.go b/pkg/kubelet/kuberuntime/instrumented_services_test.go index 6463e617f06..d29f00fa199 100644 --- a/pkg/kubelet/kuberuntime/instrumented_services_test.go +++ b/pkg/kubelet/kuberuntime/instrumented_services_test.go @@ -30,7 +30,7 @@ import ( func TestRecordOperation(t *testing.T) { prometheus.MustRegister(metrics.RuntimeOperations) - prometheus.MustRegister(metrics.RuntimeOperationsLatency) + prometheus.MustRegister(metrics.RuntimeOperationsDuration) prometheus.MustRegister(metrics.RuntimeOperationsErrors) temporalServer := "127.0.0.1:1234" @@ -50,8 +50,8 @@ func TestRecordOperation(t *testing.T) { }() recordOperation("create_container", time.Now()) - runtimeOperationsCounterExpected := "kubelet_runtime_operations{operation_type=\"create_container\"} 1" - runtimeOperationsLatencyExpected := "kubelet_runtime_operations_latency_microseconds_count{operation_type=\"create_container\"} 1" + runtimeOperationsCounterExpected := "kubelet_runtime_operations_total{operation_type=\"create_container\"} 1" + runtimeOperationsDurationExpected := "kubelet_runtime_operations_duration_seconds_count{operation_type=\"create_container\"} 1" assert.HTTPBodyContains(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { mux.ServeHTTP(w, r) @@ -59,7 +59,7 @@ func TestRecordOperation(t *testing.T) { assert.HTTPBodyContains(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { mux.ServeHTTP(w, r) - }), "GET", prometheusURL, nil, runtimeOperationsLatencyExpected) + }), "GET", prometheusURL, nil, runtimeOperationsDurationExpected) } func TestInstrumentedVersion(t *testing.T) { diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index c1b38795338..da31c581f8f 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -31,30 +31,42 @@ import ( ) const ( - KubeletSubsystem = "kubelet" - NodeNameKey = "node_name" - NodeLabelKey = "node" - PodWorkerLatencyKey = "pod_worker_latency_microseconds" - PodStartLatencyKey = "pod_start_latency_microseconds" - CgroupManagerOperationsKey = "cgroup_manager_latency_microseconds" - PodWorkerStartLatencyKey = "pod_worker_start_latency_microseconds" - PLEGRelistLatencyKey = "pleg_relist_latency_microseconds" - PLEGDiscardEventsKey = "pleg_discard_events" - PLEGRelistIntervalKey = "pleg_relist_interval_microseconds" - EvictionStatsAgeKey = "eviction_stats_age_microseconds" - VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" - VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" - VolumeStatsUsedBytesKey = "volume_stats_used_bytes" - VolumeStatsInodesKey = "volume_stats_inodes" - VolumeStatsInodesFreeKey = "volume_stats_inodes_free" - VolumeStatsInodesUsedKey = "volume_stats_inodes_used" + KubeletSubsystem = "kubelet" + NodeNameKey = "node_name" + NodeLabelKey = "node" + PodWorkerDurationKey = "pod_worker_duration_seconds" + PodStartDurationKey = "pod_start_duration_seconds" + CgroupManagerOperationsKey = "cgroup_manager_duration_seconds" + PodWorkerStartDurationKey = "pod_worker_start_duration_seconds" + PLEGRelistDurationKey = "pleg_relist_duration_seconds" + PLEGDiscardEventsKey = "pleg_discard_events" + PLEGRelistIntervalKey = "pleg_relist_interval_seconds" + EvictionStatsAgeKey = "eviction_stats_age_seconds" + DeprecatedPodWorkerLatencyKey = "pod_worker_latency_microseconds" + DeprecatedPodStartLatencyKey = "pod_start_latency_microseconds" + DeprecatedCgroupManagerOperationsKey = "cgroup_manager_latency_microseconds" + DeprecatedPodWorkerStartLatencyKey = "pod_worker_start_latency_microseconds" + DeprecatedPLEGRelistLatencyKey = "pleg_relist_latency_microseconds" + DeprecatedPLEGRelistIntervalKey = "pleg_relist_interval_microseconds" + DeprecatedEvictionStatsAgeKey = "eviction_stats_age_microseconds" + VolumeStatsCapacityBytesKey = "volume_stats_capacity_bytes" + VolumeStatsAvailableBytesKey = "volume_stats_available_bytes" + VolumeStatsUsedBytesKey = "volume_stats_used_bytes" + VolumeStatsInodesKey = "volume_stats_inodes" + VolumeStatsInodesFreeKey = "volume_stats_inodes_free" + VolumeStatsInodesUsedKey = "volume_stats_inodes_used" // Metrics keys of remote runtime operations - RuntimeOperationsKey = "runtime_operations" - RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" - RuntimeOperationsErrorsKey = "runtime_operations_errors" + RuntimeOperationsKey = "runtime_operations_total" + RuntimeOperationsDurationKey = "runtime_operations_duration_seconds" + RuntimeOperationsErrorsKey = "runtime_operations_errors_total" + DeprecatedRuntimeOperationsKey = "runtime_operations" + DeprecatedRuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" + DeprecatedRuntimeOperationsErrorsKey = "runtime_operations_errors" // Metrics keys of device plugin operations - DevicePluginRegistrationCountKey = "device_plugin_registration_count" - DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" + DevicePluginRegistrationCountKey = "device_plugin_registration_total" + DevicePluginAllocationDurationKey = "device_plugin_alloc_duration_seconds" + DeprecatedDevicePluginRegistrationCountKey = "device_plugin_registration_count" + DeprecatedDevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" // Metric keys for node config AssignedConfigKey = "node_config_assigned" @@ -81,48 +93,54 @@ var ( }, []string{NodeLabelKey}, ) - ContainersPerPodCount = prometheus.NewSummary( - prometheus.SummaryOpts{ + ContainersPerPodCount = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: "containers_per_pod_count", Help: "The number of containers per pod.", + Buckets: prometheus.DefBuckets, }, ) - PodWorkerLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + PodWorkerDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodWorkerLatencyKey, - Help: "Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", + Name: PodWorkerDurationKey, + Help: "Duration in seconds to sync a single pod. Broken down by operation type: create, update, or sync", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) - PodStartLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PodStartDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodStartLatencyKey, - Help: "Latency in microseconds for a single pod to go from pending to running.", + Name: PodStartDurationKey, + Help: "Duration in seconds for a single pod to go from pending to running.", + Buckets: prometheus.DefBuckets, }, ) - CgroupManagerLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + CgroupManagerDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: CgroupManagerOperationsKey, - Help: "Latency in microseconds for cgroup manager operations. Broken down by method.", + Help: "Duration in seconds for cgroup manager operations. Broken down by method.", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) - PodWorkerStartLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PodWorkerStartDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PodWorkerStartLatencyKey, - Help: "Latency in microseconds from seeing a pod to starting a worker.", + Name: PodWorkerStartDurationKey, + Help: "Duration in seconds from seeing a pod to starting a worker.", + Buckets: prometheus.DefBuckets, }, ) - PLEGRelistLatency = prometheus.NewSummary( - prometheus.SummaryOpts{ + PLEGRelistDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: PLEGRelistLatencyKey, - Help: "Latency in microseconds for relisting pods in PLEG.", + Name: PLEGRelistDurationKey, + Help: "Duration in seconds for relisting pods in PLEG.", + Buckets: prometheus.DefBuckets, }, ) PLEGDiscardEvents = prometheus.NewCounterVec( @@ -133,11 +151,12 @@ var ( }, []string{}, ) - PLEGRelistInterval = prometheus.NewSummary( - prometheus.SummaryOpts{ + PLEGRelistInterval = prometheus.NewHistogram( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: PLEGRelistIntervalKey, - Help: "Interval in microseconds between relisting in PLEG.", + Help: "Interval in seconds between relisting in PLEG.", + Buckets: prometheus.DefBuckets, }, ) // Metrics of remote runtime operations. @@ -149,11 +168,12 @@ var ( }, []string{"operation_type"}, ) - RuntimeOperationsLatency = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + RuntimeOperationsDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, - Name: RuntimeOperationsLatencyKey, - Help: "Latency in microseconds of runtime operations. Broken down by operation type.", + Name: RuntimeOperationsDurationKey, + Help: "Duration in seconds of runtime operations. Broken down by operation type.", + Buckets: prometheus.DefBuckets, }, []string{"operation_type"}, ) @@ -165,11 +185,12 @@ var ( }, []string{"operation_type"}, ) - EvictionStatsAge = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + EvictionStatsAge = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: KubeletSubsystem, Name: EvictionStatsAgeKey, Help: "Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", + Buckets: prometheus.DefBuckets, }, []string{"eviction_signal"}, ) @@ -181,11 +202,105 @@ var ( }, []string{"resource_name"}, ) - DevicePluginAllocationLatency = prometheus.NewSummaryVec( + DevicePluginAllocationDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: KubeletSubsystem, + Name: DevicePluginAllocationDurationKey, + Help: "Duration in seconds to serve a device plugin Allocation request. Broken down by resource name.", + Buckets: prometheus.DefBuckets, + }, + []string{"resource_name"}, + ) + + DeprecatedPodWorkerLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Subsystem: KubeletSubsystem, - Name: DevicePluginAllocationLatencyKey, - Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", + Name: DeprecatedPodWorkerLatencyKey, + Help: "(Deprecated) Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync", + }, + []string{"operation_type"}, + ) + DeprecatedPodStartLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPodStartLatencyKey, + Help: "(Deprecated) Latency in microseconds for a single pod to go from pending to running.", + }, + ) + DeprecatedCgroupManagerLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedCgroupManagerOperationsKey, + Help: "(Deprecated) Latency in microseconds for cgroup manager operations. Broken down by method.", + }, + []string{"operation_type"}, + ) + DeprecatedPodWorkerStartLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPodWorkerStartLatencyKey, + Help: "(Deprecated) Latency in microseconds from seeing a pod to starting a worker.", + }, + ) + DeprecatedPLEGRelistLatency = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPLEGRelistLatencyKey, + Help: "(Deprecated) Latency in microseconds for relisting pods in PLEG.", + }, + ) + DeprecatedPLEGRelistInterval = prometheus.NewSummary( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedPLEGRelistIntervalKey, + Help: "(Deprecated) Interval in microseconds between relisting in PLEG.", + }, + ) + DeprecatedRuntimeOperations = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsKey, + Help: "(Deprecated) Cumulative number of runtime operations by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedRuntimeOperationsLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsLatencyKey, + Help: "(Deprecated) Latency in microseconds of runtime operations. Broken down by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedRuntimeOperationsErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedRuntimeOperationsErrorsKey, + Help: "(Deprecated) Cumulative number of runtime operation errors by operation type.", + }, + []string{"operation_type"}, + ) + DeprecatedEvictionStatsAge = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedEvictionStatsAgeKey, + Help: "(Deprecated) Time between when stats are collected, and when pod is evicted based on those stats by eviction signal", + }, + []string{"eviction_signal"}, + ) + DeprecatedDevicePluginRegistrationCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedDevicePluginRegistrationCountKey, + Help: "(Deprecated) Cumulative number of device plugin registrations. Broken down by resource name.", + }, + []string{"resource_name"}, + ) + DeprecatedDevicePluginAllocationLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DeprecatedDevicePluginAllocationLatencyKey, + Help: "(Deprecated) Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", }, []string{"resource_name"}, ) @@ -248,21 +363,33 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu // Register the metrics. registerMetrics.Do(func() { prometheus.MustRegister(NodeName) - prometheus.MustRegister(PodWorkerLatency) - prometheus.MustRegister(PodStartLatency) - prometheus.MustRegister(CgroupManagerLatency) - prometheus.MustRegister(PodWorkerStartLatency) + prometheus.MustRegister(PodWorkerDuration) + prometheus.MustRegister(PodStartDuration) + prometheus.MustRegister(CgroupManagerDuration) + prometheus.MustRegister(PodWorkerStartDuration) prometheus.MustRegister(ContainersPerPodCount) prometheus.MustRegister(newPodAndContainerCollector(containerCache)) - prometheus.MustRegister(PLEGRelistLatency) + prometheus.MustRegister(PLEGRelistDuration) prometheus.MustRegister(PLEGDiscardEvents) prometheus.MustRegister(PLEGRelistInterval) prometheus.MustRegister(RuntimeOperations) - prometheus.MustRegister(RuntimeOperationsLatency) + prometheus.MustRegister(RuntimeOperationsDuration) prometheus.MustRegister(RuntimeOperationsErrors) prometheus.MustRegister(EvictionStatsAge) prometheus.MustRegister(DevicePluginRegistrationCount) - prometheus.MustRegister(DevicePluginAllocationLatency) + prometheus.MustRegister(DevicePluginAllocationDuration) + prometheus.MustRegister(DeprecatedPodWorkerLatency) + prometheus.MustRegister(DeprecatedPodStartLatency) + prometheus.MustRegister(DeprecatedCgroupManagerLatency) + prometheus.MustRegister(DeprecatedPodWorkerStartLatency) + prometheus.MustRegister(DeprecatedPLEGRelistLatency) + prometheus.MustRegister(DeprecatedPLEGRelistInterval) + prometheus.MustRegister(DeprecatedRuntimeOperations) + prometheus.MustRegister(DeprecatedRuntimeOperationsLatency) + prometheus.MustRegister(DeprecatedRuntimeOperationsErrors) + prometheus.MustRegister(DeprecatedEvictionStatsAge) + prometheus.MustRegister(DeprecatedDevicePluginRegistrationCount) + prometheus.MustRegister(DeprecatedDevicePluginAllocationLatency) if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { prometheus.MustRegister(AssignedConfig) prometheus.MustRegister(ActiveConfig) @@ -280,6 +407,11 @@ func SinceInMicroseconds(start time.Time) float64 { return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()) } +// Gets the time since the specified start in seconds. +func SinceInSeconds(start time.Time) float64 { + return time.Since(start).Seconds() +} + func newPodAndContainerCollector(containerCache kubecontainer.RuntimeCache) *podAndContainerCollector { return &podAndContainerCollector{ containerCache: containerCache, diff --git a/pkg/kubelet/pleg/generic.go b/pkg/kubelet/pleg/generic.go index bce9c7ea20e..64bf43a1ef4 100644 --- a/pkg/kubelet/pleg/generic.go +++ b/pkg/kubelet/pleg/generic.go @@ -189,12 +189,14 @@ func (g *GenericPLEG) relist() { klog.V(5).Infof("GenericPLEG: Relisting") if lastRelistTime := g.getRelistTime(); !lastRelistTime.IsZero() { - metrics.PLEGRelistInterval.Observe(metrics.SinceInMicroseconds(lastRelistTime)) + metrics.PLEGRelistInterval.Observe(metrics.SinceInSeconds(lastRelistTime)) + metrics.DeprecatedPLEGRelistInterval.Observe(metrics.SinceInMicroseconds(lastRelistTime)) } timestamp := g.clock.Now() defer func() { - metrics.PLEGRelistLatency.Observe(metrics.SinceInMicroseconds(timestamp)) + metrics.PLEGRelistDuration.Observe(metrics.SinceInSeconds(timestamp)) + metrics.DeprecatedPLEGRelistLatency.Observe(metrics.SinceInMicroseconds(timestamp)) }() // Get all the pods. diff --git a/test/e2e/framework/kubelet_stats.go b/test/e2e/framework/kubelet_stats.go index 3803f088210..9b8f4c9cb72 100644 --- a/test/e2e/framework/kubelet_stats.go +++ b/test/e2e/framework/kubelet_stats.go @@ -102,13 +102,13 @@ func getKubeletMetrics(c clientset.Interface, nodeName string) (metrics.KubeletM // Note that the KubeletMetrics passed in should not contain subsystem prefix. func GetDefaultKubeletLatencyMetrics(ms metrics.KubeletMetrics) KubeletLatencyMetrics { latencyMetricNames := sets.NewString( - kubeletmetrics.PodWorkerLatencyKey, - kubeletmetrics.PodWorkerStartLatencyKey, - kubeletmetrics.PodStartLatencyKey, + kubeletmetrics.PodWorkerDurationKey, + kubeletmetrics.PodWorkerStartDurationKey, + kubeletmetrics.PodStartDurationKey, kubeletmetrics.CgroupManagerOperationsKey, dockermetrics.DockerOperationsLatencyKey, - kubeletmetrics.PodWorkerStartLatencyKey, - kubeletmetrics.PLEGRelistLatencyKey, + kubeletmetrics.PodWorkerStartDurationKey, + kubeletmetrics.PLEGRelistDurationKey, ) return GetKubeletLatencyMetrics(ms, latencyMetricNames) } diff --git a/test/e2e/framework/metrics_util.go b/test/e2e/framework/metrics_util.go index 106f8015b2f..d098c00e049 100644 --- a/test/e2e/framework/metrics_util.go +++ b/test/e2e/framework/metrics_util.go @@ -168,9 +168,9 @@ var InterestingKubeletMetrics = []string{ "kubelet_docker_errors", "kubelet_docker_operations_latency_seconds", "kubelet_generate_pod_status_latency_microseconds", - "kubelet_pod_start_latency_microseconds", - "kubelet_pod_worker_latency_microseconds", - "kubelet_pod_worker_start_latency_microseconds", + "kubelet_pod_start_duration_seconds", + "kubelet_pod_worker_duration_seconds", + "kubelet_pod_worker_start_duration_seconds", "kubelet_sync_pods_latency_microseconds", } diff --git a/test/e2e_node/density_test.go b/test/e2e_node/density_test.go index f0b943061e6..3a80860a03a 100644 --- a/test/e2e_node/density_test.go +++ b/test/e2e_node/density_test.go @@ -459,12 +459,12 @@ func getPodStartLatency(node string) (framework.KubeletLatencyMetrics, error) { for _, samples := range ms { for _, sample := range samples { - if sample.Metric["__name__"] == kubemetrics.KubeletSubsystem+"_"+kubemetrics.PodStartLatencyKey { + if sample.Metric["__name__"] == kubemetrics.KubeletSubsystem+"_"+kubemetrics.PodStartDurationKey { quantile, _ := strconv.ParseFloat(string(sample.Metric["quantile"]), 64) latencyMetrics = append(latencyMetrics, framework.KubeletLatencyMetric{ Quantile: quantile, - Method: kubemetrics.PodStartLatencyKey, + Method: kubemetrics.PodStartDurationKey, Latency: time.Duration(int(sample.Value)) * time.Microsecond}) } } diff --git a/test/e2e_node/gpu_device_plugin.go b/test/e2e_node/gpu_device_plugin.go index 8328324b695..17a91f054e8 100644 --- a/test/e2e_node/gpu_device_plugin.go +++ b/test/e2e_node/gpu_device_plugin.go @@ -156,7 +156,7 @@ func logDevicePluginMetrics() { framework.ExpectNoError(err) for msKey, samples := range ms { switch msKey { - case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey: + case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationDurationKey: for _, sample := range samples { latency := sample.Value resource := string(sample.Metric["resource_name"])