From 0ffe89ed0b06a1355afe0a35e495aca3e6545b19 Mon Sep 17 00:00:00 2001 From: Anish Shah Date: Mon, 13 Jul 2020 17:23:57 -0700 Subject: [PATCH] kubelet: add operations count and error count metrics to network plugin manager --- .../dockershim/network/metrics/metrics.go | 28 ++++++++++++++++++- pkg/kubelet/dockershim/network/plugins.go | 18 ++++++++++-- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/pkg/kubelet/dockershim/network/metrics/metrics.go b/pkg/kubelet/dockershim/network/metrics/metrics.go index eaa05f0649d..db086e4b955 100644 --- a/pkg/kubelet/dockershim/network/metrics/metrics.go +++ b/pkg/kubelet/dockershim/network/metrics/metrics.go @@ -28,9 +28,11 @@ import ( const ( // NetworkPluginOperationsKey is the key for operation count metrics. - NetworkPluginOperationsKey = "network_plugin_operations" + NetworkPluginOperationsKey = "network_plugin_operations_total" // NetworkPluginOperationsLatencyKey is the key for the operation latency metrics. NetworkPluginOperationsLatencyKey = "network_plugin_operations_duration_seconds" + // NetworkPluginOperationsErrorsKey is the key for the operations error metrics. + NetworkPluginOperationsErrorsKey = "network_plugin_operations_errors_total" // Keep the "kubelet" subsystem for backward compatibility. kubeletSubsystem = "kubelet" @@ -49,6 +51,28 @@ var ( }, []string{"operation_type"}, ) + + // NetworkPluginOperations collects operation counts by operation type. + NetworkPluginOperations = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: kubeletSubsystem, + Name: NetworkPluginOperationsKey, + Help: "Cumulative number of network plugin operations by operation type.", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation_type"}, + ) + + // NetworkPluginOperationsErrors collects operation errors by operation type. + NetworkPluginOperationsErrors = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: kubeletSubsystem, + Name: NetworkPluginOperationsErrorsKey, + Help: "Cumulative number of network plugin operation errors by operation type.", + StabilityLevel: metrics.ALPHA, + }, + []string{"operation_type"}, + ) ) var registerMetrics sync.Once @@ -57,6 +81,8 @@ var registerMetrics sync.Once func Register() { registerMetrics.Do(func() { legacyregistry.MustRegister(NetworkPluginOperationsLatency) + legacyregistry.MustRegister(NetworkPluginOperations) + legacyregistry.MustRegister(NetworkPluginOperationsErrors) }) } diff --git a/pkg/kubelet/dockershim/network/plugins.go b/pkg/kubelet/dockershim/network/plugins.go index 29b8a7e2c46..85b5146019e 100644 --- a/pkg/kubelet/dockershim/network/plugins.go +++ b/pkg/kubelet/dockershim/network/plugins.go @@ -382,17 +382,25 @@ func (pm *PluginManager) podUnlock(fullPodName string) { // recordOperation records operation and duration func recordOperation(operation string, start time.Time) { + metrics.NetworkPluginOperations.WithLabelValues(operation).Inc() metrics.NetworkPluginOperationsLatency.WithLabelValues(operation).Observe(metrics.SinceInSeconds(start)) } +// recordError records errors for metric. +func recordError(operation string) { + metrics.NetworkPluginOperationsErrors.WithLabelValues(operation).Inc() +} + func (pm *PluginManager) GetPodNetworkStatus(podNamespace, podName string, id kubecontainer.ContainerID) (*PodNetworkStatus, error) { - defer recordOperation("get_pod_network_status", time.Now()) + const operation = "get_pod_network_status" + defer recordOperation(operation, time.Now()) fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace) pm.podLock(fullPodName).Lock() defer pm.podUnlock(fullPodName) netStatus, err := pm.plugin.GetPodNetworkStatus(podNamespace, podName, id) if err != nil { + recordError(operation) return nil, fmt.Errorf("networkPlugin %s failed on the status hook for pod %q: %v", pm.plugin.Name(), fullPodName, err) } @@ -400,13 +408,15 @@ func (pm *PluginManager) GetPodNetworkStatus(podNamespace, podName string, id ku } func (pm *PluginManager) SetUpPod(podNamespace, podName string, id kubecontainer.ContainerID, annotations, options map[string]string) error { - defer recordOperation("set_up_pod", time.Now()) + const operation = "set_up_pod" + defer recordOperation(operation, time.Now()) fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace) pm.podLock(fullPodName).Lock() defer pm.podUnlock(fullPodName) klog.V(3).Infof("Calling network plugin %s to set up pod %q", pm.plugin.Name(), fullPodName) if err := pm.plugin.SetUpPod(podNamespace, podName, id, annotations, options); err != nil { + recordError(operation) return fmt.Errorf("networkPlugin %s failed to set up pod %q network: %v", pm.plugin.Name(), fullPodName, err) } @@ -414,13 +424,15 @@ func (pm *PluginManager) SetUpPod(podNamespace, podName string, id kubecontainer } func (pm *PluginManager) TearDownPod(podNamespace, podName string, id kubecontainer.ContainerID) error { - defer recordOperation("tear_down_pod", time.Now()) + const operation = "tear_down_pod" + defer recordOperation(operation, time.Now()) fullPodName := kubecontainer.BuildPodFullName(podName, podNamespace) pm.podLock(fullPodName).Lock() defer pm.podUnlock(fullPodName) klog.V(3).Infof("Calling network plugin %s to tear down pod %q", pm.plugin.Name(), fullPodName) if err := pm.plugin.TearDownPod(podNamespace, podName, id); err != nil { + recordError(operation) return fmt.Errorf("networkPlugin %s failed to teardown pod %q network: %v", pm.plugin.Name(), fullPodName, err) }