Merge pull request #55340 from jiayingz/metrics

Automatic merge from submit-queue (batch tested with PRs 55340, 55329, 56168, 56170, 56105). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Adds device plugin allocation latency metric. For #53497 **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: **Release note**: ```release-note ```
2025-07-23 03:41:45 +00:00 · 2017-11-21 18:56:29 -08:00 · 2017-11-21 18:56:29 -08:00 · 3bb6eeeb07
commit 3bb6eeeb07
parent 277d866111 048bafdd0b
5 changed files with 61 additions and 0 deletions
--- a/pkg/kubelet/cm/deviceplugin/BUILD
+++ b/pkg/kubelet/cm/deviceplugin/BUILD
@ -23,6 +23,7 @@ go_library(
        "//pkg/kubelet/config:go_default_library",
        "//pkg/kubelet/container:go_default_library",
        "//pkg/kubelet/lifecycle:go_default_library",
+        "//pkg/kubelet/metrics:go_default_library",
        "//plugin/pkg/scheduler/schedulercache:go_default_library",
        "//vendor/github.com/golang/glog:go_default_library",
        "//vendor/golang.org/x/net/context:go_default_library",
--- a/pkg/kubelet/cm/deviceplugin/manager.go
+++ b/pkg/kubelet/cm/deviceplugin/manager.go
@ -24,6 +24,7 @@ import (
 	"os"
 	"path/filepath"
 	"sync"
+	"time"

 	"github.com/golang/glog"
 	"golang.org/x/net/context"
@ -36,6 +37,7 @@ import (
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
 	"k8s.io/kubernetes/pkg/kubelet/config"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/pkg/kubelet/metrics"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

@ -265,6 +267,7 @@ func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.P
 // Register registers a device plugin.
 func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
 	glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
+	metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
 	if r.Version != pluginapi.Version {
 		errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
 		glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
@ -548,6 +551,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 		if allocDevices == nil || len(allocDevices) <= 0 {
 			continue
 		}
+		startRPCTime := time.Now()
 		// devicePluginManager.Allocate involves RPC calls to device plugin, which
 		// could be heavy-weight. Therefore we want to perform this operation outside
 		// mutex lock. Note if Allocate call fails, we may leave container resources
@ -573,6 +577,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 		devs := allocDevices.UnsortedList()
 		glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
 		resp, err := e.allocate(devs)
+		metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
 		if err != nil {
 			// In case of allocation failure, we want to restore m.allocatedDevices
 			// to the actual allocated state from m.podDevices.
--- a/pkg/kubelet/metrics/metrics.go
+++ b/pkg/kubelet/metrics/metrics.go
@ -44,6 +44,9 @@ const (
 	RuntimeOperationsKey        = "runtime_operations"
 	RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds"
 	RuntimeOperationsErrorsKey  = "runtime_operations_errors"
+	// Metrics keys of device plugin operations
+	DevicePluginRegistrationCountKey = "device_plugin_registration_count"
+	DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"
 )

 var (
@ -179,6 +182,22 @@ var (
 		},
 		[]string{"namespace", "persistentvolumeclaim"},
 	)
+	DevicePluginRegistrationCount = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Subsystem: KubeletSubsystem,
+			Name:      DevicePluginRegistrationCountKey,
+			Help:      "Cumulative number of device plugin registrations. Broken down by resource name.",
+		},
+		[]string{"resource_name"},
+	)
+	DevicePluginAllocationLatency = prometheus.NewSummaryVec(
+		prometheus.SummaryOpts{
+			Subsystem: KubeletSubsystem,
+			Name:      DevicePluginAllocationLatencyKey,
+			Help:      "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.",
+		},
+		[]string{"resource_name"},
+	)
 )

 var registerMetrics sync.Once
@ -205,6 +224,8 @@ func Register(containerCache kubecontainer.RuntimeCache) {
 		prometheus.MustRegister(VolumeStatsInodes)
 		prometheus.MustRegister(VolumeStatsInodesFree)
 		prometheus.MustRegister(VolumeStatsInodesUsed)
+		prometheus.MustRegister(DevicePluginRegistrationCount)
+		prometheus.MustRegister(DevicePluginAllocationLatency)
 	})
 }

--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@ -47,6 +47,7 @@ go_library(
        "//vendor/github.com/golang/glog:go_default_library",
        "//vendor/github.com/onsi/ginkgo:go_default_library",
        "//vendor/github.com/onsi/gomega:go_default_library",
+        "//vendor/github.com/prometheus/common/model:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
--- a/test/e2e_node/gpu_device_plugin.go
+++ b/test/e2e_node/gpu_device_plugin.go
@ -19,6 +19,7 @@ package e2e_node
 import (
 	"os/exec"
 	"regexp"
+	"strconv"
 	"time"

 	"k8s.io/api/core/v1"
@ -27,10 +28,13 @@ import (
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
+	kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
 	"k8s.io/kubernetes/test/e2e/framework"
+	"k8s.io/kubernetes/test/e2e/framework/metrics"

 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
+	"github.com/prometheus/common/model"
 )

 const (
@ -121,6 +125,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 			Expect(devIdRestart1).To(Equal(devId1))
 			count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
 			Expect(devIdRestart2).To(Equal(devId2))
+			logDevicePluginMetrics()

 			// Cleanup
 			f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
@ -129,6 +134,34 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 	})
 })

+func logDevicePluginMetrics() {
+	ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
+	framework.ExpectNoError(err)
+	for msKey, samples := range ms {
+		switch msKey {
+		case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey:
+			for _, sample := range samples {
+				latency := sample.Value
+				resource := string(sample.Metric["resource_name"])
+				var quantile float64
+				if val, ok := sample.Metric[model.QuantileLabel]; ok {
+					var err error
+					if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
+						continue
+					}
+					framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency)
+				}
+			}
+		case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey:
+			for _, sample := range samples {
+				resource := string(sample.Metric["resource_name"])
+				count := sample.Value
+				framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count)
+			}
+		}
+	}
+}
+
 func makeCudaPauseImage() *v1.Pod {
 	podName := testPodNamePrefix + string(uuid.NewUUID())