diff --git a/pkg/kubelet/cm/deviceplugin/BUILD b/pkg/kubelet/cm/deviceplugin/BUILD index 29702ab64c8..9c91c2e9df2 100644 --- a/pkg/kubelet/cm/deviceplugin/BUILD +++ b/pkg/kubelet/cm/deviceplugin/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/kubelet/config:go_default_library", "//pkg/kubelet/container:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/metrics:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library", "//vendor/github.com/golang/glog:go_default_library", "//vendor/golang.org/x/net/context:go_default_library", diff --git a/pkg/kubelet/cm/deviceplugin/manager.go b/pkg/kubelet/cm/deviceplugin/manager.go index b30e0fc4d5c..6535479fe43 100644 --- a/pkg/kubelet/cm/deviceplugin/manager.go +++ b/pkg/kubelet/cm/deviceplugin/manager.go @@ -24,6 +24,7 @@ import ( "os" "path/filepath" "sync" + "time" "github.com/golang/glog" "golang.org/x/net/context" @@ -36,6 +37,7 @@ import ( pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha" "k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" ) @@ -265,6 +267,7 @@ func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.P // Register registers a device plugin. func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) { glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName) + metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc() if r.Version != pluginapi.Version { errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version) glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString) @@ -548,6 +551,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont if allocDevices == nil || len(allocDevices) <= 0 { continue } + startRPCTime := time.Now() // devicePluginManager.Allocate involves RPC calls to device plugin, which // could be heavy-weight. Therefore we want to perform this operation outside // mutex lock. Note if Allocate call fails, we may leave container resources @@ -573,6 +577,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont devs := allocDevices.UnsortedList() glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource) resp, err := e.allocate(devs) + metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime)) if err != nil { // In case of allocation failure, we want to restore m.allocatedDevices // to the actual allocated state from m.podDevices. diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index f7399101186..0fc10dc189a 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -44,6 +44,9 @@ const ( RuntimeOperationsKey = "runtime_operations" RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" RuntimeOperationsErrorsKey = "runtime_operations_errors" + // Metrics keys of device plugin operations + DevicePluginRegistrationCountKey = "device_plugin_registration_count" + DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds" ) var ( @@ -179,6 +182,22 @@ var ( }, []string{"namespace", "persistentvolumeclaim"}, ) + DevicePluginRegistrationCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: DevicePluginRegistrationCountKey, + Help: "Cumulative number of device plugin registrations. Broken down by resource name.", + }, + []string{"resource_name"}, + ) + DevicePluginAllocationLatency = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Subsystem: KubeletSubsystem, + Name: DevicePluginAllocationLatencyKey, + Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.", + }, + []string{"resource_name"}, + ) ) var registerMetrics sync.Once @@ -205,6 +224,8 @@ func Register(containerCache kubecontainer.RuntimeCache) { prometheus.MustRegister(VolumeStatsInodes) prometheus.MustRegister(VolumeStatsInodesFree) prometheus.MustRegister(VolumeStatsInodesUsed) + prometheus.MustRegister(DevicePluginRegistrationCount) + prometheus.MustRegister(DevicePluginAllocationLatency) }) } diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 8e440ac8a33..35ed8b7a169 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -47,6 +47,7 @@ go_library( "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/onsi/ginkgo:go_default_library", "//vendor/github.com/onsi/gomega:go_default_library", + "//vendor/github.com/prometheus/common/model:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", diff --git a/test/e2e_node/gpu_device_plugin.go b/test/e2e_node/gpu_device_plugin.go index 4a3927218bf..b27c7df0be0 100644 --- a/test/e2e_node/gpu_device_plugin.go +++ b/test/e2e_node/gpu_device_plugin.go @@ -19,6 +19,7 @@ package e2e_node import ( "os/exec" "regexp" + "strconv" "time" "k8s.io/api/core/v1" @@ -27,10 +28,13 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig" + kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/test/e2e/framework" + "k8s.io/kubernetes/test/e2e/framework/metrics" . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + "github.com/prometheus/common/model" ) const ( @@ -121,6 +125,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi Expect(devIdRestart1).To(Equal(devId1)) count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2) Expect(devIdRestart2).To(Equal(devId2)) + logDevicePluginMetrics() // Cleanup f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) @@ -129,6 +134,34 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi }) }) +func logDevicePluginMetrics() { + ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255") + framework.ExpectNoError(err) + for msKey, samples := range ms { + switch msKey { + case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey: + for _, sample := range samples { + latency := sample.Value + resource := string(sample.Metric["resource_name"]) + var quantile float64 + if val, ok := sample.Metric[model.QuantileLabel]; ok { + var err error + if quantile, err = strconv.ParseFloat(string(val), 64); err != nil { + continue + } + framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency) + } + } + case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey: + for _, sample := range samples { + resource := string(sample.Metric["resource_name"]) + count := sample.Value + framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count) + } + } + } +} + func makeCudaPauseImage() *v1.Pod { podName := testPodNamePrefix + string(uuid.NewUUID())