Adds device plugin registration count metric and allocation latency metric.

This commit is contained in:
Jiaying Zhang 2017-11-08 13:12:28 -08:00
parent 5337ff8009
commit 048bafdd0b
5 changed files with 61 additions and 0 deletions

View File

@ -23,6 +23,7 @@ go_library(
"//pkg/kubelet/config:go_default_library", "//pkg/kubelet/config:go_default_library",
"//pkg/kubelet/container:go_default_library", "//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//plugin/pkg/scheduler/schedulercache:go_default_library", "//plugin/pkg/scheduler/schedulercache:go_default_library",
"//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/golang/glog:go_default_library",
"//vendor/golang.org/x/net/context:go_default_library", "//vendor/golang.org/x/net/context:go_default_library",

View File

@ -24,6 +24,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"sync" "sync"
"time"
"github.com/golang/glog" "github.com/golang/glog"
"golang.org/x/net/context" "golang.org/x/net/context"
@ -36,6 +37,7 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha" pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
"k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache" "k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
) )
@ -265,6 +267,7 @@ func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.P
// Register registers a device plugin. // Register registers a device plugin.
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) { func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName) glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
if r.Version != pluginapi.Version { if r.Version != pluginapi.Version {
errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version) errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString) glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
@ -548,6 +551,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
if allocDevices == nil || len(allocDevices) <= 0 { if allocDevices == nil || len(allocDevices) <= 0 {
continue continue
} }
startRPCTime := time.Now()
// devicePluginManager.Allocate involves RPC calls to device plugin, which // devicePluginManager.Allocate involves RPC calls to device plugin, which
// could be heavy-weight. Therefore we want to perform this operation outside // could be heavy-weight. Therefore we want to perform this operation outside
// mutex lock. Note if Allocate call fails, we may leave container resources // mutex lock. Note if Allocate call fails, we may leave container resources
@ -573,6 +577,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
devs := allocDevices.UnsortedList() devs := allocDevices.UnsortedList()
glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource) glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
resp, err := e.allocate(devs) resp, err := e.allocate(devs)
metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
if err != nil { if err != nil {
// In case of allocation failure, we want to restore m.allocatedDevices // In case of allocation failure, we want to restore m.allocatedDevices
// to the actual allocated state from m.podDevices. // to the actual allocated state from m.podDevices.

View File

@ -44,6 +44,9 @@ const (
RuntimeOperationsKey = "runtime_operations" RuntimeOperationsKey = "runtime_operations"
RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds" RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds"
RuntimeOperationsErrorsKey = "runtime_operations_errors" RuntimeOperationsErrorsKey = "runtime_operations_errors"
// Metrics keys of device plugin operations
DevicePluginRegistrationCountKey = "device_plugin_registration_count"
DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"
) )
var ( var (
@ -179,6 +182,22 @@ var (
}, },
[]string{"namespace", "persistentvolumeclaim"}, []string{"namespace", "persistentvolumeclaim"},
) )
DevicePluginRegistrationCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: KubeletSubsystem,
Name: DevicePluginRegistrationCountKey,
Help: "Cumulative number of device plugin registrations. Broken down by resource name.",
},
[]string{"resource_name"},
)
DevicePluginAllocationLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Subsystem: KubeletSubsystem,
Name: DevicePluginAllocationLatencyKey,
Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.",
},
[]string{"resource_name"},
)
) )
var registerMetrics sync.Once var registerMetrics sync.Once
@ -205,6 +224,8 @@ func Register(containerCache kubecontainer.RuntimeCache) {
prometheus.MustRegister(VolumeStatsInodes) prometheus.MustRegister(VolumeStatsInodes)
prometheus.MustRegister(VolumeStatsInodesFree) prometheus.MustRegister(VolumeStatsInodesFree)
prometheus.MustRegister(VolumeStatsInodesUsed) prometheus.MustRegister(VolumeStatsInodesUsed)
prometheus.MustRegister(DevicePluginRegistrationCount)
prometheus.MustRegister(DevicePluginAllocationLatency)
}) })
} }

View File

@ -47,6 +47,7 @@ go_library(
"//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/golang/glog:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library", "//vendor/github.com/onsi/ginkgo:go_default_library",
"//vendor/github.com/onsi/gomega:go_default_library", "//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/github.com/prometheus/common/model:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",

View File

@ -19,6 +19,7 @@ package e2e_node
import ( import (
"os/exec" "os/exec"
"regexp" "regexp"
"strconv"
"time" "time"
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
@ -27,10 +28,13 @@ import (
"k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig" "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/framework/metrics"
. "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
"github.com/prometheus/common/model"
) )
const ( const (
@ -121,6 +125,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2) count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
Expect(devIdRestart2).To(Equal(devId2)) Expect(devIdRestart2).To(Equal(devId2))
logDevicePluginMetrics()
// Cleanup // Cleanup
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
@ -129,6 +134,34 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
}) })
}) })
func logDevicePluginMetrics() {
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
framework.ExpectNoError(err)
for msKey, samples := range ms {
switch msKey {
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey:
for _, sample := range samples {
latency := sample.Value
resource := string(sample.Metric["resource_name"])
var quantile float64
if val, ok := sample.Metric[model.QuantileLabel]; ok {
var err error
if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
continue
}
framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency)
}
}
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey:
for _, sample := range samples {
resource := string(sample.Metric["resource_name"])
count := sample.Value
framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count)
}
}
}
}
func makeCudaPauseImage() *v1.Pod { func makeCudaPauseImage() *v1.Pod {
podName := testPodNamePrefix + string(uuid.NewUUID()) podName := testPodNamePrefix + string(uuid.NewUUID())