mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 20:53:33 +00:00
Merge pull request #55340 from jiayingz/metrics
Automatic merge from submit-queue (batch tested with PRs 55340, 55329, 56168, 56170, 56105). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Adds device plugin allocation latency metric. For #53497 **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: **Release note**: ```release-note ```
This commit is contained in:
commit
3bb6eeeb07
@ -23,6 +23,7 @@ go_library(
|
|||||||
"//pkg/kubelet/config:go_default_library",
|
"//pkg/kubelet/config:go_default_library",
|
||||||
"//pkg/kubelet/container:go_default_library",
|
"//pkg/kubelet/container:go_default_library",
|
||||||
"//pkg/kubelet/lifecycle:go_default_library",
|
"//pkg/kubelet/lifecycle:go_default_library",
|
||||||
|
"//pkg/kubelet/metrics:go_default_library",
|
||||||
"//plugin/pkg/scheduler/schedulercache:go_default_library",
|
"//plugin/pkg/scheduler/schedulercache:go_default_library",
|
||||||
"//vendor/github.com/golang/glog:go_default_library",
|
"//vendor/github.com/golang/glog:go_default_library",
|
||||||
"//vendor/golang.org/x/net/context:go_default_library",
|
"//vendor/golang.org/x/net/context:go_default_library",
|
||||||
|
@ -24,6 +24,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/golang/glog"
|
"github.com/golang/glog"
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
@ -36,6 +37,7 @@ import (
|
|||||||
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
|
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -265,6 +267,7 @@ func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.P
|
|||||||
// Register registers a device plugin.
|
// Register registers a device plugin.
|
||||||
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
|
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
|
||||||
glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
|
glog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
|
||||||
|
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
|
||||||
if r.Version != pluginapi.Version {
|
if r.Version != pluginapi.Version {
|
||||||
errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
|
errorString := fmt.Sprintf(errUnsuportedVersion, r.Version, pluginapi.Version)
|
||||||
glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
|
glog.Infof("Bad registration request from device plugin with resource name %q: %v", r.ResourceName, errorString)
|
||||||
@ -548,6 +551,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
|
|||||||
if allocDevices == nil || len(allocDevices) <= 0 {
|
if allocDevices == nil || len(allocDevices) <= 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
startRPCTime := time.Now()
|
||||||
// devicePluginManager.Allocate involves RPC calls to device plugin, which
|
// devicePluginManager.Allocate involves RPC calls to device plugin, which
|
||||||
// could be heavy-weight. Therefore we want to perform this operation outside
|
// could be heavy-weight. Therefore we want to perform this operation outside
|
||||||
// mutex lock. Note if Allocate call fails, we may leave container resources
|
// mutex lock. Note if Allocate call fails, we may leave container resources
|
||||||
@ -573,6 +577,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
|
|||||||
devs := allocDevices.UnsortedList()
|
devs := allocDevices.UnsortedList()
|
||||||
glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
|
glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
|
||||||
resp, err := e.allocate(devs)
|
resp, err := e.allocate(devs)
|
||||||
|
metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// In case of allocation failure, we want to restore m.allocatedDevices
|
// In case of allocation failure, we want to restore m.allocatedDevices
|
||||||
// to the actual allocated state from m.podDevices.
|
// to the actual allocated state from m.podDevices.
|
||||||
|
@ -44,6 +44,9 @@ const (
|
|||||||
RuntimeOperationsKey = "runtime_operations"
|
RuntimeOperationsKey = "runtime_operations"
|
||||||
RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds"
|
RuntimeOperationsLatencyKey = "runtime_operations_latency_microseconds"
|
||||||
RuntimeOperationsErrorsKey = "runtime_operations_errors"
|
RuntimeOperationsErrorsKey = "runtime_operations_errors"
|
||||||
|
// Metrics keys of device plugin operations
|
||||||
|
DevicePluginRegistrationCountKey = "device_plugin_registration_count"
|
||||||
|
DevicePluginAllocationLatencyKey = "device_plugin_alloc_latency_microseconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@ -179,6 +182,22 @@ var (
|
|||||||
},
|
},
|
||||||
[]string{"namespace", "persistentvolumeclaim"},
|
[]string{"namespace", "persistentvolumeclaim"},
|
||||||
)
|
)
|
||||||
|
DevicePluginRegistrationCount = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: DevicePluginRegistrationCountKey,
|
||||||
|
Help: "Cumulative number of device plugin registrations. Broken down by resource name.",
|
||||||
|
},
|
||||||
|
[]string{"resource_name"},
|
||||||
|
)
|
||||||
|
DevicePluginAllocationLatency = prometheus.NewSummaryVec(
|
||||||
|
prometheus.SummaryOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: DevicePluginAllocationLatencyKey,
|
||||||
|
Help: "Latency in microseconds to serve a device plugin Allocation request. Broken down by resource name.",
|
||||||
|
},
|
||||||
|
[]string{"resource_name"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
var registerMetrics sync.Once
|
var registerMetrics sync.Once
|
||||||
@ -205,6 +224,8 @@ func Register(containerCache kubecontainer.RuntimeCache) {
|
|||||||
prometheus.MustRegister(VolumeStatsInodes)
|
prometheus.MustRegister(VolumeStatsInodes)
|
||||||
prometheus.MustRegister(VolumeStatsInodesFree)
|
prometheus.MustRegister(VolumeStatsInodesFree)
|
||||||
prometheus.MustRegister(VolumeStatsInodesUsed)
|
prometheus.MustRegister(VolumeStatsInodesUsed)
|
||||||
|
prometheus.MustRegister(DevicePluginRegistrationCount)
|
||||||
|
prometheus.MustRegister(DevicePluginAllocationLatency)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,6 +47,7 @@ go_library(
|
|||||||
"//vendor/github.com/golang/glog:go_default_library",
|
"//vendor/github.com/golang/glog:go_default_library",
|
||||||
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
||||||
"//vendor/github.com/onsi/gomega:go_default_library",
|
"//vendor/github.com/onsi/gomega:go_default_library",
|
||||||
|
"//vendor/github.com/prometheus/common/model:go_default_library",
|
||||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||||
|
@ -19,6 +19,7 @@ package e2e_node
|
|||||||
import (
|
import (
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
"k8s.io/api/core/v1"
|
||||||
@ -27,10 +28,13 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/util/uuid"
|
"k8s.io/apimachinery/pkg/util/uuid"
|
||||||
"k8s.io/kubernetes/pkg/features"
|
"k8s.io/kubernetes/pkg/features"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
|
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
|
||||||
|
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework/metrics"
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo"
|
. "github.com/onsi/ginkgo"
|
||||||
. "github.com/onsi/gomega"
|
. "github.com/onsi/gomega"
|
||||||
|
"github.com/prometheus/common/model"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -121,6 +125,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
|
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
|
||||||
Expect(devIdRestart2).To(Equal(devId2))
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
|
logDevicePluginMetrics()
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||||
@ -129,6 +134,34 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
func logDevicePluginMetrics() {
|
||||||
|
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
for msKey, samples := range ms {
|
||||||
|
switch msKey {
|
||||||
|
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginAllocationLatencyKey:
|
||||||
|
for _, sample := range samples {
|
||||||
|
latency := sample.Value
|
||||||
|
resource := string(sample.Metric["resource_name"])
|
||||||
|
var quantile float64
|
||||||
|
if val, ok := sample.Metric[model.QuantileLabel]; ok {
|
||||||
|
var err error
|
||||||
|
if quantile, err = strconv.ParseFloat(string(val), 64); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
framework.Logf("Metric: %v ResourceName: %v Quantile: %v Latency: %v", msKey, resource, quantile, latency)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case kubeletmetrics.KubeletSubsystem + "_" + kubeletmetrics.DevicePluginRegistrationCountKey:
|
||||||
|
for _, sample := range samples {
|
||||||
|
resource := string(sample.Metric["resource_name"])
|
||||||
|
count := sample.Value
|
||||||
|
framework.Logf("Metric: %v ResourceName: %v Count: %v", msKey, resource, count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func makeCudaPauseImage() *v1.Pod {
|
func makeCudaPauseImage() *v1.Pod {
|
||||||
podName := testPodNamePrefix + string(uuid.NewUUID())
|
podName := testPodNamePrefix + string(uuid.NewUUID())
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user