memorymanager:metrics: add metrics

As part of the memory manager GA graduation effort, we should add
metrics in order to iprove observability.

The metrics also mentioned in the PR https://github.com/kubernetes/enhancements/pull/4251 (which was not merged yet)

Signed-off-by: Talor Itzhak <titzhak@redhat.com>
This commit is contained in:
Talor Itzhak 2023-11-07 12:40:45 +02:00
parent 246d363ea4
commit ddd60de3f3
2 changed files with 36 additions and 1 deletions

View File

@ -34,6 +34,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/types"
)
@ -95,7 +96,7 @@ func (p *staticPolicy) Start(s state.State) error {
}
// Allocate call is idempotent
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
// allocate the memory only for guaranteed pods
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return nil
@ -103,6 +104,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
podUID := string(pod.UID)
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
// container belongs in an exclusively allocated pool
metrics.MemoryManagerPinningRequestTotal.Inc()
defer func() {
if rerr != nil {
metrics.MemoryManagerPinningErrorsTotal.Inc()
}
}()
if blocks := s.GetMemoryBlocks(podUID, container.Name); blocks != nil {
p.updatePodReusableMemory(pod, container, blocks)

View File

@ -108,6 +108,10 @@ const (
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
// Metrics to track the Memory manager behavior
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
MemoryManagerPinningErrorsTotalKey = "memory_manager_pinning_errors_total"
// Metrics to track the Topology manager behavior
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
@ -719,6 +723,25 @@ var (
},
)
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
MemoryManagerPinningRequestTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: MemoryManagerPinningRequestsTotalKey,
Help: "The number of memory pages allocations which required pinning.",
StabilityLevel: metrics.ALPHA,
})
// MemoryManagerPinningErrorsTotal tracks the number of times the pod spec required the memory manager to pin memory pages, but the allocation failed
MemoryManagerPinningErrorsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: MemoryManagerPinningErrorsTotalKey,
Help: "The number of memory pages allocations which required pinning that failed.",
StabilityLevel: metrics.ALPHA,
},
)
// TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod
TopologyManagerAdmissionRequestsTotal = metrics.NewCounter(
&metrics.CounterOpts{
@ -887,6 +910,10 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(RunPodSandboxErrors)
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
}
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)