From 172c55d310cba28a970e8af02064674b7719ee0d Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Tue, 17 Jan 2023 17:39:21 +0000 Subject: [PATCH] node: topologymgr: add metrics about admission requests and errors Signed-off-by: Swati Sehgal --- .../cm/topologymanager/scope_container.go | 3 +++ pkg/kubelet/cm/topologymanager/scope_pod.go | 3 +++ .../cm/topologymanager/topology_manager.go | 3 +++ pkg/kubelet/metrics/metrics.go | 26 +++++++++++++++++++ 4 files changed, 35 insertions(+) diff --git a/pkg/kubelet/cm/topologymanager/scope_container.go b/pkg/kubelet/cm/topologymanager/scope_container.go index 1e4e2f58fc0..fd90ac549fb 100644 --- a/pkg/kubelet/cm/topologymanager/scope_container.go +++ b/pkg/kubelet/cm/topologymanager/scope_container.go @@ -22,6 +22,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm/admission" "k8s.io/kubernetes/pkg/kubelet/cm/containermap" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + "k8s.io/kubernetes/pkg/kubelet/metrics" ) type containerScope struct { @@ -54,6 +55,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name) if !admit { + metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(&TopologyAffinityError{}) } klog.InfoS("Topology Affinity", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name) @@ -61,6 +63,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { err := s.allocateAlignedResources(pod, &container) if err != nil { + metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(err) } } diff --git a/pkg/kubelet/cm/topologymanager/scope_pod.go b/pkg/kubelet/cm/topologymanager/scope_pod.go index b77682597b8..ffcf7917167 100644 --- a/pkg/kubelet/cm/topologymanager/scope_pod.go +++ b/pkg/kubelet/cm/topologymanager/scope_pod.go @@ -22,6 +22,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm/admission" "k8s.io/kubernetes/pkg/kubelet/cm/containermap" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + "k8s.io/kubernetes/pkg/kubelet/metrics" ) type podScope struct { @@ -52,6 +53,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { bestHint, admit := s.calculateAffinity(pod) klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod)) if !admit { + metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(&TopologyAffinityError{}) } @@ -61,6 +63,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { err := s.allocateAlignedResources(pod, &container) if err != nil { + metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(err) } } diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index 16e0aceb46d..8b288aa6dec 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -24,6 +24,7 @@ import ( "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + "k8s.io/kubernetes/pkg/kubelet/metrics" ) const ( @@ -208,6 +209,8 @@ func (m *manager) RemoveContainer(containerID string) error { func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { klog.InfoS("Topology Admit Handler") + + metrics.TopologyManagerAdmissionRequestsTotal.Inc() pod := attrs.Pod return m.scope.Admit(pod) diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index fe8d3c00c0e..4a265be27ed 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -91,6 +91,10 @@ const ( CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total" CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total" + // Metrics to track the Topology manager behavior + TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total" + TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" + // Values used in metric labels Container = "container" InitContainer = "init_container" @@ -549,6 +553,26 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + + // TopologyManagerAdmissionRequestsTotal tracks the number of times the pod spec will cause the topology manager to admit a pod + TopologyManagerAdmissionRequestsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: TopologyManagerAdmissionRequestsTotalKey, + Help: "The number of admission requests where resources have to be aligned.", + StabilityLevel: metrics.ALPHA, + }, + ) + + // TopologyManagerAdmissionErrorsTotal tracks the number of times the pod spec required the topology manager to admit a pod, but the admission failed + TopologyManagerAdmissionErrorsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: TopologyManagerAdmissionErrorsTotalKey, + Help: "The number of admission request failures where resources could not be aligned.", + StabilityLevel: metrics.ALPHA, + }, + ) ) var registerMetrics sync.Once @@ -600,6 +624,8 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(RunPodSandboxErrors) legacyregistry.MustRegister(CPUManagerPinningRequestsTotal) legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) + legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) + legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) for _, collector := range collectors { legacyregistry.CustomMustRegister(collector)