diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index 0dbe757b3c5..56236e95096 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -325,13 +325,15 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai defer func() { if rerr != nil { metrics.CPUManagerPinningErrorsTotal.Inc() + if p.options.FullPhysicalCPUsOnly { + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc() + } return } - if !p.options.FullPhysicalCPUsOnly { + if p.options.FullPhysicalCPUsOnly { // increment only if we know we allocate aligned resources - return + metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc() } - metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc() }() if p.options.FullPhysicalCPUsOnly { @@ -752,6 +754,7 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC func (p *staticPolicy) initializeMetrics(s state.State) { metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000)) metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s))) + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists } func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) { diff --git a/pkg/kubelet/cm/topologymanager/scope_container.go b/pkg/kubelet/cm/topologymanager/scope_container.go index 7bdbba68dc2..7c06c090cc6 100644 --- a/pkg/kubelet/cm/topologymanager/scope_container.go +++ b/pkg/kubelet/cm/topologymanager/scope_container.go @@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name) if !admit { + if IsAlignmentGuaranteed(s.policy) { + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc() + } metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(&TopologyAffinityError{}) } diff --git a/pkg/kubelet/cm/topologymanager/scope_pod.go b/pkg/kubelet/cm/topologymanager/scope_pod.go index d06d9578256..bcb421d61e4 100644 --- a/pkg/kubelet/cm/topologymanager/scope_pod.go +++ b/pkg/kubelet/cm/topologymanager/scope_pod.go @@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult { bestHint, admit := s.calculateAffinity(pod) klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod)) if !admit { + if IsAlignmentGuaranteed(s.policy) { + // increment only if we know we allocate aligned resources. + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc() + } metrics.TopologyManagerAdmissionErrorsTotal.Inc() return admission.GetPodAdmitResult(&TopologyAffinityError{}) } diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index d9e244952ec..ccaba099f80 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology scope: scope, } + manager.initializeMetrics() + return manager, nil } +func (m *manager) initializeMetrics() { + // ensure the values exist + metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0) + metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0) + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0) + metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0) +} + func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint { return m.scope.GetAffinity(podUID, containerName) } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 28078e451ce..f3530391f24 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -132,6 +132,7 @@ const ( // Metric for tracking aligment of compute resources ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count" + ContainerAlignedComputeResourcesFailureNameKey = "container_aligned_compute_resources_failure_count" ContainerAlignedComputeResourcesScopeLabelKey = "scope" ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary" @@ -818,7 +819,18 @@ var ( }, []string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey}, ) - // MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages + + // ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary + ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: ContainerAlignedComputeResourcesFailureNameKey, + Help: "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.", + StabilityLevel: metrics.ALPHA, + }, + []string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey}, + ) + MemoryManagerPinningRequestTotal = metrics.NewCounter( &metrics.CounterOpts{ Subsystem: KubeletSubsystem, @@ -1079,6 +1091,7 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores) legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount) legacyregistry.MustRegister(ContainerAlignedComputeResources) + legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure) legacyregistry.MustRegister(MemoryManagerPinningRequestTotal) legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) diff --git a/test/e2e_node/cpu_manager_metrics_test.go b/test/e2e_node/cpu_manager_metrics_test.go index 7ec1db48451..03afe0c1e79 100644 --- a/test/e2e_node/cpu_manager_metrics_test.go +++ b/test/e2e_node/cpu_manager_metrics_test.go @@ -104,6 +104,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), @@ -111,6 +112,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::physical_cpu": timelessSample(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -127,6 +131,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), @@ -134,6 +139,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::physical_cpu": timelessSample(1), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -150,6 +158,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), @@ -157,6 +166,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::physical_cpu": timelessSample(0), + }), }) ginkgo.By("Giving the Kubelet time to start up and produce metrics") @@ -178,6 +190,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa "kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ "container::physical_cpu": timelessSample(1), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::physical_cpu": timelessSample(0), + }), }) ginkgo.By("Giving the Kubelet time to update the alignment metrics") diff --git a/test/e2e_node/topology_manager_metrics_test.go b/test/e2e_node/topology_manager_metrics_test.go index 0fef30e82ec..595d51b16ea 100644 --- a/test/e2e_node/topology_manager_metrics_test.go +++ b/test/e2e_node/topology_manager_metrics_test.go @@ -28,7 +28,6 @@ import ( v1 "k8s.io/api/core/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" - "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" @@ -84,6 +83,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), @@ -91,6 +91,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::numa_node": timelessSample(0), + "pod::numa_node": timelessSample(0), + }), "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "": timelessSample(0), }), @@ -110,6 +114,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), @@ -117,6 +122,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::numa_node": timelessSample(0), + "pod::numa_node": timelessSample(1), + }), "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "": checkMetricValueGreaterThan(0), }), @@ -136,6 +145,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted") + idFn := makeCustomPairID("scope", "boundary") matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(1), @@ -143,6 +153,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "": timelessSample(0), }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::numa_node": timelessSample(0), + "pod::numa_node": timelessSample(0), + }), "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "": checkMetricValueGreaterThan(0), }), @@ -162,9 +176,15 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature. // being [Serial], we can also assume noone else but us is running pods. ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") + idFn := makeCustomPairID("scope", "boundary") matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ - "kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{ - metrics.AlignedNUMANode: timelessSample(1), + "kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(idFn, gstruct.Elements{ + "container::numa_node": timelessSample(0), + "pod::numa_node": timelessSample(1), + }), + "kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ + "container::numa_node": timelessSample(0), + "pod::numa_node": timelessSample(0), }), })