Merge pull request #129950 from ffromani/alignment-error-detail-metrics

node: metrics for alignment failures
This commit is contained in:
Kubernetes Prow Robot 2025-03-12 18:03:46 -07:00 committed by GitHub
commit 05bfdbc6dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 75 additions and 7 deletions

View File

@ -325,13 +325,15 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
defer func() { defer func() {
if rerr != nil { if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc() metrics.CPUManagerPinningErrorsTotal.Inc()
if p.options.FullPhysicalCPUsOnly {
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}
return return
} }
if !p.options.FullPhysicalCPUsOnly { if p.options.FullPhysicalCPUsOnly {
// increment only if we know we allocate aligned resources // increment only if we know we allocate aligned resources
return metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
} }
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}() }()
if p.options.FullPhysicalCPUsOnly { if p.options.FullPhysicalCPUsOnly {
@ -752,6 +754,7 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
func (p *staticPolicy) initializeMetrics(s state.State) { func (p *staticPolicy) initializeMetrics(s state.State) {
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000)) metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s))) metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
} }
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) { func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {

View File

@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name) klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
if !admit { if !admit {
if IsAlignmentGuaranteed(s.policy) {
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
}
metrics.TopologyManagerAdmissionErrorsTotal.Inc() metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{}) return admission.GetPodAdmitResult(&TopologyAffinityError{})
} }

View File

@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
bestHint, admit := s.calculateAffinity(pod) bestHint, admit := s.calculateAffinity(pod)
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod)) klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
if !admit { if !admit {
if IsAlignmentGuaranteed(s.policy) {
// increment only if we know we allocate aligned resources.
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
}
metrics.TopologyManagerAdmissionErrorsTotal.Inc() metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{}) return admission.GetPodAdmitResult(&TopologyAffinityError{})
} }

View File

@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology
scope: scope, scope: scope,
} }
manager.initializeMetrics()
return manager, nil return manager, nil
} }
func (m *manager) initializeMetrics() {
// ensure the values exist
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
}
func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint { func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
return m.scope.GetAffinity(podUID, containerName) return m.scope.GetAffinity(podUID, containerName)
} }

View File

@ -132,6 +132,7 @@ const (
// Metric for tracking aligment of compute resources // Metric for tracking aligment of compute resources
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count" ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
ContainerAlignedComputeResourcesFailureNameKey = "container_aligned_compute_resources_failure_count"
ContainerAlignedComputeResourcesScopeLabelKey = "scope" ContainerAlignedComputeResourcesScopeLabelKey = "scope"
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary" ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
@ -818,7 +819,18 @@ var (
}, },
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey}, []string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
) )
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
// ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary
ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ContainerAlignedComputeResourcesFailureNameKey,
Help: "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
StabilityLevel: metrics.ALPHA,
},
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
)
MemoryManagerPinningRequestTotal = metrics.NewCounter( MemoryManagerPinningRequestTotal = metrics.NewCounter(
&metrics.CounterOpts{ &metrics.CounterOpts{
Subsystem: KubeletSubsystem, Subsystem: KubeletSubsystem,
@ -1079,6 +1091,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores) legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount) legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
legacyregistry.MustRegister(ContainerAlignedComputeResources) legacyregistry.MustRegister(ContainerAlignedComputeResources)
legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure)
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal) legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal) legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)

View File

@ -104,6 +104,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running") ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
@ -111,6 +112,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(0),
}),
}) })
ginkgo.By("Giving the Kubelet time to start up and produce metrics") ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -127,6 +131,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit") ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
@ -134,6 +139,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(1),
}),
}) })
ginkgo.By("Giving the Kubelet time to start up and produce metrics") ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -150,6 +158,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
@ -157,6 +166,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(0),
}),
}) })
ginkgo.By("Giving the Kubelet time to start up and produce metrics") ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@ -178,6 +190,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{ "kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(1), "container::physical_cpu": timelessSample(1),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::physical_cpu": timelessSample(0),
}),
}) })
ginkgo.By("Giving the Kubelet time to update the alignment metrics") ginkgo.By("Giving the Kubelet time to update the alignment metrics")

View File

@ -28,7 +28,6 @@ import (
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@ -84,6 +83,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running") ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
@ -91,6 +91,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::numa_node": timelessSample(0),
"pod::numa_node": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
@ -110,6 +114,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit") ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
@ -117,6 +122,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::numa_node": timelessSample(0),
"pod::numa_node": timelessSample(1),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0), "": checkMetricValueGreaterThan(0),
}), }),
@ -136,6 +145,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted") ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")
idFn := makeCustomPairID("scope", "boundary")
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1), "": timelessSample(1),
@ -143,6 +153,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0), "": timelessSample(0),
}), }),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::numa_node": timelessSample(0),
"pod::numa_node": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{ "kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0), "": checkMetricValueGreaterThan(0),
}), }),
@ -162,9 +176,15 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
// being [Serial], we can also assume noone else but us is running pods. // being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
idFn := makeCustomPairID("scope", "boundary")
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{ "kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(idFn, gstruct.Elements{
metrics.AlignedNUMANode: timelessSample(1), "container::numa_node": timelessSample(0),
"pod::numa_node": timelessSample(1),
}),
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
"container::numa_node": timelessSample(0),
"pod::numa_node": timelessSample(0),
}), }),
}) })