mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-09 12:07:47 +00:00
Merge pull request #129950 from ffromani/alignment-error-detail-metrics
node: metrics for alignment failures
This commit is contained in:
commit
05bfdbc6dd
@ -325,13 +325,15 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
|
||||
defer func() {
|
||||
if rerr != nil {
|
||||
metrics.CPUManagerPinningErrorsTotal.Inc()
|
||||
if p.options.FullPhysicalCPUsOnly {
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
|
||||
}
|
||||
return
|
||||
}
|
||||
if !p.options.FullPhysicalCPUsOnly {
|
||||
if p.options.FullPhysicalCPUsOnly {
|
||||
// increment only if we know we allocate aligned resources
|
||||
return
|
||||
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
|
||||
}
|
||||
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
|
||||
}()
|
||||
|
||||
if p.options.FullPhysicalCPUsOnly {
|
||||
@ -752,6 +754,7 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
|
||||
func (p *staticPolicy) initializeMetrics(s state.State) {
|
||||
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
|
||||
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
|
||||
}
|
||||
|
||||
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
|
||||
|
@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
|
||||
|
||||
if !admit {
|
||||
if IsAlignmentGuaranteed(s.policy) {
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
|
||||
}
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(&TopologyAffinityError{})
|
||||
}
|
||||
|
@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
|
||||
bestHint, admit := s.calculateAffinity(pod)
|
||||
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
|
||||
if !admit {
|
||||
if IsAlignmentGuaranteed(s.policy) {
|
||||
// increment only if we know we allocate aligned resources.
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
|
||||
}
|
||||
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
|
||||
return admission.GetPodAdmitResult(&TopologyAffinityError{})
|
||||
}
|
||||
|
@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology
|
||||
scope: scope,
|
||||
}
|
||||
|
||||
manager.initializeMetrics()
|
||||
|
||||
return manager, nil
|
||||
}
|
||||
|
||||
func (m *manager) initializeMetrics() {
|
||||
// ensure the values exist
|
||||
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
|
||||
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
|
||||
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
|
||||
}
|
||||
|
||||
func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
|
||||
return m.scope.GetAffinity(podUID, containerName)
|
||||
}
|
||||
|
@ -132,6 +132,7 @@ const (
|
||||
|
||||
// Metric for tracking aligment of compute resources
|
||||
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
|
||||
ContainerAlignedComputeResourcesFailureNameKey = "container_aligned_compute_resources_failure_count"
|
||||
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
|
||||
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
|
||||
|
||||
@ -818,7 +819,18 @@ var (
|
||||
},
|
||||
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
|
||||
)
|
||||
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
|
||||
|
||||
// ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary
|
||||
ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: ContainerAlignedComputeResourcesFailureNameKey,
|
||||
Help: "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
|
||||
)
|
||||
|
||||
MemoryManagerPinningRequestTotal = metrics.NewCounter(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
@ -1079,6 +1091,7 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
|
||||
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
|
||||
legacyregistry.MustRegister(ContainerAlignedComputeResources)
|
||||
legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure)
|
||||
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
|
||||
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||
|
@ -104,6 +104,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
@ -111,6 +112,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::physical_cpu": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
@ -127,6 +131,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
@ -134,6 +139,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::physical_cpu": timelessSample(1),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
@ -150,6 +158,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
@ -157,6 +166,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::physical_cpu": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
|
||||
@ -178,6 +190,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
|
||||
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::physical_cpu": timelessSample(1),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::physical_cpu": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
|
||||
|
@ -28,7 +28,6 @@ import (
|
||||
v1 "k8s.io/api/core/v1"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/test/e2e/feature"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
@ -84,6 +83,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
@ -91,6 +91,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::numa_node": timelessSample(0),
|
||||
"pod::numa_node": timelessSample(0),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
@ -110,6 +114,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
@ -117,6 +122,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::numa_node": timelessSample(0),
|
||||
"pod::numa_node": timelessSample(1),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"": checkMetricValueGreaterThan(0),
|
||||
}),
|
||||
@ -136,6 +145,7 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
@ -143,6 +153,10 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::numa_node": timelessSample(0),
|
||||
"pod::numa_node": timelessSample(0),
|
||||
}),
|
||||
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"": checkMetricValueGreaterThan(0),
|
||||
}),
|
||||
@ -162,9 +176,15 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
|
||||
|
||||
idFn := makeCustomPairID("scope", "boundary")
|
||||
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
metrics.AlignedNUMANode: timelessSample(1),
|
||||
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(idFn, gstruct.Elements{
|
||||
"container::numa_node": timelessSample(0),
|
||||
"pod::numa_node": timelessSample(1),
|
||||
}),
|
||||
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
|
||||
"container::numa_node": timelessSample(0),
|
||||
"pod::numa_node": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user