node: cpumgr: metrics: add metrics for allocation per NUMA

Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
This commit is contained in:
Swati Sehgal 2025-02-21 11:14:55 +00:00
parent 0446f6c146
commit f1031be019
2 changed files with 53 additions and 11 deletions

View File

@ -18,6 +18,7 @@ package cpumanager
import ( import (
"fmt" "fmt"
"strconv"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature" utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -389,7 +390,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
s.SetCPUSet(string(pod.UID), container.Name, cpuAllocation.CPUs) s.SetCPUSet(string(pod.UID), container.Name, cpuAllocation.CPUs)
p.updateCPUsToReuse(pod, container, cpuAllocation.CPUs) p.updateCPUsToReuse(pod, container, cpuAllocation.CPUs)
p.updateMetricsOnAllocate(cpuAllocation) p.updateMetricsOnAllocate(s, cpuAllocation)
klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuAllocation.CPUs.String()) klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuAllocation.CPUs.String())
return nil return nil
@ -416,7 +417,8 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
// Mutate the shared pool, adding released cpus. // Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse) toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
p.updateMetricsOnRelease(toRelease) p.updateMetricsOnRelease(s, toRelease)
} }
return nil return nil
} }
@ -755,33 +757,60 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
func (p *staticPolicy) initializeMetrics(s state.State) { func (p *staticPolicy) initializeMetrics(s state.State) {
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000)) metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Add(0) // ensure the value exists metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Add(0) // ensure the value exists
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(totalAssignedCPUs.Size()))
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
} }
func (p *staticPolicy) updateMetricsOnAllocate(cpuAlloc topology.Allocation) { func (p *staticPolicy) updateMetricsOnAllocate(s state.State, cpuAlloc topology.Allocation) {
ncpus := cpuAlloc.CPUs.Size() ncpus := cpuAlloc.CPUs.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus)) metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000)) metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
if cpuAlloc.Aligned.UncoreCache { if cpuAlloc.Aligned.UncoreCache {
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Inc() metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Inc()
} }
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
} }
func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) { func (p *staticPolicy) updateMetricsOnRelease(s state.State, cset cpuset.CPUSet) {
ncpus := cset.Size() ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus)) metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000)) metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs.Difference(cset))
} }
func countExclusiveCPUs(s state.State) int { func getTotalAssignedExclusiveCPUs(s state.State) cpuset.CPUSet {
exclusiveCPUs := 0 totalAssignedCPUs := cpuset.New()
for _, cpuAssign := range s.GetCPUAssignments() { for _, assignment := range s.GetCPUAssignments() {
for _, cset := range cpuAssign { for _, cset := range assignment {
exclusiveCPUs += cset.Size() totalAssignedCPUs = totalAssignedCPUs.Union(cset)
} }
}
return totalAssignedCPUs
}
func updateAllocationPerNUMAMetric(topo *topology.CPUTopology, allocatedCPUs cpuset.CPUSet) {
numaCount := make(map[int]int)
// Count CPUs allocated per NUMA node
for _, cpuID := range allocatedCPUs.UnsortedList() {
numaNode, err := topo.CPUNUMANodeID(cpuID)
if err != nil {
//NOTE: We are logging the error but it is highly unlikely to happen as the CPUset
// is already computed, evaluated and there is no room for user tampering.
klog.ErrorS(err, "Unable to determine NUMA node", "cpuID", cpuID)
}
numaCount[numaNode]++
}
// Update metric
for numaNode, count := range numaCount {
metrics.CPUManagerAllocationPerNUMA.WithLabelValues(strconv.Itoa(numaNode)).Set(float64(count))
} }
return exclusiveCPUs
} }

View File

@ -113,6 +113,7 @@ const (
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total" CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
CPUManagerSharedPoolSizeMilliCoresKey = "cpu_manager_shared_pool_size_millicores" CPUManagerSharedPoolSizeMilliCoresKey = "cpu_manager_shared_pool_size_millicores"
CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count" CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count"
CPUManagerAllocationPerNUMAKey = "cpu_manager_allocation_per_numa"
// Metrics to track the Memory manager behavior // Metrics to track the Memory manager behavior
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total" MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
@ -815,6 +816,17 @@ var (
}, },
) )
// CPUManagerAllocationPerNUMA tracks the count of CPUs allocated per NUMA node
CPUManagerAllocationPerNUMA = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: CPUManagerAllocationPerNUMAKey,
Help: "Number of CPUs allocated per NUMA node",
StabilityLevel: metrics.ALPHA,
},
[]string{AlignedNUMANode},
)
// ContainerAlignedComputeResources reports the count of resources allocation which granted aligned resources, per alignment boundary // ContainerAlignedComputeResources reports the count of resources allocation which granted aligned resources, per alignment boundary
ContainerAlignedComputeResources = metrics.NewCounterVec( ContainerAlignedComputeResources = metrics.NewCounterVec(
&metrics.CounterOpts{ &metrics.CounterOpts{
@ -1126,6 +1138,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores) legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount) legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
legacyregistry.MustRegister(CPUManagerAllocationPerNUMA)
legacyregistry.MustRegister(ContainerAlignedComputeResources) legacyregistry.MustRegister(ContainerAlignedComputeResources)
legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure) legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure)
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal) legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)