diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index a9828474d64..e183d71e4a6 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -18,6 +18,7 @@ package cpumanager import ( "fmt" + "strconv" v1 "k8s.io/api/core/v1" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -389,7 +390,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai s.SetCPUSet(string(pod.UID), container.Name, cpuAllocation.CPUs) p.updateCPUsToReuse(pod, container, cpuAllocation.CPUs) - p.updateMetricsOnAllocate(cpuAllocation) + p.updateMetricsOnAllocate(s, cpuAllocation) klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuAllocation.CPUs.String()) return nil @@ -416,7 +417,8 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa // Mutate the shared pool, adding released cpus. toRelease = toRelease.Difference(cpusInUse) s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease)) - p.updateMetricsOnRelease(toRelease) + p.updateMetricsOnRelease(s, toRelease) + } return nil } @@ -755,33 +757,60 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC func (p *staticPolicy) initializeMetrics(s state.State) { metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000)) - metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s))) metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Add(0) // ensure the value exists + totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s) + metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(totalAssignedCPUs.Size())) + updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs) } -func (p *staticPolicy) updateMetricsOnAllocate(cpuAlloc topology.Allocation) { +func (p *staticPolicy) updateMetricsOnAllocate(s state.State, cpuAlloc topology.Allocation) { ncpus := cpuAlloc.CPUs.Size() metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus)) metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000)) if cpuAlloc.Aligned.UncoreCache { metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Inc() } + totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s) + updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs) } -func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) { +func (p *staticPolicy) updateMetricsOnRelease(s state.State, cset cpuset.CPUSet) { ncpus := cset.Size() metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus)) metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000)) + totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s) + updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs.Difference(cset)) } -func countExclusiveCPUs(s state.State) int { - exclusiveCPUs := 0 - for _, cpuAssign := range s.GetCPUAssignments() { - for _, cset := range cpuAssign { - exclusiveCPUs += cset.Size() +func getTotalAssignedExclusiveCPUs(s state.State) cpuset.CPUSet { + totalAssignedCPUs := cpuset.New() + for _, assignment := range s.GetCPUAssignments() { + for _, cset := range assignment { + totalAssignedCPUs = totalAssignedCPUs.Union(cset) } + + } + return totalAssignedCPUs +} + +func updateAllocationPerNUMAMetric(topo *topology.CPUTopology, allocatedCPUs cpuset.CPUSet) { + numaCount := make(map[int]int) + + // Count CPUs allocated per NUMA node + for _, cpuID := range allocatedCPUs.UnsortedList() { + numaNode, err := topo.CPUNUMANodeID(cpuID) + if err != nil { + //NOTE: We are logging the error but it is highly unlikely to happen as the CPUset + // is already computed, evaluated and there is no room for user tampering. + klog.ErrorS(err, "Unable to determine NUMA node", "cpuID", cpuID) + } + numaCount[numaNode]++ + } + + // Update metric + for numaNode, count := range numaCount { + metrics.CPUManagerAllocationPerNUMA.WithLabelValues(strconv.Itoa(numaNode)).Set(float64(count)) } - return exclusiveCPUs } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 04d62880b0b..7a04387ab6f 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -113,6 +113,7 @@ const ( CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total" CPUManagerSharedPoolSizeMilliCoresKey = "cpu_manager_shared_pool_size_millicores" CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count" + CPUManagerAllocationPerNUMAKey = "cpu_manager_allocation_per_numa" // Metrics to track the Memory manager behavior MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total" @@ -815,6 +816,17 @@ var ( }, ) + // CPUManagerAllocationPerNUMA tracks the count of CPUs allocated per NUMA node + CPUManagerAllocationPerNUMA = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: CPUManagerAllocationPerNUMAKey, + Help: "Number of CPUs allocated per NUMA node", + StabilityLevel: metrics.ALPHA, + }, + []string{AlignedNUMANode}, + ) + // ContainerAlignedComputeResources reports the count of resources allocation which granted aligned resources, per alignment boundary ContainerAlignedComputeResources = metrics.NewCounterVec( &metrics.CounterOpts{ @@ -1126,6 +1138,7 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(CPUManagerPinningErrorsTotal) legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores) legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount) + legacyregistry.MustRegister(CPUManagerAllocationPerNUMA) legacyregistry.MustRegister(ContainerAlignedComputeResources) legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure) legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)