Merge pull request #127959 from ffromani/fix-smtalign-error-message

node: cpumanager: fix smtalign error message and minor cleanup
2025-09-16 22:53:22 +00:00 · 2024-10-11 00:32:20 +01:00
parent c15581b277 838f911dea
commit 3bf17e2340
2 changed files with 58 additions and 13 deletions
--- a/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static.go
@@ -49,10 +49,11 @@ type SMTAlignmentError struct {
 	RequestedCPUs         int
 	CpusPerCore           int
 	AvailablePhysicalCPUs int
+	CausedByPhysicalCPUs  bool
 }

 func (e SMTAlignmentError) Error() string {
-	if e.AvailablePhysicalCPUs > 0 {
+	if e.CausedByPhysicalCPUs {
 		return fmt.Sprintf("SMT Alignment Error: not enough free physical CPUs: available physical CPUs = %d, requested CPUs = %d, CPUs per core = %d", e.AvailablePhysicalCPUs, e.RequestedCPUs, e.CpusPerCore)
 	}
 	return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore)
@@ -118,6 +119,9 @@ type staticPolicy struct {
 	cpusToReuse map[string]cpuset.CPUSet
 	// options allow to fine-tune the behaviour of the policy
 	options StaticPolicyOptions
+	// we compute this value multiple time, and it's not supposed to change
+	// at runtime - the cpumanager can't deal with runtime topology changes anyway.
+	cpuGroupSize int
 }

 // Ensure staticPolicy implements Policy interface
@@ -136,13 +140,15 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
 		return nil, err
 	}

-	klog.InfoS("Static policy created with configuration", "options", opts)
+	cpuGroupSize := topology.CPUsPerCore()
+	klog.InfoS("Static policy created with configuration", "options", opts, "cpuGroupSize", cpuGroupSize)

 	policy := &staticPolicy{
-		topology:    topology,
-		affinity:    affinity,
-		cpusToReuse: make(map[string]cpuset.CPUSet),
-		options:     opts,
+		topology:     topology,
+		affinity:     affinity,
+		cpusToReuse:  make(map[string]cpuset.CPUSet),
+		options:      opts,
+		cpuGroupSize: cpuGroupSize,
 	}

 	allCPUs := topology.CPUDetails.CPUs()
@@ -310,8 +316,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 	}()

 	if p.options.FullPhysicalCPUsOnly {
-		CPUsPerCore := p.topology.CPUsPerCore()
-		if (numCPUs % CPUsPerCore) != 0 {
+		if (numCPUs % p.cpuGroupSize) != 0 {
 			// Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
 			// if the CPU requested is a multiple of the number of virtual cpus per physical cores.
 			// In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put
@@ -322,8 +327,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 			// and only in case the request cannot be sattisfied on a single socket, CPU allocation is done for a workload to occupy all
 			// CPUs on a physical core. Allocation of individual threads would never have to occur.
 			return SMTAlignmentError{
-				RequestedCPUs: numCPUs,
-				CpusPerCore:   CPUsPerCore,
+				RequestedCPUs:        numCPUs,
+				CpusPerCore:          p.cpuGroupSize,
+				CausedByPhysicalCPUs: false,
 			}
 		}

@@ -336,8 +342,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 		if numCPUs > availablePhysicalCPUs {
 			return SMTAlignmentError{
 				RequestedCPUs:         numCPUs,
-				CpusPerCore:           CPUsPerCore,
+				CpusPerCore:           p.cpuGroupSize,
 				AvailablePhysicalCPUs: availablePhysicalCPUs,
+				CausedByPhysicalCPUs:  true,
 			}
 		}
 	}
@@ -491,7 +498,7 @@ func (p *staticPolicy) takeByTopology(availableCPUs cpuset.CPUSet, numCPUs int)
 	if p.options.DistributeCPUsAcrossNUMA {
 		cpuGroupSize := 1
 		if p.options.FullPhysicalCPUsOnly {
-			cpuGroupSize = p.topology.CPUsPerCore()
+			cpuGroupSize = p.cpuGroupSize
 		}
 		return takeByTopologyNUMADistributed(p.topology, availableCPUs, numCPUs, cpuGroupSize, cpuSortingStrategy)
 	}
--- a/pkg/kubelet/cm/cpumanager/policy_static_test.go
+++ b/pkg/kubelet/cm/cpumanager/policy_static_test.go
@@ -509,7 +509,7 @@ func TestStaticPolicyAdd(t *testing.T) {
 			stAssignments:   state.ContainerCPUAssignments{},
 			stDefaultCPUSet: cpuset.New(0, 2, 3, 4, 5, 7, 8, 9, 10, 11),
 			pod:             makePod("fakePod", "fakeContainerBug113537_1", "10000m", "10000m"),
-			expErr:          SMTAlignmentError{RequestedCPUs: 10, CpusPerCore: 2, AvailablePhysicalCPUs: 8},
+			expErr:          SMTAlignmentError{RequestedCPUs: 10, CpusPerCore: 2, AvailablePhysicalCPUs: 8, CausedByPhysicalCPUs: true},
 			expCPUAlloc:     false,
 			expCSet:         cpuset.New(),
 		},
@@ -1188,6 +1188,44 @@ func TestStaticPolicyOptions(t *testing.T) {
 	}
 }

+func TestSMTAlignmentErrorText(t *testing.T) {
+	type smtErrTestCase struct {
+		name     string
+		err      SMTAlignmentError
+		expected string
+	}
+
+	testCases := []smtErrTestCase{
+		{
+			name:     "base SMT alignment error",
+			err:      SMTAlignmentError{RequestedCPUs: 15, CpusPerCore: 4},
+			expected: `SMT Alignment Error: requested 15 cpus not multiple cpus per core = 4`,
+		},
+		{
+			// Note the explicit 0. The intent is to signal the lack of physical CPUs, but
+			// in the corner case of no available physical CPUs at all, without the explicit
+			// flag we cannot distinguish the case, and before PR#127959 we printed the old message
+			name:     "base SMT alignment error, no physical CPUs, missing flag",
+			err:      SMTAlignmentError{RequestedCPUs: 4, CpusPerCore: 2, AvailablePhysicalCPUs: 0},
+			expected: `SMT Alignment Error: requested 4 cpus not multiple cpus per core = 2`,
+		},
+		{
+			name:     "base SMT alignment error, no physical CPUs, explicit flag",
+			err:      SMTAlignmentError{RequestedCPUs: 4, CpusPerCore: 2, AvailablePhysicalCPUs: 0, CausedByPhysicalCPUs: true},
+			expected: `SMT Alignment Error: not enough free physical CPUs: available physical CPUs = 0, requested CPUs = 4, CPUs per core = 2`,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.name, func(t *testing.T) {
+			got := testCase.err.Error()
+			if got != testCase.expected {
+				t.Errorf("got=%v expected=%v", got, testCase.expected)
+			}
+		})
+	}
+}
+
 func newCPUSetPtr(cpus ...int) *cpuset.CPUSet {
 	ret := cpuset.New(cpus...)
 	return &ret