cpumanager: smtalign: fix error message

Fix error message if availablePhysicalCPUs = 0.
Without this change, the logic was mistakenly emitting
the old error message, which is confusing for troubleshooting.

Plus, a tiny quality of life improvement:
cpumanager static policy wants to use `cpuGroupSize` multiple times.
The value represents how many VCPUs per PCPUs the machine has.
So, let's cache (and log!) the value in the policy data.
We don't support dynamic update of the HW topology anyway.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2024-10-09 15:10:53 +02:00
parent c5f2fc05ad
commit 838f911dea
2 changed files with 58 additions and 13 deletions

View File

@ -49,10 +49,11 @@ type SMTAlignmentError struct {
RequestedCPUs int
CpusPerCore int
AvailablePhysicalCPUs int
CausedByPhysicalCPUs bool
}
func (e SMTAlignmentError) Error() string {
if e.AvailablePhysicalCPUs > 0 {
if e.CausedByPhysicalCPUs {
return fmt.Sprintf("SMT Alignment Error: not enough free physical CPUs: available physical CPUs = %d, requested CPUs = %d, CPUs per core = %d", e.AvailablePhysicalCPUs, e.RequestedCPUs, e.CpusPerCore)
}
return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore)
@ -118,6 +119,9 @@ type staticPolicy struct {
cpusToReuse map[string]cpuset.CPUSet
// options allow to fine-tune the behaviour of the policy
options StaticPolicyOptions
// we compute this value multiple time, and it's not supposed to change
// at runtime - the cpumanager can't deal with runtime topology changes anyway.
cpuGroupSize int
}
// Ensure staticPolicy implements Policy interface
@ -136,13 +140,15 @@ func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reserv
return nil, err
}
klog.InfoS("Static policy created with configuration", "options", opts)
cpuGroupSize := topology.CPUsPerCore()
klog.InfoS("Static policy created with configuration", "options", opts, "cpuGroupSize", cpuGroupSize)
policy := &staticPolicy{
topology: topology,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
topology: topology,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
cpuGroupSize: cpuGroupSize,
}
allCPUs := topology.CPUDetails.CPUs()
@ -310,8 +316,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}()
if p.options.FullPhysicalCPUsOnly {
CPUsPerCore := p.topology.CPUsPerCore()
if (numCPUs % CPUsPerCore) != 0 {
if (numCPUs % p.cpuGroupSize) != 0 {
// Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
// if the CPU requested is a multiple of the number of virtual cpus per physical cores.
// In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put
@ -322,8 +327,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
// and only in case the request cannot be sattisfied on a single socket, CPU allocation is done for a workload to occupy all
// CPUs on a physical core. Allocation of individual threads would never have to occur.
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: CPUsPerCore,
RequestedCPUs: numCPUs,
CpusPerCore: p.cpuGroupSize,
CausedByPhysicalCPUs: false,
}
}
@ -336,8 +342,9 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
if numCPUs > availablePhysicalCPUs {
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: CPUsPerCore,
CpusPerCore: p.cpuGroupSize,
AvailablePhysicalCPUs: availablePhysicalCPUs,
CausedByPhysicalCPUs: true,
}
}
}
@ -491,7 +498,7 @@ func (p *staticPolicy) takeByTopology(availableCPUs cpuset.CPUSet, numCPUs int)
if p.options.DistributeCPUsAcrossNUMA {
cpuGroupSize := 1
if p.options.FullPhysicalCPUsOnly {
cpuGroupSize = p.topology.CPUsPerCore()
cpuGroupSize = p.cpuGroupSize
}
return takeByTopologyNUMADistributed(p.topology, availableCPUs, numCPUs, cpuGroupSize, cpuSortingStrategy)
}

View File

@ -509,7 +509,7 @@ func TestStaticPolicyAdd(t *testing.T) {
stAssignments: state.ContainerCPUAssignments{},
stDefaultCPUSet: cpuset.New(0, 2, 3, 4, 5, 7, 8, 9, 10, 11),
pod: makePod("fakePod", "fakeContainerBug113537_1", "10000m", "10000m"),
expErr: SMTAlignmentError{RequestedCPUs: 10, CpusPerCore: 2, AvailablePhysicalCPUs: 8},
expErr: SMTAlignmentError{RequestedCPUs: 10, CpusPerCore: 2, AvailablePhysicalCPUs: 8, CausedByPhysicalCPUs: true},
expCPUAlloc: false,
expCSet: cpuset.New(),
},
@ -1188,6 +1188,44 @@ func TestStaticPolicyOptions(t *testing.T) {
}
}
func TestSMTAlignmentErrorText(t *testing.T) {
type smtErrTestCase struct {
name string
err SMTAlignmentError
expected string
}
testCases := []smtErrTestCase{
{
name: "base SMT alignment error",
err: SMTAlignmentError{RequestedCPUs: 15, CpusPerCore: 4},
expected: `SMT Alignment Error: requested 15 cpus not multiple cpus per core = 4`,
},
{
// Note the explicit 0. The intent is to signal the lack of physical CPUs, but
// in the corner case of no available physical CPUs at all, without the explicit
// flag we cannot distinguish the case, and before PR#127959 we printed the old message
name: "base SMT alignment error, no physical CPUs, missing flag",
err: SMTAlignmentError{RequestedCPUs: 4, CpusPerCore: 2, AvailablePhysicalCPUs: 0},
expected: `SMT Alignment Error: requested 4 cpus not multiple cpus per core = 2`,
},
{
name: "base SMT alignment error, no physical CPUs, explicit flag",
err: SMTAlignmentError{RequestedCPUs: 4, CpusPerCore: 2, AvailablePhysicalCPUs: 0, CausedByPhysicalCPUs: true},
expected: `SMT Alignment Error: not enough free physical CPUs: available physical CPUs = 0, requested CPUs = 4, CPUs per core = 2`,
},
}
for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
got := testCase.err.Error()
if got != testCase.expected {
t.Errorf("got=%v expected=%v", got, testCase.expected)
}
})
}
}
func newCPUSetPtr(cpus ...int) *cpuset.CPUSet {
ret := cpuset.New(cpus...)
return &ret