diff --git a/pkg/kubelet/cm/cpumanager/cpu_assignment.go b/pkg/kubelet/cm/cpumanager/cpu_assignment.go index 1a844495450..b23d021cfeb 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_assignment.go +++ b/pkg/kubelet/cm/cpumanager/cpu_assignment.go @@ -482,7 +482,13 @@ func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.C // evenly distributed and remainder CPUs are allocated. The subset with the // lowest "balance score" will receive the CPUs in order to keep the overall // allocation of CPUs as "balanced" as possible. -func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) { +// +// NOTE: This algorithm has been generalized to take an additional +// 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups +// of size 'cpuGroupSize' according to the algorithm described above. This is +// important, for example, to ensure that all CPUs (i.e. all hyperthreads) from +// a single core are allocated together. +func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) { acc := newCPUAccumulator(topo, availableCPUs, numCPUs) if acc.isSatisfied() { return acc.result, nil @@ -519,9 +525,20 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu return } - // Check that each NUMA node in this combination can provide - // (at least) numCPUs/len(combo) of the total cpus required. - distribution := numCPUs / len(combo) + // Check that CPUs can be handed out in groups of size + // 'cpuGroupSize' across the NUMA nodes in this combo. + numCPUGroups := 0 + for _, numa := range combo { + numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize) + } + if (numCPUGroups * cpuGroupSize) < numCPUs { + return + } + + // Check that each NUMA node in this combination can allocate an + // even distribution of CPUs in groups of size 'cpuGroupSize', + // modulo some remainder. + distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize for _, numa := range combo { cpus := acc.details.CPUsInNUMANodes(numa) if cpus.Size() < distribution { @@ -529,17 +546,19 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu } } - // Calculate how many CPUs will be available on each NUMA node - // in 'combo' ater allocating an even distribution of CPUs from - // them. This will be used to calculate a "balance" score for the - // combo to help decide which combo should ultimately be chosen. + // Calculate how many CPUs will be available on each NUMA node in + // 'combo' after allocating an even distribution of CPU groups of + // size 'cpuGroupSize' from them. This will be used in the "balance + // score" calculation to help decide if this combo should + // ultimately be chosen. availableAfterAllocation := make(mapIntInt, len(combo)) for _, numa := range combo { availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size() - distribution } // Check if there are any remaining CPUs to distribute across the - // NUMA nodes once CPUs have been evenly distributed. + // NUMA nodes once CPUs have been evenly distributed in groups of + // size 'cpuGroupSize'. remainder := numCPUs - (distribution * len(combo)) // Declare a set of local variables to help track the "balance @@ -558,19 +577,22 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu // Otherwise, find the best "balance score" when allocating the // remainder CPUs across different subsets of NUMA nodes in 'combo'. - acc.iterateCombinations(combo, remainder, func(subset []int) { + // These remainder CPUs are handed out in groups of size 'cpuGroupSize'. + acc.iterateCombinations(combo, remainder/cpuGroupSize, func(subset []int) { // Make a local copy of 'availableAfterAllocation'. availableAfterAllocation := availableAfterAllocation.Clone() - // For all NUMA nodes in 'subset', remove 1 more CPU (to account - // for any remainder CPUs that will be allocated on them. + // For all NUMA nodes in 'subset', remove another + // 'cpuGroupSize' number of CPUs (to account for any remainder + // CPUs that will be allocated on them). for _, numa := range subset { - availableAfterAllocation[numa] -= 1 + availableAfterAllocation[numa] -= cpuGroupSize } // Calculate the "balance score" as the standard deviation of // the number of CPUs available on all NUMA nodes in 'combo' - // assuming the remainder CPUs are spread across 'subset'. + // after the remainder CPUs have been allocated across 'subset' + // in groups of size 'cpuGroupSize'. balance := standardDeviation(availableAfterAllocation.Values()) if balance < bestLocalBalance { bestLocalBalance = balance @@ -596,16 +618,18 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu } // Otherwise, start allocating CPUs from the NUMA node combination - // chosen. First allocate numCPUs / len(bestCombo) CPUs from each node. - distribution := numCPUs / len(bestCombo) + // chosen. First allocate an even distribution of CPUs in groups of + // size 'cpuGroupSize' from 'bestCombo'. + distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize for _, numa := range bestCombo { cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution) acc.take(cpus) } - // Then allocate any remaining CPUs from each NUMA node in the remainder set. + // Then allocate any remaining CPUs in groups of size 'cpuGroupSize' + // from each NUMA node in the remainder set. for _, numa := range bestRemainder { - cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), 1) + cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize) acc.take(cpus) } diff --git a/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go b/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go index 4432be5aa8c..59059feb0cc 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go +++ b/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go @@ -666,14 +666,39 @@ func TestTakeByTopologyNUMAPacked(t *testing.T) { } } -func TestTakeByTopologyNUMADistributed(t *testing.T) { +type takeByTopologyExtendedTestCase struct { + description string + topo *topology.CPUTopology + availableCPUs cpuset.CPUSet + numCPUs int + cpuGroupSize int + expErr string + expResult cpuset.CPUSet +} + +func commonTakeByTopologyExtendedTestCases(t *testing.T) []takeByTopologyExtendedTestCase { + var extendedTestCases []takeByTopologyExtendedTestCase + testCases := commonTakeByTopologyTestCases(t) - testCases = append(testCases, []takeByTopologyTestCase{ + for _, tc := range testCases { + extendedTestCases = append(extendedTestCases, takeByTopologyExtendedTestCase{ + tc.description, + tc.topo, + tc.availableCPUs, + tc.numCPUs, + 1, + tc.expErr, + tc.expResult, + }) + } + + extendedTestCases = append(extendedTestCases, []takeByTopologyExtendedTestCase{ { "allocate 4 full cores with 2 distributed across each NUMA node", topoDualSocketHT, mustParseCPUSet(t, "0-11"), 8, + 1, "", mustParseCPUSet(t, "0,6,2,8,1,7,3,9"), }, @@ -682,6 +707,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-79"), 64, + 1, "", mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-47,50-57,60-67,70-77"), }, @@ -690,6 +716,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-79"), 48, + 1, "", mustParseCPUSet(t, "0-7,10-17,20-27,40-47,50-57,60-67"), }, @@ -698,6 +725,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "2-39,42-79"), 48, + 1, "", mustParseCPUSet(t, "2-9,10-17,20-27,42-49,50-57,60-67"), }, @@ -706,6 +734,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "3-39,43-79"), 48, + 1, "", mustParseCPUSet(t, "10-17,20-27,30-37,50-57,60-67,70-77"), }, @@ -714,6 +743,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-2,10-12,20-22,30-32,40-41,50-51,60-61,70-71"), 16, + 1, "", mustParseCPUSet(t, "0-1,10-11,20-21,30-31,40-41,50-51,60-61,70-71"), }, @@ -722,38 +752,77 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) { topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-2,10-12,20-22,30-32,40-41,50-51,60-61,70-71"), 16, + 1, "", mustParseCPUSet(t, "0-1,10-11,20-21,30-31,40-41,50-51,60-61,70-71"), }, + }...) + + return extendedTestCases +} + +func TestTakeByTopologyNUMADistributed(t *testing.T) { + testCases := commonTakeByTopologyExtendedTestCases(t) + testCases = append(testCases, []takeByTopologyExtendedTestCase{ { "allocate 13 full cores distributed across the first 2 NUMA nodes", topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-79"), 26, + 1, "", mustParseCPUSet(t, "0-6,10-16,40-45,50-55"), }, + { + "allocate 13 full cores distributed across the first 2 NUMA nodes (cpuGroupSize 2)", + topoDualSocketMultiNumaPerSocketHT, + mustParseCPUSet(t, "0-79"), + 26, + 2, + "", + mustParseCPUSet(t, "0-6,10-15,40-46,50-55"), + }, { "allocate 31 full cores with 15 CPUs distributed across each NUMA node and 1 CPU spilling over to each of NUMA 0, 1", topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-79"), 62, + 1, "", mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-47,50-57,60-66,70-76"), }, + { + "allocate 31 full cores with 14 CPUs distributed across each NUMA node and 2 CPUs spilling over to each of NUMA 0, 1, 2 (cpuGroupSize 2)", + topoDualSocketMultiNumaPerSocketHT, + mustParseCPUSet(t, "0-79"), + 62, + 2, + "", + mustParseCPUSet(t, "0-7,10-17,20-27,30-36,40-47,50-57,60-67,70-76"), + }, { "allocate 31 full cores with 15 CPUs distributed across each NUMA node and 1 CPU spilling over to each of NUMA 2, 3 (to keep balance)", topoDualSocketMultiNumaPerSocketHT, mustParseCPUSet(t, "0-8,10-18,20-39,40-48,50-58,60-79"), 62, + 1, "", mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-46,50-56,60-67,70-77"), }, + { + "allocate 31 full cores with 14 CPUs distributed across each NUMA node and 2 CPUs spilling over to each of NUMA 0, 2, 3 (to keep balance with cpuGroupSize 2)", + topoDualSocketMultiNumaPerSocketHT, + mustParseCPUSet(t, "0-8,10-18,20-39,40-48,50-58,60-79"), + 62, + 2, + "", + mustParseCPUSet(t, "0-7,10-16,20-27,30-37,40-47,50-56,60-67,70-77"), + }, }...) for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - result, err := takeByTopologyNUMADistributed(tc.topo, tc.availableCPUs, tc.numCPUs) + result, err := takeByTopologyNUMADistributed(tc.topo, tc.availableCPUs, tc.numCPUs, tc.cpuGroupSize) if tc.expErr != "" && err.Error() != tc.expErr { t.Errorf("expected error to be [%v] but it was [%v]", tc.expErr, err) }