Generalize the NUMA distribution algorithm to take cpuGroupSize

This parameter ensures that CPUs are always allocated in groups of size 'cpuGroupSize'. This is important, for example, to ensure that all CPUs (i.e. hyperthreads) from the same core are handed out together. Signed-off-by: Kevin Klues <kklues@nvidia.com>
2025-07-29 14:37:00 +00:00 · 2021-10-14 23:20:05 +00:00 · 2021-10-14 23:20:05 +00:00 · d54445a84d
commit d54445a84d
parent 1436e33642
2 changed files with 114 additions and 21 deletions
--- a/pkg/kubelet/cm/cpumanager/cpu_assignment.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_assignment.go
@ -482,7 +482,13 @@ func takeByTopologyNUMAPacked(topo *topology.CPUTopology, availableCPUs cpuset.C
 // evenly distributed and remainder CPUs are allocated. The subset with the
 // lowest "balance score" will receive the CPUs in order to keep the overall
 // allocation of CPUs as "balanced" as possible.
-func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
+//
+// NOTE: This algorithm has been generalized to take an additional
+// 'cpuGroupSize' parameter to ensure that CPUs are always allocated in groups
+// of size 'cpuGroupSize' according to the algorithm described above. This is
+// important, for example, to ensure that all CPUs (i.e. all hyperthreads) from
+// a single core are allocated together.
+func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpuset.CPUSet, numCPUs int, cpuGroupSize int) (cpuset.CPUSet, error) {
 	acc := newCPUAccumulator(topo, availableCPUs, numCPUs)
 	if acc.isSatisfied() {
 		return acc.result, nil
@ -519,9 +525,20 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 				return
 			}

-			// Check that each NUMA node in this combination can provide
-			// (at least) numCPUs/len(combo) of the total cpus required.
-			distribution := numCPUs / len(combo)
+			// Check that CPUs can be handed out in groups of size
+			// 'cpuGroupSize' across the NUMA nodes in this combo.
+			numCPUGroups := 0
+			for _, numa := range combo {
+				numCPUGroups += (acc.details.CPUsInNUMANodes(numa).Size() / cpuGroupSize)
+			}
+			if (numCPUGroups * cpuGroupSize) < numCPUs {
+				return
+			}
+
+			// Check that each NUMA node in this combination can allocate an
+			// even distribution of CPUs in groups of size 'cpuGroupSize',
+			// modulo some remainder.
+			distribution := (numCPUs / len(combo) / cpuGroupSize) * cpuGroupSize
 			for _, numa := range combo {
 				cpus := acc.details.CPUsInNUMANodes(numa)
 				if cpus.Size() < distribution {
@ -529,17 +546,19 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 				}
 			}

-			// Calculate how many CPUs will be available on each NUMA node
-			// in 'combo' ater allocating an even distribution of CPUs from
-			// them. This will be used to calculate a "balance" score for the
-			// combo to help decide which combo should ultimately be chosen.
+			// Calculate how many CPUs will be available on each NUMA node in
+			// 'combo' after allocating an even distribution of CPU groups of
+			// size 'cpuGroupSize' from them. This will be used in the "balance
+			// score" calculation to help decide if this combo should
+			// ultimately be chosen.
 			availableAfterAllocation := make(mapIntInt, len(combo))
 			for _, numa := range combo {
 				availableAfterAllocation[numa] = acc.details.CPUsInNUMANodes(numa).Size() - distribution
 			}

 			// Check if there are any remaining CPUs to distribute across the
-			// NUMA nodes once CPUs have been evenly distributed.
+			// NUMA nodes once CPUs have been evenly distributed in groups of
+			// size 'cpuGroupSize'.
 			remainder := numCPUs - (distribution * len(combo))

 			// Declare a set of local variables to help track the "balance
@ -558,19 +577,22 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu

 			// Otherwise, find the best "balance score" when allocating the
 			// remainder CPUs across different subsets of NUMA nodes in 'combo'.
-			acc.iterateCombinations(combo, remainder, func(subset []int) {
+			// These remainder CPUs are handed out in groups of size 'cpuGroupSize'.
+			acc.iterateCombinations(combo, remainder/cpuGroupSize, func(subset []int) {
 				// Make a local copy of 'availableAfterAllocation'.
 				availableAfterAllocation := availableAfterAllocation.Clone()

-				// For all NUMA nodes in 'subset', remove 1 more CPU (to account
-				// for any remainder CPUs that will be allocated on them.
+				// For all NUMA nodes in 'subset', remove another
+				// 'cpuGroupSize' number of CPUs (to account for any remainder
+				// CPUs that will be allocated on them).
 				for _, numa := range subset {
-					availableAfterAllocation[numa] -= 1
+					availableAfterAllocation[numa] -= cpuGroupSize
 				}

 				// Calculate the "balance score" as the standard deviation of
 				// the number of CPUs available on all NUMA nodes in 'combo'
-				// assuming the remainder CPUs are spread across 'subset'.
+				// after the remainder CPUs have been allocated across 'subset'
+				// in groups of size 'cpuGroupSize'.
 				balance := standardDeviation(availableAfterAllocation.Values())
 				if balance < bestLocalBalance {
 					bestLocalBalance = balance
@ -596,16 +618,18 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 		}

 		// Otherwise, start allocating CPUs from the NUMA node combination
-		// chosen. First allocate numCPUs / len(bestCombo) CPUs from each node.
-		distribution := numCPUs / len(bestCombo)
+		// chosen. First allocate an even distribution of CPUs in groups of
+		// size 'cpuGroupSize' from 'bestCombo'.
+		distribution := (numCPUs / len(bestCombo) / cpuGroupSize) * cpuGroupSize
 		for _, numa := range bestCombo {
 			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), distribution)
 			acc.take(cpus)
 		}

-		// Then allocate any remaining CPUs from each NUMA node in the remainder set.
+		// Then allocate any remaining CPUs in groups of size 'cpuGroupSize'
+		// from each NUMA node in the remainder set.
 		for _, numa := range bestRemainder {
-			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), 1)
+			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize)
 			acc.take(cpus)
 		}

--- a/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_assignment_test.go
@ -666,14 +666,39 @@ func TestTakeByTopologyNUMAPacked(t *testing.T) {
 	}
 }

-func TestTakeByTopologyNUMADistributed(t *testing.T) {
+type takeByTopologyExtendedTestCase struct {
+	description   string
+	topo          *topology.CPUTopology
+	availableCPUs cpuset.CPUSet
+	numCPUs       int
+	cpuGroupSize  int
+	expErr        string
+	expResult     cpuset.CPUSet
+}
+
+func commonTakeByTopologyExtendedTestCases(t *testing.T) []takeByTopologyExtendedTestCase {
+	var extendedTestCases []takeByTopologyExtendedTestCase
+
 	testCases := commonTakeByTopologyTestCases(t)
-	testCases = append(testCases, []takeByTopologyTestCase{
+	for _, tc := range testCases {
+		extendedTestCases = append(extendedTestCases, takeByTopologyExtendedTestCase{
+			tc.description,
+			tc.topo,
+			tc.availableCPUs,
+			tc.numCPUs,
+			1,
+			tc.expErr,
+			tc.expResult,
+		})
+	}
+
+	extendedTestCases = append(extendedTestCases, []takeByTopologyExtendedTestCase{
 		{
 			"allocate 4 full cores with 2 distributed across each NUMA node",
 			topoDualSocketHT,
 			mustParseCPUSet(t, "0-11"),
 			8,
+			1,
 			"",
 			mustParseCPUSet(t, "0,6,2,8,1,7,3,9"),
 		},
@ -682,6 +707,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-79"),
 			64,
+			1,
 			"",
 			mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-47,50-57,60-67,70-77"),
 		},
@ -690,6 +716,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-79"),
 			48,
+			1,
 			"",
 			mustParseCPUSet(t, "0-7,10-17,20-27,40-47,50-57,60-67"),
 		},
@ -698,6 +725,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "2-39,42-79"),
 			48,
+			1,
 			"",
 			mustParseCPUSet(t, "2-9,10-17,20-27,42-49,50-57,60-67"),
 		},
@ -706,6 +734,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "3-39,43-79"),
 			48,
+			1,
 			"",
 			mustParseCPUSet(t, "10-17,20-27,30-37,50-57,60-67,70-77"),
 		},
@ -714,6 +743,7 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-2,10-12,20-22,30-32,40-41,50-51,60-61,70-71"),
 			16,
+			1,
 			"",
 			mustParseCPUSet(t, "0-1,10-11,20-21,30-31,40-41,50-51,60-61,70-71"),
 		},
@ -722,38 +752,77 @@ func TestTakeByTopologyNUMADistributed(t *testing.T) {
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-2,10-12,20-22,30-32,40-41,50-51,60-61,70-71"),
 			16,
+			1,
 			"",
 			mustParseCPUSet(t, "0-1,10-11,20-21,30-31,40-41,50-51,60-61,70-71"),
 		},
+	}...)
+
+	return extendedTestCases
+}
+
+func TestTakeByTopologyNUMADistributed(t *testing.T) {
+	testCases := commonTakeByTopologyExtendedTestCases(t)
+	testCases = append(testCases, []takeByTopologyExtendedTestCase{
 		{
 			"allocate 13 full cores distributed across the first 2 NUMA nodes",
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-79"),
 			26,
+			1,
 			"",
 			mustParseCPUSet(t, "0-6,10-16,40-45,50-55"),
 		},
+		{
+			"allocate 13 full cores distributed across the first 2 NUMA nodes (cpuGroupSize 2)",
+			topoDualSocketMultiNumaPerSocketHT,
+			mustParseCPUSet(t, "0-79"),
+			26,
+			2,
+			"",
+			mustParseCPUSet(t, "0-6,10-15,40-46,50-55"),
+		},
 		{
 			"allocate 31 full cores with 15 CPUs distributed across each NUMA node and 1 CPU spilling over to each of NUMA 0, 1",
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-79"),
 			62,
+			1,
 			"",
 			mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-47,50-57,60-66,70-76"),
 		},
+		{
+			"allocate 31 full cores with 14 CPUs distributed across each NUMA node and 2 CPUs spilling over to each of NUMA 0, 1, 2 (cpuGroupSize 2)",
+			topoDualSocketMultiNumaPerSocketHT,
+			mustParseCPUSet(t, "0-79"),
+			62,
+			2,
+			"",
+			mustParseCPUSet(t, "0-7,10-17,20-27,30-36,40-47,50-57,60-67,70-76"),
+		},
 		{
 			"allocate 31 full cores with 15 CPUs distributed across each NUMA node and 1 CPU spilling over to each of NUMA 2, 3 (to keep balance)",
 			topoDualSocketMultiNumaPerSocketHT,
 			mustParseCPUSet(t, "0-8,10-18,20-39,40-48,50-58,60-79"),
 			62,
+			1,
 			"",
 			mustParseCPUSet(t, "0-7,10-17,20-27,30-37,40-46,50-56,60-67,70-77"),
 		},
+		{
+			"allocate 31 full cores with 14 CPUs distributed across each NUMA node and 2 CPUs spilling over to each of NUMA 0, 2, 3 (to keep balance with cpuGroupSize 2)",
+			topoDualSocketMultiNumaPerSocketHT,
+			mustParseCPUSet(t, "0-8,10-18,20-39,40-48,50-58,60-79"),
+			62,
+			2,
+			"",
+			mustParseCPUSet(t, "0-7,10-16,20-27,30-37,40-47,50-56,60-67,70-77"),
+		},
 	}...)

 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
-			result, err := takeByTopologyNUMADistributed(tc.topo, tc.availableCPUs, tc.numCPUs)
+			result, err := takeByTopologyNUMADistributed(tc.topo, tc.availableCPUs, tc.numCPUs, tc.cpuGroupSize)
 			if tc.expErr != "" && err.Error() != tc.expErr {
 				t.Errorf("expected error to be [%v] but it was [%v]", tc.expErr, err)
 			}