From 031f11513dd5007d1f631b59c5d6eb63a73685bf Mon Sep 17 00:00:00 2001
From: Kevin Klues <kklues@nvidia.com>
Date: Mon, 22 Nov 2021 18:18:26 +0000
Subject: [PATCH] Fix accounting bug in CPUManager distribute NUMA policy

Without this fix, the algorithm may decide to allocate "remainder" CPUs from a
NUMA node that has no more CPUs to allocate. Moreover, it was only considering
allocation of remainder CPUs from NUMA nodes such that each NUMA node in the
remainderSet could only allocate 1 (i.e. 'cpuGroupSize') more CPUs. With these
two issues in play, one could end up with an accounting error where not enough
CPUs were allocated by the time the algorithm runs to completion.

The updated algorithm will now omit any NUMA nodes that have 0 CPUs left from
the set of NUMA nodes considered for allocating remainder CPUs. Additionally,
we now consider *all* combinations of nodes from the remainder set of size
1..len(remainderSet). This allows us to find a better solution if allocating
CPUs from a smaller set leads to a more balanced allocation. Finally, we loop
through all NUMA nodes 1-by-1 in the remainderSet until all rmeainer CPUs have
been accounted for and allocated. This ensure that we will not hit an
accounting error later on because we explicitly remove CPUs from the remainder
set until there are none left.

A follow-on commit adds a set of unit tests that will fail before these
changes, but succeeds after them.

Signed-off-by: Kevin Klues <kklues@nvidia.com>
---
 pkg/kubelet/cm/cpumanager/cpu_assignment.go | 96 +++++++++++++++------
 1 file changed, 70 insertions(+), 26 deletions(-)

diff --git a/pkg/kubelet/cm/cpumanager/cpu_assignment.go b/pkg/kubelet/cm/cpumanager/cpu_assignment.go
index 578b0415549..01ddf57bb7b 100644
--- a/pkg/kubelet/cm/cpumanager/cpu_assignment.go
+++ b/pkg/kubelet/cm/cpumanager/cpu_assignment.go
@@ -638,9 +638,20 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 			// size 'cpuGroupSize'.
 			remainder := numCPUs - (distribution * len(combo))
 
+			// Get a list of NUMA nodes to consider pulling the remainder CPUs
+			// from. This list excludes NUMA nodes that don't have at least
+			// 'cpuGroupSize' CPUs available after being allocated
+			// 'distribution' number of CPUs.
+			var remainderCombo []int
+			for _, numa := range combo {
+				if availableAfterAllocation[numa] >= cpuGroupSize {
+					remainderCombo = append(remainderCombo, numa)
+				}
+			}
+
 			// Declare a set of local variables to help track the "balance
-			// scores" calculated when using different subsets of 'combo' to
-			// allocate remainder CPUs from.
+			// scores" calculated when using different subsets of
+			// 'remainderCombo' to allocate remainder CPUs from.
 			var bestLocalBalance float64 = math.MaxFloat64
 			var bestLocalRemainder []int = nil
 
@@ -653,31 +664,54 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 			}
 
 			// Otherwise, find the best "balance score" when allocating the
-			// remainder CPUs across different subsets of NUMA nodes in 'combo'.
+			// remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'.
 			// These remainder CPUs are handed out in groups of size 'cpuGroupSize'.
-			acc.iterateCombinations(combo, remainder/cpuGroupSize, func(subset []int) LoopControl {
-				// Make a local copy of 'availableAfterAllocation'.
-				availableAfterAllocation := availableAfterAllocation.Clone()
+			// We start from k=len(remainderCombo) and walk down to k=1 so that
+			// we continue to distribute CPUs as much as possible across
+			// multiple NUMA nodes.
+			for k := len(remainderCombo); remainder > 0 && k >= 1; k-- {
+				acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl {
+					// Make a local copy of 'remainder'.
+					remainder := remainder
 
-				// For all NUMA nodes in 'subset', remove another
-				// 'cpuGroupSize' number of CPUs (to account for any remainder
-				// CPUs that will be allocated on them).
-				for _, numa := range subset {
-					availableAfterAllocation[numa] -= cpuGroupSize
-				}
+					// Make a local copy of 'availableAfterAllocation'.
+					availableAfterAllocation := availableAfterAllocation.Clone()
 
-				// Calculate the "balance score" as the standard deviation of
-				// the number of CPUs available on all NUMA nodes in the system
-				// after the remainder CPUs have been allocated across 'subset'
-				// in groups of size 'cpuGroupSize'.
-				balance := standardDeviation(availableAfterAllocation.Values())
-				if balance < bestLocalBalance {
-					bestLocalBalance = balance
-					bestLocalRemainder = subset
-				}
+					// If this subset is not capable of allocating all
+					// remainder CPUs, continue to the next one.
+					if sum(availableAfterAllocation.Values(subset...)) < remainder {
+						return Continue
+					}
 
-				return Continue
-			})
+					// For all NUMA nodes in 'subset', walk through them,
+					// removing 'cpuGroupSize' number of CPUs from each
+					// until all remainder CPUs have been accounted for.
+					for remainder > 0 {
+						for _, numa := range subset {
+							if remainder == 0 {
+								break
+							}
+							if availableAfterAllocation[numa] < cpuGroupSize {
+								continue
+							}
+							availableAfterAllocation[numa] -= cpuGroupSize
+							remainder -= cpuGroupSize
+						}
+					}
+
+					// Calculate the "balance score" as the standard deviation
+					// of the number of CPUs available on all NUMA nodes in the
+					// system after the remainder CPUs have been allocated
+					// across 'subset' in groups of size 'cpuGroupSize'.
+					balance := standardDeviation(availableAfterAllocation.Values())
+					if balance < bestLocalBalance {
+						bestLocalBalance = balance
+						bestLocalRemainder = subset
+					}
+
+					return Continue
+				})
+			}
 
 			// If the best "balance score" for this combo is less than the
 			// lowest "balance score" of all previous combos, then update this
@@ -709,9 +743,19 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu
 
 		// Then allocate any remaining CPUs in groups of size 'cpuGroupSize'
 		// from each NUMA node in the remainder set.
-		for _, numa := range bestRemainder {
-			cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize)
-			acc.take(cpus)
+		remainder := numCPUs - (distribution * len(bestCombo))
+		for remainder > 0 {
+			for _, numa := range bestRemainder {
+				if remainder == 0 {
+					break
+				}
+				if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize {
+					continue
+				}
+				cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize)
+				acc.take(cpus)
+				remainder -= cpuGroupSize
+			}
 		}
 
 		// If we haven't allocated all of our CPUs at this point, then something