From 031f11513dd5007d1f631b59c5d6eb63a73685bf Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Mon, 22 Nov 2021 18:18:26 +0000 Subject: [PATCH] Fix accounting bug in CPUManager distribute NUMA policy Without this fix, the algorithm may decide to allocate "remainder" CPUs from a NUMA node that has no more CPUs to allocate. Moreover, it was only considering allocation of remainder CPUs from NUMA nodes such that each NUMA node in the remainderSet could only allocate 1 (i.e. 'cpuGroupSize') more CPUs. With these two issues in play, one could end up with an accounting error where not enough CPUs were allocated by the time the algorithm runs to completion. The updated algorithm will now omit any NUMA nodes that have 0 CPUs left from the set of NUMA nodes considered for allocating remainder CPUs. Additionally, we now consider *all* combinations of nodes from the remainder set of size 1..len(remainderSet). This allows us to find a better solution if allocating CPUs from a smaller set leads to a more balanced allocation. Finally, we loop through all NUMA nodes 1-by-1 in the remainderSet until all rmeainer CPUs have been accounted for and allocated. This ensure that we will not hit an accounting error later on because we explicitly remove CPUs from the remainder set until there are none left. A follow-on commit adds a set of unit tests that will fail before these changes, but succeeds after them. Signed-off-by: Kevin Klues --- pkg/kubelet/cm/cpumanager/cpu_assignment.go | 96 +++++++++++++++------ 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_assignment.go b/pkg/kubelet/cm/cpumanager/cpu_assignment.go index 578b0415549..01ddf57bb7b 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_assignment.go +++ b/pkg/kubelet/cm/cpumanager/cpu_assignment.go @@ -638,9 +638,20 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu // size 'cpuGroupSize'. remainder := numCPUs - (distribution * len(combo)) + // Get a list of NUMA nodes to consider pulling the remainder CPUs + // from. This list excludes NUMA nodes that don't have at least + // 'cpuGroupSize' CPUs available after being allocated + // 'distribution' number of CPUs. + var remainderCombo []int + for _, numa := range combo { + if availableAfterAllocation[numa] >= cpuGroupSize { + remainderCombo = append(remainderCombo, numa) + } + } + // Declare a set of local variables to help track the "balance - // scores" calculated when using different subsets of 'combo' to - // allocate remainder CPUs from. + // scores" calculated when using different subsets of + // 'remainderCombo' to allocate remainder CPUs from. var bestLocalBalance float64 = math.MaxFloat64 var bestLocalRemainder []int = nil @@ -653,31 +664,54 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu } // Otherwise, find the best "balance score" when allocating the - // remainder CPUs across different subsets of NUMA nodes in 'combo'. + // remainder CPUs across different subsets of NUMA nodes in 'remainderCombo'. // These remainder CPUs are handed out in groups of size 'cpuGroupSize'. - acc.iterateCombinations(combo, remainder/cpuGroupSize, func(subset []int) LoopControl { - // Make a local copy of 'availableAfterAllocation'. - availableAfterAllocation := availableAfterAllocation.Clone() + // We start from k=len(remainderCombo) and walk down to k=1 so that + // we continue to distribute CPUs as much as possible across + // multiple NUMA nodes. + for k := len(remainderCombo); remainder > 0 && k >= 1; k-- { + acc.iterateCombinations(remainderCombo, k, func(subset []int) LoopControl { + // Make a local copy of 'remainder'. + remainder := remainder - // For all NUMA nodes in 'subset', remove another - // 'cpuGroupSize' number of CPUs (to account for any remainder - // CPUs that will be allocated on them). - for _, numa := range subset { - availableAfterAllocation[numa] -= cpuGroupSize - } + // Make a local copy of 'availableAfterAllocation'. + availableAfterAllocation := availableAfterAllocation.Clone() - // Calculate the "balance score" as the standard deviation of - // the number of CPUs available on all NUMA nodes in the system - // after the remainder CPUs have been allocated across 'subset' - // in groups of size 'cpuGroupSize'. - balance := standardDeviation(availableAfterAllocation.Values()) - if balance < bestLocalBalance { - bestLocalBalance = balance - bestLocalRemainder = subset - } + // If this subset is not capable of allocating all + // remainder CPUs, continue to the next one. + if sum(availableAfterAllocation.Values(subset...)) < remainder { + return Continue + } - return Continue - }) + // For all NUMA nodes in 'subset', walk through them, + // removing 'cpuGroupSize' number of CPUs from each + // until all remainder CPUs have been accounted for. + for remainder > 0 { + for _, numa := range subset { + if remainder == 0 { + break + } + if availableAfterAllocation[numa] < cpuGroupSize { + continue + } + availableAfterAllocation[numa] -= cpuGroupSize + remainder -= cpuGroupSize + } + } + + // Calculate the "balance score" as the standard deviation + // of the number of CPUs available on all NUMA nodes in the + // system after the remainder CPUs have been allocated + // across 'subset' in groups of size 'cpuGroupSize'. + balance := standardDeviation(availableAfterAllocation.Values()) + if balance < bestLocalBalance { + bestLocalBalance = balance + bestLocalRemainder = subset + } + + return Continue + }) + } // If the best "balance score" for this combo is less than the // lowest "balance score" of all previous combos, then update this @@ -709,9 +743,19 @@ func takeByTopologyNUMADistributed(topo *topology.CPUTopology, availableCPUs cpu // Then allocate any remaining CPUs in groups of size 'cpuGroupSize' // from each NUMA node in the remainder set. - for _, numa := range bestRemainder { - cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize) - acc.take(cpus) + remainder := numCPUs - (distribution * len(bestCombo)) + for remainder > 0 { + for _, numa := range bestRemainder { + if remainder == 0 { + break + } + if acc.details.CPUsInNUMANodes(numa).Size() < cpuGroupSize { + continue + } + cpus, _ := takeByTopologyNUMAPacked(acc.topo, acc.details.CPUsInNUMANodes(numa), cpuGroupSize) + acc.take(cpus) + remainder -= cpuGroupSize + } } // If we haven't allocated all of our CPUs at this point, then something