From 9ea9798759d60b90e3470041206155fc2fc37091 Mon Sep 17 00:00:00 2001 From: Artyom Lukianov Date: Tue, 31 Aug 2021 17:46:59 +0300 Subject: [PATCH] kubelet: memory manager: fix topology preferred topology hints calculation Prevent starting pods with resources satisfied by a single NUMA node on multiple NUMA nodes. The code returned before it updated the minimal amount of NUMA nodes that can satisfy the container requests. Signed-off-by: Artyom Lukianov --- pkg/kubelet/cm/memorymanager/policy_static.go | 40 ++++---- .../cm/memorymanager/policy_static_test.go | 98 +++++++++++++++++++ 2 files changed, 119 insertions(+), 19 deletions(-) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index d2817de1cab..c7a94df58eb 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -438,28 +438,10 @@ func (p *staticPolicy) calculateHints(machineState state.NUMANodeMap, pod *v1.Po maskBits := mask.GetBits() singleNUMAHint := len(maskBits) == 1 - // the node already in group with another node, it can not be used for the single NUMA node allocation - if singleNUMAHint && len(machineState[maskBits[0]].Cells) > 1 { - return - } - totalFreeSize := map[v1.ResourceName]uint64{} totalAllocatableSize := map[v1.ResourceName]uint64{} - // calculate total free memory for the node mask + // calculate total free and allocatable memory for the node mask for _, nodeID := range maskBits { - // the node already used for the memory allocation - if !singleNUMAHint && machineState[nodeID].NumberOfAssignments > 0 { - // the node used for the single NUMA memory allocation, it can not be used for the multi NUMA node allocation - if len(machineState[nodeID].Cells) == 1 { - return - } - - // the node already used with different group of nodes, it can not be use with in the current hint - if !areGroupsEqual(machineState[nodeID].Cells, maskBits) { - return - } - } - for resourceName := range requestedResources { if _, ok := totalFreeSize[resourceName]; !ok { totalFreeSize[resourceName] = 0 @@ -485,6 +467,26 @@ func (p *staticPolicy) calculateHints(machineState state.NUMANodeMap, pod *v1.Po minAffinitySize = mask.Count() } + // the node already in group with another node, it can not be used for the single NUMA node allocation + if singleNUMAHint && len(machineState[maskBits[0]].Cells) > 1 { + return + } + + for _, nodeID := range maskBits { + // the node already used for the memory allocation + if !singleNUMAHint && machineState[nodeID].NumberOfAssignments > 0 { + // the node used for the single NUMA memory allocation, it can not be used for the multi NUMA node allocation + if len(machineState[nodeID].Cells) == 1 { + return + } + + // the node already used with different group of nodes, it can not be use with in the current hint + if !areGroupsEqual(machineState[nodeID].Cells, maskBits) { + return + } + } + } + // verify that for all memory types the node mask has enough free resources for resourceName, requestedSize := range requestedResources { podReusableMemory := p.getPodReusableMemory(pod, mask, resourceName) diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index d0d93321639..ad320403446 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -2872,6 +2872,104 @@ func TestStaticPolicyGetTopologyHints(t *testing.T) { }, expectedTopologyHints: nil, }, + { + description: "should not return preferred hints with multiple NUMA nodes for the pod with resources satisfied by a single NUMA node", + assignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0, 1}, + Type: v1.ResourceMemory, + Size: 2 * gb, + }, + { + NUMAAffinity: []int{0, 1}, + Type: hugepages2M, + Size: 24 * mb, + }, + }, + }, + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: 0, + Reserved: 1536 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2 * gb, + }, + hugepages2M: { + Allocatable: 20 * mb, + Free: 0, + Reserved: 20 * mb, + SystemReserved: 0, + TotalMemSize: 20 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: gb, + Reserved: 512 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2 * gb, + }, + hugepages2M: { + Allocatable: 20 * mb, + Free: 16 * mb, + Reserved: 4 * mb, + SystemReserved: 0, + TotalMemSize: 20 * mb, + }, + }, + Cells: []int{0, 1}, + NumberOfAssignments: 2, + }, + }, + pod: getPod("pod2", + "container2", + &v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1000Mi"), + v1.ResourceMemory: resource.MustParse("1Gi"), + hugepages2M: resource.MustParse("16Mi"), + }, + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1000Mi"), + v1.ResourceMemory: resource.MustParse("1Gi"), + hugepages2M: resource.MustParse("16Mi"), + }, + }, + ), + systemReserved: systemReservedMemory{ + 0: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + 1: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + }, + expectedTopologyHints: map[string][]topologymanager.TopologyHint{ + string(v1.ResourceMemory): { + { + NUMANodeAffinity: newNUMAAffinity(0, 1), + Preferred: false, + }, + }, + hugepages2M: { + { + NUMANodeAffinity: newNUMAAffinity(0, 1), + Preferred: false, + }, + }, + }, + }, } for _, testCase := range testCases {