From 37399c8d4fade6a707cb96b5cf098bf517865dfb Mon Sep 17 00:00:00 2001 From: Talor Itzhak Date: Wed, 21 Feb 2024 18:54:11 +0200 Subject: [PATCH 1/2] memorymanager: avoid violating NUMA node memory allocation rule According to https://kubernetes.io/blog/2021/08/11/kubernetes-1-22-feature-memory-manager-moves-to-beta/#single-vs-cross-numa-node-allocation and to the design introduce in the memory manager KEP: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/1769-memory-manager the NUMA node can not have both single and cross NUMA node allocations. There're cases when the chosen affinity hint does not align with the rule above but the pod get admitted anyway. The implications are incosistent admission/rejection behaviors from memory manager side. In order to fix the issue, we should validate that the affinity hint that has been chosen is not violating the above rule. Signed-off-by: Talor Itzhak --- pkg/kubelet/cm/memorymanager/policy_static.go | 30 +++++ .../cm/memorymanager/policy_static_test.go | 120 ++++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 130b62ab637..ca267515cb2 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -157,6 +157,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai bestHint = extendedHint } + // the best hint might violate the NUMA allocation rule on which + // NUMA node cannot have both single and cross NUMA node allocations + // https://kubernetes.io/blog/2021/08/11/kubernetes-1-22-feature-memory-manager-moves-to-beta/#single-vs-cross-numa-node-allocation + if isAffinityViolatingNUMAAllocations(machineState, bestHint.NUMANodeAffinity) { + return fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation") + } + var containerBlocks []state.Block maskBits := bestHint.NUMANodeAffinity.GetBits() for resourceName, requestedSize := range requestedResources { @@ -992,3 +999,26 @@ func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool { return bitMask1.IsEqual(bitMask2) } + +func isAffinityViolatingNUMAAllocations(machineState state.NUMANodeMap, mask bitmask.BitMask) bool { + maskBits := mask.GetBits() + singleNUMAHint := len(maskBits) == 1 + for _, nodeID := range mask.GetBits() { + // the node was never used for the memory allocation + if machineState[nodeID].NumberOfAssignments == 0 { + continue + } + if singleNUMAHint { + continue + } + // the node used for the single NUMA memory allocation, it cannot be used for the multi NUMA node allocation + if len(machineState[nodeID].Cells) == 1 { + return true + } + // the node already used with a different group of nodes, it cannot be used within the current hint + if !areGroupsEqual(machineState[nodeID].Cells, maskBits) { + return true + } + } + return false +} diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index 7b105591580..f05b2905552 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -1765,6 +1765,126 @@ func TestStaticPolicyAllocate(t *testing.T) { pod: getPod("pod1", "container1", requirementsGuaranteed), topologyHint: &topologymanager.TopologyHint{Preferred: true}, }, + { + description: "should validate NUMA node can not have both single and cross NUMA node memory allocations", + assignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0}, + Type: v1.ResourceMemory, + Size: 1024 * mb, + }, + }, + }, + }, + expectedAssignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0}, + Type: v1.ResourceMemory, + Size: 1024 * mb, + }, + }, + }, + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: 512 * mb, + Reserved: 1024 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{0}, + NumberOfAssignments: 1, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 512 * mb, + Free: 512 * mb, + Reserved: 0, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{1}, + NumberOfAssignments: 0, + }, + }, + expectedMachineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: 512 * mb, + Reserved: 1024 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{0}, + NumberOfAssignments: 1, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 512 * mb, + Free: 512 * mb, + Reserved: 0, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{1}, + NumberOfAssignments: 0, + }, + }, + systemReserved: systemReservedMemory{ + 0: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + 1: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + }, + pod: getPod("pod2", "container1", requirementsGuaranteed), + topologyHint: &topologymanager.TopologyHint{NUMANodeAffinity: newNUMAAffinity(0, 1), Preferred: true}, + expectedError: fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation"), + }, } for _, testCase := range testCases { From c019114e410c8bd9d923e9258d2bc71c26fdf11e Mon Sep 17 00:00:00 2001 From: Talor Itzhak Date: Thu, 11 Jul 2024 17:05:15 +0300 Subject: [PATCH 2/2] memorymanager:unit-tests for isAffinityViolatingNUMAAllocations function Signed-off-by: Talor Itzhak --- .../cm/memorymanager/policy_static_test.go | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index f05b2905552..b2e030c8d9a 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -3896,3 +3896,79 @@ func Test_getPodRequestedResources(t *testing.T) { }) } } + +func Test_isAffinityViolatingNUMAAllocations(t *testing.T) { + testsCases := []struct { + description string + machineState map[int]*state.NUMANodeState + topologyHint *topologymanager.TopologyHint + isViolationExpected bool + }{ + { + description: "violating NUMA allocations because given affinity asks for NUMA ID 1 which is on different cells group", + machineState: map[int]*state.NUMANodeState{ + 0: { + NumberOfAssignments: 1, + Cells: []int{0, 1}, + }, + 1: { + NumberOfAssignments: 1, + Cells: []int{0, 1}, + }, + 2: { + NumberOfAssignments: 1, + Cells: []int{2}, + }, + 3: { + NumberOfAssignments: 0, + Cells: []int{3}, + }, + }, + topologyHint: &topologymanager.TopologyHint{ + NUMANodeAffinity: newNUMAAffinity(1, 2), + }, + isViolationExpected: true, + }, + { + description: "violating NUMA allocations because given affinity with multiple nodes asks for NUMA ID 1 which is used for a single NUMA node memory allocation", + machineState: map[int]*state.NUMANodeState{ + 0: { + NumberOfAssignments: 0, + Cells: []int{0, 1}, + }, + 1: { + NumberOfAssignments: 1, + Cells: []int{1}, + }, + }, + topologyHint: &topologymanager.TopologyHint{ + NUMANodeAffinity: newNUMAAffinity(0, 1), + }, + isViolationExpected: true, + }, + { + description: "valid affinity, no prior assignments", + machineState: map[int]*state.NUMANodeState{ + 0: { + NumberOfAssignments: 0, + Cells: []int{0}, + }, + 1: { + NumberOfAssignments: 0, + Cells: []int{1}, + }, + }, + topologyHint: &topologymanager.TopologyHint{ + NUMANodeAffinity: newNUMAAffinity(0, 1), + }, + isViolationExpected: false, + }, + } + for _, tc := range testsCases { + t.Run(tc.description, func(t *testing.T) { + if isAffinityViolatingNUMAAllocations(tc.machineState, tc.topologyHint.NUMANodeAffinity) != tc.isViolationExpected { + t.Errorf("isAffinityViolatingNUMAAllocations with affinity %v expected to return %t, got %t", tc.topologyHint.NUMANodeAffinity.GetBits(), tc.isViolationExpected, !tc.isViolationExpected) + } + }) + } +}