diff --git a/pkg/kubelet/cm/memorymanager/policy_static.go b/pkg/kubelet/cm/memorymanager/policy_static.go index 130b62ab637..ca267515cb2 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static.go +++ b/pkg/kubelet/cm/memorymanager/policy_static.go @@ -157,6 +157,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai bestHint = extendedHint } + // the best hint might violate the NUMA allocation rule on which + // NUMA node cannot have both single and cross NUMA node allocations + // https://kubernetes.io/blog/2021/08/11/kubernetes-1-22-feature-memory-manager-moves-to-beta/#single-vs-cross-numa-node-allocation + if isAffinityViolatingNUMAAllocations(machineState, bestHint.NUMANodeAffinity) { + return fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation") + } + var containerBlocks []state.Block maskBits := bestHint.NUMANodeAffinity.GetBits() for resourceName, requestedSize := range requestedResources { @@ -992,3 +999,26 @@ func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool { return bitMask1.IsEqual(bitMask2) } + +func isAffinityViolatingNUMAAllocations(machineState state.NUMANodeMap, mask bitmask.BitMask) bool { + maskBits := mask.GetBits() + singleNUMAHint := len(maskBits) == 1 + for _, nodeID := range mask.GetBits() { + // the node was never used for the memory allocation + if machineState[nodeID].NumberOfAssignments == 0 { + continue + } + if singleNUMAHint { + continue + } + // the node used for the single NUMA memory allocation, it cannot be used for the multi NUMA node allocation + if len(machineState[nodeID].Cells) == 1 { + return true + } + // the node already used with a different group of nodes, it cannot be used within the current hint + if !areGroupsEqual(machineState[nodeID].Cells, maskBits) { + return true + } + } + return false +} diff --git a/pkg/kubelet/cm/memorymanager/policy_static_test.go b/pkg/kubelet/cm/memorymanager/policy_static_test.go index 7b105591580..f05b2905552 100644 --- a/pkg/kubelet/cm/memorymanager/policy_static_test.go +++ b/pkg/kubelet/cm/memorymanager/policy_static_test.go @@ -1765,6 +1765,126 @@ func TestStaticPolicyAllocate(t *testing.T) { pod: getPod("pod1", "container1", requirementsGuaranteed), topologyHint: &topologymanager.TopologyHint{Preferred: true}, }, + { + description: "should validate NUMA node can not have both single and cross NUMA node memory allocations", + assignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0}, + Type: v1.ResourceMemory, + Size: 1024 * mb, + }, + }, + }, + }, + expectedAssignments: state.ContainerMemoryAssignments{ + "pod1": map[string][]state.Block{ + "container1": { + { + NUMAAffinity: []int{0}, + Type: v1.ResourceMemory, + Size: 1024 * mb, + }, + }, + }, + }, + machineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: 512 * mb, + Reserved: 1024 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{0}, + NumberOfAssignments: 1, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 512 * mb, + Free: 512 * mb, + Reserved: 0, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{1}, + NumberOfAssignments: 0, + }, + }, + expectedMachineState: state.NUMANodeMap{ + 0: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 1536 * mb, + Free: 512 * mb, + Reserved: 1024 * mb, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{0}, + NumberOfAssignments: 1, + }, + 1: &state.NUMANodeState{ + MemoryMap: map[v1.ResourceName]*state.MemoryTable{ + v1.ResourceMemory: { + Allocatable: 512 * mb, + Free: 512 * mb, + Reserved: 0, + SystemReserved: 512 * mb, + TotalMemSize: 2176 * mb, + }, + hugepages1Gi: { + Allocatable: gb, + Free: gb, + Reserved: 0, + SystemReserved: 0, + TotalMemSize: gb, + }, + }, + Cells: []int{1}, + NumberOfAssignments: 0, + }, + }, + systemReserved: systemReservedMemory{ + 0: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + 1: map[v1.ResourceName]uint64{ + v1.ResourceMemory: 512 * mb, + }, + }, + pod: getPod("pod2", "container1", requirementsGuaranteed), + topologyHint: &topologymanager.TopologyHint{NUMANodeAffinity: newNUMAAffinity(0, 1), Preferred: true}, + expectedError: fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation"), + }, } for _, testCase := range testCases {