From 2ef1f8107682e35403b42db81e89020c8a89d48d Mon Sep 17 00:00:00 2001 From: kikimo Date: Sun, 16 May 2021 23:30:19 +0800 Subject: [PATCH] Avoid undesirable allocation when device is associated with multiple NUMA Nodes suppose there are two devices dev1 and dev2, each has NUMA Nodes associated as below: dev1: numa1 dev2: numa1, numa2 and we request a device from numa2, currently filterByAffinity() will return [], [dev1, dev2], [] if loop of available devices produce a sequence of [dev1, dev2], that is is not desirable as what we truely expect is an allocation of dev2 from numa2. --- pkg/kubelet/cm/devicemanager/manager.go | 18 +++- pkg/kubelet/cm/devicemanager/manager_test.go | 101 +++++++++++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 1708176fc9b..c691eaea89b 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -791,9 +791,23 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa nodes = append(nodes, node) } - // Sort the list of nodes by how many devices they contain. + // Sort the list of nodes by nodes from affinity set, nodes from none-affinity + // set, nodes without topology set and how many devices they contain. sort.Slice(nodes, func(i, j int) bool { - return perNodeDevices[i].Len() < perNodeDevices[j].Len() + ni, nj := nodes[i], nodes[j] + // 1. node[i] from to affninity set + if hint.NUMANodeAffinity.IsSet(ni) { + return !hint.NUMANodeAffinity.IsSet(nj) || perNodeDevices[ni].Len() < perNodeDevices[nj].Len() + } + + // 2. node[i] from none-affinity set + if ni != nodeWithoutTopology { + return !hint.NUMANodeAffinity.IsSet(nj) && + (nj == nodeWithoutTopology || perNodeDevices[ni].Len() < perNodeDevices[nj].Len()) + } + + // 3. node[i] from node without topology set + return nj == nodeWithoutTopology && perNodeDevices[ni].Len() < perNodeDevices[nj].Len() }) // Generate three sorted lists of devices. Devices in the first list come diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 86f39eaa016..c9ff654045a 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -40,6 +40,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" "k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/pluginmanager" @@ -669,6 +670,106 @@ type TestResource struct { topology bool } +func TestFilterByAffinity(t *testing.T) { + as := require.New(t) + allDevices := ResourceDeviceInstances{ + "res1": map[string]pluginapi.Device{ + "dev1": { + ID: "dev1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + "dev2": { + ID: "dev2", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + { + ID: 2, + }, + }, + }, + }, + "dev3": { + ID: "dev3", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 2, + }, + }, + }, + }, + "dev4": { + ID: "dev4", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 2, + }, + }, + }, + }, + }, + } + + testManager := ManagerImpl{ + topologyAffinityStore: newFakeTopologyManager(), + allDevices: allDevices, + } + + testCases := []struct { + available sets.String + fromAffinityExpected sets.String + notFromAffinityExpected sets.String + withoutTopologyExpected sets.String + }{ + { + available: sets.NewString("dev1", "dev2"), + fromAffinityExpected: sets.NewString("dev2"), + notFromAffinityExpected: sets.NewString("dev1"), + withoutTopologyExpected: sets.NewString(), + }, + { + available: sets.NewString("dev1", "dev2", "dev3", "dev4"), + fromAffinityExpected: sets.NewString("dev2", "dev3", "dev4"), + notFromAffinityExpected: sets.NewString("dev1"), + withoutTopologyExpected: sets.NewString(), + }, + } + + for _, testCase := range testCases { + fromAffinity, notFromAffinity, withoutTopology := testManager.filterByAffinity("", "", "res1", testCase.available) + as.Truef(fromAffinity.Equal(testCase.fromAffinityExpected), "expect devices from affinity to be %v but got %v", testCase.fromAffinityExpected, fromAffinity) + as.Truef(notFromAffinity.Equal(testCase.notFromAffinityExpected), "expect devices not from affinity to be %v but got %v", testCase.notFromAffinityExpected, notFromAffinity) + as.Truef(withoutTopology.Equal(testCase.withoutTopologyExpected), "expect devices without topology to be %v but got %v", testCase.notFromAffinityExpected, notFromAffinity) + } +} + +type fakeTopologyManager struct { + topologymanager.Manager +} + +func newFakeTopologyManager() topologymanager.Manager { + return &fakeTopologyManager{} +} + +func (m *fakeTopologyManager) GetAffinity(podUID string, containerName string) topologymanager.TopologyHint { + // return hint with numa2 set + affinity, _ := bitmask.NewBitMask(2) + return topologymanager.TopologyHint{ + NUMANodeAffinity: affinity, + Preferred: true, + } +} + func TestPodContainerDeviceAllocation(t *testing.T) { res1 := TestResource{ resourceName: "domain1.com/resource1",