From 2ef1f8107682e35403b42db81e89020c8a89d48d Mon Sep 17 00:00:00 2001 From: kikimo Date: Sun, 16 May 2021 23:30:19 +0800 Subject: [PATCH 1/7] Avoid undesirable allocation when device is associated with multiple NUMA Nodes suppose there are two devices dev1 and dev2, each has NUMA Nodes associated as below: dev1: numa1 dev2: numa1, numa2 and we request a device from numa2, currently filterByAffinity() will return [], [dev1, dev2], [] if loop of available devices produce a sequence of [dev1, dev2], that is is not desirable as what we truely expect is an allocation of dev2 from numa2. --- pkg/kubelet/cm/devicemanager/manager.go | 18 +++- pkg/kubelet/cm/devicemanager/manager_test.go | 101 +++++++++++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 1708176fc9b..c691eaea89b 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -791,9 +791,23 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa nodes = append(nodes, node) } - // Sort the list of nodes by how many devices they contain. + // Sort the list of nodes by nodes from affinity set, nodes from none-affinity + // set, nodes without topology set and how many devices they contain. sort.Slice(nodes, func(i, j int) bool { - return perNodeDevices[i].Len() < perNodeDevices[j].Len() + ni, nj := nodes[i], nodes[j] + // 1. node[i] from to affninity set + if hint.NUMANodeAffinity.IsSet(ni) { + return !hint.NUMANodeAffinity.IsSet(nj) || perNodeDevices[ni].Len() < perNodeDevices[nj].Len() + } + + // 2. node[i] from none-affinity set + if ni != nodeWithoutTopology { + return !hint.NUMANodeAffinity.IsSet(nj) && + (nj == nodeWithoutTopology || perNodeDevices[ni].Len() < perNodeDevices[nj].Len()) + } + + // 3. node[i] from node without topology set + return nj == nodeWithoutTopology && perNodeDevices[ni].Len() < perNodeDevices[nj].Len() }) // Generate three sorted lists of devices. Devices in the first list come diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 86f39eaa016..c9ff654045a 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -40,6 +40,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" "k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/pluginmanager" @@ -669,6 +670,106 @@ type TestResource struct { topology bool } +func TestFilterByAffinity(t *testing.T) { + as := require.New(t) + allDevices := ResourceDeviceInstances{ + "res1": map[string]pluginapi.Device{ + "dev1": { + ID: "dev1", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + }, + }, + }, + "dev2": { + ID: "dev2", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 1, + }, + { + ID: 2, + }, + }, + }, + }, + "dev3": { + ID: "dev3", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 2, + }, + }, + }, + }, + "dev4": { + ID: "dev4", + Topology: &pluginapi.TopologyInfo{ + Nodes: []*pluginapi.NUMANode{ + { + ID: 2, + }, + }, + }, + }, + }, + } + + testManager := ManagerImpl{ + topologyAffinityStore: newFakeTopologyManager(), + allDevices: allDevices, + } + + testCases := []struct { + available sets.String + fromAffinityExpected sets.String + notFromAffinityExpected sets.String + withoutTopologyExpected sets.String + }{ + { + available: sets.NewString("dev1", "dev2"), + fromAffinityExpected: sets.NewString("dev2"), + notFromAffinityExpected: sets.NewString("dev1"), + withoutTopologyExpected: sets.NewString(), + }, + { + available: sets.NewString("dev1", "dev2", "dev3", "dev4"), + fromAffinityExpected: sets.NewString("dev2", "dev3", "dev4"), + notFromAffinityExpected: sets.NewString("dev1"), + withoutTopologyExpected: sets.NewString(), + }, + } + + for _, testCase := range testCases { + fromAffinity, notFromAffinity, withoutTopology := testManager.filterByAffinity("", "", "res1", testCase.available) + as.Truef(fromAffinity.Equal(testCase.fromAffinityExpected), "expect devices from affinity to be %v but got %v", testCase.fromAffinityExpected, fromAffinity) + as.Truef(notFromAffinity.Equal(testCase.notFromAffinityExpected), "expect devices not from affinity to be %v but got %v", testCase.notFromAffinityExpected, notFromAffinity) + as.Truef(withoutTopology.Equal(testCase.withoutTopologyExpected), "expect devices without topology to be %v but got %v", testCase.notFromAffinityExpected, notFromAffinity) + } +} + +type fakeTopologyManager struct { + topologymanager.Manager +} + +func newFakeTopologyManager() topologymanager.Manager { + return &fakeTopologyManager{} +} + +func (m *fakeTopologyManager) GetAffinity(podUID string, containerName string) topologymanager.TopologyHint { + // return hint with numa2 set + affinity, _ := bitmask.NewBitMask(2) + return topologymanager.TopologyHint{ + NUMANodeAffinity: affinity, + Preferred: true, + } +} + func TestPodContainerDeviceAllocation(t *testing.T) { res1 := TestResource{ resourceName: "domain1.com/resource1", From 893ebf3a1c4a9c789caf7364c482c988e6423283 Mon Sep 17 00:00:00 2001 From: kikimo Date: Wed, 19 May 2021 09:33:51 +0800 Subject: [PATCH 2/7] add a reusable fakeTopologyManagerWithHint{} --- pkg/kubelet/cm/devicemanager/manager_test.go | 54 +++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index c9ff654045a..6b46ee25554 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -44,6 +44,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/pluginmanager" + "k8s.io/kubernetes/pkg/kubelet/util/format" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) @@ -720,8 +721,13 @@ func TestFilterByAffinity(t *testing.T) { }, } + fakeAffinity, _ := bitmask.NewBitMask(2) + fakeHint := topologymanager.TopologyHint{ + NUMANodeAffinity: fakeAffinity, + Preferred: true, + } testManager := ManagerImpl{ - topologyAffinityStore: newFakeTopologyManager(), + topologyAffinityStore: NewFakeTopologyManagerWithHint(t, &fakeHint), allDevices: allDevices, } @@ -753,23 +759,45 @@ func TestFilterByAffinity(t *testing.T) { } } -type fakeTopologyManager struct { - topologymanager.Manager +type fakeTopologyManagerWithHint struct { + t *testing.T + hint *topologymanager.TopologyHint } -func newFakeTopologyManager() topologymanager.Manager { - return &fakeTopologyManager{} -} - -func (m *fakeTopologyManager) GetAffinity(podUID string, containerName string) topologymanager.TopologyHint { - // return hint with numa2 set - affinity, _ := bitmask.NewBitMask(2) - return topologymanager.TopologyHint{ - NUMANodeAffinity: affinity, - Preferred: true, +// NewFakeTopologyManagerWithHint returns an instance of fake topology manager with specified topology hints +func NewFakeTopologyManagerWithHint(t *testing.T, hint *topologymanager.TopologyHint) topologymanager.Manager { + return &fakeTopologyManagerWithHint{ + t: t, + hint: hint, } } +func (m *fakeTopologyManagerWithHint) AddHintProvider(h topologymanager.HintProvider) { + m.t.Logf("[fake topologymanager] AddHintProvider HintProvider: %v", h) +} + +func (m *fakeTopologyManagerWithHint) AddContainer(pod *v1.Pod, containerID string) error { + m.t.Logf("[fake topologymanager] AddContainer pod: %v container id: %v", format.Pod(pod), containerID) + return nil +} + +func (m *fakeTopologyManagerWithHint) RemoveContainer(containerID string) error { + m.t.Logf("[fake topologymanager] RemoveContainer container id: %v", containerID) + return nil +} + +func (m *fakeTopologyManagerWithHint) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult { + m.t.Logf("[fake topologymanager] Topology Admit Handler") + return lifecycle.PodAdmitResult{ + Admit: true, + } +} + +func (m *fakeTopologyManagerWithHint) GetAffinity(podUID string, containerName string) topologymanager.TopologyHint { + m.t.Logf("[fake topologymanager] GetAffinity podUID: %v container name: %v", podUID, containerName) + return *m.hint +} + func TestPodContainerDeviceAllocation(t *testing.T) { res1 := TestResource{ resourceName: "domain1.com/resource1", From 7d30bfecd57c4735e258d3aa2a18c19646da0224 Mon Sep 17 00:00:00 2001 From: kikimo Date: Wed, 19 May 2021 09:57:42 +0800 Subject: [PATCH 3/7] simplify sorting comparator of numa nodes --- pkg/kubelet/cm/devicemanager/manager.go | 36 ++++++++++++++++--------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index c691eaea89b..06d2726a95e 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -120,6 +120,14 @@ type sourcesReadyStub struct{} // PodReusableDevices is a map by pod name of devices to reuse. type PodReusableDevices map[string]map[string]sets.String +type nodePriority int + +const ( + priorityFromAffinity nodePriority = iota + priorityNotFromAffinity + priorityWithoutTopology +) + func (s *sourcesReadyStub) AddSource(source string) {} func (s *sourcesReadyStub) AllReady() bool { return true } @@ -791,23 +799,27 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa nodes = append(nodes, node) } + getNodePriority := func(n int) nodePriority { + if hint.NUMANodeAffinity.IsSet(n) { + return priorityFromAffinity + } + + if n != nodeWithoutTopology { + return priorityNotFromAffinity + } + + return priorityWithoutTopology + } + // Sort the list of nodes by nodes from affinity set, nodes from none-affinity // set, nodes without topology set and how many devices they contain. sort.Slice(nodes, func(i, j int) bool { - ni, nj := nodes[i], nodes[j] - // 1. node[i] from to affninity set - if hint.NUMANodeAffinity.IsSet(ni) { - return !hint.NUMANodeAffinity.IsSet(nj) || perNodeDevices[ni].Len() < perNodeDevices[nj].Len() + pi, pj := getNodePriority(nodes[i]), getNodePriority(nodes[j]) + if pi != pj { + return pi < pj } - // 2. node[i] from none-affinity set - if ni != nodeWithoutTopology { - return !hint.NUMANodeAffinity.IsSet(nj) && - (nj == nodeWithoutTopology || perNodeDevices[ni].Len() < perNodeDevices[nj].Len()) - } - - // 3. node[i] from node without topology set - return nj == nodeWithoutTopology && perNodeDevices[ni].Len() < perNodeDevices[nj].Len() + return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() }) // Generate three sorted lists of devices. Devices in the first list come From 84a4b40526103e019d8751bd034b4ab412cfa145 Mon Sep 17 00:00:00 2001 From: kikimo Date: Wed, 19 May 2021 10:12:12 +0800 Subject: [PATCH 4/7] fix incompatible interface in fakeTopologyManagerWithHint --- pkg/kubelet/cm/devicemanager/manager_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 6b46ee25554..61031c728e1 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -776,9 +776,8 @@ func (m *fakeTopologyManagerWithHint) AddHintProvider(h topologymanager.HintProv m.t.Logf("[fake topologymanager] AddHintProvider HintProvider: %v", h) } -func (m *fakeTopologyManagerWithHint) AddContainer(pod *v1.Pod, containerID string) error { - m.t.Logf("[fake topologymanager] AddContainer pod: %v container id: %v", format.Pod(pod), containerID) - return nil +func (m *fakeTopologyManagerWithHint) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) { + m.t.Logf("[fake topologymanager] AddContainer pod: %v container name: %v container id: %v", format.Pod(pod), container.Name, containerID) } func (m *fakeTopologyManagerWithHint) RemoveContainer(containerID string) error { From ecfa609b712ae0e442a3d99d020c4deca3badecf Mon Sep 17 00:00:00 2001 From: kikimo Date: Wed, 19 May 2021 21:19:47 +0800 Subject: [PATCH 5/7] simplify sorting comparator of numa nodes --- pkg/kubelet/cm/devicemanager/manager.go | 51 +++++++++++++------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 06d2726a95e..dfaad214739 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -120,14 +120,6 @@ type sourcesReadyStub struct{} // PodReusableDevices is a map by pod name of devices to reuse. type PodReusableDevices map[string]map[string]sets.String -type nodePriority int - -const ( - priorityFromAffinity nodePriority = iota - priorityNotFromAffinity - priorityWithoutTopology -) - func (s *sourcesReadyStub) AddSource(source string) {} func (s *sourcesReadyStub) AllReady() bool { return true } @@ -799,26 +791,35 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa nodes = append(nodes, node) } - getNodePriority := func(n int) nodePriority { - if hint.NUMANodeAffinity.IsSet(n) { - return priorityFromAffinity - } - - if n != nodeWithoutTopology { - return priorityNotFromAffinity - } - - return priorityWithoutTopology - } - - // Sort the list of nodes by nodes from affinity set, nodes from none-affinity - // set, nodes without topology set and how many devices they contain. + // Sort the list of nodes by: + // 1) Nodes contained in the 'hint's affinity set + // 2) Nodes not contained in the 'hint's affinity set + // 3) The fake NUMANode of -1 (assuming it is included in the list) + // Within each of the groups above, sort the nodes by how many devices they contain sort.Slice(nodes, func(i, j int) bool { - pi, pj := getNodePriority(nodes[i]), getNodePriority(nodes[j]) - if pi != pj { - return pi < pj + // If one or the other of nodes[i] or nodes[j] is in the 'hint's affinity set + if hint.NUMANodeAffinity.IsSet(nodes[i]) && hint.NUMANodeAffinity.IsSet(nodes[j]) { + return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() + } + if hint.NUMANodeAffinity.IsSet(nodes[i]) { + return true + } + if hint.NUMANodeAffinity.IsSet(nodes[j]) { + return false } + // If either nodes[i] or nodes[j] are in the 'hint's affinity set (but are not -1) + if nodes[i] != nodeWithoutTopology && nodes[j] != nodeWithoutTopology { + return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() + } + if nodes[i] != nodeWithoutTopology { + return true + } + if nodes[j] != nodeWithoutTopology { + return false + } + + // If both nodes[i] and nodes[j] are -1 return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() }) From 445b9c07622793bb7cbcad2b9b062182e940eaee Mon Sep 17 00:00:00 2001 From: kikimo Date: Thu, 20 May 2021 08:21:20 +0800 Subject: [PATCH 6/7] minor tweak on numa node sorting algorithm --- pkg/kubelet/cm/devicemanager/manager.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index dfaad214739..94cf44e8fa7 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -812,14 +812,15 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa if nodes[i] != nodeWithoutTopology && nodes[j] != nodeWithoutTopology { return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() } - if nodes[i] != nodeWithoutTopology { - return true - } - if nodes[j] != nodeWithoutTopology { + // If one or the other of nodes[i] or nodes[j] is the fake NUMA node -1 (they can't both be) + if nodes[i] == nodeWithoutTopology { return false } + if nodes[j] == nodeWithoutTopology { + return true + } - // If both nodes[i] and nodes[j] are -1 + // Otherwise both nodes[i] and nodes[j] are real NUMA nodes that are not in the 'hint's' affinity list. return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() }) From c0a7939cbb2a5d4987778e9d478db8f47b164a28 Mon Sep 17 00:00:00 2001 From: kikimo Date: Thu, 20 May 2021 20:31:47 +0800 Subject: [PATCH 7/7] remove redundant test branch in sorting algorithm --- pkg/kubelet/cm/devicemanager/manager.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 94cf44e8fa7..884a224895b 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -808,10 +808,6 @@ func (m *ManagerImpl) filterByAffinity(podUID, contName, resource string, availa return false } - // If either nodes[i] or nodes[j] are in the 'hint's affinity set (but are not -1) - if nodes[i] != nodeWithoutTopology && nodes[j] != nodeWithoutTopology { - return perNodeDevices[nodes[i]].Len() < perNodeDevices[nodes[j]].Len() - } // If one or the other of nodes[i] or nodes[j] is the fake NUMA node -1 (they can't both be) if nodes[i] == nodeWithoutTopology { return false