From 58ef3f202ab540156bfca4a082237022ab6fffb3 Mon Sep 17 00:00:00 2001 From: PiotrProkop Date: Thu, 27 Oct 2022 10:37:20 +0200 Subject: [PATCH] Improved multi-numa alignment in Topology Manager: add NUMAInfo Signed-off-by: PiotrProkop --- .../cm/topologymanager/bitmask/bitmask.go | 16 +- .../topologymanager/bitmask/bitmask_test.go | 72 +++ pkg/kubelet/cm/topologymanager/numa_info.go | 150 +++++ .../cm/topologymanager/numa_info_test.go | 584 ++++++++++++++++++ 4 files changed, 819 insertions(+), 3 deletions(-) create mode 100644 pkg/kubelet/cm/topologymanager/numa_info.go create mode 100644 pkg/kubelet/cm/topologymanager/numa_info_test.go diff --git a/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go b/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go index fb2043e0182..3d029adb691 100644 --- a/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go +++ b/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go @@ -35,6 +35,8 @@ type BitMask interface { IsSet(bit int) bool AnySet(bits []int) bool IsNarrowerThan(mask BitMask) bool + IsLessThan(mask BitMask) bool + IsGreaterThan(mask BitMask) bool String() string Count() int GetBits() []int @@ -143,13 +145,21 @@ func (s *bitMask) IsEqual(mask BitMask) bool { // lower-numbered bits set wins out. func (s *bitMask) IsNarrowerThan(mask BitMask) bool { if s.Count() == mask.Count() { - if *s < *mask.(*bitMask) { - return true - } + return s.IsLessThan(mask) } return s.Count() < mask.Count() } +// IsLessThan checks which bitmask has more lower-numbered bits set. +func (s *bitMask) IsLessThan(mask BitMask) bool { + return *s < *mask.(*bitMask) +} + +// IsGreaterThan checks which bitmask has more higher-numbered bits set. +func (s *bitMask) IsGreaterThan(mask BitMask) bool { + return *s > *mask.(*bitMask) +} + // String converts mask to string func (s *bitMask) String() string { grouping := 2 diff --git a/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go b/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go index d4a5f569509..784b28715e7 100644 --- a/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go +++ b/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go @@ -630,3 +630,75 @@ func TestIterateBitMasks(t *testing.T) { } } } + +func TestIsLessThan(t *testing.T) { + tcases := []struct { + name string + firstMask []int + secondMask []int + expectedFirstLower bool + }{ + { + name: "Check which value is lower of masks with equal bits set 1/1", + firstMask: []int{0}, + secondMask: []int{0}, + expectedFirstLower: false, + }, + { + name: "Check which value is lower of masks with unequal bits set 2/1", + firstMask: []int{1}, + secondMask: []int{0}, + expectedFirstLower: false, + }, + { + name: "Check which value is lower of masks with unequal bits set 1/2", + firstMask: []int{0}, + secondMask: []int{1}, + expectedFirstLower: true, + }, + } + for _, tc := range tcases { + firstMask, _ := NewBitMask(tc.firstMask...) + secondMask, _ := NewBitMask(tc.secondMask...) + expectedFirstLower := firstMask.IsLessThan(secondMask) + if expectedFirstLower != tc.expectedFirstLower { + t.Errorf("Expected value to be %v, got %v", tc.expectedFirstLower, expectedFirstLower) + } + } +} + +func TestIsGreaterThan(t *testing.T) { + tcases := []struct { + name string + firstMask []int + secondMask []int + expectedFirstGreater bool + }{ + { + name: "Check which value is greater of masks with equal bits set 1/1", + firstMask: []int{0}, + secondMask: []int{0}, + expectedFirstGreater: false, + }, + { + name: "Check which value is greater of masks with unequal bits set 2/1", + firstMask: []int{1}, + secondMask: []int{0}, + expectedFirstGreater: true, + }, + { + name: "Check which value is greater of masks with unequal bits set 1/2", + firstMask: []int{0}, + secondMask: []int{1}, + expectedFirstGreater: false, + }, + } + for _, tc := range tcases { + firstMask, _ := NewBitMask(tc.firstMask...) + secondMask, _ := NewBitMask(tc.secondMask...) + expectedFirstGreater := firstMask.IsGreaterThan(secondMask) + if expectedFirstGreater != tc.expectedFirstGreater { + t.Errorf("Expected value to be %v, got %v", tc.expectedFirstGreater, expectedFirstGreater) + } + } +} diff --git a/pkg/kubelet/cm/topologymanager/numa_info.go b/pkg/kubelet/cm/topologymanager/numa_info.go new file mode 100644 index 00000000000..d38f525c845 --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/numa_info.go @@ -0,0 +1,150 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "fmt" + "io/ioutil" + "strconv" + "strings" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" +) + +const ( + defaultNodeDir = "/sys/devices/system/node/" +) + +type NUMADistances [][]uint64 + +type NUMAInfo struct { + Nodes []int + NUMADistances NUMADistances +} + +type NUMASysFs struct { + nodeDir string +} + +func NewNUMAInfo(topology []cadvisorapi.Node) (*NUMAInfo, error) { + return newNUMAInfo(topology, &NUMASysFs{nodeDir: defaultNodeDir}) +} + +func newNUMAInfo(topology []cadvisorapi.Node, sysFs *NUMASysFs) (*NUMAInfo, error) { + var numaNodes []int + distances := make([][]uint64, len(topology)) + for _, node := range topology { + numaNodes = append(numaNodes, node.Id) + + // Populate the NUMA distances + // For now we need to retrieve this information from sysfs. + // TODO: Update this as follows once a version of cadvisor with this commit is vendored in https://github.com/google/cadvisor/commit/24dd1de08a72cfee661f6178454db995900c0fee + // distances[node.Id] = node.Distances[:] + nodeDistance, err := sysFs.GetDistances(node.Id) + if err != nil { + return nil, fmt.Errorf("error getting NUMA distances from sysfs: %w", err) + } + distances[node.Id] = nodeDistance + } + + numaInfo := &NUMAInfo{ + Nodes: numaNodes, + NUMADistances: distances, + } + + return numaInfo, nil +} + +func (n *NUMAInfo) Narrowest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask { + if m1.IsNarrowerThan(m2) { + return m1 + } + return m2 +} + +func (n *NUMAInfo) Closest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask { + // If the length of both bitmasks aren't the same, choose the one that is narrowest. + if m1.Count() != m2.Count() { + return n.Narrowest(m1, m2) + } + + m1Distance := n.NUMADistances.CalculateAverageFor(m1) + m2Distance := n.NUMADistances.CalculateAverageFor(m2) + // If average distance is the same, take bitmask with more lower-number bits set. + if m1Distance == m2Distance { + if m1.IsLessThan(m2) { + return m1 + } + return m2 + } + + // Otherwise, return the bitmask with the shortest average distance between NUMA nodes. + if m1Distance < m2Distance { + return m1 + } + + return m2 +} + +func (n NUMAInfo) DefaultAffinityMask() bitmask.BitMask { + defaultAffinity, _ := bitmask.NewBitMask(n.Nodes...) + return defaultAffinity +} + +func (d NUMADistances) CalculateAverageFor(bm bitmask.BitMask) float64 { + // This should never happen, but just in case make sure we do not divide by zero. + if bm.Count() == 0 { + return 0 + } + + var count float64 = 0 + var sum float64 = 0 + for _, node1 := range bm.GetBits() { + for _, node2 := range bm.GetBits() { + sum += float64(d[node1][node2]) + count++ + } + } + + return sum / count +} + +func (s NUMASysFs) GetDistances(nodeId int) ([]uint64, error) { + distancePath := fmt.Sprintf("%s/node%d/distance", s.nodeDir, nodeId) + distance, err := ioutil.ReadFile(distancePath) + if err != nil { + return nil, fmt.Errorf("problem reading %s: %w", distancePath, err) + } + + rawDistances := strings.TrimSpace(string(distance)) + + return splitDistances(rawDistances) +} + +func splitDistances(rawDistances string) ([]uint64, error) { + distances := []uint64{} + for _, distance := range strings.Split(rawDistances, " ") { + distanceUint, err := strconv.ParseUint(distance, 10, 64) + if err != nil { + return nil, fmt.Errorf("cannot convert %s to int", distance) + } + distances = append(distances, distanceUint) + } + + return distances, nil +} diff --git a/pkg/kubelet/cm/topologymanager/numa_info_test.go b/pkg/kubelet/cm/topologymanager/numa_info_test.go new file mode 100644 index 00000000000..c101094f2bb --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/numa_info_test.go @@ -0,0 +1,584 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "reflect" + "strconv" + "strings" + "testing" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" +) + +func TestNUMAInfo(t *testing.T) { + tcases := []struct { + name string + topology []cadvisorapi.Node + expectedNUMAInfo *NUMAInfo + expectedErr error + }{ + { + name: "positive test 1 node", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + }, + }, + }, + { + name: "positive test 2 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + }, + }, + }, + { + name: "positive test 3 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + { + Id: 2, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1, 2}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + }, + }, + }, + { + name: "positive test 4 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + { + Id: 2, + }, + { + Id: 3, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1, 2, 3}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + }, + }, + { + name: "negative test 1 node", + topology: []cadvisorapi.Node{ + { + Id: 9, + }, + }, + expectedNUMAInfo: nil, + expectedErr: fmt.Errorf("no such file or directory"), + }, + } + + nodeDir, err := os.MkdirTemp("", "TestNUMAInfo") + if err != nil { + t.Fatalf("Unable to create temporary directory: %v", err) + } + defer os.RemoveAll(nodeDir) + + numaDistances := map[int]string{ + 0: "10 11 12 12", + 1: "11 10 12 12", + 2: "12 12 10 11", + 3: "12 12 11 10", + } + + for i, distances := range numaDistances { + numaDir := filepath.Join(nodeDir, fmt.Sprintf("node%d", i)) + if err := os.Mkdir(numaDir, 0700); err != nil { + t.Fatalf("Unable to create numaDir %s: %v", numaDir, err) + } + + distanceFile := filepath.Join(numaDir, "distance") + + if err = os.WriteFile(distanceFile, []byte(distances), 0644); err != nil { + t.Fatalf("Unable to create test distanceFile: %v", err) + } + } + + // stub sysFs to read from temp dir + sysFs := &NUMASysFs{nodeDir: nodeDir} + + for _, tcase := range tcases { + topology, err := newNUMAInfo(tcase.topology, sysFs) + if tcase.expectedErr == nil && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tcase.expectedErr != nil && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tcase.expectedErr) + } else if tcase.expectedErr != nil { + if !strings.Contains(err.Error(), tcase.expectedErr.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tcase.expectedErr.Error()) + } + } + + if !reflect.DeepEqual(topology, tcase.expectedNUMAInfo) { + t.Fatalf("Expected topology to equal %v, not %v", tcase.expectedNUMAInfo, topology) + } + + } +} + +func TestCalculateAvgDistanceFor(t *testing.T) { + tcases := []struct { + name string + bm []int + distance [][]uint64 + expectedAvg float64 + }{ + { + name: "1 NUMA node", + bm: []int{ + 0, + }, + distance: [][]uint64{ + { + 10, + }, + }, + expectedAvg: 10, + }, + { + name: "2 NUMA node, 1 set in bitmask", + bm: []int{ + 0, + }, + distance: [][]uint64{ + { + 10, + 11, + }, + { + 11, + 10, + }, + }, + expectedAvg: 10, + }, + { + name: "2 NUMA node, 2 set in bitmask", + bm: []int{ + 0, + 1, + }, + distance: [][]uint64{ + { + 10, + 11, + }, + { + 11, + 10, + }, + }, + expectedAvg: 10.5, + }, + { + name: "4 NUMA node, 2 set in bitmask", + bm: []int{ + 0, + 2, + }, + distance: [][]uint64{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + expectedAvg: 11, + }, + { + name: "4 NUMA node, 3 set in bitmask", + bm: []int{ + 0, + 2, + 3, + }, + distance: [][]uint64{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + expectedAvg: 11.11111111111111, + }, + { + name: "0 NUMA node, 0 set in bitmask", + bm: []int{}, + distance: [][]uint64{}, + expectedAvg: 0, + }, + } + + for _, tcase := range tcases { + bm, err := bitmask.NewBitMask(tcase.bm...) + if err != nil { + t.Errorf("no error expected got %v", err) + } + + numaInfo := NUMAInfo{ + Nodes: tcase.bm, + NUMADistances: tcase.distance, + } + + result := numaInfo.NUMADistances.CalculateAverageFor(bm) + if result != tcase.expectedAvg { + t.Errorf("Expected result to equal %g, not %g", tcase.expectedAvg, result) + } + } + +} + +func TestGetDistances(t *testing.T) { + testCases := []struct { + name string + expectedErr bool + expectedDistances []uint64 + nodeId int + nodeExists bool + }{ + { + name: "reading proper distance file", + expectedErr: false, + expectedDistances: []uint64{10, 11, 12, 13}, + nodeId: 0, + nodeExists: true, + }, + { + name: "no distance file", + expectedErr: true, + expectedDistances: nil, + nodeId: 99, + }, + } + + for _, tcase := range testCases { + t.Run(tcase.name, func(t *testing.T) { + var err error + + nodeDir, err := os.MkdirTemp("", "TestGetDistances") + if err != nil { + t.Fatalf("Unable to create temporary directory: %v", err) + } + + defer os.RemoveAll(nodeDir) + + if tcase.nodeExists { + numaDir := filepath.Join(nodeDir, fmt.Sprintf("node%d", tcase.nodeId)) + if err := os.Mkdir(numaDir, 0700); err != nil { + t.Fatalf("Unable to create numaDir %s: %v", numaDir, err) + } + + distanceFile := filepath.Join(numaDir, "distance") + + var buffer bytes.Buffer + for i, distance := range tcase.expectedDistances { + buffer.WriteString(strconv.Itoa(int(distance))) + if i != len(tcase.expectedDistances)-1 { + buffer.WriteString(" ") + } + } + + if err = os.WriteFile(distanceFile, buffer.Bytes(), 0644); err != nil { + t.Fatalf("Unable to create test distanceFile: %v", err) + } + } + + sysFs := &NUMASysFs{nodeDir: nodeDir} + + distances, err := sysFs.GetDistances(tcase.nodeId) + if !tcase.expectedErr && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tcase.expectedErr && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tcase.expectedErr) + } + + if !tcase.expectedErr && !reflect.DeepEqual(distances, tcase.expectedDistances) { + t.Fatalf("Expected distances to equal %v, not %v", tcase.expectedDistances, distances) + } + }) + } +} + +func TestSplitDistances(t *testing.T) { + tcases := []struct { + description string + rawDistances string + expected []uint64 + expectedErr error + }{ + { + description: "read one distance", + rawDistances: "10", + expected: []uint64{10}, + expectedErr: nil, + }, + { + description: "read two distances", + rawDistances: "10 20", + expected: []uint64{10, 20}, + expectedErr: nil, + }, + { + description: "can't convert negative number to uint64", + rawDistances: "10 -20", + expected: nil, + expectedErr: fmt.Errorf("cannot conver"), + }, + } + + for _, tc := range tcases { + result, err := splitDistances(tc.rawDistances) + + if tc.expectedErr == nil && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tc.expectedErr != nil && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tc.expectedErr) + } else if tc.expectedErr != nil { + if !strings.Contains(err.Error(), tc.expectedErr.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tc.expectedErr.Error()) + } + } + + if !reflect.DeepEqual(tc.expected, result) { + t.Fatalf("Expected distances to equal: %v, got: %v", tc.expected, result) + } + } +} + +func TestClosest(t *testing.T) { + tcases := []struct { + description string + current bitmask.BitMask + candidate bitmask.BitMask + expected string + numaInfo *NUMAInfo + }{ + { + description: "current and candidate length is not the same, current narrower", + current: NewTestBitMask(0), + candidate: NewTestBitMask(0, 2), + expected: "current", + numaInfo: &NUMAInfo{}, + }, + { + description: "current and candidate length is the same, distance is the same, current more lower bits set", + current: NewTestBitMask(0, 1), + candidate: NewTestBitMask(0, 2), + expected: "current", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 10, 10}, + {10, 10, 10}, + {10, 10, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, distance is the same, candidate more lower bits set", + current: NewTestBitMask(0, 3), + candidate: NewTestBitMask(0, 2), + expected: "candidate", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 10, 10, 10}, + {10, 10, 10, 10}, + {10, 10, 10, 10}, + {10, 10, 10, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, candidate average distance is smaller", + current: NewTestBitMask(0, 3), + candidate: NewTestBitMask(0, 1), + expected: "candidate", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 11, 12, 12}, + {11, 10, 12, 12}, + {12, 12, 10, 11}, + {12, 12, 11, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, current average distance is smaller", + current: NewTestBitMask(2, 3), + candidate: NewTestBitMask(0, 3), + expected: "current", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 11, 12, 12}, + {11, 10, 12, 12}, + {12, 12, 10, 11}, + {12, 12, 11, 10}, + }, + }, + }, + } + + for _, tc := range tcases { + t.Run(tc.description, func(t *testing.T) { + + result := tc.numaInfo.Closest(tc.candidate, tc.current) + if result != tc.current && result != tc.candidate { + t.Errorf("Expected result to be either 'current' or 'candidate' hint") + } + if tc.expected == "current" && result != tc.current { + t.Errorf("Expected result to be %v, got %v", tc.current, result) + } + if tc.expected == "candidate" && result != tc.candidate { + t.Errorf("Expected result to be %v, got %v", tc.candidate, result) + } + }) + } +}