diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 719bdd9847f..f184a5c2cf0 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -519,6 +519,7 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.BoolVar(&c.ProtectKernelDefaults, "protect-kernel-defaults", c.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.") fs.StringVar(&c.ReservedSystemCPUs, "reserved-cpus", c.ReservedSystemCPUs, "A comma-separated list of CPUs or CPU ranges that are reserved for system and kubernetes usage. This specific list will supersede cpu counts in --system-reserved and --kube-reserved.") fs.StringVar(&c.TopologyManagerScope, "topology-manager-scope", c.TopologyManagerScope, "Scope to which topology hints applied. Topology Manager collects hints from Hint Providers and applies them to defined scope to ensure the pod admission. Possible values: 'container', 'pod'.") + fs.Var(cliflag.NewMapStringStringNoSplit(&c.TopologyManagerPolicyOptions), "topology-manager-policy-options", "A set of key=value Topology Manager policy options to use, to fine tune their behaviour. If not supplied, keep the default behaviour.") // Node Allocatable Flags fs.Var(cliflag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu, memory and local ephemeral storage for root file system are supported. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for more detail. [default=none]") fs.Var(cliflag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently only cpu, memory and local ephemeral storage for root file system are supported. See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for more detail. [default=none]") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 3ff0737e91d..c4c42cc8802 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -710,6 +710,16 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend s.CPUManagerPolicyOptions, features.CPUManager, features.CPUManagerPolicyOptions) } + var topologyManagerPolicyOptions map[string]string + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyManager) { + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyManagerPolicyOptions) { + topologyManagerPolicyOptions = s.TopologyManagerPolicyOptions + } else if s.TopologyManagerPolicyOptions != nil { + return fmt.Errorf("topology manager policy options %v require feature gates %q, %q enabled", + s.TopologyManagerPolicyOptions, features.TopologyManager, features.TopologyManagerPolicyOptions) + } + } + kubeDeps.ContainerManager, err = cm.NewContainerManager( kubeDeps.Mounter, kubeDeps.CAdvisorInterface, @@ -732,17 +742,18 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend ReservedSystemCPUs: reservedSystemCPUs, HardEvictionThresholds: hardEvictionThresholds, }, - QOSReserved: *experimentalQOSReserved, - CPUManagerPolicy: s.CPUManagerPolicy, - CPUManagerPolicyOptions: cpuManagerPolicyOptions, - CPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, - ExperimentalMemoryManagerPolicy: s.MemoryManagerPolicy, - ExperimentalMemoryManagerReservedMemory: s.ReservedMemory, - ExperimentalPodPidsLimit: s.PodPidsLimit, - EnforceCPULimits: s.CPUCFSQuota, - CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, - ExperimentalTopologyManagerPolicy: s.TopologyManagerPolicy, - ExperimentalTopologyManagerScope: s.TopologyManagerScope, + QOSReserved: *experimentalQOSReserved, + CPUManagerPolicy: s.CPUManagerPolicy, + CPUManagerPolicyOptions: cpuManagerPolicyOptions, + CPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, + ExperimentalMemoryManagerPolicy: s.MemoryManagerPolicy, + ExperimentalMemoryManagerReservedMemory: s.ReservedMemory, + ExperimentalPodPidsLimit: s.PodPidsLimit, + EnforceCPULimits: s.CPUCFSQuota, + CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, + ExperimentalTopologyManagerPolicy: s.TopologyManagerPolicy, + ExperimentalTopologyManagerScope: s.TopologyManagerScope, + ExperimentalTopologyManagerPolicyOptions: topologyManagerPolicyOptions, }, s.FailSwapOn, kubeDeps.Recorder) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index db394d985a3..a31f50115cb 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -781,6 +781,34 @@ const ( // Enable resource managers to make NUMA aligned decisions TopologyManager featuregate.Feature = "TopologyManager" + // owner: @PiotrProkop + // kep: https://kep.k8s.io/3545 + // alpha: v1.26 + // + // Allow fine-tuning of topology manager policies with alpha options. + // This feature gate: + // - will guard *a group* of topology manager options whose quality level is alpha. + // - will never graduate to beta or stable. + TopologyManagerPolicyAlphaOptions featuregate.Feature = "TopologyManagerPolicyAlphaOptions" + + // owner: @PiotrProkop + // kep: https://kep.k8s.io/3545 + // alpha: v1.26 + // + // Allow fine-tuning of topology manager policies with beta options. + // This feature gate: + // - will guard *a group* of topology manager options whose quality level is beta. + // - is thus *introduced* as beta + // - will never graduate to stable. + TopologyManagerPolicyBetaOptions featuregate.Feature = "TopologyManagerPolicyBetaOptions" + + // owner: @PiotrProkop + // kep: https://kep.k8s.io/3545 + // alpha: v1.26 + // + // Allow the usage of options to fine-tune the topology manager policies. + TopologyManagerPolicyOptions featuregate.Feature = "TopologyManagerPolicyOptions" + // owner: @rata, @giuseppe // kep: https://kep.k8s.io/127 // alpha: v1.25 @@ -1050,6 +1078,12 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS TopologyManager: {Default: true, PreRelease: featuregate.Beta}, + TopologyManagerPolicyAlphaOptions: {Default: false, PreRelease: featuregate.Alpha}, + + TopologyManagerPolicyBetaOptions: {Default: false, PreRelease: featuregate.Beta}, + + TopologyManagerPolicyOptions: {Default: false, PreRelease: featuregate.Alpha}, + VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha}, UserNamespacesStatelessPodsSupport: {Default: false, PreRelease: featuregate.Alpha}, diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 27560d75213..e2ca6f67aac 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -56453,6 +56453,22 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen Format: "", }, }, + "topologyManagerPolicyOptions": { + SchemaProps: spec.SchemaProps{ + Description: "TopologyManagerPolicyOptions is a set of key=value which allows to set extra options to fine tune the behaviour of the topology manager policies. Requires both the \"TopologyManager\" and \"TopologyManagerPolicyOptions\" feature gates to be enabled. Default: nil", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Allows: true, + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, "qosReserved": { SchemaProps: spec.SchemaProps{ Description: "qosReserved is a set of resource name to percentage pairs that specify the minimum percentage of a resource reserved for exclusive use by the guaranteed QoS tier. Currently supported resources: \"memory\" Requires the QOSReserved feature gate to be enabled. Default: nil", diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go index 643cc1b6947..67aea014442 100644 --- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go @@ -76,6 +76,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.NodeStatusMaxImages = 50 obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope + obj.TopologyManagerPolicyOptions = make(map[string]string) obj.QOSReserved = map[string]string{ "memory": "50%", } diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 029eb119a4e..f791a36dc87 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -175,6 +175,7 @@ var ( "CPUManagerReconcilePeriod.Duration", "TopologyManagerPolicy", "TopologyManagerScope", + "TopologyManagerPolicyOptions[*]", "QOSReserved[*]", "CgroupDriver", "CgroupRoot", diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index efc53dc8b97..04432b9fc69 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -241,6 +241,10 @@ type KubeletConfiguration struct { // Default: "container" // +optional TopologyManagerScope string + // TopologyManagerPolicyOptions is a set of key=value which allows to set extra options + // to fine tune the behaviour of the topology manager policies. + // Requires both the "TopologyManager" and "TopologyManagerPolicyOptions" feature gates to be enabled. + TopologyManagerPolicyOptions map[string]string // Map of QoS resource reservation percentages (memory only for now). // Requires the QOSReserved feature gate to be enabled. QOSReserved map[string]string diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index daa73aa1f8f..3f289820445 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -412,6 +412,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in out.MemoryManagerPolicy = in.MemoryManagerPolicy out.TopologyManagerPolicy = in.TopologyManagerPolicy out.TopologyManagerScope = in.TopologyManagerScope + out.TopologyManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.TopologyManagerPolicyOptions)) out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode @@ -592,6 +593,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in out.MemoryManagerPolicy = in.MemoryManagerPolicy out.TopologyManagerPolicy = in.TopologyManagerPolicy out.TopologyManagerScope = in.TopologyManagerScope + out.TopologyManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.TopologyManagerPolicyOptions)) out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go index 59581e52e42..9436182f325 100644 --- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go @@ -211,6 +211,13 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { } } out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + if in.TopologyManagerPolicyOptions != nil { + in, out := &in.TopologyManagerPolicyOptions, &out.TopologyManagerPolicyOptions + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.QOSReserved != nil { in, out := &in.QOSReserved, &out.QOSReserved *out = make(map[string]string, len(*in)) diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 8b94671be7e..f39bb64f66f 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -133,17 +133,18 @@ type NodeConfig struct { KubeletRootDir string ProtectKernelDefaults bool NodeAllocatableConfig - QOSReserved map[v1.ResourceName]int64 - CPUManagerPolicy string - CPUManagerPolicyOptions map[string]string - ExperimentalTopologyManagerScope string - CPUManagerReconcilePeriod time.Duration - ExperimentalMemoryManagerPolicy string - ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation - ExperimentalPodPidsLimit int64 - EnforceCPULimits bool - CPUCFSQuotaPeriod time.Duration - ExperimentalTopologyManagerPolicy string + QOSReserved map[v1.ResourceName]int64 + CPUManagerPolicy string + CPUManagerPolicyOptions map[string]string + ExperimentalTopologyManagerScope string + CPUManagerReconcilePeriod time.Duration + ExperimentalMemoryManagerPolicy string + ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation + ExperimentalPodPidsLimit int64 + EnforceCPULimits bool + CPUCFSQuotaPeriod time.Duration + ExperimentalTopologyManagerPolicy string + ExperimentalTopologyManagerPolicyOptions map[string]string } type NodeAllocatableConfig struct { diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index bd1e1c225b4..dfbfc2f7325 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -289,6 +289,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I machineInfo.Topology, nodeConfig.ExperimentalTopologyManagerPolicy, nodeConfig.ExperimentalTopologyManagerScope, + nodeConfig.ExperimentalTopologyManagerPolicyOptions, ) if err != nil { diff --git a/pkg/kubelet/cm/cpumanager/policy_options_test.go b/pkg/kubelet/cm/cpumanager/policy_options_test.go index 8fc3bb8bdc3..60917d06f4f 100644 --- a/pkg/kubelet/cm/cpumanager/policy_options_test.go +++ b/pkg/kubelet/cm/cpumanager/policy_options_test.go @@ -162,7 +162,7 @@ func TestValidateStaticPolicyOptions(t *testing.T) { t.Run(testCase.description, func(t *testing.T) { topoMgrPolicy := topologymanager.NewNonePolicy() if testCase.topoMgrPolicy == topologymanager.PolicySingleNumaNode { - topoMgrPolicy = topologymanager.NewSingleNumaNodePolicy(nil) + topoMgrPolicy, _ = topologymanager.NewSingleNumaNodePolicy(&topologymanager.NUMAInfo{}, map[string]string{}) } topoMgrStore := topologymanager.NewFakeManagerWithPolicy(topoMgrPolicy) diff --git a/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go b/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go index fb2043e0182..3d029adb691 100644 --- a/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go +++ b/pkg/kubelet/cm/topologymanager/bitmask/bitmask.go @@ -35,6 +35,8 @@ type BitMask interface { IsSet(bit int) bool AnySet(bits []int) bool IsNarrowerThan(mask BitMask) bool + IsLessThan(mask BitMask) bool + IsGreaterThan(mask BitMask) bool String() string Count() int GetBits() []int @@ -143,13 +145,21 @@ func (s *bitMask) IsEqual(mask BitMask) bool { // lower-numbered bits set wins out. func (s *bitMask) IsNarrowerThan(mask BitMask) bool { if s.Count() == mask.Count() { - if *s < *mask.(*bitMask) { - return true - } + return s.IsLessThan(mask) } return s.Count() < mask.Count() } +// IsLessThan checks which bitmask has more lower-numbered bits set. +func (s *bitMask) IsLessThan(mask BitMask) bool { + return *s < *mask.(*bitMask) +} + +// IsGreaterThan checks which bitmask has more higher-numbered bits set. +func (s *bitMask) IsGreaterThan(mask BitMask) bool { + return *s > *mask.(*bitMask) +} + // String converts mask to string func (s *bitMask) String() string { grouping := 2 diff --git a/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go b/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go index d4a5f569509..784b28715e7 100644 --- a/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go +++ b/pkg/kubelet/cm/topologymanager/bitmask/bitmask_test.go @@ -630,3 +630,75 @@ func TestIterateBitMasks(t *testing.T) { } } } + +func TestIsLessThan(t *testing.T) { + tcases := []struct { + name string + firstMask []int + secondMask []int + expectedFirstLower bool + }{ + { + name: "Check which value is lower of masks with equal bits set 1/1", + firstMask: []int{0}, + secondMask: []int{0}, + expectedFirstLower: false, + }, + { + name: "Check which value is lower of masks with unequal bits set 2/1", + firstMask: []int{1}, + secondMask: []int{0}, + expectedFirstLower: false, + }, + { + name: "Check which value is lower of masks with unequal bits set 1/2", + firstMask: []int{0}, + secondMask: []int{1}, + expectedFirstLower: true, + }, + } + for _, tc := range tcases { + firstMask, _ := NewBitMask(tc.firstMask...) + secondMask, _ := NewBitMask(tc.secondMask...) + expectedFirstLower := firstMask.IsLessThan(secondMask) + if expectedFirstLower != tc.expectedFirstLower { + t.Errorf("Expected value to be %v, got %v", tc.expectedFirstLower, expectedFirstLower) + } + } +} + +func TestIsGreaterThan(t *testing.T) { + tcases := []struct { + name string + firstMask []int + secondMask []int + expectedFirstGreater bool + }{ + { + name: "Check which value is greater of masks with equal bits set 1/1", + firstMask: []int{0}, + secondMask: []int{0}, + expectedFirstGreater: false, + }, + { + name: "Check which value is greater of masks with unequal bits set 2/1", + firstMask: []int{1}, + secondMask: []int{0}, + expectedFirstGreater: true, + }, + { + name: "Check which value is greater of masks with unequal bits set 1/2", + firstMask: []int{0}, + secondMask: []int{1}, + expectedFirstGreater: false, + }, + } + for _, tc := range tcases { + firstMask, _ := NewBitMask(tc.firstMask...) + secondMask, _ := NewBitMask(tc.secondMask...) + expectedFirstGreater := firstMask.IsGreaterThan(secondMask) + if expectedFirstGreater != tc.expectedFirstGreater { + t.Errorf("Expected value to be %v, got %v", tc.expectedFirstGreater, expectedFirstGreater) + } + } +} diff --git a/pkg/kubelet/cm/topologymanager/numa_info.go b/pkg/kubelet/cm/topologymanager/numa_info.go new file mode 100644 index 00000000000..d38f525c845 --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/numa_info.go @@ -0,0 +1,150 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "fmt" + "io/ioutil" + "strconv" + "strings" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" +) + +const ( + defaultNodeDir = "/sys/devices/system/node/" +) + +type NUMADistances [][]uint64 + +type NUMAInfo struct { + Nodes []int + NUMADistances NUMADistances +} + +type NUMASysFs struct { + nodeDir string +} + +func NewNUMAInfo(topology []cadvisorapi.Node) (*NUMAInfo, error) { + return newNUMAInfo(topology, &NUMASysFs{nodeDir: defaultNodeDir}) +} + +func newNUMAInfo(topology []cadvisorapi.Node, sysFs *NUMASysFs) (*NUMAInfo, error) { + var numaNodes []int + distances := make([][]uint64, len(topology)) + for _, node := range topology { + numaNodes = append(numaNodes, node.Id) + + // Populate the NUMA distances + // For now we need to retrieve this information from sysfs. + // TODO: Update this as follows once a version of cadvisor with this commit is vendored in https://github.com/google/cadvisor/commit/24dd1de08a72cfee661f6178454db995900c0fee + // distances[node.Id] = node.Distances[:] + nodeDistance, err := sysFs.GetDistances(node.Id) + if err != nil { + return nil, fmt.Errorf("error getting NUMA distances from sysfs: %w", err) + } + distances[node.Id] = nodeDistance + } + + numaInfo := &NUMAInfo{ + Nodes: numaNodes, + NUMADistances: distances, + } + + return numaInfo, nil +} + +func (n *NUMAInfo) Narrowest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask { + if m1.IsNarrowerThan(m2) { + return m1 + } + return m2 +} + +func (n *NUMAInfo) Closest(m1 bitmask.BitMask, m2 bitmask.BitMask) bitmask.BitMask { + // If the length of both bitmasks aren't the same, choose the one that is narrowest. + if m1.Count() != m2.Count() { + return n.Narrowest(m1, m2) + } + + m1Distance := n.NUMADistances.CalculateAverageFor(m1) + m2Distance := n.NUMADistances.CalculateAverageFor(m2) + // If average distance is the same, take bitmask with more lower-number bits set. + if m1Distance == m2Distance { + if m1.IsLessThan(m2) { + return m1 + } + return m2 + } + + // Otherwise, return the bitmask with the shortest average distance between NUMA nodes. + if m1Distance < m2Distance { + return m1 + } + + return m2 +} + +func (n NUMAInfo) DefaultAffinityMask() bitmask.BitMask { + defaultAffinity, _ := bitmask.NewBitMask(n.Nodes...) + return defaultAffinity +} + +func (d NUMADistances) CalculateAverageFor(bm bitmask.BitMask) float64 { + // This should never happen, but just in case make sure we do not divide by zero. + if bm.Count() == 0 { + return 0 + } + + var count float64 = 0 + var sum float64 = 0 + for _, node1 := range bm.GetBits() { + for _, node2 := range bm.GetBits() { + sum += float64(d[node1][node2]) + count++ + } + } + + return sum / count +} + +func (s NUMASysFs) GetDistances(nodeId int) ([]uint64, error) { + distancePath := fmt.Sprintf("%s/node%d/distance", s.nodeDir, nodeId) + distance, err := ioutil.ReadFile(distancePath) + if err != nil { + return nil, fmt.Errorf("problem reading %s: %w", distancePath, err) + } + + rawDistances := strings.TrimSpace(string(distance)) + + return splitDistances(rawDistances) +} + +func splitDistances(rawDistances string) ([]uint64, error) { + distances := []uint64{} + for _, distance := range strings.Split(rawDistances, " ") { + distanceUint, err := strconv.ParseUint(distance, 10, 64) + if err != nil { + return nil, fmt.Errorf("cannot convert %s to int", distance) + } + distances = append(distances, distanceUint) + } + + return distances, nil +} diff --git a/pkg/kubelet/cm/topologymanager/numa_info_test.go b/pkg/kubelet/cm/topologymanager/numa_info_test.go new file mode 100644 index 00000000000..c101094f2bb --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/numa_info_test.go @@ -0,0 +1,584 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "reflect" + "strconv" + "strings" + "testing" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" +) + +func TestNUMAInfo(t *testing.T) { + tcases := []struct { + name string + topology []cadvisorapi.Node + expectedNUMAInfo *NUMAInfo + expectedErr error + }{ + { + name: "positive test 1 node", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + }, + }, + }, + { + name: "positive test 2 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + }, + }, + }, + { + name: "positive test 3 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + { + Id: 2, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1, 2}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + }, + }, + }, + { + name: "positive test 4 nodes", + topology: []cadvisorapi.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + { + Id: 2, + }, + { + Id: 3, + }, + }, + expectedNUMAInfo: &NUMAInfo{ + Nodes: []int{0, 1, 2, 3}, + NUMADistances: NUMADistances{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + }, + }, + { + name: "negative test 1 node", + topology: []cadvisorapi.Node{ + { + Id: 9, + }, + }, + expectedNUMAInfo: nil, + expectedErr: fmt.Errorf("no such file or directory"), + }, + } + + nodeDir, err := os.MkdirTemp("", "TestNUMAInfo") + if err != nil { + t.Fatalf("Unable to create temporary directory: %v", err) + } + defer os.RemoveAll(nodeDir) + + numaDistances := map[int]string{ + 0: "10 11 12 12", + 1: "11 10 12 12", + 2: "12 12 10 11", + 3: "12 12 11 10", + } + + for i, distances := range numaDistances { + numaDir := filepath.Join(nodeDir, fmt.Sprintf("node%d", i)) + if err := os.Mkdir(numaDir, 0700); err != nil { + t.Fatalf("Unable to create numaDir %s: %v", numaDir, err) + } + + distanceFile := filepath.Join(numaDir, "distance") + + if err = os.WriteFile(distanceFile, []byte(distances), 0644); err != nil { + t.Fatalf("Unable to create test distanceFile: %v", err) + } + } + + // stub sysFs to read from temp dir + sysFs := &NUMASysFs{nodeDir: nodeDir} + + for _, tcase := range tcases { + topology, err := newNUMAInfo(tcase.topology, sysFs) + if tcase.expectedErr == nil && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tcase.expectedErr != nil && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tcase.expectedErr) + } else if tcase.expectedErr != nil { + if !strings.Contains(err.Error(), tcase.expectedErr.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tcase.expectedErr.Error()) + } + } + + if !reflect.DeepEqual(topology, tcase.expectedNUMAInfo) { + t.Fatalf("Expected topology to equal %v, not %v", tcase.expectedNUMAInfo, topology) + } + + } +} + +func TestCalculateAvgDistanceFor(t *testing.T) { + tcases := []struct { + name string + bm []int + distance [][]uint64 + expectedAvg float64 + }{ + { + name: "1 NUMA node", + bm: []int{ + 0, + }, + distance: [][]uint64{ + { + 10, + }, + }, + expectedAvg: 10, + }, + { + name: "2 NUMA node, 1 set in bitmask", + bm: []int{ + 0, + }, + distance: [][]uint64{ + { + 10, + 11, + }, + { + 11, + 10, + }, + }, + expectedAvg: 10, + }, + { + name: "2 NUMA node, 2 set in bitmask", + bm: []int{ + 0, + 1, + }, + distance: [][]uint64{ + { + 10, + 11, + }, + { + 11, + 10, + }, + }, + expectedAvg: 10.5, + }, + { + name: "4 NUMA node, 2 set in bitmask", + bm: []int{ + 0, + 2, + }, + distance: [][]uint64{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + expectedAvg: 11, + }, + { + name: "4 NUMA node, 3 set in bitmask", + bm: []int{ + 0, + 2, + 3, + }, + distance: [][]uint64{ + { + 10, + 11, + 12, + 12, + }, + { + 11, + 10, + 12, + 12, + }, + { + 12, + 12, + 10, + 11, + }, + { + 12, + 12, + 11, + 10, + }, + }, + expectedAvg: 11.11111111111111, + }, + { + name: "0 NUMA node, 0 set in bitmask", + bm: []int{}, + distance: [][]uint64{}, + expectedAvg: 0, + }, + } + + for _, tcase := range tcases { + bm, err := bitmask.NewBitMask(tcase.bm...) + if err != nil { + t.Errorf("no error expected got %v", err) + } + + numaInfo := NUMAInfo{ + Nodes: tcase.bm, + NUMADistances: tcase.distance, + } + + result := numaInfo.NUMADistances.CalculateAverageFor(bm) + if result != tcase.expectedAvg { + t.Errorf("Expected result to equal %g, not %g", tcase.expectedAvg, result) + } + } + +} + +func TestGetDistances(t *testing.T) { + testCases := []struct { + name string + expectedErr bool + expectedDistances []uint64 + nodeId int + nodeExists bool + }{ + { + name: "reading proper distance file", + expectedErr: false, + expectedDistances: []uint64{10, 11, 12, 13}, + nodeId: 0, + nodeExists: true, + }, + { + name: "no distance file", + expectedErr: true, + expectedDistances: nil, + nodeId: 99, + }, + } + + for _, tcase := range testCases { + t.Run(tcase.name, func(t *testing.T) { + var err error + + nodeDir, err := os.MkdirTemp("", "TestGetDistances") + if err != nil { + t.Fatalf("Unable to create temporary directory: %v", err) + } + + defer os.RemoveAll(nodeDir) + + if tcase.nodeExists { + numaDir := filepath.Join(nodeDir, fmt.Sprintf("node%d", tcase.nodeId)) + if err := os.Mkdir(numaDir, 0700); err != nil { + t.Fatalf("Unable to create numaDir %s: %v", numaDir, err) + } + + distanceFile := filepath.Join(numaDir, "distance") + + var buffer bytes.Buffer + for i, distance := range tcase.expectedDistances { + buffer.WriteString(strconv.Itoa(int(distance))) + if i != len(tcase.expectedDistances)-1 { + buffer.WriteString(" ") + } + } + + if err = os.WriteFile(distanceFile, buffer.Bytes(), 0644); err != nil { + t.Fatalf("Unable to create test distanceFile: %v", err) + } + } + + sysFs := &NUMASysFs{nodeDir: nodeDir} + + distances, err := sysFs.GetDistances(tcase.nodeId) + if !tcase.expectedErr && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tcase.expectedErr && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tcase.expectedErr) + } + + if !tcase.expectedErr && !reflect.DeepEqual(distances, tcase.expectedDistances) { + t.Fatalf("Expected distances to equal %v, not %v", tcase.expectedDistances, distances) + } + }) + } +} + +func TestSplitDistances(t *testing.T) { + tcases := []struct { + description string + rawDistances string + expected []uint64 + expectedErr error + }{ + { + description: "read one distance", + rawDistances: "10", + expected: []uint64{10}, + expectedErr: nil, + }, + { + description: "read two distances", + rawDistances: "10 20", + expected: []uint64{10, 20}, + expectedErr: nil, + }, + { + description: "can't convert negative number to uint64", + rawDistances: "10 -20", + expected: nil, + expectedErr: fmt.Errorf("cannot conver"), + }, + } + + for _, tc := range tcases { + result, err := splitDistances(tc.rawDistances) + + if tc.expectedErr == nil && err != nil { + t.Fatalf("Expected err to equal nil, not %v", err) + } else if tc.expectedErr != nil && err == nil { + t.Fatalf("Expected err to equal %v, not nil", tc.expectedErr) + } else if tc.expectedErr != nil { + if !strings.Contains(err.Error(), tc.expectedErr.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tc.expectedErr.Error()) + } + } + + if !reflect.DeepEqual(tc.expected, result) { + t.Fatalf("Expected distances to equal: %v, got: %v", tc.expected, result) + } + } +} + +func TestClosest(t *testing.T) { + tcases := []struct { + description string + current bitmask.BitMask + candidate bitmask.BitMask + expected string + numaInfo *NUMAInfo + }{ + { + description: "current and candidate length is not the same, current narrower", + current: NewTestBitMask(0), + candidate: NewTestBitMask(0, 2), + expected: "current", + numaInfo: &NUMAInfo{}, + }, + { + description: "current and candidate length is the same, distance is the same, current more lower bits set", + current: NewTestBitMask(0, 1), + candidate: NewTestBitMask(0, 2), + expected: "current", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 10, 10}, + {10, 10, 10}, + {10, 10, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, distance is the same, candidate more lower bits set", + current: NewTestBitMask(0, 3), + candidate: NewTestBitMask(0, 2), + expected: "candidate", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 10, 10, 10}, + {10, 10, 10, 10}, + {10, 10, 10, 10}, + {10, 10, 10, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, candidate average distance is smaller", + current: NewTestBitMask(0, 3), + candidate: NewTestBitMask(0, 1), + expected: "candidate", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 11, 12, 12}, + {11, 10, 12, 12}, + {12, 12, 10, 11}, + {12, 12, 11, 10}, + }, + }, + }, + { + description: "current and candidate length is the same, current average distance is smaller", + current: NewTestBitMask(2, 3), + candidate: NewTestBitMask(0, 3), + expected: "current", + numaInfo: &NUMAInfo{ + NUMADistances: [][]uint64{ + {10, 11, 12, 12}, + {11, 10, 12, 12}, + {12, 12, 10, 11}, + {12, 12, 11, 10}, + }, + }, + }, + } + + for _, tc := range tcases { + t.Run(tc.description, func(t *testing.T) { + + result := tc.numaInfo.Closest(tc.candidate, tc.current) + if result != tc.current && result != tc.candidate { + t.Errorf("Expected result to be either 'current' or 'candidate' hint") + } + if tc.expected == "current" && result != tc.current { + t.Errorf("Expected result to be %v, got %v", tc.current, result) + } + if tc.expected == "candidate" && result != tc.candidate { + t.Errorf("Expected result to be %v, got %v", tc.candidate, result) + } + }) + } +} diff --git a/pkg/kubelet/cm/topologymanager/policy.go b/pkg/kubelet/cm/topologymanager/policy.go index 38ad9792c5c..09f1c7bc2cf 100644 --- a/pkg/kubelet/cm/topologymanager/policy.go +++ b/pkg/kubelet/cm/topologymanager/policy.go @@ -33,11 +33,10 @@ type Policy interface { // Merge a TopologyHints permutation to a single hint by performing a bitwise-AND // of their affinity masks. The hint shall be preferred if all hits in the permutation // are preferred. -func mergePermutation(numaNodes []int, permutation []TopologyHint) TopologyHint { +func mergePermutation(defaultAffinity bitmask.BitMask, permutation []TopologyHint) TopologyHint { // Get the NUMANodeAffinity from each hint in the permutation and see if any // of them encode unpreferred allocations. preferred := true - defaultAffinity, _ := bitmask.NewBitMask(numaNodes...) var numaAffinities []bitmask.BitMask for _, hint := range permutation { // Only consider hints that have an actual NUMANodeAffinity set. @@ -127,7 +126,50 @@ func maxOfMinAffinityCounts(filteredHints [][]TopologyHint) int { return maxOfMinCount } -func compareHints(bestNonPreferredAffinityCount int, current *TopologyHint, candidate *TopologyHint) *TopologyHint { +type HintMerger struct { + NUMAInfo *NUMAInfo + Hints [][]TopologyHint + // Set bestNonPreferredAffinityCount to help decide which affinity mask is + // preferred amongst all non-preferred hints. We calculate this value as + // the maximum of the minimum affinity counts supplied for any given hint + // provider. In other words, prefer a hint that has an affinity mask that + // includes all of the NUMA nodes from the provider that requires the most + // NUMA nodes to satisfy its allocation. + BestNonPreferredAffinityCount int + CompareNUMAAffinityMasks func(candidate *TopologyHint, current *TopologyHint) (best *TopologyHint) +} + +func NewHintMerger(numaInfo *NUMAInfo, hints [][]TopologyHint, policyName string, opts PolicyOptions) HintMerger { + compareNumaAffinityMasks := func(current, candidate *TopologyHint) *TopologyHint { + // If current and candidate bitmasks are the same, prefer current hint. + if candidate.NUMANodeAffinity.IsEqual(current.NUMANodeAffinity) { + return current + } + + // Otherwise compare the hints, based on the policy options provided + var best bitmask.BitMask + if (policyName != PolicySingleNumaNode) && opts.PreferClosestNUMA { + best = numaInfo.Closest(current.NUMANodeAffinity, candidate.NUMANodeAffinity) + } else { + best = numaInfo.Narrowest(current.NUMANodeAffinity, candidate.NUMANodeAffinity) + } + if best.IsEqual(current.NUMANodeAffinity) { + return current + } + return candidate + } + + merger := HintMerger{ + NUMAInfo: numaInfo, + Hints: hints, + BestNonPreferredAffinityCount: maxOfMinAffinityCounts(hints), + CompareNUMAAffinityMasks: compareNumaAffinityMasks, + } + + return merger +} + +func (m HintMerger) compare(current *TopologyHint, candidate *TopologyHint) *TopologyHint { // Only consider candidates that result in a NUMANodeAffinity > 0 to // replace the current bestHint. if candidate.NUMANodeAffinity.Count() == 0 { @@ -146,20 +188,18 @@ func compareHints(bestNonPreferredAffinityCount int, current *TopologyHint, cand } // If the current bestHint is preferred and the candidate hint is - // non-preferred, never update the bestHint, regardless of the - // candidate hint's narowness. + // non-preferred, never update the bestHint, regardless of how + // the candidate hint's affinity mask compares to the current + // hint's affinity mask. if current.Preferred && !candidate.Preferred { return current } // If the current bestHint and the candidate hint are both preferred, - // then only consider candidate hints that have a narrower - // NUMANodeAffinity than the NUMANodeAffinity in the current bestHint. + // then only consider fitter NUMANodeAffinity if current.Preferred && candidate.Preferred { - if candidate.NUMANodeAffinity.IsNarrowerThan(current.NUMANodeAffinity) { - return candidate - } - return current + return m.CompareNUMAAffinityMasks(current, candidate) + } // The only case left is if the current best bestHint and the candidate @@ -173,13 +213,13 @@ func compareHints(bestNonPreferredAffinityCount int, current *TopologyHint, cand // 3. current.NUMANodeAffinity.Count() < bestNonPreferredAffinityCount // // For case (1), the current bestHint is larger than the - // bestNonPreferredAffinityCount, so updating to any narrower mergeHint + // bestNonPreferredAffinityCount, so updating to fitter mergeHint // is preferred over staying where we are. // // For case (2), the current bestHint is equal to the // bestNonPreferredAffinityCount, so we would like to stick with what // we have *unless* the candidate hint is also equal to - // bestNonPreferredAffinityCount and it is narrower. + // bestNonPreferredAffinityCount and it is fitter. // // For case (3), the current bestHint is less than // bestNonPreferredAffinityCount, so we would like to creep back up to @@ -216,33 +256,28 @@ func compareHints(bestNonPreferredAffinityCount int, current *TopologyHint, cand // the bestNonPreferredAffinityCount. // // Finally, for case (3cc), we know that the current bestHint and the - // candidate hint are equal, so we simply choose the narrower of the 2. + // candidate hint are equal, so we simply choose the fitter of the 2. // Case 1 - if current.NUMANodeAffinity.Count() > bestNonPreferredAffinityCount { - if candidate.NUMANodeAffinity.IsNarrowerThan(current.NUMANodeAffinity) { - return candidate - } - return current + if current.NUMANodeAffinity.Count() > m.BestNonPreferredAffinityCount { + return m.CompareNUMAAffinityMasks(current, candidate) } // Case 2 - if current.NUMANodeAffinity.Count() == bestNonPreferredAffinityCount { - if candidate.NUMANodeAffinity.Count() != bestNonPreferredAffinityCount { + if current.NUMANodeAffinity.Count() == m.BestNonPreferredAffinityCount { + if candidate.NUMANodeAffinity.Count() != m.BestNonPreferredAffinityCount { return current } - if candidate.NUMANodeAffinity.IsNarrowerThan(current.NUMANodeAffinity) { - return candidate - } - return current + return m.CompareNUMAAffinityMasks(current, candidate) } // Case 3a - if candidate.NUMANodeAffinity.Count() > bestNonPreferredAffinityCount { + if candidate.NUMANodeAffinity.Count() > m.BestNonPreferredAffinityCount { return current } // Case 3b - if candidate.NUMANodeAffinity.Count() == bestNonPreferredAffinityCount { + if candidate.NUMANodeAffinity.Count() == m.BestNonPreferredAffinityCount { return candidate } + // Case 3ca if candidate.NUMANodeAffinity.Count() > current.NUMANodeAffinity.Count() { return candidate @@ -251,35 +286,27 @@ func compareHints(bestNonPreferredAffinityCount int, current *TopologyHint, cand if candidate.NUMANodeAffinity.Count() < current.NUMANodeAffinity.Count() { return current } + // Case 3cc - if candidate.NUMANodeAffinity.IsNarrowerThan(current.NUMANodeAffinity) { - return candidate - } - return current + return m.CompareNUMAAffinityMasks(current, candidate) + } -func mergeFilteredHints(numaNodes []int, filteredHints [][]TopologyHint) TopologyHint { - // Set bestNonPreferredAffinityCount to help decide which affinity mask is - // preferred amongst all non-preferred hints. We calculate this value as - // the maximum of the minimum affinity counts supplied for any given hint - // provider. In other words, prefer a hint that has an affinity mask that - // includes all of the NUMA nodes from the provider that requires the most - // NUMA nodes to satisfy its allocation. - bestNonPreferredAffinityCount := maxOfMinAffinityCounts(filteredHints) +func (m HintMerger) Merge() TopologyHint { + defaultAffinity := m.NUMAInfo.DefaultAffinityMask() var bestHint *TopologyHint - iterateAllProviderTopologyHints(filteredHints, func(permutation []TopologyHint) { + iterateAllProviderTopologyHints(m.Hints, func(permutation []TopologyHint) { // Get the NUMANodeAffinity from each hint in the permutation and see if any // of them encode unpreferred allocations. - mergedHint := mergePermutation(numaNodes, permutation) + mergedHint := mergePermutation(defaultAffinity, permutation) // Compare the current bestHint with the candidate mergedHint and // update bestHint if appropriate. - bestHint = compareHints(bestNonPreferredAffinityCount, bestHint, &mergedHint) + bestHint = m.compare(bestHint, &mergedHint) }) if bestHint == nil { - defaultAffinity, _ := bitmask.NewBitMask(numaNodes...) bestHint = &TopologyHint{defaultAffinity, false} } diff --git a/pkg/kubelet/cm/topologymanager/policy_best_effort.go b/pkg/kubelet/cm/topologymanager/policy_best_effort.go index 651f3a76572..186e622fc1c 100644 --- a/pkg/kubelet/cm/topologymanager/policy_best_effort.go +++ b/pkg/kubelet/cm/topologymanager/policy_best_effort.go @@ -17,8 +17,9 @@ limitations under the License. package topologymanager type bestEffortPolicy struct { - //List of NUMA Nodes available on the underlying machine - numaNodes []int + // numaInfo represents list of NUMA Nodes available on the underlying machine and distances between them + numaInfo *NUMAInfo + opts PolicyOptions } var _ Policy = &bestEffortPolicy{} @@ -27,8 +28,13 @@ var _ Policy = &bestEffortPolicy{} const PolicyBestEffort string = "best-effort" // NewBestEffortPolicy returns best-effort policy. -func NewBestEffortPolicy(numaNodes []int) Policy { - return &bestEffortPolicy{numaNodes: numaNodes} +func NewBestEffortPolicy(numaInfo *NUMAInfo, topologyPolicyOptions map[string]string) (Policy, error) { + opts, err := NewPolicyOptions(topologyPolicyOptions) + if err != nil { + return nil, err + } + + return &bestEffortPolicy{numaInfo: numaInfo, opts: opts}, nil } func (p *bestEffortPolicy) Name() string { @@ -40,8 +46,9 @@ func (p *bestEffortPolicy) canAdmitPodResult(hint *TopologyHint) bool { } func (p *bestEffortPolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) { - filteredProvidersHints := filterProvidersHints(providersHints) - bestHint := mergeFilteredHints(p.numaNodes, filteredProvidersHints) + filteredHints := filterProvidersHints(providersHints) + merger := NewHintMerger(p.numaInfo, filteredHints, p.Name(), p.opts) + bestHint := merger.Merge() admit := p.canAdmitPodResult(&bestHint) return bestHint, admit } diff --git a/pkg/kubelet/cm/topologymanager/policy_best_effort_test.go b/pkg/kubelet/cm/topologymanager/policy_best_effort_test.go index 43bcc6ca933..70b59a4b7f0 100644 --- a/pkg/kubelet/cm/topologymanager/policy_best_effort_test.go +++ b/pkg/kubelet/cm/topologymanager/policy_best_effort_test.go @@ -39,9 +39,9 @@ func TestPolicyBestEffortCanAdmitPodResult(t *testing.T) { } for _, tc := range tcases { - numaNodes := []int{0, 1} - policy := NewBestEffortPolicy(numaNodes) - result := policy.(*bestEffortPolicy).canAdmitPodResult(&tc.hint) + numaInfo := commonNUMAInfoTwoNodes() + policy := &bestEffortPolicy{numaInfo: numaInfo} + result := policy.canAdmitPodResult(&tc.hint) if result != tc.expected { t.Errorf("Expected result to be %t, got %t", tc.expected, result) @@ -50,11 +50,26 @@ func TestPolicyBestEffortCanAdmitPodResult(t *testing.T) { } func TestPolicyBestEffortMerge(t *testing.T) { - numaNodes := []int{0, 1, 2, 3} - policy := NewBestEffortPolicy(numaNodes) + numaInfo := commonNUMAInfoFourNodes() + policy := &bestEffortPolicy{numaInfo: numaInfo} - tcases := commonPolicyMergeTestCases(numaNodes) - tcases = append(tcases, policy.(*bestEffortPolicy).mergeTestCases(numaNodes)...) + tcases := commonPolicyMergeTestCases(numaInfo.Nodes) + tcases = append(tcases, policy.mergeTestCases(numaInfo.Nodes)...) + tcases = append(tcases, policy.mergeTestCasesNoPolicies(numaInfo.Nodes)...) + + testPolicyMerge(policy, tcases, t) +} + +func TestPolicyBestEffortMergeClosestNUMA(t *testing.T) { + numaInfo := commonNUMAInfoEightNodes() + opts := PolicyOptions{ + PreferClosestNUMA: true, + } + policy := &bestEffortPolicy{numaInfo: numaInfo, opts: opts} + + tcases := commonPolicyMergeTestCases(numaInfo.Nodes) + tcases = append(tcases, policy.mergeTestCases(numaInfo.Nodes)...) + tcases = append(tcases, policy.mergeTestCasesClosestNUMA(numaInfo.Nodes)...) testPolicyMerge(policy, tcases, t) } diff --git a/pkg/kubelet/cm/topologymanager/policy_options.go b/pkg/kubelet/cm/topologymanager/policy_options.go new file mode 100644 index 00000000000..39fff52b789 --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/policy_options.go @@ -0,0 +1,81 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "fmt" + "strconv" + + "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" + kubefeatures "k8s.io/kubernetes/pkg/features" +) + +const ( + PreferClosestNUMANodes string = "prefer-closest-numa-nodes" +) + +var ( + alphaOptions = sets.NewString( + PreferClosestNUMANodes, + ) + betaOptions = sets.NewString() + stableOptions = sets.NewString() +) + +func CheckPolicyOptionAvailable(option string) error { + if !alphaOptions.Has(option) && !betaOptions.Has(option) && !stableOptions.Has(option) { + return fmt.Errorf("unknown Topology Manager Policy option: %q", option) + } + + if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyAlphaOptions) { + return fmt.Errorf("Topology Manager Policy Alpha-level Options not enabled, but option %q provided", option) + } + + if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyBetaOptions) { + return fmt.Errorf("Topology Manager Policy Beta-level Options not enabled, but option %q provided", option) + } + + return nil +} + +type PolicyOptions struct { + PreferClosestNUMA bool +} + +func NewPolicyOptions(policyOptions map[string]string) (PolicyOptions, error) { + opts := PolicyOptions{} + for name, value := range policyOptions { + if err := CheckPolicyOptionAvailable(name); err != nil { + return opts, err + } + + switch name { + case PreferClosestNUMANodes: + optValue, err := strconv.ParseBool(value) + if err != nil { + return opts, fmt.Errorf("bad value for option %q: %w", name, err) + } + opts.PreferClosestNUMA = optValue + default: + // this should never be reached, we already detect unknown options, + // but we keep it as further safety. + return opts, fmt.Errorf("unsupported topologymanager option: %q (%s)", name, value) + } + } + return opts, nil +} diff --git a/pkg/kubelet/cm/topologymanager/policy_options_test.go b/pkg/kubelet/cm/topologymanager/policy_options_test.go new file mode 100644 index 00000000000..cf4a01caf9d --- /dev/null +++ b/pkg/kubelet/cm/topologymanager/policy_options_test.go @@ -0,0 +1,166 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package topologymanager + +import ( + "fmt" + "strings" + "testing" + + "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/component-base/featuregate" + featuregatetesting "k8s.io/component-base/featuregate/testing" + pkgfeatures "k8s.io/kubernetes/pkg/features" +) + +var fancyBetaOption = "fancy-new-option" + +type optionAvailTest struct { + option string + featureGate featuregate.Feature + featureGateEnable bool + expectedAvailable bool +} + +func TestNewTopologyManagerOptions(t *testing.T) { + testCases := []struct { + description string + policyOptions map[string]string + featureGate featuregate.Feature + expectedErr error + expectedOptions PolicyOptions + }{ + { + description: "return TopologyManagerOptions with PreferClosestNUMA set to true", + featureGate: pkgfeatures.TopologyManagerPolicyAlphaOptions, + expectedOptions: PolicyOptions{ + PreferClosestNUMA: true, + }, + policyOptions: map[string]string{ + PreferClosestNUMANodes: "true", + }, + }, + { + description: "return empty TopologyManagerOptions", + }, + { + description: "fail to parse options", + featureGate: pkgfeatures.TopologyManagerPolicyAlphaOptions, + policyOptions: map[string]string{ + PreferClosestNUMANodes: "not a boolean", + }, + expectedErr: fmt.Errorf("bad value for option"), + }, + { + description: "test beta options success", + featureGate: pkgfeatures.TopologyManagerPolicyBetaOptions, + policyOptions: map[string]string{ + fancyBetaOption: "true", + }, + }, + { + description: "test beta options success", + policyOptions: map[string]string{ + fancyBetaOption: "true", + }, + expectedErr: fmt.Errorf("Topology Manager Policy Beta-level Options not enabled,"), + }, + } + + betaOptions = sets.NewString(fancyBetaOption) + + for _, tcase := range testCases { + t.Run(tcase.description, func(t *testing.T) { + if tcase.featureGate != "" { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, tcase.featureGate, true)() + } + opts, err := NewPolicyOptions(tcase.policyOptions) + if tcase.expectedErr != nil { + if !strings.Contains(err.Error(), tcase.expectedErr.Error()) { + t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tcase.expectedErr.Error()) + } + } + + if opts != tcase.expectedOptions { + t.Errorf("Expected TopologyManagerOptions to equal %v, not %v", tcase.expectedOptions, opts) + + } + }) + } +} + +func TestPolicyDefaultsAvailable(t *testing.T) { + testCases := []optionAvailTest{ + { + option: "this-option-does-not-exist", + expectedAvailable: false, + }, + { + option: PreferClosestNUMANodes, + expectedAvailable: false, + }, + } + for _, testCase := range testCases { + t.Run(testCase.option, func(t *testing.T) { + err := CheckPolicyOptionAvailable(testCase.option) + isEnabled := (err == nil) + if isEnabled != testCase.expectedAvailable { + t.Errorf("option %q available got=%v expected=%v", testCase.option, isEnabled, testCase.expectedAvailable) + } + }) + } +} + +func TestPolicyOptionsAvailable(t *testing.T) { + testCases := []optionAvailTest{ + { + option: "this-option-does-not-exist", + featureGate: pkgfeatures.TopologyManagerPolicyBetaOptions, + featureGateEnable: false, + expectedAvailable: false, + }, + { + option: "this-option-does-not-exist", + featureGate: pkgfeatures.TopologyManagerPolicyBetaOptions, + featureGateEnable: true, + expectedAvailable: false, + }, + { + option: PreferClosestNUMANodes, + featureGate: pkgfeatures.TopologyManagerPolicyAlphaOptions, + featureGateEnable: true, + expectedAvailable: true, + }, + { + option: PreferClosestNUMANodes, + featureGate: pkgfeatures.TopologyManagerPolicyBetaOptions, + featureGateEnable: true, + expectedAvailable: false, + }, + } + for _, testCase := range testCases { + t.Run(testCase.option, func(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, testCase.featureGate, testCase.featureGateEnable)() + err := CheckPolicyOptionAvailable(testCase.option) + isEnabled := (err == nil) + if isEnabled != testCase.expectedAvailable { + t.Errorf("option %q available got=%v expected=%v", testCase.option, isEnabled, testCase.expectedAvailable) + } + }) + } +} diff --git a/pkg/kubelet/cm/topologymanager/policy_restricted.go b/pkg/kubelet/cm/topologymanager/policy_restricted.go index 5ee2f245d63..8d515e3d4c9 100644 --- a/pkg/kubelet/cm/topologymanager/policy_restricted.go +++ b/pkg/kubelet/cm/topologymanager/policy_restricted.go @@ -26,8 +26,13 @@ var _ Policy = &restrictedPolicy{} const PolicyRestricted string = "restricted" // NewRestrictedPolicy returns restricted policy. -func NewRestrictedPolicy(numaNodes []int) Policy { - return &restrictedPolicy{bestEffortPolicy{numaNodes: numaNodes}} +func NewRestrictedPolicy(numaInfo *NUMAInfo, topologyPolicyOptions map[string]string) (Policy, error) { + opts, err := NewPolicyOptions(topologyPolicyOptions) + if err != nil { + return nil, err + } + + return &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo, opts: opts}}, nil } func (p *restrictedPolicy) Name() string { @@ -40,7 +45,8 @@ func (p *restrictedPolicy) canAdmitPodResult(hint *TopologyHint) bool { func (p *restrictedPolicy) Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool) { filteredHints := filterProvidersHints(providersHints) - hint := mergeFilteredHints(p.numaNodes, filteredHints) - admit := p.canAdmitPodResult(&hint) - return hint, admit + merger := NewHintMerger(p.numaInfo, filteredHints, p.Name(), p.opts) + bestHint := merger.Merge() + admit := p.canAdmitPodResult(&bestHint) + return bestHint, admit } diff --git a/pkg/kubelet/cm/topologymanager/policy_restricted_test.go b/pkg/kubelet/cm/topologymanager/policy_restricted_test.go index d2623199208..b17fd147696 100644 --- a/pkg/kubelet/cm/topologymanager/policy_restricted_test.go +++ b/pkg/kubelet/cm/topologymanager/policy_restricted_test.go @@ -30,8 +30,9 @@ func TestPolicyRestrictedName(t *testing.T) { expected: "restricted", }, } + numaInfo := commonNUMAInfoTwoNodes() for _, tc := range tcases { - policy := NewRestrictedPolicy([]int{0, 1}) + policy := &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo, opts: PolicyOptions{}}} if policy.Name() != tc.expected { t.Errorf("Expected Policy Name to be %s, got %s", tc.expected, policy.Name()) } @@ -57,9 +58,9 @@ func TestPolicyRestrictedCanAdmitPodResult(t *testing.T) { } for _, tc := range tcases { - numaNodes := []int{0, 1} - policy := NewRestrictedPolicy(numaNodes) - result := policy.(*restrictedPolicy).canAdmitPodResult(&tc.hint) + numaInfo := commonNUMAInfoTwoNodes() + policy := &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo}} + result := policy.canAdmitPodResult(&tc.hint) if result != tc.expected { t.Errorf("Expected result to be %t, got %t", tc.expected, result) @@ -68,11 +69,23 @@ func TestPolicyRestrictedCanAdmitPodResult(t *testing.T) { } func TestPolicyRestrictedMerge(t *testing.T) { - numaNodes := []int{0, 1, 2, 3} - policy := NewRestrictedPolicy(numaNodes) + numaInfo := commonNUMAInfoFourNodes() + policy := &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo}} - tcases := commonPolicyMergeTestCases(numaNodes) - tcases = append(tcases, policy.(*restrictedPolicy).mergeTestCases(numaNodes)...) + tcases := commonPolicyMergeTestCases(numaInfo.Nodes) + tcases = append(tcases, policy.mergeTestCases(numaInfo.Nodes)...) + tcases = append(tcases, policy.mergeTestCasesNoPolicies(numaInfo.Nodes)...) + + testPolicyMerge(policy, tcases, t) +} + +func TestPolicyRestrictedMergeClosestNUMA(t *testing.T) { + numaInfo := commonNUMAInfoEightNodes() + policy := &restrictedPolicy{bestEffortPolicy{numaInfo: numaInfo, opts: PolicyOptions{PreferClosestNUMA: true}}} + + tcases := commonPolicyMergeTestCases(numaInfo.Nodes) + tcases = append(tcases, policy.mergeTestCases(numaInfo.Nodes)...) + tcases = append(tcases, policy.mergeTestCasesClosestNUMA(numaInfo.Nodes)...) testPolicyMerge(policy, tcases, t) } diff --git a/pkg/kubelet/cm/topologymanager/policy_single_numa_node.go b/pkg/kubelet/cm/topologymanager/policy_single_numa_node.go index 7745951c085..54395b5ec1e 100644 --- a/pkg/kubelet/cm/topologymanager/policy_single_numa_node.go +++ b/pkg/kubelet/cm/topologymanager/policy_single_numa_node.go @@ -16,13 +16,10 @@ limitations under the License. package topologymanager -import ( - "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" -) - type singleNumaNodePolicy struct { - //List of NUMA Nodes available on the underlying machine - numaNodes []int + // numaInfo represents list of NUMA Nodes available on the underlying machine and distances between them + numaInfo *NUMAInfo + opts PolicyOptions } var _ Policy = &singleNumaNodePolicy{} @@ -31,8 +28,13 @@ var _ Policy = &singleNumaNodePolicy{} const PolicySingleNumaNode string = "single-numa-node" // NewSingleNumaNodePolicy returns single-numa-node policy. -func NewSingleNumaNodePolicy(numaNodes []int) Policy { - return &singleNumaNodePolicy{numaNodes: numaNodes} +func NewSingleNumaNodePolicy(numaInfo *NUMAInfo, topologyPolicyOptions map[string]string) (Policy, error) { + opts, err := NewPolicyOptions(topologyPolicyOptions) + if err != nil { + return nil, err + } + + return &singleNumaNodePolicy{numaInfo: numaInfo, opts: opts}, nil } func (p *singleNumaNodePolicy) Name() string { @@ -65,10 +67,11 @@ func (p *singleNumaNodePolicy) Merge(providersHints []map[string][]TopologyHint) filteredHints := filterProvidersHints(providersHints) // Filter to only include don't cares and hints with a single NUMA node. singleNumaHints := filterSingleNumaHints(filteredHints) - bestHint := mergeFilteredHints(p.numaNodes, singleNumaHints) - defaultAffinity, _ := bitmask.NewBitMask(p.numaNodes...) - if bestHint.NUMANodeAffinity.IsEqual(defaultAffinity) { + merger := NewHintMerger(p.numaInfo, singleNumaHints, p.Name(), p.opts) + bestHint := merger.Merge() + + if bestHint.NUMANodeAffinity.IsEqual(p.numaInfo.DefaultAffinityMask()) { bestHint = TopologyHint{nil, bestHint.Preferred} } diff --git a/pkg/kubelet/cm/topologymanager/policy_single_numa_node_test.go b/pkg/kubelet/cm/topologymanager/policy_single_numa_node_test.go index f2303e72a80..39b1fe33c68 100644 --- a/pkg/kubelet/cm/topologymanager/policy_single_numa_node_test.go +++ b/pkg/kubelet/cm/topologymanager/policy_single_numa_node_test.go @@ -33,11 +33,11 @@ func TestPolicySingleNumaNodeCanAdmitPodResult(t *testing.T) { expected: false, }, } + numaInfo := commonNUMAInfoTwoNodes() for _, tc := range tcases { - numaNodes := []int{0, 1} - policy := NewSingleNumaNodePolicy(numaNodes) - result := policy.(*singleNumaNodePolicy).canAdmitPodResult(&tc.hint) + policy := singleNumaNodePolicy{numaInfo: numaInfo, opts: PolicyOptions{}} + result := policy.canAdmitPodResult(&tc.hint) if result != tc.expected { t.Errorf("Expected result to be %t, got %t", tc.expected, result) @@ -156,11 +156,11 @@ func TestPolicySingleNumaNodeFilterHints(t *testing.T) { } func TestPolicySingleNumaNodeMerge(t *testing.T) { - numaNodes := []int{0, 1, 2, 3} - policy := NewSingleNumaNodePolicy(numaNodes) + numaInfo := commonNUMAInfoFourNodes() + policy := singleNumaNodePolicy{numaInfo: numaInfo, opts: PolicyOptions{}} - tcases := commonPolicyMergeTestCases(numaNodes) - tcases = append(tcases, policy.(*singleNumaNodePolicy).mergeTestCases(numaNodes)...) + tcases := commonPolicyMergeTestCases(numaInfo.Nodes) + tcases = append(tcases, policy.mergeTestCases(numaInfo.Nodes)...) - testPolicyMerge(policy, tcases, t) + testPolicyMerge(&policy, tcases, t) } diff --git a/pkg/kubelet/cm/topologymanager/policy_test.go b/pkg/kubelet/cm/topologymanager/policy_test.go index df9f7e8d5f6..e0924507f80 100644 --- a/pkg/kubelet/cm/topologymanager/policy_test.go +++ b/pkg/kubelet/cm/topologymanager/policy_test.go @@ -748,6 +748,11 @@ func (p *bestEffortPolicy) mergeTestCases(numaNodes []int) []policyMergeTestCase Preferred: false, }, }, + } +} + +func (p *bestEffortPolicy) mergeTestCasesNoPolicies(numaNodes []int) []policyMergeTestCase { + return []policyMergeTestCase{ { name: "bestNonPreferredAffinityCount (5)", hp: []HintProvider{ @@ -833,6 +838,167 @@ func (p *bestEffortPolicy) mergeTestCases(numaNodes []int) []policyMergeTestCase } } +func (p *bestEffortPolicy) mergeTestCasesClosestNUMA(numaNodes []int) []policyMergeTestCase { + return []policyMergeTestCase{ + { + name: "Two providers, 2 hints each, same mask (some with different bits), same preferred", + hp: []HintProvider{ + &mockHintProvider{ + map[string][]TopologyHint{ + "resource1": { + { + NUMANodeAffinity: NewTestBitMask(0, 4), + Preferred: true, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 2), + Preferred: true, + }, + }, + }, + }, + &mockHintProvider{ + map[string][]TopologyHint{ + "resource2": { + { + NUMANodeAffinity: NewTestBitMask(0, 4), + Preferred: true, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 2), + Preferred: true, + }, + }, + }, + }, + }, + expected: TopologyHint{ + NUMANodeAffinity: NewTestBitMask(0, 2), + Preferred: true, + }, + }, + { + name: "Two providers, 2 hints each, different mask", + hp: []HintProvider{ + &mockHintProvider{ + map[string][]TopologyHint{ + "resource1": { + { + NUMANodeAffinity: NewTestBitMask(4), + Preferred: true, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 2), + Preferred: true, + }, + }, + }, + }, + &mockHintProvider{ + map[string][]TopologyHint{ + "resource2": { + { + NUMANodeAffinity: NewTestBitMask(4), + Preferred: true, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 2), + Preferred: true, + }, + }, + }, + }, + }, + expected: TopologyHint{ + NUMANodeAffinity: NewTestBitMask(4), + Preferred: true, + }, + }, + { + name: "bestNonPreferredAffinityCount (5)", + hp: []HintProvider{ + &mockHintProvider{ + map[string][]TopologyHint{ + "resource1": { + { + NUMANodeAffinity: NewTestBitMask(0, 1, 2, 3), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 1), + Preferred: false, + }, + }, + }, + }, + &mockHintProvider{ + map[string][]TopologyHint{ + "resource2": { + { + NUMANodeAffinity: NewTestBitMask(1, 2), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(2, 3), + Preferred: false, + }, + }, + }, + }, + }, + expected: TopologyHint{ + NUMANodeAffinity: NewTestBitMask(2, 3), + Preferred: false, + }, + }, + { + name: "bestNonPreferredAffinityCount (6)", + hp: []HintProvider{ + &mockHintProvider{ + map[string][]TopologyHint{ + "resource1": { + { + NUMANodeAffinity: NewTestBitMask(0, 1, 2, 3), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(0, 1), + Preferred: false, + }, + }, + }, + }, + &mockHintProvider{ + map[string][]TopologyHint{ + "resource2": { + { + NUMANodeAffinity: NewTestBitMask(1, 2, 3), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(1, 2), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(1, 3), + Preferred: false, + }, + { + NUMANodeAffinity: NewTestBitMask(2, 3), + Preferred: false, + }, + }, + }, + }, + }, + expected: TopologyHint{ + NUMANodeAffinity: NewTestBitMask(2, 3), + Preferred: false, + }, + }, + } +} + func (p *singleNumaNodePolicy) mergeTestCases(numaNodes []int) []policyMergeTestCase { return []policyMergeTestCase{ { @@ -1200,7 +1366,7 @@ func TestMaxOfMinAffinityCounts(t *testing.T) { } } -func TestCompareHints(t *testing.T) { +func TestCompareHintsNarrowest(t *testing.T) { tcases := []struct { description string bestNonPreferredAffinityCount int @@ -1387,7 +1553,11 @@ func TestCompareHints(t *testing.T) { for _, tc := range tcases { t.Run(tc.description, func(t *testing.T) { - result := compareHints(tc.bestNonPreferredAffinityCount, tc.current, tc.candidate) + numaInfo := &NUMAInfo{} + merger := NewHintMerger(numaInfo, [][]TopologyHint{}, PolicyBestEffort, PolicyOptions{}) + merger.BestNonPreferredAffinityCount = tc.bestNonPreferredAffinityCount + + result := merger.compare(tc.current, tc.candidate) if result != tc.current && result != tc.candidate { t.Errorf("Expected result to be either 'current' or 'candidate' hint") } @@ -1400,3 +1570,41 @@ func TestCompareHints(t *testing.T) { }) } } + +func commonNUMAInfoTwoNodes() *NUMAInfo { + return &NUMAInfo{ + Nodes: []int{0, 1}, + NUMADistances: NUMADistances{ + {10, 11}, + {11, 10}, + }, + } +} + +func commonNUMAInfoFourNodes() *NUMAInfo { + return &NUMAInfo{ + Nodes: []int{0, 1, 2, 3}, + NUMADistances: NUMADistances{ + {10, 11, 12, 12}, + {11, 10, 12, 12}, + {12, 12, 10, 11}, + {12, 12, 11, 10}, + }, + } +} + +func commonNUMAInfoEightNodes() *NUMAInfo { + return &NUMAInfo{ + Nodes: []int{0, 1, 2, 3, 4, 5, 6, 7}, + NUMADistances: NUMADistances{ + {10, 11, 12, 12, 30, 30, 30, 30}, + {11, 10, 12, 12, 30, 30, 30, 30}, + {12, 12, 10, 11, 30, 30, 30, 30}, + {12, 12, 11, 10, 30, 30, 30, 30}, + {30, 30, 30, 30, 10, 11, 12, 12}, + {30, 30, 30, 30, 11, 10, 12, 12}, + {30, 30, 30, 30, 12, 12, 10, 11}, + {30, 30, 30, 30, 12, 12, 13, 10}, + }, + } +} diff --git a/pkg/kubelet/cm/topologymanager/topology_manager.go b/pkg/kubelet/cm/topologymanager/topology_manager.go index ea5ac91560d..1484af3bd42 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager.go @@ -130,15 +130,15 @@ func (th *TopologyHint) LessThan(other TopologyHint) bool { var _ Manager = &manager{} // NewManager creates a new TopologyManager based on provided policy and scope -func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topologyScopeName string) (Manager, error) { +func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topologyScopeName string, topologyPolicyOptions map[string]string) (Manager, error) { klog.InfoS("Creating topology manager with policy per scope", "topologyPolicyName", topologyPolicyName, "topologyScopeName", topologyScopeName) - var numaNodes []int - for _, node := range topology { - numaNodes = append(numaNodes, node.Id) + numaInfo, err := NewNUMAInfo(topology) + if err != nil { + return nil, fmt.Errorf("cannot discover NUMA topology: %w", err) } - if topologyPolicyName != PolicyNone && len(numaNodes) > maxAllowableNUMANodes { + if topologyPolicyName != PolicyNone && len(numaInfo.Nodes) > maxAllowableNUMANodes { return nil, fmt.Errorf("unsupported on machines with more than %v NUMA Nodes", maxAllowableNUMANodes) } @@ -149,13 +149,22 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology policy = NewNonePolicy() case PolicyBestEffort: - policy = NewBestEffortPolicy(numaNodes) + policy, err = NewBestEffortPolicy(numaInfo, topologyPolicyOptions) + if err != nil { + return nil, err + } case PolicyRestricted: - policy = NewRestrictedPolicy(numaNodes) + policy, err = NewRestrictedPolicy(numaInfo, topologyPolicyOptions) + if err != nil { + return nil, err + } case PolicySingleNumaNode: - policy = NewSingleNumaNodePolicy(numaNodes) + policy, err = NewSingleNumaNodePolicy(numaInfo, topologyPolicyOptions) + if err != nil { + return nil, err + } default: return nil, fmt.Errorf("unknown policy: \"%s\"", topologyPolicyName) diff --git a/pkg/kubelet/cm/topologymanager/topology_manager_test.go b/pkg/kubelet/cm/topologymanager/topology_manager_test.go index c0cc0980c65..3573b045a58 100644 --- a/pkg/kubelet/cm/topologymanager/topology_manager_test.go +++ b/pkg/kubelet/cm/topologymanager/topology_manager_test.go @@ -22,6 +22,9 @@ import ( "testing" "k8s.io/api/core/v1" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask" "k8s.io/kubernetes/pkg/kubelet/lifecycle" ) @@ -37,6 +40,8 @@ func TestNewManager(t *testing.T) { policyName string expectedPolicy string expectedError error + topologyError error + policyOptions map[string]string }{ { description: "Policy is set to none", @@ -63,11 +68,30 @@ func TestNewManager(t *testing.T) { policyName: "unknown", expectedError: fmt.Errorf("unknown policy: \"unknown\""), }, + { + description: "Unknown policy name best-effort policy", + policyName: "best-effort", + expectedPolicy: "best-effort", + expectedError: fmt.Errorf("unknown Topology Manager Policy option:"), + policyOptions: map[string]string{ + "unknown-option": "true", + }, + }, + { + description: "Unknown policy name restricted policy", + policyName: "restricted", + expectedPolicy: "restricted", + expectedError: fmt.Errorf("unknown Topology Manager Policy option:"), + policyOptions: map[string]string{ + "unknown-option": "true", + }, + }, } for _, tc := range tcases { - mngr, err := NewManager(nil, tc.policyName, "container") + topology := []cadvisorapi.Node{} + mngr, err := NewManager(topology, tc.policyName, "container", tc.policyOptions) if tc.expectedError != nil { if !strings.Contains(err.Error(), tc.expectedError.Error()) { t.Errorf("Unexpected error message. Have: %s wants %s", err.Error(), tc.expectedError.Error()) @@ -107,7 +131,7 @@ func TestManagerScope(t *testing.T) { } for _, tc := range tcases { - mngr, err := NewManager(nil, "best-effort", tc.scopeName) + mngr, err := NewManager(nil, "best-effort", tc.scopeName, nil) if tc.expectedError != nil { if !strings.Contains(err.Error(), tc.expectedError.Error()) { @@ -179,7 +203,18 @@ func TestAddHintProvider(t *testing.T) { } func TestAdmit(t *testing.T) { - numaNodes := []int{0, 1} + numaInfo := &NUMAInfo{ + Nodes: []int{0, 1}, + NUMADistances: NUMADistances{ + {10, 11}, + {11, 10}, + }, + } + + opts := map[string]string{} + bePolicy, _ := NewBestEffortPolicy(numaInfo, opts) + restrictedPolicy, _ := NewRestrictedPolicy(numaInfo, opts) + singleNumaPolicy, _ := NewSingleNumaNodePolicy(numaInfo, opts) tcases := []struct { name string @@ -206,7 +241,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as BestEffort. single-numa-node Policy. No Hints.", qosClass: v1.PodQOSBestEffort, - policy: NewRestrictedPolicy(numaNodes), + policy: singleNumaPolicy, hp: []HintProvider{ &mockHintProvider{}, }, @@ -215,7 +250,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as BestEffort. Restricted Policy. No Hints.", qosClass: v1.PodQOSBestEffort, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{}, }, @@ -224,7 +259,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. BestEffort Policy. Preferred Affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewBestEffortPolicy(numaNodes), + policy: bePolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -246,7 +281,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. BestEffort Policy. More than one Preferred Affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewBestEffortPolicy(numaNodes), + policy: bePolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -272,7 +307,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Burstable. BestEffort Policy. More than one Preferred Affinity.", qosClass: v1.PodQOSBurstable, - policy: NewBestEffortPolicy(numaNodes), + policy: bePolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -298,7 +333,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. BestEffort Policy. No Preferred Affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewBestEffortPolicy(numaNodes), + policy: bePolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -316,7 +351,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. Restricted Policy. Preferred Affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -338,7 +373,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Burstable. Restricted Policy. Preferred Affinity.", qosClass: v1.PodQOSBurstable, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -360,7 +395,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. Restricted Policy. More than one Preferred affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -386,7 +421,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Burstable. Restricted Policy. More than one Preferred affinity.", qosClass: v1.PodQOSBurstable, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -412,7 +447,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Guaranteed. Restricted Policy. No Preferred affinity.", qosClass: v1.PodQOSGuaranteed, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ @@ -430,7 +465,7 @@ func TestAdmit(t *testing.T) { { name: "QOSClass set as Burstable. Restricted Policy. No Preferred affinity.", qosClass: v1.PodQOSBurstable, - policy: NewRestrictedPolicy(numaNodes), + policy: restrictedPolicy, hp: []HintProvider{ &mockHintProvider{ map[string][]TopologyHint{ diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index 2c130f947d0..3fab1abbe9c 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -382,6 +382,12 @@ type KubeletConfiguration struct { // Default: "container" // +optional TopologyManagerScope string `json:"topologyManagerScope,omitempty"` + // TopologyManagerPolicyOptions is a set of key=value which allows to set extra options + // to fine tune the behaviour of the topology manager policies. + // Requires both the "TopologyManager" and "TopologyManagerPolicyOptions" feature gates to be enabled. + // Default: nil + // +optional + TopologyManagerPolicyOptions map[string]string `json:"topologyManagerPolicyOptions,omitempty"` // qosReserved is a set of resource name to percentage pairs that specify // the minimum percentage of a resource reserved for exclusive use by the // guaranteed QoS tier. diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go index 63ae4e749c1..d3a3ceca887 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go @@ -261,6 +261,13 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { } } out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + if in.TopologyManagerPolicyOptions != nil { + in, out := &in.TopologyManagerPolicyOptions, &out.TopologyManagerPolicyOptions + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.QOSReserved != nil { in, out := &in.QOSReserved, &out.QOSReserved *out = make(map[string]string, len(*in))