diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index f216bc8484d..e0401faae51 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -537,6 +537,7 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.Int32Var(&c.PodsPerCore, "pods-per-core", c.PodsPerCore, "Number of Pods per core that can run on this Kubelet. The total number of Pods on this Kubelet cannot exceed max-pods, so max-pods will be used if this calculation results in a larger number of Pods allowed on the Kubelet. A value of 0 disables this limit.") fs.BoolVar(&c.ProtectKernelDefaults, "protect-kernel-defaults", c.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.") fs.StringVar(&c.ReservedSystemCPUs, "reserved-cpus", c.ReservedSystemCPUs, "A comma-separated list of CPUs or CPU ranges that are reserved for system and kubernetes usage. This specific list will supersede cpu counts in --system-reserved and --kube-reserved.") + fs.StringVar(&c.TopologyManagerScope, "topology-manager-scope", c.TopologyManagerScope, "Scope to which topology hints applied. Topology Manager collects hints from Hint Providers and applies them to defined scope to ensure the pod admission. Possible values: 'container' (default), 'pod'.") // Node Allocatable Flags fs.Var(cliflag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") fs.Var(cliflag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local ephemeral storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index a3b6fbf04ac..d91ddfdf38a 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -738,6 +738,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend EnforceCPULimits: s.CPUCFSQuota, CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, ExperimentalTopologyManagerPolicy: s.TopologyManagerPolicy, + ExperimentalTopologyManagerScope: s.TopologyManagerScope, }, s.FailSwapOn, devicePluginEnabled, diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go index a1312f1a421..104d8215526 100644 --- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go @@ -70,6 +70,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency obj.NodeStatusMaxImages = 50 obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy + obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope obj.QOSReserved = map[string]string{ "memory": "50%", } diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index c75d4d289d3..0de07106221 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -152,6 +152,7 @@ var ( "CPUManagerPolicy", "CPUManagerReconcilePeriod.Duration", "TopologyManagerPolicy", + "TopologyManagerScope", "QOSReserved[*]", "CgroupDriver", "CgroupRoot", diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml index 9321e8b00ee..a82e874a1e3 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml @@ -70,5 +70,6 @@ serializeImagePulls: true streamingConnectionIdleTimeout: 4h0m0s syncFrequency: 1m0s topologyManagerPolicy: none +topologyManagerScope: container volumePluginDir: /usr/libexec/kubernetes/kubelet-plugins/volume/exec/ volumeStatsAggPeriod: 1m0s diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml index 9321e8b00ee..a82e874a1e3 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml @@ -70,5 +70,6 @@ serializeImagePulls: true streamingConnectionIdleTimeout: 4h0m0s syncFrequency: 1m0s topologyManagerPolicy: none +topologyManagerScope: container volumePluginDir: /usr/libexec/kubernetes/kubelet-plugins/volume/exec/ volumeStatsAggPeriod: 1m0s diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index 96481ad9f13..c36aec6a563 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -61,12 +61,18 @@ const ( // BestEffortTopologyManagerPolicy is a mode in which kubelet will favour // pods with NUMA alignment of CPU and device resources. BestEffortTopologyManagerPolicy = "best-effort" - // NoneTopologyManager Policy is a mode in which kubelet has no knowledge + // NoneTopologyManagerPolicy is a mode in which kubelet has no knowledge // of NUMA alignment of a pod's CPU and device resources. NoneTopologyManagerPolicy = "none" - // SingleNumaNodeTopologyManager Policy iis a mode in which kubelet only allows + // SingleNumaNodeTopologyManagerPolicy is a mode in which kubelet only allows // pods with a single NUMA alignment of CPU and device resources. - SingleNumaNodeTopologyManager = "single-numa-node" + SingleNumaNodeTopologyManagerPolicy = "single-numa-node" + // ContainerTopologyManagerScope represents that + // topology policy is applied on a per-container basis. + ContainerTopologyManagerScope = "container" + // PodTopologyManagerScope represents that + // topology policy is applied on a per-pod basis. + PodTopologyManagerScope = "pod" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -221,6 +227,12 @@ type KubeletConfiguration struct { // TopologyManagerPolicy is the name of the policy to use. // Policies other than "none" require the TopologyManager feature gate to be enabled. TopologyManagerPolicy string + // TopologyManagerScope represents the scope of topology hint generation + // that topology manager requests and hint providers generate. + // "pod" scope requires the TopologyManager feature gate to be enabled. + // Default: "container" + // +optional + TopologyManagerScope string // Map of QoS resource reservation percentages (memory only for now). // Requires the QOSReserved feature gate to be enabled. QOSReserved map[string]string diff --git a/pkg/kubelet/apis/config/v1beta1/defaults.go b/pkg/kubelet/apis/config/v1beta1/defaults.go index 32935604c58..3b46d4164b3 100644 --- a/pkg/kubelet/apis/config/v1beta1/defaults.go +++ b/pkg/kubelet/apis/config/v1beta1/defaults.go @@ -157,6 +157,9 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura if obj.TopologyManagerPolicy == "" { obj.TopologyManagerPolicy = kubeletconfigv1beta1.NoneTopologyManagerPolicy } + if obj.TopologyManagerScope == "" { + obj.TopologyManagerScope = kubeletconfigv1beta1.ContainerTopologyManagerScope + } if obj.RuntimeRequestTimeout == zeroDuration { obj.RuntimeRequestTimeout = metav1.Duration{Duration: 2 * time.Minute} } diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index e0b7608745b..cf33ba227c6 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -275,6 +275,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod out.TopologyManagerPolicy = in.TopologyManagerPolicy + out.TopologyManagerScope = in.TopologyManagerScope out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode @@ -427,6 +428,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod out.TopologyManagerPolicy = in.TopologyManagerPolicy + out.TopologyManagerScope = in.TopologyManagerScope out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go index 156a4b4e2e0..50c46334e08 100644 --- a/pkg/kubelet/apis/config/validation/validation.go +++ b/pkg/kubelet/apis/config/validation/validation.go @@ -125,6 +125,9 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error if kc.TopologyManagerPolicy != kubeletconfig.NoneTopologyManagerPolicy && !localFeatureGate.Enabled(features.TopologyManager) { allErrors = append(allErrors, fmt.Errorf("invalid configuration: topologyManager %v requires feature gate TopologyManager", kc.TopologyManagerPolicy)) } + if kc.TopologyManagerScope != kubeletconfig.ContainerTopologyManagerScope && !localFeatureGate.Enabled(features.TopologyManager) { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: TopologyManagerScope %v requires feature gate TopologyManager", kc.TopologyManagerScope)) + } for _, val := range kc.EnforceNodeAllocatable { switch val { case kubetypes.NodeAllocatableEnforcementKey: diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index bf8122610a2..ea81d6163a7 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -133,6 +133,7 @@ type NodeConfig struct { NodeAllocatableConfig QOSReserved map[v1.ResourceName]int64 ExperimentalCPUManagerPolicy string + ExperimentalTopologyManagerScope string ExperimentalCPUManagerReconcilePeriod time.Duration ExperimentalPodPidsLimit int64 EnforceCPULimits bool diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index d298f3c4b21..ff3dc464f44 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -61,12 +61,18 @@ const ( // BestEffortTopologyManagerPolicy is a mode in which kubelet will favour // pods with NUMA alignment of CPU and device resources. BestEffortTopologyManagerPolicy = "best-effort" - // NoneTopologyManager Policy is a mode in which kubelet has no knowledge + // NoneTopologyManagerPolicy is a mode in which kubelet has no knowledge // of NUMA alignment of a pod's CPU and device resources. NoneTopologyManagerPolicy = "none" - // SingleNumaNodeTopologyManager Policy iis a mode in which kubelet only allows + // SingleNumaNodeTopologyManagerPolicy is a mode in which kubelet only allows // pods with a single NUMA alignment of CPU and device resources. - SingleNumaNodeTopologyManager = "single-numa-node" + SingleNumaNodeTopologyManagerPolicy = "single-numa-node" + // ContainerTopologyManagerScope represents that + // topology policy is applied on a per-container basis. + ContainerTopologyManagerScope = "container" + // PodTopologyManagerScope represents that + // topology policy is applied on a per-pod basis. + PodTopologyManagerScope = "pod" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -434,6 +440,12 @@ type KubeletConfiguration struct { // Default: "none" // +optional TopologyManagerPolicy string `json:"topologyManagerPolicy,omitempty"` + // TopologyManagerScope represents the scope of topology hint generation + // that topology manager requests and hint providers generate. + // "pod" scope requires the TopologyManager feature gate to be enabled. + // Default: "container" + // +optional + TopologyManagerScope string `json:"topologyManagerScope,omitempty"` // qosReserved is a set of resource name to percentage pairs that specify // the minimum percentage of a resource reserved for exclusive use by the // guaranteed QoS tier.