From 9bcd986b23cbb6a8bfcf2035666c7ae150f9106d Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Thu, 12 Apr 2018 20:51:08 -0500 Subject: [PATCH] kubelet: move QOSReserved from experimental to alpha feature gate --- cmd/kubelet/app/options/options.go | 7 +-- cmd/kubelet/app/server.go | 4 +- pkg/features/kube_features.go | 8 +++ .../apis/kubeletconfig/fuzzer/fuzzer.go | 3 ++ .../apis/kubeletconfig/helpers_test.go | 1 + pkg/kubelet/apis/kubeletconfig/types.go | 3 ++ .../apis/kubeletconfig/v1beta1/types.go | 5 ++ .../v1beta1/zz_generated.conversion.go | 2 + .../v1beta1/zz_generated.deepcopy.go | 7 +++ .../kubeletconfig/zz_generated.deepcopy.go | 7 +++ pkg/kubelet/cm/container_manager.go | 2 +- pkg/kubelet/cm/qos_container_manager_linux.go | 53 ++++++++++--------- 12 files changed, 68 insertions(+), 34 deletions(-) diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index fbdc2f42ea5..f4540e2d09e 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -151,10 +151,6 @@ type KubeletFlags struct { // This flag, if set, will avoid including `EvictionHard` limits while computing Node Allocatable. // Refer to [Node Allocatable](https://git.k8s.io/community/contributors/design-proposals/node-allocatable.md) doc for more information. ExperimentalNodeAllocatableIgnoreEvictionThreshold bool - // A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe - // how pod resource requests are reserved at the QoS level. - // Currently only memory is supported. [default=none]" - ExperimentalQOSReserved map[string]string // Node Labels are the node labels to add when registering the node in the cluster NodeLabels map[string]string // volumePluginDir is the full path of the directory in which to search @@ -235,7 +231,6 @@ func NewKubeletFlags() *KubeletFlags { NonMasqueradeCIDR: "10.0.0.0/8", RegisterSchedulable: true, ExperimentalKernelMemcgNotification: false, - ExperimentalQOSReserved: make(map[string]string), RemoteRuntimeEndpoint: remoteRuntimeEndpoint, RotateCertificates: false, // TODO(#54161:v1.11.0): Remove --enable-custom-metrics flag, it is deprecated. @@ -380,7 +375,6 @@ func (f *KubeletFlags) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&f.RemoteImageEndpoint, "image-service-endpoint", f.RemoteImageEndpoint, "[Experimental] The endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. Currently unix socket is supported on Linux, and tcp is supported on windows. Examples:'unix:///var/run/dockershim.sock', 'tcp://localhost:3735'") fs.BoolVar(&f.ExperimentalCheckNodeCapabilitiesBeforeMount, "experimental-check-node-capabilities-before-mount", f.ExperimentalCheckNodeCapabilitiesBeforeMount, "[Experimental] if set true, the kubelet will check the underlying node for required components (binaries, etc.) before performing the mount") fs.BoolVar(&f.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", f.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details. [default=false]") - fs.Var(flag.NewMapStringString(&f.ExperimentalQOSReserved), "experimental-qos-reserved", "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]") bindableNodeLabels := flag.ConfigurationMap(f.NodeLabels) fs.Var(&bindableNodeLabels, "node-labels", " Labels to add when registering the node in the cluster. Labels must be key=value pairs separated by ','.") fs.StringVar(&f.VolumePluginDir, "volume-plugin-dir", f.VolumePluginDir, "The full path of the directory in which to search for additional third party volume plugins") @@ -519,6 +513,7 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.StringVar(&c.CgroupRoot, "cgroup-root", c.CgroupRoot, "Optional root cgroup to use for pods. This is handled by the container runtime on a best effort basis. Default: '', which means use the container runtime default.") fs.StringVar(&c.CPUManagerPolicy, "cpu-manager-policy", c.CPUManagerPolicy, "CPU Manager policy to use. Possible values: 'none', 'static'. Default: 'none'") fs.DurationVar(&c.CPUManagerReconcilePeriod.Duration, "cpu-manager-reconcile-period", c.CPUManagerReconcilePeriod.Duration, " CPU Manager reconciliation period. Examples: '10s', or '1m'. If not supplied, defaults to `NodeStatusUpdateFrequency`") + fs.Var(flag.NewMapStringString(&c.QOSReserved), "qos-reserved", " A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. Requires the QOSReserved feature gate to be enabled.") fs.DurationVar(&c.RuntimeRequestTimeout.Duration, "runtime-request-timeout", c.RuntimeRequestTimeout.Duration, "Timeout of all runtime requests except long running request - pull, logs, exec and attach. When timeout exceeded, kubelet will cancel the request, throw out an error and retry later.") fs.StringVar(&c.HairpinMode, "hairpin-mode", c.HairpinMode, "How should the kubelet setup hairpin NAT. This allows endpoints of a Service to loadbalance back to themselves if they should try to access their own Service. Valid values are \"promiscuous-bridge\", \"hairpin-veth\" and \"none\".") fs.Int32Var(&c.MaxPods, "max-pods", c.MaxPods, "Number of Pods that can run on this Kubelet.") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 6dd9382e40b..7c5c8782159 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -628,7 +628,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies) (err error) { return err } } - experimentalQOSReserved, err := cm.ParseQOSReserved(s.ExperimentalQOSReserved) + experimentalQOSReserved, err := cm.ParseQOSReserved(s.QOSReserved) if err != nil { return err } @@ -656,7 +656,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.Dependencies) (err error) { SystemReserved: systemReserved, HardEvictionThresholds: hardEvictionThresholds, }, - ExperimentalQOSReserved: *experimentalQOSReserved, + QOSReserved: *experimentalQOSReserved, ExperimentalCPUManagerPolicy: s.CPUManagerPolicy, ExperimentalCPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, ExperimentalPodPidsLimit: s.PodPidsLimit, diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 2cf3d8a831a..8df6a287240 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -135,6 +135,13 @@ const ( // Enable mount propagation of volumes. MountPropagation utilfeature.Feature = "MountPropagation" + // owner: @sjenning + // alpha: v1.11 + // + // Allows resource reservations at the QoS level preventing pods at lower QoS levels from + // bursting into resources requested at higher QoS levels (memory only for now) + QOSReserved utilfeature.Feature = "QOSReserved" + // owner: @ConnorDoyle // alpha: v1.8 // @@ -299,6 +306,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS EnableEquivalenceClassCache: {Default: false, PreRelease: utilfeature.Alpha}, TaintNodesByCondition: {Default: false, PreRelease: utilfeature.Alpha}, MountPropagation: {Default: true, PreRelease: utilfeature.Beta}, + QOSReserved: {Default: false, PreRelease: utilfeature.Alpha}, ExpandPersistentVolumes: {Default: false, PreRelease: utilfeature.Alpha}, CPUManager: {Default: true, PreRelease: utilfeature.Beta}, ServiceNodeExclusion: {Default: false, PreRelease: utilfeature.Alpha}, diff --git a/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go b/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go index 857646d07e8..d55d8ee9af4 100644 --- a/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go @@ -63,6 +63,9 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.NodeStatusUpdateFrequency = metav1.Duration{Duration: 10 * time.Second} obj.CPUManagerPolicy = "none" obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency + obj.QOSReserved = map[string]string{ + "memory": "50%", + } obj.OOMScoreAdj = int32(qos.KubeletOOMScoreAdj) obj.Port = ports.KubeletPort obj.ReadOnlyPort = ports.KubeletReadOnlyPort diff --git a/pkg/kubelet/apis/kubeletconfig/helpers_test.go b/pkg/kubelet/apis/kubeletconfig/helpers_test.go index 6c8602b2272..ccdbfd7f44d 100644 --- a/pkg/kubelet/apis/kubeletconfig/helpers_test.go +++ b/pkg/kubelet/apis/kubeletconfig/helpers_test.go @@ -147,6 +147,7 @@ var ( "CPUCFSQuota", "CPUManagerPolicy", "CPUManagerReconcilePeriod.Duration", + "QOSReserved[*]", "CgroupDriver", "CgroupRoot", "CgroupsPerQOS", diff --git a/pkg/kubelet/apis/kubeletconfig/types.go b/pkg/kubelet/apis/kubeletconfig/types.go index b593c2c25d8..102fc3573c2 100644 --- a/pkg/kubelet/apis/kubeletconfig/types.go +++ b/pkg/kubelet/apis/kubeletconfig/types.go @@ -160,6 +160,9 @@ type KubeletConfiguration struct { // CPU Manager reconciliation period. // Requires the CPUManager feature gate to be enabled. CPUManagerReconcilePeriod metav1.Duration + // Map of QoS resource reservation percentages (memory only for now). + // Requires the QOSReserved feature gate to be enabled. + QOSReserved map[string]string // runtimeRequestTimeout is the timeout for all runtime requests except long running // requests - pull, logs, exec and attach. RuntimeRequestTimeout metav1.Duration diff --git a/pkg/kubelet/apis/kubeletconfig/v1beta1/types.go b/pkg/kubelet/apis/kubeletconfig/v1beta1/types.go index fd46f3c9dff..292a4e0504f 100644 --- a/pkg/kubelet/apis/kubeletconfig/v1beta1/types.go +++ b/pkg/kubelet/apis/kubeletconfig/v1beta1/types.go @@ -249,6 +249,11 @@ type KubeletConfiguration struct { // Default: "10s" // +optional CPUManagerReconcilePeriod metav1.Duration `json:"cpuManagerReconcilePeriod,omitempty"` + // Map of QoS resource reservation percentages (memory only for now). + // Requires the QOSReserved feature gate to be enabled. + // Default: nil + // +optional + QOSReserved map[string]string // runtimeRequestTimeout is the timeout for all runtime requests except long running // requests - pull, logs, exec and attach. // Default: "2m" diff --git a/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.conversion.go index cd758225a2a..3c89531388d 100644 --- a/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.conversion.go @@ -200,6 +200,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_kubeletconfig_KubeletConfigurat out.CgroupDriver = in.CgroupDriver out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode out.MaxPods = in.MaxPods @@ -321,6 +322,7 @@ func autoConvert_kubeletconfig_KubeletConfiguration_To_v1beta1_KubeletConfigurat out.CgroupDriver = in.CgroupDriver out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) out.RuntimeRequestTimeout = in.RuntimeRequestTimeout out.HairpinMode = in.HairpinMode out.MaxPods = in.MaxPods diff --git a/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.deepcopy.go b/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.deepcopy.go index 76f65ea1951..456f551a64e 100644 --- a/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/kubeletconfig/v1beta1/zz_generated.deepcopy.go @@ -193,6 +193,13 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { } } out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + if in.QOSReserved != nil { + in, out := &in.QOSReserved, &out.QOSReserved + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } out.RuntimeRequestTimeout = in.RuntimeRequestTimeout if in.PodPidsLimit != nil { in, out := &in.PodPidsLimit, &out.PodPidsLimit diff --git a/pkg/kubelet/apis/kubeletconfig/zz_generated.deepcopy.go b/pkg/kubelet/apis/kubeletconfig/zz_generated.deepcopy.go index b46cc34bf8a..e1ec3346672 100644 --- a/pkg/kubelet/apis/kubeletconfig/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/kubeletconfig/zz_generated.deepcopy.go @@ -112,6 +112,13 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { out.ImageMinimumGCAge = in.ImageMinimumGCAge out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + if in.QOSReserved != nil { + in, out := &in.QOSReserved, &out.QOSReserved + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } out.RuntimeRequestTimeout = in.RuntimeRequestTimeout if in.EvictionHard != nil { in, out := &in.EvictionHard, &out.EvictionHard diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 2a64db67143..2e863581707 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -107,7 +107,7 @@ type NodeConfig struct { KubeletRootDir string ProtectKernelDefaults bool NodeAllocatableConfig - ExperimentalQOSReserved map[v1.ResourceName]int64 + QOSReserved map[v1.ResourceName]int64 ExperimentalCPUManagerPolicy string ExperimentalCPUManagerReconcilePeriod time.Duration ExperimentalPodPidsLimit int64 diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 0cd8559998e..62380a2b2fc 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -71,7 +71,7 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nod subsystems: subsystems, cgroupManager: cgroupManager, cgroupRoot: CgroupName(cgroupRoot), - qosReserved: nodeConfig.ExperimentalQOSReserved, + qosReserved: nodeConfig.QOSReserved, }, nil } @@ -300,31 +300,34 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { } } - for resource, percentReserve := range m.qosReserved { - switch resource { - case v1.ResourceMemory: - m.setMemoryReserve(qosConfigs, percentReserve) + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) { + for resource, percentReserve := range m.qosReserved { + switch resource { + case v1.ResourceMemory: + m.setMemoryReserve(qosConfigs, percentReserve) + } } - } - updateSuccess := true - for _, config := range qosConfigs { - err := m.cgroupManager.Update(config) - if err != nil { - updateSuccess = false - } - } - if updateSuccess { - glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration") - return nil - } - // If the resource can adjust the ResourceConfig to increase likelihood of - // success, call the adjustment function here. Otherwise, the Update() will - // be called again with the same values. - for resource, percentReserve := range m.qosReserved { - switch resource { - case v1.ResourceMemory: - m.retrySetMemoryReserve(qosConfigs, percentReserve) + updateSuccess := true + for _, config := range qosConfigs { + err := m.cgroupManager.Update(config) + if err != nil { + updateSuccess = false + } + } + if updateSuccess { + glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration") + return nil + } + + // If the resource can adjust the ResourceConfig to increase likelihood of + // success, call the adjustment function here. Otherwise, the Update() will + // be called again with the same values. + for resource, percentReserve := range m.qosReserved { + switch resource { + case v1.ResourceMemory: + m.retrySetMemoryReserve(qosConfigs, percentReserve) + } } } @@ -336,7 +339,7 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { } } - glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry") + glog.V(4).Infof("[ContainerManager]: Updated QoS cgroup configuration") return nil }