From c39cf28ed35200bd9b20ee1408edef73cd3ab7e6 Mon Sep 17 00:00:00 2001 From: Amim Knabben Date: Sat, 6 Jun 2020 11:55:43 -0400 Subject: [PATCH] Moving Kubelet kernel-memgc-notification to configuration file --- cluster/gce/config-test.sh | 4 +-- cluster/gce/util.sh | 2 +- cmd/kubelet/app/options/options.go | 35 +++++++++---------- cmd/kubelet/app/server.go | 6 ++-- pkg/kubelet/apis/config/fuzzer/fuzzer.go | 1 + pkg/kubelet/apis/config/helpers_test.go | 1 + pkg/kubelet/apis/config/types.go | 3 ++ .../config/v1beta1/zz_generated.conversion.go | 2 ++ pkg/kubelet/kubelet.go | 4 +-- .../k8s.io/kubelet/config/v1beta1/types.go | 7 ++++ test/e2e_node/remote/node_e2e.go | 2 +- 11 files changed, 40 insertions(+), 27 deletions(-) diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 41400544e98..78078e4de53 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -220,10 +220,10 @@ TEST_CLUSTER_API_CONTENT_TYPE=${TEST_CLUSTER_API_CONTENT_TYPE:-} KUBELET_TEST_ARGS="${KUBELET_TEST_ARGS:-} --serialize-image-pulls=false ${TEST_CLUSTER_API_CONTENT_TYPE}" if [[ "${NODE_OS_DISTRIBUTION}" = 'gci' ]] || [[ "${NODE_OS_DISTRIBUTION}" = 'ubuntu' ]] || [[ "${NODE_OS_DISTRIBUTION}" = 'custom' ]]; then - NODE_KUBELET_TEST_ARGS="${NODE_KUBELET_TEST_ARGS:-} --experimental-kernel-memcg-notification=true" + NODE_KUBELET_TEST_ARGS="${NODE_KUBELET_TEST_ARGS:-} --kernel-memcg-notification=true" fi if [[ "${MASTER_OS_DISTRIBUTION}" = 'gci' ]] || [[ "${MASTER_OS_DISTRIBUTION}" = 'ubuntu' ]]; then - MASTER_KUBELET_TEST_ARGS="${MASTER_KUBELET_TEST_ARGS:-} --experimental-kernel-memcg-notification=true" + MASTER_KUBELET_TEST_ARGS="${MASTER_KUBELET_TEST_ARGS:-} --kernel-memcg-notification=true" fi APISERVER_TEST_ARGS="${APISERVER_TEST_ARGS:-} --runtime-config=extensions/v1beta1,scheduling.k8s.io/v1alpha1,settings.k8s.io/v1alpha1 ${TEST_CLUSTER_DELETE_COLLECTION_WORKERS} ${TEST_CLUSTER_MAX_REQUESTS_INFLIGHT}" CONTROLLER_MANAGER_TEST_ARGS="${CONTROLLER_MANAGER_TEST_ARGS:-} ${TEST_CLUSTER_RESYNC_PERIOD} ${TEST_CLUSTER_API_CONTENT_TYPE}" diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 18b676cbe6c..fdc811d3ff3 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -891,7 +891,7 @@ function construct-windows-kubelet-flags { flags+=" --cgroups-per-qos=false --enforce-node-allocatable=" # Turn off kernel memory cgroup notification. - flags+=" --experimental-kernel-memcg-notification=false" + flags+=" --kernel-memcg-notification=false" # TODO(#78628): Re-enable KubeletPodResources when the issue is fixed. # Force disable KubeletPodResources feature on Windows until #78628 is fixed. diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index d5473fed69a..bc121233d6a 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -117,9 +117,6 @@ type KubeletFlags struct { RemoteImageEndpoint string // experimentalMounterPath is the path of mounter binary. Leave empty to use the default mount path ExperimentalMounterPath string - // If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling. - // +optional - ExperimentalKernelMemcgNotification bool // This flag, if set, enables a check prior to mount operations to verify that the required components // (binaries, etc.) to mount the volume are available on the underlying node. If the check is enabled // and fails the mount operation fails. @@ -180,20 +177,19 @@ func NewKubeletFlags() *KubeletFlags { } return &KubeletFlags{ - ContainerRuntimeOptions: *NewContainerRuntimeOptions(), - CertDirectory: "/var/lib/kubelet/pki", - RootDirectory: defaultRootDir, - MasterServiceNamespace: metav1.NamespaceDefault, - MaxContainerCount: -1, - MaxPerPodContainerCount: 1, - MinimumGCAge: metav1.Duration{Duration: 0}, - NonMasqueradeCIDR: "10.0.0.0/8", - RegisterSchedulable: true, - ExperimentalKernelMemcgNotification: false, - RemoteRuntimeEndpoint: remoteRuntimeEndpoint, - NodeLabels: make(map[string]string), - RegisterNode: true, - SeccompProfileRoot: filepath.Join(defaultRootDir, "seccomp"), + ContainerRuntimeOptions: *NewContainerRuntimeOptions(), + CertDirectory: "/var/lib/kubelet/pki", + RootDirectory: defaultRootDir, + MasterServiceNamespace: metav1.NamespaceDefault, + MaxContainerCount: -1, + MaxPerPodContainerCount: 1, + MinimumGCAge: metav1.Duration{Duration: 0}, + NonMasqueradeCIDR: "10.0.0.0/8", + RegisterSchedulable: true, + RemoteRuntimeEndpoint: remoteRuntimeEndpoint, + NodeLabels: make(map[string]string), + RegisterNode: true, + SeccompProfileRoot: filepath.Join(defaultRootDir, "seccomp"), // prior to the introduction of this flag, there was a hardcoded cap of 50 images EnableCAdvisorJSONEndpoints: false, } @@ -345,7 +341,6 @@ func (f *KubeletFlags) AddFlags(mainfs *pflag.FlagSet) { fs.Var(utiltaints.NewTaintsVar(&f.RegisterWithTaints), "register-with-taints", "Register the node with the given list of taints (comma separated \"=:\"). No-op if register-node is false.") // EXPERIMENTAL FLAGS - fs.BoolVar(&f.ExperimentalKernelMemcgNotification, "experimental-kernel-memcg-notification", f.ExperimentalKernelMemcgNotification, "If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling.") fs.StringVar(&f.RemoteRuntimeEndpoint, "container-runtime-endpoint", f.RemoteRuntimeEndpoint, "[Experimental] The endpoint of remote runtime service. Currently unix socket endpoint is supported on Linux, while npipe and tcp endpoints are supported on windows. Examples:'unix:///var/run/dockershim.sock', 'npipe:////./pipe/dockershim'") fs.StringVar(&f.RemoteImageEndpoint, "image-service-endpoint", f.RemoteImageEndpoint, "[Experimental] The endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. Currently unix socket endpoint is supported on Linux, while npipe and tcp endpoints are supported on windows. Examples:'unix:///var/run/dockershim.sock', 'npipe:////./pipe/dockershim'") fs.BoolVar(&f.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", f.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details. [default=false]") @@ -519,6 +514,7 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.StringSliceVar(&c.AllowedUnsafeSysctls, "allowed-unsafe-sysctls", c.AllowedUnsafeSysctls, "Comma-separated whitelist of unsafe sysctls or unsafe sysctl patterns (ending in *). Use these at your own risk.") fs.Int32Var(&c.NodeStatusMaxImages, "node-status-max-images", c.NodeStatusMaxImages, "The maximum number of images to report in Node.Status.Images. If -1 is specified, no cap will be applied.") + fs.BoolVar(&c.KernelMemcgNotification, "kernel-memcg-notification", c.KernelMemcgNotification, "If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling.") // Flags intended for testing, not recommended used in production environments. fs.Int64Var(&c.MaxOpenFiles, "max-open-files", c.MaxOpenFiles, "Number of files that can be opened by Kubelet process.") @@ -543,4 +539,7 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptable options are 'none', 'pods', 'system-reserved', and 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' and '--kube-reserved-cgroup' must also be set, respectively. If 'none' is specified, no additional options should be set. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details.") fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") + + // Graduated experimental flags, kept for backward compatibility + fs.BoolVar(&c.KernelMemcgNotification, "experimental-kernel-memcg-notification", c.KernelMemcgNotification, "Use kernelMemcgNotification configuration, this flag will be removed in 1.23.") } diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index ebd220c44e8..e812798df17 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -1109,7 +1109,7 @@ func RunKubelet(kubeServer *options.KubeletServer, kubeDeps *kubelet.Dependencie kubeServer.RegisterWithTaints, kubeServer.AllowedUnsafeSysctls, kubeServer.ExperimentalMounterPath, - kubeServer.ExperimentalKernelMemcgNotification, + kubeServer.KernelMemcgNotification, kubeServer.ExperimentalCheckNodeCapabilitiesBeforeMount, kubeServer.ExperimentalNodeAllocatableIgnoreEvictionThreshold, kubeServer.MinimumGCAge, @@ -1183,7 +1183,7 @@ func createAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, registerWithTaints []api.Taint, allowedUnsafeSysctls []string, experimentalMounterPath string, - experimentalKernelMemcgNotification bool, + kernelMemcgNotification bool, experimentalCheckNodeCapabilitiesBeforeMount bool, experimentalNodeAllocatableIgnoreEvictionThreshold bool, minimumGCAge metav1.Duration, @@ -1215,7 +1215,7 @@ func createAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, registerWithTaints, allowedUnsafeSysctls, experimentalMounterPath, - experimentalKernelMemcgNotification, + kernelMemcgNotification, experimentalCheckNodeCapabilitiesBeforeMount, experimentalNodeAllocatableIgnoreEvictionThreshold, minimumGCAge, diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go index c9008be145d..06c636be2ad 100644 --- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go @@ -59,6 +59,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.ImageMinimumGCAge = metav1.Duration{Duration: 2 * time.Minute} obj.ImageGCHighThresholdPercent = 85 obj.ImageGCLowThresholdPercent = 80 + obj.KernelMemcgNotification = false obj.MaxOpenFiles = 1000000 obj.MaxPods = 110 obj.PodPidsLimit = -1 diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 4cfaf04b5d0..e8b4581589f 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -189,6 +189,7 @@ var ( "ImageGCHighThresholdPercent", "ImageGCLowThresholdPercent", "ImageMinimumGCAge.Duration", + "KernelMemcgNotification", "KubeAPIBurst", "KubeAPIQPS", "KubeReservedCgroup", diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index bcafb460b50..78e76e934d7 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -322,6 +322,9 @@ type KubeletConfiguration struct { // These sysctls are namespaced but not allowed by default. For example: "kernel.msg*,net.ipv4.route.min_pmtu" // +optional AllowedUnsafeSysctls []string + // kernelMemcgNotification if enabled, the kubelet will integrate with the kernel memcg + // notification to determine if memory eviction thresholds are crossed rather than polling. + KernelMemcgNotification bool /* the following fields are meant for Node Allocatable */ diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index f0579411672..8efd41941c1 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -341,6 +341,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in out.AllowedUnsafeSysctls = *(*[]string)(unsafe.Pointer(&in.AllowedUnsafeSysctls)) out.VolumePluginDir = in.VolumePluginDir out.ProviderID = in.ProviderID + out.KernelMemcgNotification = in.KernelMemcgNotification return nil } @@ -477,6 +478,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in } out.ConfigMapAndSecretChangeDetectionStrategy = v1beta1.ResourceChangeDetectionStrategy(in.ConfigMapAndSecretChangeDetectionStrategy) out.AllowedUnsafeSysctls = *(*[]string)(unsafe.Pointer(&in.AllowedUnsafeSysctls)) + out.KernelMemcgNotification = in.KernelMemcgNotification out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved)) out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved)) out.SystemReservedCgroup = in.SystemReservedCgroup diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 2a5619d3b19..0956040a858 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -363,7 +363,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, registerWithTaints []api.Taint, allowedUnsafeSysctls []string, experimentalMounterPath string, - experimentalKernelMemcgNotification bool, + kernelMemcgNotification bool, experimentalCheckNodeCapabilitiesBeforeMount bool, experimentalNodeAllocatableIgnoreEvictionThreshold bool, minimumGCAge metav1.Duration, @@ -432,7 +432,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration, MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod), Thresholds: thresholds, - KernelMemcgNotification: experimentalKernelMemcgNotification, + KernelMemcgNotification: kernelMemcgNotification, PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(), } diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index 5c61c130dd6..e84a5935086 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -787,6 +787,13 @@ type KubeletConfiguration struct { // Default: "" // +optional ProviderID string `json:"providerID,omitempty"` + // kernelMemcgNotification, if set, the kubelet will integrate with the kernel memcg notification + // to determine if memory eviction thresholds are crossed rather than polling. + // Dynamic Kubelet Config (beta): If dynamically updating this field, consider that + // it may impact the way Kubelet interacts with the kernel. + // Default: false + // +optional + KernelMemcgNotification bool `json:"kernelMemcgNotification,omitempty"` } type KubeletAuthorizationMode string diff --git a/test/e2e_node/remote/node_e2e.go b/test/e2e_node/remote/node_e2e.go index 047a3b39f12..ef192d881f7 100644 --- a/test/e2e_node/remote/node_e2e.go +++ b/test/e2e_node/remote/node_e2e.go @@ -98,7 +98,7 @@ func prependCOSMounterFlag(args, host, workspace string) (string, error) { // prependMemcgNotificationFlag prepends the flag for enabling memcg // notification to args and returns the result. func prependMemcgNotificationFlag(args string) string { - return "--kubelet-flags=--experimental-kernel-memcg-notification=true " + args + return "--kubelet-flags=--kernel-memcg-notification=true " + args } // updateOSSpecificKubeletFlags updates the Kubelet args with OS specific