diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 6e86e343057..0527c07a3a1 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -249,6 +249,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.DurationVar(&s.EvictionPressureTransitionPeriod.Duration, "eviction-pressure-transition-period", s.EvictionPressureTransitionPeriod.Duration, "Duration for which the kubelet has to wait before transitioning out of an eviction pressure condition.") fs.Int32Var(&s.EvictionMaxPodGracePeriod, "eviction-max-pod-grace-period", s.EvictionMaxPodGracePeriod, "Maximum allowed grace period (in seconds) to use when terminating pods in response to a soft eviction threshold being met. If negative, defer to pod specified value.") fs.StringVar(&s.EvictionMinimumReclaim, "eviction-minimum-reclaim", s.EvictionMinimumReclaim, "A set of minimum reclaims (e.g. imagefs.available=2Gi) that describes the minimum amount of resource the kubelet will reclaim when performing a pod eviction if that resource is under pressure.") + fs.BoolVar(&s.ExperimentalKernelMemcgNotification, "experimental-kernel-memcg-notification", s.ExperimentalKernelMemcgNotification, "If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling.") fs.Int32Var(&s.PodsPerCore, "pods-per-core", s.PodsPerCore, "Number of Pods per core that can run on this Kubelet. The total number of Pods on this Kubelet cannot exceed max-pods, so max-pods will be used if this calculation results in a larger number of Pods allowed on the Kubelet. A value of 0 disables this limit.") fs.BoolVar(&s.ProtectKernelDefaults, "protect-kernel-defaults", s.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.") diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 5e39d83ab96..932466e101e 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -202,6 +202,7 @@ experimental-nvidia-gpus experimental-prefix experimental-cri experimental-check-node-capabilities-before-mount +experimental-kernel-memcg-notification external-etcd-cafile external-etcd-certfile external-etcd-endpoints diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index cd87c983671..c169381f952 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -430,6 +430,9 @@ type KubeletConfiguration struct { // Comma-delimited list of minimum reclaims (e.g. imagefs.available=2Gi) that describes the minimum amount of resource the kubelet will reclaim when performing a pod eviction if that resource is under pressure. // +optional EvictionMinimumReclaim string `json:"evictionMinimumReclaim,omitempty"` + // If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling. + // +optional + ExperimentalKernelMemcgNotification bool `json:"experimentalKernelMemcgNotification"` // Maximum number of pods per core. Cannot exceed MaxPods PodsPerCore int32 `json:"podsPerCore"` // enableControllerAttachDetach enables the Attach/Detach controller to diff --git a/pkg/apis/componentconfig/v1alpha1/defaults.go b/pkg/apis/componentconfig/v1alpha1/defaults.go index 59ea6a51397..a22f3417ad6 100644 --- a/pkg/apis/componentconfig/v1alpha1/defaults.go +++ b/pkg/apis/componentconfig/v1alpha1/defaults.go @@ -374,6 +374,9 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) { if obj.EvictionPressureTransitionPeriod == zeroDuration { obj.EvictionPressureTransitionPeriod = metav1.Duration{Duration: 5 * time.Minute} } + if obj.ExperimentalKernelMemcgNotification == nil { + obj.ExperimentalKernelMemcgNotification = boolVar(false) + } if obj.SystemReserved == nil { obj.SystemReserved = make(map[string]string) } diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 16b1f24fec0..0bcb9e4bd44 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -472,6 +472,8 @@ type KubeletConfiguration struct { EvictionMaxPodGracePeriod int32 `json:"evictionMaxPodGracePeriod"` // Comma-delimited list of minimum reclaims (e.g. imagefs.available=2Gi) that describes the minimum amount of resource the kubelet will reclaim when performing a pod eviction if that resource is under pressure. EvictionMinimumReclaim string `json:"evictionMinimumReclaim"` + // If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling. + ExperimentalKernelMemcgNotification *bool `json:"experimentalKernelMemcgNotification"` // Maximum number of pods per core. Cannot exceed MaxPods PodsPerCore int32 `json:"podsPerCore"` // enableControllerAttachDetach enables the Attach/Detach controller to diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 58bf12cb1bb..d1a28cfa86d 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -204,7 +204,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } // attempt to create a threshold notifier to improve eviction response time - if !m.notifiersInitialized { + if m.config.KernelMemcgNotification && !m.notifiersInitialized { + glog.Infof("eviction manager attempting to integrate with kernel memcg notification api") m.notifiersInitialized = true // start soft memory notification err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) { diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index f601dc5f2de..d25f107e70e 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -69,6 +69,8 @@ type Config struct { MaxPodGracePeriodSeconds int64 // Thresholds define the set of conditions monitored to trigger eviction. Thresholds []Threshold + // KernelMemcgNotification if true will integrate with the kernel memcg notification to determine if memory thresholds are crossed. + KernelMemcgNotification bool } // ThresholdValue is a value holder that abstracts literal versus percentage based quantity diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index d04d25e56a4..680d4f383d5 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -354,6 +354,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub PressureTransitionPeriod: kubeCfg.EvictionPressureTransitionPeriod.Duration, MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod), Thresholds: thresholds, + KernelMemcgNotification: kubeCfg.ExperimentalKernelMemcgNotification, } reservation, err := ParseReservation(kubeCfg.KubeReserved, kubeCfg.SystemReserved)