From 4f909c14a0b32dba90d5c28f2937964aaf20677a Mon Sep 17 00:00:00 2001 From: utam0k Date: Mon, 14 Oct 2024 14:46:28 +0900 Subject: [PATCH] kubelet: new kubelet config option for disabling group oom kill Signed-off-by: utam0k --- cmd/kubelet/app/server.go | 4 +- cmd/kubelet/app/server_linux.go | 6 -- pkg/generated/openapi/zz_generated.openapi.go | 7 ++ pkg/kubelet/apis/config/fuzzer/fuzzer.go | 9 ++- pkg/kubelet/apis/config/helpers_test.go | 1 + pkg/kubelet/apis/config/types.go | 4 + .../apis/config/v1beta1/defaults_test.go | 9 ++- .../config/v1beta1/zz_generated.conversion.go | 2 + .../config/validation/validation_linux.go | 8 +- .../config/validation/validation_others.go | 6 ++ .../apis/config/validation/validation_test.go | 2 + .../config/validation/validation_windows.go | 7 +- .../apis/config/zz_generated.deepcopy.go | 5 ++ pkg/kubelet/kubelet.go | 16 ++++ pkg/kubelet/kubelet_test.go | 1 + .../kuberuntime_container_linux.go | 4 +- .../kuberuntime_container_linux_test.go | 47 ++++++++--- .../kuberuntime/kuberuntime_manager.go | 7 ++ .../kuberuntime_sandbox_linux_test.go | 2 + pkg/kubelet/util/util_linux.go | 29 +++++++ pkg/kubelet/util/util_others.go | 25 ++++++ pkg/kubelet/util/util_windows.go | 5 ++ .../k8s.io/kubelet/config/v1beta1/types.go | 9 +++ .../config/v1beta1/zz_generated.deepcopy.go | 5 ++ test/e2e_node/oomkiller_linux_test.go | 78 ++++++++++++++----- 25 files changed, 253 insertions(+), 45 deletions(-) create mode 100644 pkg/kubelet/util/util_linux.go create mode 100644 pkg/kubelet/util/util_others.go diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 9cff6743de5..4eae74a54d4 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -606,7 +606,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend // Warn if MemoryQoS enabled with cgroups v1 if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) && - !isCgroup2UnifiedMode() { + !kubeletutil.IsCgroup2UnifiedMode() { klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1") } // Obtain Kubelet Lock File @@ -831,7 +831,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend s.TopologyManagerPolicyOptions, features.TopologyManagerPolicyOptions) } if utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) { - if !isCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap { + if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap { // This feature is not supported for cgroupv1 so we are failing early. return fmt.Errorf("swap feature is enabled and LimitedSwap but it is only supported with cgroupv2") } diff --git a/cmd/kubelet/app/server_linux.go b/cmd/kubelet/app/server_linux.go index 473c39a33f7..00c23e30da7 100644 --- a/cmd/kubelet/app/server_linux.go +++ b/cmd/kubelet/app/server_linux.go @@ -19,8 +19,6 @@ package app import ( "k8s.io/klog/v2" "k8s.io/utils/inotify" - - libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" ) func watchForLockfileContention(path string, done chan struct{}) error { @@ -46,7 +44,3 @@ func watchForLockfileContention(path string, done chan struct{}) error { }() return nil } - -func isCgroup2UnifiedMode() bool { - return libcontainercgroups.IsCgroup2UnifiedMode() -} diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 41e0b0fa592..8a9d61e1b62 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -61947,6 +61947,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen Format: "", }, }, + "singleProcessOOMKill": { + SchemaProps: spec.SchemaProps{ + Description: "singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as a group. It means that if true, the behavior aligns with the behavior of cgroups v1. The default value is determined automatically when you don't specify. On non-linux such as windows, only null / absent is allowed. On cgroup v1 linux, only null / absent and true are allowed. On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.", + Type: []string{"boolean"}, + Format: "", + }, + }, "cpuManagerPolicyOptions": { SchemaProps: spec.SchemaProps{ Description: "cpuManagerPolicyOptions is a set of key=value which \tallows to set extra options to fine tune the behaviour of the cpu manager policies. Requires both the \"CPUManager\" and \"CPUManagerPolicyOptions\" feature gates to be enabled. Default: nil", diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go index bf21a4eae00..efa8120df3a 100644 --- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go +++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go @@ -73,12 +73,12 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.NodeStatusReportFrequency = metav1.Duration{Duration: time.Minute} obj.NodeLeaseDurationSeconds = 40 obj.CPUManagerPolicy = "none" - obj.CPUManagerPolicyOptions = make(map[string]string) + obj.CPUManagerPolicyOptions = nil obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency obj.NodeStatusMaxImages = 50 obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope - obj.TopologyManagerPolicyOptions = make(map[string]string) + obj.TopologyManagerPolicyOptions = nil obj.QOSReserved = map[string]string{ "memory": "50%", } @@ -104,13 +104,14 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} { obj.CgroupsPerQOS = true obj.CgroupDriver = "cgroupfs" obj.EnforceNodeAllocatable = kubeletconfigv1beta1.DefaultNodeAllocatableEnforcement - obj.StaticPodURLHeader = make(map[string][]string) + obj.StaticPodURLHeader = nil + obj.SingleProcessOOMKill = ptr.To(false) obj.ContainerLogMaxFiles = 5 obj.ContainerLogMaxSize = "10Mi" obj.ContainerLogMaxWorkers = 1 obj.ContainerLogMonitorInterval = metav1.Duration{Duration: 10 * time.Second} obj.ConfigMapAndSecretChangeDetectionStrategy = "Watch" - obj.AllowedUnsafeSysctls = []string{} + obj.AllowedUnsafeSysctls = nil obj.VolumePluginDir = kubeletconfigv1beta1.DefaultVolumePluginDir obj.ContainerRuntimeEndpoint = "unix:///run/containerd/containerd.sock" diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 7e7ca3fdd21..c601cde5daa 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -233,6 +233,7 @@ var ( "Logging.Options.Text.OutputRoutingOptions.SplitStream", "Logging.VModule[*].FilePattern", "Logging.VModule[*].Verbosity", + "SingleProcessOOMKill", "Logging.Verbosity", "TLSCipherSuites[*]", "TLSMinVersion", diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index 9fe3528e20c..4d76687dd05 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -229,6 +229,10 @@ type KubeletConfiguration struct { CgroupsPerQOS bool // driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd) CgroupDriver string + // SingleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container + // cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as + // a group. It means that if true, the behavior aligns with the behavior of cgroups v1. + SingleProcessOOMKill *bool // CPUManagerPolicy is the name of the policy to use. // Requires the CPUManager feature gate to be enabled. CPUManagerPolicy string diff --git a/pkg/kubelet/apis/config/v1beta1/defaults_test.go b/pkg/kubelet/apis/config/v1beta1/defaults_test.go index 2f68e6b7461..808980a41b4 100644 --- a/pkg/kubelet/apis/config/v1beta1/defaults_test.go +++ b/pkg/kubelet/apis/config/v1beta1/defaults_test.go @@ -34,7 +34,6 @@ import ( ) func TestSetDefaultsKubeletConfiguration(t *testing.T) { - tests := []struct { name string config *v1beta1.KubeletConfiguration @@ -130,6 +129,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, + SingleProcessOOMKill: nil, }, }, { @@ -261,6 +261,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(false), LocalStorageCapacityIsolation: ptr.To(false), PodLogsDir: "", + SingleProcessOOMKill: ptr.To(false), }, &v1beta1.KubeletConfiguration{ EnableServer: ptr.To(false), @@ -363,6 +364,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(false), LocalStorageCapacityIsolation: ptr.To(false), PodLogsDir: DefaultPodLogsDir, + SingleProcessOOMKill: ptr.To(false), }, }, { @@ -516,6 +518,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: "/custom/path", + SingleProcessOOMKill: ptr.To(true), }, &v1beta1.KubeletConfiguration{ EnableServer: ptr.To(true), @@ -666,6 +669,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: "/custom/path", + SingleProcessOOMKill: ptr.To(true), }, }, { @@ -759,6 +763,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, + SingleProcessOOMKill: nil, }, }, { @@ -852,6 +857,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, + SingleProcessOOMKill: nil, }, }, { @@ -945,6 +951,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) { RegisterNode: ptr.To(true), LocalStorageCapacityIsolation: ptr.To(true), PodLogsDir: DefaultPodLogsDir, + SingleProcessOOMKill: nil, }, }, } diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index 731f9087fcf..2b904b093b3 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -409,6 +409,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in } out.CgroupDriver = in.CgroupDriver out.CPUManagerPolicy = in.CPUManagerPolicy + out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill)) out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions)) out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod out.MemoryManagerPolicy = in.MemoryManagerPolicy @@ -606,6 +607,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in return err } out.CgroupDriver = in.CgroupDriver + out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill)) out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions)) out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod diff --git a/pkg/kubelet/apis/config/validation/validation_linux.go b/pkg/kubelet/apis/config/validation/validation_linux.go index 97407b7fbe6..a8412dcb4fc 100644 --- a/pkg/kubelet/apis/config/validation/validation_linux.go +++ b/pkg/kubelet/apis/config/validation/validation_linux.go @@ -24,13 +24,19 @@ import ( libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" + "k8s.io/utils/ptr" ) // validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid. func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error { - if kc.FailCgroupV1 && !libcontainercgroups.IsCgroup2UnifiedMode() { + isCgroup1 := !libcontainercgroups.IsCgroup2UnifiedMode() + if kc.FailCgroupV1 && isCgroup1 { return fmt.Errorf("kubelet is configured to not run on a host using cgroup v1. cgroup v1 support is in maintenance mode") } + if isCgroup1 && kc.SingleProcessOOMKill != nil && !ptr.Deref(kc.SingleProcessOOMKill, true) { + return fmt.Errorf("invalid configuration: singleProcessOOMKill must not be explicitly set to false when using cgroup v1") + } + return nil } diff --git a/pkg/kubelet/apis/config/validation/validation_others.go b/pkg/kubelet/apis/config/validation/validation_others.go index c50143116c2..e019421398e 100644 --- a/pkg/kubelet/apis/config/validation/validation_others.go +++ b/pkg/kubelet/apis/config/validation/validation_others.go @@ -20,10 +20,16 @@ limitations under the License. package validation import ( + "fmt" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" ) // validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid. func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error { + if kc.SingleProcessOOMKill != nil { + return fmt.Errorf("invalid configuration: singleProcessOOMKill is only supported on linux") + } + return nil } diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index a475478c221..ff17f53aa16 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -29,6 +29,7 @@ import ( kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/apis/config/validation" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" + kubeletutil "k8s.io/kubernetes/pkg/kubelet/util" "k8s.io/utils/ptr" ) @@ -78,6 +79,7 @@ var ( ContainerRuntimeEndpoint: "unix:///run/containerd/containerd.sock", ContainerLogMaxWorkers: 1, ContainerLogMonitorInterval: metav1.Duration{Duration: 10 * time.Second}, + SingleProcessOOMKill: ptr.To(!kubeletutil.IsCgroup2UnifiedMode()), } ) diff --git a/pkg/kubelet/apis/config/validation/validation_windows.go b/pkg/kubelet/apis/config/validation/validation_windows.go index 325b3cbab1a..65765fe0db5 100644 --- a/pkg/kubelet/apis/config/validation/validation_windows.go +++ b/pkg/kubelet/apis/config/validation/validation_windows.go @@ -20,9 +20,10 @@ limitations under the License. package validation import ( - "k8s.io/klog/v2" + "fmt" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -36,6 +37,10 @@ func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) erro klog.Warningf(message, "CgroupsPerQOS", "--cgroups-per-qos", kc.CgroupsPerQOS) } + if kc.SingleProcessOOMKill != nil { + return fmt.Errorf("invalid configuration: singleProcessOOMKill is not supported on Windows") + } + enforceNodeAllocatableWithoutNone := sets.New(kc.EnforceNodeAllocatable...).Delete(kubetypes.NodeAllocatableNoneKey) if len(enforceNodeAllocatableWithoutNone) > 0 { klog.Warningf(message, "EnforceNodeAllocatable", "--enforce-node-allocatable", kc.EnforceNodeAllocatable) diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go index dc2df3bcee4..b4ab86f64dd 100644 --- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go @@ -204,6 +204,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { out.ImageMinimumGCAge = in.ImageMinimumGCAge out.ImageMaximumGCAge = in.ImageMaximumGCAge out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod + if in.SingleProcessOOMKill != nil { + in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill + *out = new(bool) + **out = **in + } if in.CPUManagerPolicyOptions != nil { in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions *out = make(map[string]string, len(*in)) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 8c10db99cf7..0fe7fe5b8da 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -45,6 +45,7 @@ import ( v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" utilfs "k8s.io/kubernetes/pkg/util/filesystem" netutils "k8s.io/utils/net" + "k8s.io/utils/ptr" inuserns "github.com/moby/sys/userns" v1 "k8s.io/api/core/v1" @@ -661,6 +662,20 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.podCache, ) + var singleProcessOOMKill *bool + if sysruntime.GOOS == "linux" { + if !util.IsCgroup2UnifiedMode() { + // This is a default behavior for cgroups v1. + singleProcessOOMKill = ptr.To(true) + } else { + if kubeCfg.SingleProcessOOMKill == nil { + singleProcessOOMKill = ptr.To(false) + } else { + singleProcessOOMKill = kubeCfg.SingleProcessOOMKill + } + } + } + runtime, err := kuberuntime.NewKubeGenericRuntimeManager( kubecontainer.FilterEventRecorder(kubeDeps.Recorder), klet.livenessManager, @@ -680,6 +695,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, int(kubeCfg.RegistryBurst), imageCredentialProviderConfigFile, imageCredentialProviderBinDir, + singleProcessOOMKill, kubeCfg.CPUCFSQuota, kubeCfg.CPUCFSQuotaPeriod, kubeDeps.RemoteRuntimeService, diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 70e05796b21..a13d407ab03 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -3225,6 +3225,7 @@ func TestSyncPodSpans(t *testing.T) { int(kubeCfg.RegistryBurst), "", "", + nil, kubeCfg.CPUCFSQuota, kubeCfg.CPUCFSQuotaPeriod, runtimeSvc, diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index acc2238e39d..61dba0e736c 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -37,6 +37,7 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" + v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" @@ -45,6 +46,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/qos" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" cgroups "k8s.io/kubernetes/third_party/forked/cgroups" + "k8s.io/utils/ptr" ) var defaultPageSize = int64(os.Getpagesize()) @@ -247,7 +249,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit } // runc requires cgroupv2 for unified mode - if isCgroup2UnifiedMode() { + if isCgroup2UnifiedMode() && !ptr.Deref(m.singleProcessOOMKill, true) { resources.Unified = map[string]string{ // Ask the kernel to kill all processes in the container cgroup in case of OOM. // See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index 37f0cf3562c..b9425b61efa 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -249,12 +249,13 @@ func TestCalculateLinuxResources(t *testing.T) { } tests := []struct { - name string - cpuReq *resource.Quantity - cpuLim *resource.Quantity - memLim *resource.Quantity - expected *runtimeapi.LinuxContainerResources - cgroupVersion CgroupVersion + name string + cpuReq *resource.Quantity + cpuLim *resource.Quantity + memLim *resource.Quantity + expected *runtimeapi.LinuxContainerResources + cgroupVersion CgroupVersion + singleProcessOOMKill bool }{ { name: "Request128MBLimit256MB", @@ -321,6 +322,20 @@ func TestCalculateLinuxResources(t *testing.T) { }, cgroupVersion: cgroupV2, }, + { + name: "Request128MBLimit256MBSingleProcess", + cpuReq: generateResourceQuantity("1"), + cpuLim: generateResourceQuantity("2"), + memLim: generateResourceQuantity("128Mi"), + expected: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 100000, + CpuQuota: 200000, + CpuShares: 1024, + MemoryLimitInBytes: 134217728, + }, + cgroupVersion: cgroupV2, + singleProcessOOMKill: true, + }, { name: "RequestNoMemory", cpuReq: generateResourceQuantity("2"), @@ -365,6 +380,7 @@ func TestCalculateLinuxResources(t *testing.T) { } for _, test := range tests { setCgroupVersionDuringTest(test.cgroupVersion) + m.singleProcessOOMKill = ptr.To(test.singleProcessOOMKill) linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim) assert.Equal(t, test.expected, linuxContainerResources) } @@ -808,16 +824,18 @@ func TestGenerateLinuxContainerResources(t *testing.T) { } for _, tc := range []struct { - name string - limits v1.ResourceList - requests v1.ResourceList - expected *runtimeapi.LinuxContainerResources - cgroupVersion CgroupVersion + name string + limits v1.ResourceList + requests v1.ResourceList + singleProcessOOMKill bool + expected *runtimeapi.LinuxContainerResources + cgroupVersion CgroupVersion }{ { "requests & limits, cpu & memory, guaranteed qos", v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + true, &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997}, cgroupV1, }, @@ -825,6 +843,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) { "requests & limits, cpu & memory, burstable qos", v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + true, &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970}, cgroupV1, }, @@ -832,6 +851,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) { "best-effort qos", nil, nil, + true, &runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000}, cgroupV1, }, @@ -839,6 +859,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) { "requests & limits, cpu & memory, guaranteed qos", v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + false, &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997, Unified: map[string]string{"memory.oom.group": "1"}}, cgroupV2, }, @@ -846,6 +867,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) { "requests & limits, cpu & memory, burstable qos", v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")}, v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")}, + false, &runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970, Unified: map[string]string{"memory.oom.group": "1"}}, cgroupV2, }, @@ -853,6 +875,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) { "best-effort qos", nil, nil, + false, &runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000, Unified: map[string]string{"memory.oom.group": "1"}}, cgroupV2, }, @@ -863,6 +886,8 @@ func TestGenerateLinuxContainerResources(t *testing.T) { pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests} + m.singleProcessOOMKill = ptr.To(tc.singleProcessOOMKill) + resources := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false) tc.expected.HugepageLimits = resources.HugepageLimits assert.Equal(t, tc.expected, resources) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 0c7a9b24c63..189156055ab 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -118,6 +118,11 @@ type kubeGenericRuntimeManager struct { readinessManager proberesults.Manager startupManager proberesults.Manager + // If false, pass "memory.oom.group" to container cgroups when using cgroups v2 to cause processes + // in those cgroups to be killed as a unit by the OOM killer. + // It must be nil except for linux + singleProcessOOMKill *bool + // If true, enforce container cpu limits with CFS quota support cpuCFSQuota bool @@ -198,6 +203,7 @@ func NewKubeGenericRuntimeManager( imagePullBurst int, imageCredentialProviderConfigFile string, imageCredentialProviderBinDir string, + singleProcessOOMKill *bool, cpuCFSQuota bool, cpuCFSQuotaPeriod metav1.Duration, runtimeService internalapi.RuntimeService, @@ -218,6 +224,7 @@ func NewKubeGenericRuntimeManager( tracer := tracerProvider.Tracer(instrumentationScope) kubeRuntimeManager := &kubeGenericRuntimeManager{ recorder: recorder, + singleProcessOOMKill: singleProcessOOMKill, cpuCFSQuota: cpuCFSQuota, cpuCFSQuotaPeriod: cpuCFSQuotaPeriod, seccompProfileRoot: filepath.Join(rootDirectory, "seccomp"), diff --git a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go index 6d87e946899..3c5159a34eb 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go @@ -28,11 +28,13 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + "k8s.io/utils/ptr" ) func TestApplySandboxResources(t *testing.T) { _, _, m, err := createTestRuntimeManager() m.cpuCFSQuota = true + m.singleProcessOOMKill = ptr.To(false) config := &runtimeapi.PodSandboxConfig{ Linux: &runtimeapi.LinuxPodSandboxConfig{}, diff --git a/pkg/kubelet/util/util_linux.go b/pkg/kubelet/util/util_linux.go new file mode 100644 index 00000000000..56b9b920fd8 --- /dev/null +++ b/pkg/kubelet/util/util_linux.go @@ -0,0 +1,29 @@ +//go:build linux +// +build linux + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" +) + +// IsCgroup2UnifiedMode returns true if the cgroup v2 unified mode is enabled +func IsCgroup2UnifiedMode() bool { + return libcontainercgroups.IsCgroup2UnifiedMode() +} diff --git a/pkg/kubelet/util/util_others.go b/pkg/kubelet/util/util_others.go new file mode 100644 index 00000000000..e2e1c71bac6 --- /dev/null +++ b/pkg/kubelet/util/util_others.go @@ -0,0 +1,25 @@ +//go:build !linux && !windows +// +build !linux,!windows + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +// IsCgroup2UnifiedMode is a no-op for other OSes. +func IsCgroup2UnifiedMode() bool { + return false +} diff --git a/pkg/kubelet/util/util_windows.go b/pkg/kubelet/util/util_windows.go index 81852dcf93a..c944a7d22f2 100644 --- a/pkg/kubelet/util/util_windows.go +++ b/pkg/kubelet/util/util_windows.go @@ -73,3 +73,8 @@ func NormalizePath(path string) string { } return path } + +// IsCgroup2UnifiedMode is a no-op for Windows for now +func IsCgroup2UnifiedMode() bool { + return false +} diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index 330b253ca78..d10578f2c9a 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -354,6 +354,15 @@ type KubeletConfiguration struct { // Default: "None" // +optional CPUManagerPolicy string `json:"cpuManagerPolicy,omitempty"` + // singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container + // cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as + // a group. It means that if true, the behavior aligns with the behavior of cgroups v1. + // The default value is determined automatically when you don't specify. + // On non-linux such as windows, only null / absent is allowed. + // On cgroup v1 linux, only null / absent and true are allowed. + // On cgroup v2 linux, null / absent, true and false are allowed. The default value is false. + // +optional + SingleProcessOOMKill *bool `json:"singleProcessOOMKill,omitempty"` // cpuManagerPolicyOptions is a set of key=value which allows to set extra options // to fine tune the behaviour of the cpu manager policies. // Requires both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled. diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go index 613a039a755..0ab6259f98f 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go @@ -254,6 +254,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { *out = new(bool) **out = **in } + if in.SingleProcessOOMKill != nil { + in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill + *out = new(bool) + **out = **in + } if in.CPUManagerPolicyOptions != nil { in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions *out = make(map[string]string, len(*in)) diff --git a/test/e2e_node/oomkiller_linux_test.go b/test/e2e_node/oomkiller_linux_test.go index db84cb0f0cf..a9e2c4fe8f5 100644 --- a/test/e2e_node/oomkiller_linux_test.go +++ b/test/e2e_node/oomkiller_linux_test.go @@ -19,6 +19,7 @@ package e2enode import ( "context" "fmt" + "time" "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" @@ -35,9 +36,11 @@ import ( ) type testCase struct { - name string - podSpec *v1.Pod - oomTargetContainerName string + name string + podSpec *v1.Pod + oomTargetContainerName string + enableSingleProcessKill *bool + expectPodRunning bool } // KubeReservedMemory is default fraction value of node capacity memory to @@ -62,7 +65,7 @@ var _ = SIGDescribe("OOMKiller for pod using more memory than node allocatable [ } }) -var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), func() { +var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), framework.WithSerial(), func() { f := framework.NewDefaultFramework("oomkiller-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged @@ -89,6 +92,24 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu oomTargetContainerName: "oomkill-multi-target-container", podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container", getOOMTargetContainerMultiProcess), + enableSingleProcessKill: nil, + }) + + testCases = append(testCases, testCase{ + name: "multi process container (single process kill enabled)", + oomTargetContainerName: "oomkill-multi-target-container", + podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container", + getOOMTargetContainerMultiProcess), + enableSingleProcessKill: ptr.To(true), + expectPodRunning: true, + }) + + testCases = append(testCases, testCase{ + name: "multi process container (single process kill disabled)", + oomTargetContainerName: "oomkill-multi-target-container", + podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container", + getOOMTargetContainerMultiProcess), + enableSingleProcessKill: ptr.To(false), }) } for _, tc := range testCases { @@ -99,8 +120,8 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMemory float64) { ginkgo.Context(testCase.name, func() { // Update KubeReservedMemory in KubeletConfig. - if kubeReservedMemory > 0 { - tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + if kubeReservedMemory > 0 { if initialConfig.KubeReserved == nil { initialConfig.KubeReserved = map[string]string{} } @@ -109,8 +130,10 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem // K8s components such that node allocatable memory is less than node capacity to // observe OOM kills at cgroup level instead of system OOM kills. initialConfig.KubeReserved["memory"] = fmt.Sprintf("%d", int(kubeReservedMemory*getLocalNode(context.TODO(), f).Status.Capacity.Memory().AsApproximateFloat64())) - }) - } + } + + initialConfig.SingleProcessOOMKill = testCase.enableSingleProcessKill + }) ginkgo.BeforeEach(func() { // Precautionary check that kubelet is healthy before running the test. @@ -120,18 +143,37 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem e2epod.NewPodClient(f).Create(context.TODO(), testCase.podSpec) }) - ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() { - ginkgo.By("Waiting for the pod to be failed") - err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name) - framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name) + if testCase.expectPodRunning { + ginkgo.It("The containers should not be OOMKilled", func() { + err := e2epod.WaitForPodsRunning(context.TODO(), f.ClientSet, f.Namespace.Name, 1, framework.PodStartTimeout) + framework.ExpectNoError(err, "Failed waiting for pod to be running state, %s/%s", f.Namespace.Name, testCase.podSpec.Name) - ginkgo.By("Fetching the latest pod status") - pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{}) - framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name) + gomega.Consistently(context.TODO(), func(ctx context.Context) error { + pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, testCase.podSpec.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("expected the pod %s to exist: %w", pod.Name, err) + } + phase := pod.Status.Phase + if phase != v1.PodRunning && phase != v1.PodSucceeded { + return fmt.Errorf("pod %s: unexpected status %s, expected status: %s or %s", pod.Name, pod.Status.Phase, v1.PodRunning, v1.PodSucceeded) + } + return nil + }, 10*time.Second, f.Timeouts.Poll).Should(gomega.BeNil()) + }) + } else { + ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() { + ginkgo.By("Waiting for the pod to be failed") + err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name) + framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name) - ginkgo.By("Verifying the OOM target container has the expected reason") - verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName) - }) + ginkgo.By("Fetching the latest pod status") + pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{}) + framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name) + + ginkgo.By("Verifying the OOM target container has the expected reason") + verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName) + }) + } ginkgo.AfterEach(func() { ginkgo.By(fmt.Sprintf("deleting pod: %s", testCase.podSpec.Name))