From 4f909c14a0b32dba90d5c28f2937964aaf20677a Mon Sep 17 00:00:00 2001
From: utam0k <k0ma@utam0k.jp>
Date: Mon, 14 Oct 2024 14:46:28 +0900
Subject: [PATCH] kubelet: new kubelet config option for disabling group oom
 kill

Signed-off-by: utam0k <k0ma@utam0k.jp>
---
 cmd/kubelet/app/server.go                     |  4 +-
 cmd/kubelet/app/server_linux.go               |  6 --
 pkg/generated/openapi/zz_generated.openapi.go |  7 ++
 pkg/kubelet/apis/config/fuzzer/fuzzer.go      |  9 ++-
 pkg/kubelet/apis/config/helpers_test.go       |  1 +
 pkg/kubelet/apis/config/types.go              |  4 +
 .../apis/config/v1beta1/defaults_test.go      |  9 ++-
 .../config/v1beta1/zz_generated.conversion.go |  2 +
 .../config/validation/validation_linux.go     |  8 +-
 .../config/validation/validation_others.go    |  6 ++
 .../apis/config/validation/validation_test.go |  2 +
 .../config/validation/validation_windows.go   |  7 +-
 .../apis/config/zz_generated.deepcopy.go      |  5 ++
 pkg/kubelet/kubelet.go                        | 16 ++++
 pkg/kubelet/kubelet_test.go                   |  1 +
 .../kuberuntime_container_linux.go            |  4 +-
 .../kuberuntime_container_linux_test.go       | 47 ++++++++---
 .../kuberuntime/kuberuntime_manager.go        |  7 ++
 .../kuberuntime_sandbox_linux_test.go         |  2 +
 pkg/kubelet/util/util_linux.go                | 29 +++++++
 pkg/kubelet/util/util_others.go               | 25 ++++++
 pkg/kubelet/util/util_windows.go              |  5 ++
 .../k8s.io/kubelet/config/v1beta1/types.go    |  9 +++
 .../config/v1beta1/zz_generated.deepcopy.go   |  5 ++
 test/e2e_node/oomkiller_linux_test.go         | 78 ++++++++++++++-----
 25 files changed, 253 insertions(+), 45 deletions(-)
 create mode 100644 pkg/kubelet/util/util_linux.go
 create mode 100644 pkg/kubelet/util/util_others.go

diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go
index 9cff6743de5..4eae74a54d4 100644
--- a/cmd/kubelet/app/server.go
+++ b/cmd/kubelet/app/server.go
@@ -606,7 +606,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
 
 	// Warn if MemoryQoS enabled with cgroups v1
 	if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) &&
-		!isCgroup2UnifiedMode() {
+		!kubeletutil.IsCgroup2UnifiedMode() {
 		klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1")
 	}
 	// Obtain Kubelet Lock File
@@ -831,7 +831,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
 				s.TopologyManagerPolicyOptions, features.TopologyManagerPolicyOptions)
 		}
 		if utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) {
-			if !isCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
+			if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
 				// This feature is not supported for cgroupv1 so we are failing early.
 				return fmt.Errorf("swap feature is enabled and LimitedSwap but it is only supported with cgroupv2")
 			}
diff --git a/cmd/kubelet/app/server_linux.go b/cmd/kubelet/app/server_linux.go
index 473c39a33f7..00c23e30da7 100644
--- a/cmd/kubelet/app/server_linux.go
+++ b/cmd/kubelet/app/server_linux.go
@@ -19,8 +19,6 @@ package app
 import (
 	"k8s.io/klog/v2"
 	"k8s.io/utils/inotify"
-
-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
 )
 
 func watchForLockfileContention(path string, done chan struct{}) error {
@@ -46,7 +44,3 @@ func watchForLockfileContention(path string, done chan struct{}) error {
 	}()
 	return nil
 }
-
-func isCgroup2UnifiedMode() bool {
-	return libcontainercgroups.IsCgroup2UnifiedMode()
-}
diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go
index 41e0b0fa592..8a9d61e1b62 100644
--- a/pkg/generated/openapi/zz_generated.openapi.go
+++ b/pkg/generated/openapi/zz_generated.openapi.go
@@ -61947,6 +61947,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
 							Format:      "",
 						},
 					},
+					"singleProcessOOMKill": {
+						SchemaProps: spec.SchemaProps{
+							Description: "singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as a group. It means that if true, the behavior aligns with the behavior of cgroups v1. The default value is determined automatically when you don't specify. On non-linux such as windows, only null / absent is allowed. On cgroup v1 linux, only null / absent and true are allowed. On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.",
+							Type:        []string{"boolean"},
+							Format:      "",
+						},
+					},
 					"cpuManagerPolicyOptions": {
 						SchemaProps: spec.SchemaProps{
 							Description: "cpuManagerPolicyOptions is a set of key=value which \tallows to set extra options to fine tune the behaviour of the cpu manager policies. Requires  both the \"CPUManager\" and \"CPUManagerPolicyOptions\" feature gates to be enabled. Default: nil",
diff --git a/pkg/kubelet/apis/config/fuzzer/fuzzer.go b/pkg/kubelet/apis/config/fuzzer/fuzzer.go
index bf21a4eae00..efa8120df3a 100644
--- a/pkg/kubelet/apis/config/fuzzer/fuzzer.go
+++ b/pkg/kubelet/apis/config/fuzzer/fuzzer.go
@@ -73,12 +73,12 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
 			obj.NodeStatusReportFrequency = metav1.Duration{Duration: time.Minute}
 			obj.NodeLeaseDurationSeconds = 40
 			obj.CPUManagerPolicy = "none"
-			obj.CPUManagerPolicyOptions = make(map[string]string)
+			obj.CPUManagerPolicyOptions = nil
 			obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency
 			obj.NodeStatusMaxImages = 50
 			obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy
 			obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope
-			obj.TopologyManagerPolicyOptions = make(map[string]string)
+			obj.TopologyManagerPolicyOptions = nil
 			obj.QOSReserved = map[string]string{
 				"memory": "50%",
 			}
@@ -104,13 +104,14 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
 			obj.CgroupsPerQOS = true
 			obj.CgroupDriver = "cgroupfs"
 			obj.EnforceNodeAllocatable = kubeletconfigv1beta1.DefaultNodeAllocatableEnforcement
-			obj.StaticPodURLHeader = make(map[string][]string)
+			obj.StaticPodURLHeader = nil
+			obj.SingleProcessOOMKill = ptr.To(false)
 			obj.ContainerLogMaxFiles = 5
 			obj.ContainerLogMaxSize = "10Mi"
 			obj.ContainerLogMaxWorkers = 1
 			obj.ContainerLogMonitorInterval = metav1.Duration{Duration: 10 * time.Second}
 			obj.ConfigMapAndSecretChangeDetectionStrategy = "Watch"
-			obj.AllowedUnsafeSysctls = []string{}
+			obj.AllowedUnsafeSysctls = nil
 			obj.VolumePluginDir = kubeletconfigv1beta1.DefaultVolumePluginDir
 			obj.ContainerRuntimeEndpoint = "unix:///run/containerd/containerd.sock"
 
diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go
index 7e7ca3fdd21..c601cde5daa 100644
--- a/pkg/kubelet/apis/config/helpers_test.go
+++ b/pkg/kubelet/apis/config/helpers_test.go
@@ -233,6 +233,7 @@ var (
 		"Logging.Options.Text.OutputRoutingOptions.SplitStream",
 		"Logging.VModule[*].FilePattern",
 		"Logging.VModule[*].Verbosity",
+		"SingleProcessOOMKill",
 		"Logging.Verbosity",
 		"TLSCipherSuites[*]",
 		"TLSMinVersion",
diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go
index 9fe3528e20c..4d76687dd05 100644
--- a/pkg/kubelet/apis/config/types.go
+++ b/pkg/kubelet/apis/config/types.go
@@ -229,6 +229,10 @@ type KubeletConfiguration struct {
 	CgroupsPerQOS bool
 	// driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd)
 	CgroupDriver string
+	// SingleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
+	// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
+	// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
+	SingleProcessOOMKill *bool
 	// CPUManagerPolicy is the name of the policy to use.
 	// Requires the CPUManager feature gate to be enabled.
 	CPUManagerPolicy string
diff --git a/pkg/kubelet/apis/config/v1beta1/defaults_test.go b/pkg/kubelet/apis/config/v1beta1/defaults_test.go
index 2f68e6b7461..808980a41b4 100644
--- a/pkg/kubelet/apis/config/v1beta1/defaults_test.go
+++ b/pkg/kubelet/apis/config/v1beta1/defaults_test.go
@@ -34,7 +34,6 @@ import (
 )
 
 func TestSetDefaultsKubeletConfiguration(t *testing.T) {
-
 	tests := []struct {
 		name     string
 		config   *v1beta1.KubeletConfiguration
@@ -130,6 +129,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    DefaultPodLogsDir,
+				SingleProcessOOMKill:          nil,
 			},
 		},
 		{
@@ -261,6 +261,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                    ptr.To(false),
 				LocalStorageCapacityIsolation:   ptr.To(false),
 				PodLogsDir:                      "",
+				SingleProcessOOMKill:            ptr.To(false),
 			},
 			&v1beta1.KubeletConfiguration{
 				EnableServer:       ptr.To(false),
@@ -363,6 +364,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(false),
 				LocalStorageCapacityIsolation: ptr.To(false),
 				PodLogsDir:                    DefaultPodLogsDir,
+				SingleProcessOOMKill:          ptr.To(false),
 			},
 		},
 		{
@@ -516,6 +518,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    "/custom/path",
+				SingleProcessOOMKill:          ptr.To(true),
 			},
 			&v1beta1.KubeletConfiguration{
 				EnableServer:       ptr.To(true),
@@ -666,6 +669,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    "/custom/path",
+				SingleProcessOOMKill:          ptr.To(true),
 			},
 		},
 		{
@@ -759,6 +763,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    DefaultPodLogsDir,
+				SingleProcessOOMKill:          nil,
 			},
 		},
 		{
@@ -852,6 +857,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    DefaultPodLogsDir,
+				SingleProcessOOMKill:          nil,
 			},
 		},
 		{
@@ -945,6 +951,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
 				RegisterNode:                  ptr.To(true),
 				LocalStorageCapacityIsolation: ptr.To(true),
 				PodLogsDir:                    DefaultPodLogsDir,
+				SingleProcessOOMKill:          nil,
 			},
 		},
 	}
diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
index 731f9087fcf..2b904b093b3 100644
--- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
+++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
@@ -409,6 +409,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
 	}
 	out.CgroupDriver = in.CgroupDriver
 	out.CPUManagerPolicy = in.CPUManagerPolicy
+	out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
 	out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
 	out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
 	out.MemoryManagerPolicy = in.MemoryManagerPolicy
@@ -606,6 +607,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
 		return err
 	}
 	out.CgroupDriver = in.CgroupDriver
+	out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
 	out.CPUManagerPolicy = in.CPUManagerPolicy
 	out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
 	out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
diff --git a/pkg/kubelet/apis/config/validation/validation_linux.go b/pkg/kubelet/apis/config/validation/validation_linux.go
index 97407b7fbe6..a8412dcb4fc 100644
--- a/pkg/kubelet/apis/config/validation/validation_linux.go
+++ b/pkg/kubelet/apis/config/validation/validation_linux.go
@@ -24,13 +24,19 @@ import (
 
 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
+	"k8s.io/utils/ptr"
 )
 
 // validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
 func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
-	if kc.FailCgroupV1 && !libcontainercgroups.IsCgroup2UnifiedMode() {
+	isCgroup1 := !libcontainercgroups.IsCgroup2UnifiedMode()
+	if kc.FailCgroupV1 && isCgroup1 {
 		return fmt.Errorf("kubelet is configured to not run on a host using cgroup v1. cgroup v1 support is in maintenance mode")
 	}
 
+	if isCgroup1 && kc.SingleProcessOOMKill != nil && !ptr.Deref(kc.SingleProcessOOMKill, true) {
+		return fmt.Errorf("invalid configuration: singleProcessOOMKill must not be explicitly set to false when using cgroup v1")
+	}
+
 	return nil
 }
diff --git a/pkg/kubelet/apis/config/validation/validation_others.go b/pkg/kubelet/apis/config/validation/validation_others.go
index c50143116c2..e019421398e 100644
--- a/pkg/kubelet/apis/config/validation/validation_others.go
+++ b/pkg/kubelet/apis/config/validation/validation_others.go
@@ -20,10 +20,16 @@ limitations under the License.
 package validation
 
 import (
+	"fmt"
+
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 )
 
 // validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
 func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
+	if kc.SingleProcessOOMKill != nil {
+		return fmt.Errorf("invalid configuration: singleProcessOOMKill is only supported on linux")
+	}
+
 	return nil
 }
diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go
index a475478c221..ff17f53aa16 100644
--- a/pkg/kubelet/apis/config/validation/validation_test.go
+++ b/pkg/kubelet/apis/config/validation/validation_test.go
@@ -29,6 +29,7 @@ import (
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/pkg/kubelet/apis/config/validation"
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
 	"k8s.io/utils/ptr"
 )
 
@@ -78,6 +79,7 @@ var (
 		ContainerRuntimeEndpoint:    "unix:///run/containerd/containerd.sock",
 		ContainerLogMaxWorkers:      1,
 		ContainerLogMonitorInterval: metav1.Duration{Duration: 10 * time.Second},
+		SingleProcessOOMKill:        ptr.To(!kubeletutil.IsCgroup2UnifiedMode()),
 	}
 )
 
diff --git a/pkg/kubelet/apis/config/validation/validation_windows.go b/pkg/kubelet/apis/config/validation/validation_windows.go
index 325b3cbab1a..65765fe0db5 100644
--- a/pkg/kubelet/apis/config/validation/validation_windows.go
+++ b/pkg/kubelet/apis/config/validation/validation_windows.go
@@ -20,9 +20,10 @@ limitations under the License.
 package validation
 
 import (
-	"k8s.io/klog/v2"
+	"fmt"
 
 	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/klog/v2"
 
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
@@ -36,6 +37,10 @@ func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) erro
 		klog.Warningf(message, "CgroupsPerQOS", "--cgroups-per-qos", kc.CgroupsPerQOS)
 	}
 
+	if kc.SingleProcessOOMKill != nil {
+		return fmt.Errorf("invalid configuration: singleProcessOOMKill is not supported on Windows")
+	}
+
 	enforceNodeAllocatableWithoutNone := sets.New(kc.EnforceNodeAllocatable...).Delete(kubetypes.NodeAllocatableNoneKey)
 	if len(enforceNodeAllocatableWithoutNone) > 0 {
 		klog.Warningf(message, "EnforceNodeAllocatable", "--enforce-node-allocatable", kc.EnforceNodeAllocatable)
diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go
index dc2df3bcee4..b4ab86f64dd 100644
--- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go
+++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go
@@ -204,6 +204,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
 	out.ImageMinimumGCAge = in.ImageMinimumGCAge
 	out.ImageMaximumGCAge = in.ImageMaximumGCAge
 	out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod
+	if in.SingleProcessOOMKill != nil {
+		in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
+		*out = new(bool)
+		**out = **in
+	}
 	if in.CPUManagerPolicyOptions != nil {
 		in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
 		*out = make(map[string]string, len(*in))
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index 8c10db99cf7..0fe7fe5b8da 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -45,6 +45,7 @@ import (
 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 	utilfs "k8s.io/kubernetes/pkg/util/filesystem"
 	netutils "k8s.io/utils/net"
+	"k8s.io/utils/ptr"
 
 	inuserns "github.com/moby/sys/userns"
 	v1 "k8s.io/api/core/v1"
@@ -661,6 +662,20 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		klet.podCache,
 	)
 
+	var singleProcessOOMKill *bool
+	if sysruntime.GOOS == "linux" {
+		if !util.IsCgroup2UnifiedMode() {
+			// This is a default behavior for cgroups v1.
+			singleProcessOOMKill = ptr.To(true)
+		} else {
+			if kubeCfg.SingleProcessOOMKill == nil {
+				singleProcessOOMKill = ptr.To(false)
+			} else {
+				singleProcessOOMKill = kubeCfg.SingleProcessOOMKill
+			}
+		}
+	}
+
 	runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
 		kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
 		klet.livenessManager,
@@ -680,6 +695,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		int(kubeCfg.RegistryBurst),
 		imageCredentialProviderConfigFile,
 		imageCredentialProviderBinDir,
+		singleProcessOOMKill,
 		kubeCfg.CPUCFSQuota,
 		kubeCfg.CPUCFSQuotaPeriod,
 		kubeDeps.RemoteRuntimeService,
diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go
index 70e05796b21..a13d407ab03 100644
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@@ -3225,6 +3225,7 @@ func TestSyncPodSpans(t *testing.T) {
 		int(kubeCfg.RegistryBurst),
 		"",
 		"",
+		nil,
 		kubeCfg.CPUCFSQuota,
 		kubeCfg.CPUCFSQuotaPeriod,
 		runtimeSvc,
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index acc2238e39d..61dba0e736c 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -37,6 +37,7 @@ import (
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
 	"k8s.io/klog/v2"
+
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 	kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 	kubefeatures "k8s.io/kubernetes/pkg/features"
@@ -45,6 +46,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/qos"
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 	cgroups "k8s.io/kubernetes/third_party/forked/cgroups"
+	"k8s.io/utils/ptr"
 )
 
 var defaultPageSize = int64(os.Getpagesize())
@@ -247,7 +249,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
 	}
 
 	// runc requires cgroupv2 for unified mode
-	if isCgroup2UnifiedMode() {
+	if isCgroup2UnifiedMode() && !ptr.Deref(m.singleProcessOOMKill, true) {
 		resources.Unified = map[string]string{
 			// Ask the kernel to kill all processes in the container cgroup in case of OOM.
 			// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
index 37f0cf3562c..b9425b61efa 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@@ -249,12 +249,13 @@ func TestCalculateLinuxResources(t *testing.T) {
 	}
 
 	tests := []struct {
-		name          string
-		cpuReq        *resource.Quantity
-		cpuLim        *resource.Quantity
-		memLim        *resource.Quantity
-		expected      *runtimeapi.LinuxContainerResources
-		cgroupVersion CgroupVersion
+		name                 string
+		cpuReq               *resource.Quantity
+		cpuLim               *resource.Quantity
+		memLim               *resource.Quantity
+		expected             *runtimeapi.LinuxContainerResources
+		cgroupVersion        CgroupVersion
+		singleProcessOOMKill bool
 	}{
 		{
 			name:   "Request128MBLimit256MB",
@@ -321,6 +322,20 @@ func TestCalculateLinuxResources(t *testing.T) {
 			},
 			cgroupVersion: cgroupV2,
 		},
+		{
+			name:   "Request128MBLimit256MBSingleProcess",
+			cpuReq: generateResourceQuantity("1"),
+			cpuLim: generateResourceQuantity("2"),
+			memLim: generateResourceQuantity("128Mi"),
+			expected: &runtimeapi.LinuxContainerResources{
+				CpuPeriod:          100000,
+				CpuQuota:           200000,
+				CpuShares:          1024,
+				MemoryLimitInBytes: 134217728,
+			},
+			cgroupVersion:        cgroupV2,
+			singleProcessOOMKill: true,
+		},
 		{
 			name:   "RequestNoMemory",
 			cpuReq: generateResourceQuantity("2"),
@@ -365,6 +380,7 @@ func TestCalculateLinuxResources(t *testing.T) {
 	}
 	for _, test := range tests {
 		setCgroupVersionDuringTest(test.cgroupVersion)
+		m.singleProcessOOMKill = ptr.To(test.singleProcessOOMKill)
 		linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim)
 		assert.Equal(t, test.expected, linuxContainerResources)
 	}
@@ -808,16 +824,18 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 	}
 
 	for _, tc := range []struct {
-		name          string
-		limits        v1.ResourceList
-		requests      v1.ResourceList
-		expected      *runtimeapi.LinuxContainerResources
-		cgroupVersion CgroupVersion
+		name                 string
+		limits               v1.ResourceList
+		requests             v1.ResourceList
+		singleProcessOOMKill bool
+		expected             *runtimeapi.LinuxContainerResources
+		cgroupVersion        CgroupVersion
 	}{
 		{
 			"requests & limits, cpu & memory, guaranteed qos",
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
+			true,
 			&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997},
 			cgroupV1,
 		},
@@ -825,6 +843,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			"requests & limits, cpu & memory, burstable qos",
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
+			true,
 			&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970},
 			cgroupV1,
 		},
@@ -832,6 +851,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			"best-effort qos",
 			nil,
 			nil,
+			true,
 			&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000},
 			cgroupV1,
 		},
@@ -839,6 +859,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			"requests & limits, cpu & memory, guaranteed qos",
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
+			false,
 			&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997, Unified: map[string]string{"memory.oom.group": "1"}},
 			cgroupV2,
 		},
@@ -846,6 +867,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			"requests & limits, cpu & memory, burstable qos",
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
 			v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
+			false,
 			&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970, Unified: map[string]string{"memory.oom.group": "1"}},
 			cgroupV2,
 		},
@@ -853,6 +875,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			"best-effort qos",
 			nil,
 			nil,
+			false,
 			&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000, Unified: map[string]string{"memory.oom.group": "1"}},
 			cgroupV2,
 		},
@@ -863,6 +886,8 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 
 			pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
 
+			m.singleProcessOOMKill = ptr.To(tc.singleProcessOOMKill)
+
 			resources := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
 			tc.expected.HugepageLimits = resources.HugepageLimits
 			assert.Equal(t, tc.expected, resources)
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
index 0c7a9b24c63..189156055ab 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@@ -118,6 +118,11 @@ type kubeGenericRuntimeManager struct {
 	readinessManager proberesults.Manager
 	startupManager   proberesults.Manager
 
+	// If false, pass "memory.oom.group" to container cgroups when using cgroups v2 to cause processes
+	// in those cgroups to be killed as a unit by the OOM killer.
+	// It must be nil except for linux
+	singleProcessOOMKill *bool
+
 	// If true, enforce container cpu limits with CFS quota support
 	cpuCFSQuota bool
 
@@ -198,6 +203,7 @@ func NewKubeGenericRuntimeManager(
 	imagePullBurst int,
 	imageCredentialProviderConfigFile string,
 	imageCredentialProviderBinDir string,
+	singleProcessOOMKill *bool,
 	cpuCFSQuota bool,
 	cpuCFSQuotaPeriod metav1.Duration,
 	runtimeService internalapi.RuntimeService,
@@ -218,6 +224,7 @@ func NewKubeGenericRuntimeManager(
 	tracer := tracerProvider.Tracer(instrumentationScope)
 	kubeRuntimeManager := &kubeGenericRuntimeManager{
 		recorder:               recorder,
+		singleProcessOOMKill:   singleProcessOOMKill,
 		cpuCFSQuota:            cpuCFSQuota,
 		cpuCFSQuotaPeriod:      cpuCFSQuotaPeriod,
 		seccompProfileRoot:     filepath.Join(rootDirectory, "seccomp"),
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
index 6d87e946899..3c5159a34eb 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
@@ -28,11 +28,13 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
+	"k8s.io/utils/ptr"
 )
 
 func TestApplySandboxResources(t *testing.T) {
 	_, _, m, err := createTestRuntimeManager()
 	m.cpuCFSQuota = true
+	m.singleProcessOOMKill = ptr.To(false)
 
 	config := &runtimeapi.PodSandboxConfig{
 		Linux: &runtimeapi.LinuxPodSandboxConfig{},
diff --git a/pkg/kubelet/util/util_linux.go b/pkg/kubelet/util/util_linux.go
new file mode 100644
index 00000000000..56b9b920fd8
--- /dev/null
+++ b/pkg/kubelet/util/util_linux.go
@@ -0,0 +1,29 @@
+//go:build linux
+// +build linux
+
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package util
+
+import (
+	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+// IsCgroup2UnifiedMode returns true if the cgroup v2 unified mode is enabled
+func IsCgroup2UnifiedMode() bool {
+	return libcontainercgroups.IsCgroup2UnifiedMode()
+}
diff --git a/pkg/kubelet/util/util_others.go b/pkg/kubelet/util/util_others.go
new file mode 100644
index 00000000000..e2e1c71bac6
--- /dev/null
+++ b/pkg/kubelet/util/util_others.go
@@ -0,0 +1,25 @@
+//go:build !linux && !windows
+// +build !linux,!windows
+
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package util
+
+// IsCgroup2UnifiedMode is a no-op for other OSes.
+func IsCgroup2UnifiedMode() bool {
+	return false
+}
diff --git a/pkg/kubelet/util/util_windows.go b/pkg/kubelet/util/util_windows.go
index 81852dcf93a..c944a7d22f2 100644
--- a/pkg/kubelet/util/util_windows.go
+++ b/pkg/kubelet/util/util_windows.go
@@ -73,3 +73,8 @@ func NormalizePath(path string) string {
 	}
 	return path
 }
+
+// IsCgroup2UnifiedMode is a no-op for Windows for now
+func IsCgroup2UnifiedMode() bool {
+	return false
+}
diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
index 330b253ca78..d10578f2c9a 100644
--- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
@@ -354,6 +354,15 @@ type KubeletConfiguration struct {
 	// Default: "None"
 	// +optional
 	CPUManagerPolicy string `json:"cpuManagerPolicy,omitempty"`
+	// singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
+	// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
+	// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
+	// The default value is determined automatically when you don't specify.
+	// On non-linux such as windows, only null / absent is allowed.
+	// On cgroup v1 linux, only null / absent and true are allowed.
+	// On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.
+	// +optional
+	SingleProcessOOMKill *bool `json:"singleProcessOOMKill,omitempty"`
 	// cpuManagerPolicyOptions is a set of key=value which 	allows to set extra options
 	// to fine tune the behaviour of the cpu manager policies.
 	// Requires  both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled.
diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
index 613a039a755..0ab6259f98f 100644
--- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
@@ -254,6 +254,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
 		*out = new(bool)
 		**out = **in
 	}
+	if in.SingleProcessOOMKill != nil {
+		in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
+		*out = new(bool)
+		**out = **in
+	}
 	if in.CPUManagerPolicyOptions != nil {
 		in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
 		*out = make(map[string]string, len(*in))
diff --git a/test/e2e_node/oomkiller_linux_test.go b/test/e2e_node/oomkiller_linux_test.go
index db84cb0f0cf..a9e2c4fe8f5 100644
--- a/test/e2e_node/oomkiller_linux_test.go
+++ b/test/e2e_node/oomkiller_linux_test.go
@@ -19,6 +19,7 @@ package e2enode
 import (
 	"context"
 	"fmt"
+	"time"
 
 	"github.com/onsi/gomega"
 	v1 "k8s.io/api/core/v1"
@@ -35,9 +36,11 @@ import (
 )
 
 type testCase struct {
-	name                   string
-	podSpec                *v1.Pod
-	oomTargetContainerName string
+	name                    string
+	podSpec                 *v1.Pod
+	oomTargetContainerName  string
+	enableSingleProcessKill *bool
+	expectPodRunning        bool
 }
 
 // KubeReservedMemory is default fraction value of node capacity memory to
@@ -62,7 +65,7 @@ var _ = SIGDescribe("OOMKiller for pod using more memory than node allocatable [
 	}
 })
 
-var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), func() {
+var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), framework.WithSerial(), func() {
 	f := framework.NewDefaultFramework("oomkiller-test")
 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
 
@@ -89,6 +92,24 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
 			oomTargetContainerName: "oomkill-multi-target-container",
 			podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
 				getOOMTargetContainerMultiProcess),
+			enableSingleProcessKill: nil,
+		})
+
+		testCases = append(testCases, testCase{
+			name:                   "multi process container (single process kill enabled)",
+			oomTargetContainerName: "oomkill-multi-target-container",
+			podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
+				getOOMTargetContainerMultiProcess),
+			enableSingleProcessKill: ptr.To(true),
+			expectPodRunning:        true,
+		})
+
+		testCases = append(testCases, testCase{
+			name:                   "multi process container (single process kill disabled)",
+			oomTargetContainerName: "oomkill-multi-target-container",
+			podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
+				getOOMTargetContainerMultiProcess),
+			enableSingleProcessKill: ptr.To(false),
 		})
 	}
 	for _, tc := range testCases {
@@ -99,8 +120,8 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
 func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMemory float64) {
 	ginkgo.Context(testCase.name, func() {
 		// Update KubeReservedMemory in KubeletConfig.
-		if kubeReservedMemory > 0 {
-			tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
+		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
+			if kubeReservedMemory > 0 {
 				if initialConfig.KubeReserved == nil {
 					initialConfig.KubeReserved = map[string]string{}
 				}
@@ -109,8 +130,10 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
 				// K8s components such that node allocatable memory is less than node capacity to
 				// observe OOM kills at cgroup level instead of system OOM kills.
 				initialConfig.KubeReserved["memory"] = fmt.Sprintf("%d", int(kubeReservedMemory*getLocalNode(context.TODO(), f).Status.Capacity.Memory().AsApproximateFloat64()))
-			})
-		}
+			}
+
+			initialConfig.SingleProcessOOMKill = testCase.enableSingleProcessKill
+		})
 
 		ginkgo.BeforeEach(func() {
 			// Precautionary check that kubelet is healthy before running the test.
@@ -120,18 +143,37 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
 			e2epod.NewPodClient(f).Create(context.TODO(), testCase.podSpec)
 		})
 
-		ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
-			ginkgo.By("Waiting for the pod to be failed")
-			err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
-			framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
+		if testCase.expectPodRunning {
+			ginkgo.It("The containers should not be OOMKilled", func() {
+				err := e2epod.WaitForPodsRunning(context.TODO(), f.ClientSet, f.Namespace.Name, 1, framework.PodStartTimeout)
+				framework.ExpectNoError(err, "Failed waiting for pod to be running state, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
 
-			ginkgo.By("Fetching the latest pod status")
-			pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
-			framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
+				gomega.Consistently(context.TODO(), func(ctx context.Context) error {
+					pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, testCase.podSpec.Name, metav1.GetOptions{})
+					if err != nil {
+						return fmt.Errorf("expected the pod %s to exist: %w", pod.Name, err)
+					}
+					phase := pod.Status.Phase
+					if phase != v1.PodRunning && phase != v1.PodSucceeded {
+						return fmt.Errorf("pod %s: unexpected status %s, expected status: %s or %s", pod.Name, pod.Status.Phase, v1.PodRunning, v1.PodSucceeded)
+					}
+					return nil
+				}, 10*time.Second, f.Timeouts.Poll).Should(gomega.BeNil())
+			})
+		} else {
+			ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
+				ginkgo.By("Waiting for the pod to be failed")
+				err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
+				framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
 
-			ginkgo.By("Verifying the OOM target container has the expected reason")
-			verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
-		})
+				ginkgo.By("Fetching the latest pod status")
+				pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
+				framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
+
+				ginkgo.By("Verifying the OOM target container has the expected reason")
+				verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
+			})
+		}
 
 		ginkgo.AfterEach(func() {
 			ginkgo.By(fmt.Sprintf("deleting pod: %s", testCase.podSpec.Name))