kubelet: new kubelet config option for disabling group oom kill

Signed-off-by: utam0k <k0ma@utam0k.jp>
This commit is contained in:
utam0k 2024-10-14 14:46:28 +09:00
parent 3036d107a0
commit 4f909c14a0
No known key found for this signature in database
GPG Key ID: 2DB29D2A21B41E0E
25 changed files with 253 additions and 45 deletions

View File

@ -606,7 +606,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
// Warn if MemoryQoS enabled with cgroups v1
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) &&
!isCgroup2UnifiedMode() {
!kubeletutil.IsCgroup2UnifiedMode() {
klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1")
}
// Obtain Kubelet Lock File
@ -831,7 +831,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
s.TopologyManagerPolicyOptions, features.TopologyManagerPolicyOptions)
}
if utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) {
if !isCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
// This feature is not supported for cgroupv1 so we are failing early.
return fmt.Errorf("swap feature is enabled and LimitedSwap but it is only supported with cgroupv2")
}

View File

@ -19,8 +19,6 @@ package app
import (
"k8s.io/klog/v2"
"k8s.io/utils/inotify"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
)
func watchForLockfileContention(path string, done chan struct{}) error {
@ -46,7 +44,3 @@ func watchForLockfileContention(path string, done chan struct{}) error {
}()
return nil
}
func isCgroup2UnifiedMode() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}

View File

@ -61947,6 +61947,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
Format: "",
},
},
"singleProcessOOMKill": {
SchemaProps: spec.SchemaProps{
Description: "singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as a group. It means that if true, the behavior aligns with the behavior of cgroups v1. The default value is determined automatically when you don't specify. On non-linux such as windows, only null / absent is allowed. On cgroup v1 linux, only null / absent and true are allowed. On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.",
Type: []string{"boolean"},
Format: "",
},
},
"cpuManagerPolicyOptions": {
SchemaProps: spec.SchemaProps{
Description: "cpuManagerPolicyOptions is a set of key=value which \tallows to set extra options to fine tune the behaviour of the cpu manager policies. Requires both the \"CPUManager\" and \"CPUManagerPolicyOptions\" feature gates to be enabled. Default: nil",

View File

@ -73,12 +73,12 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
obj.NodeStatusReportFrequency = metav1.Duration{Duration: time.Minute}
obj.NodeLeaseDurationSeconds = 40
obj.CPUManagerPolicy = "none"
obj.CPUManagerPolicyOptions = make(map[string]string)
obj.CPUManagerPolicyOptions = nil
obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency
obj.NodeStatusMaxImages = 50
obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy
obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope
obj.TopologyManagerPolicyOptions = make(map[string]string)
obj.TopologyManagerPolicyOptions = nil
obj.QOSReserved = map[string]string{
"memory": "50%",
}
@ -104,13 +104,14 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
obj.CgroupsPerQOS = true
obj.CgroupDriver = "cgroupfs"
obj.EnforceNodeAllocatable = kubeletconfigv1beta1.DefaultNodeAllocatableEnforcement
obj.StaticPodURLHeader = make(map[string][]string)
obj.StaticPodURLHeader = nil
obj.SingleProcessOOMKill = ptr.To(false)
obj.ContainerLogMaxFiles = 5
obj.ContainerLogMaxSize = "10Mi"
obj.ContainerLogMaxWorkers = 1
obj.ContainerLogMonitorInterval = metav1.Duration{Duration: 10 * time.Second}
obj.ConfigMapAndSecretChangeDetectionStrategy = "Watch"
obj.AllowedUnsafeSysctls = []string{}
obj.AllowedUnsafeSysctls = nil
obj.VolumePluginDir = kubeletconfigv1beta1.DefaultVolumePluginDir
obj.ContainerRuntimeEndpoint = "unix:///run/containerd/containerd.sock"

View File

@ -233,6 +233,7 @@ var (
"Logging.Options.Text.OutputRoutingOptions.SplitStream",
"Logging.VModule[*].FilePattern",
"Logging.VModule[*].Verbosity",
"SingleProcessOOMKill",
"Logging.Verbosity",
"TLSCipherSuites[*]",
"TLSMinVersion",

View File

@ -229,6 +229,10 @@ type KubeletConfiguration struct {
CgroupsPerQOS bool
// driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd)
CgroupDriver string
// SingleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
SingleProcessOOMKill *bool
// CPUManagerPolicy is the name of the policy to use.
// Requires the CPUManager feature gate to be enabled.
CPUManagerPolicy string

View File

@ -34,7 +34,6 @@ import (
)
func TestSetDefaultsKubeletConfiguration(t *testing.T) {
tests := []struct {
name string
config *v1beta1.KubeletConfiguration
@ -130,6 +129,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
SingleProcessOOMKill: nil,
},
},
{
@ -261,6 +261,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(false),
LocalStorageCapacityIsolation: ptr.To(false),
PodLogsDir: "",
SingleProcessOOMKill: ptr.To(false),
},
&v1beta1.KubeletConfiguration{
EnableServer: ptr.To(false),
@ -363,6 +364,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(false),
LocalStorageCapacityIsolation: ptr.To(false),
PodLogsDir: DefaultPodLogsDir,
SingleProcessOOMKill: ptr.To(false),
},
},
{
@ -516,6 +518,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: "/custom/path",
SingleProcessOOMKill: ptr.To(true),
},
&v1beta1.KubeletConfiguration{
EnableServer: ptr.To(true),
@ -666,6 +669,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: "/custom/path",
SingleProcessOOMKill: ptr.To(true),
},
},
{
@ -759,6 +763,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
SingleProcessOOMKill: nil,
},
},
{
@ -852,6 +857,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
SingleProcessOOMKill: nil,
},
},
{
@ -945,6 +951,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
RegisterNode: ptr.To(true),
LocalStorageCapacityIsolation: ptr.To(true),
PodLogsDir: DefaultPodLogsDir,
SingleProcessOOMKill: nil,
},
},
}

View File

@ -409,6 +409,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
}
out.CgroupDriver = in.CgroupDriver
out.CPUManagerPolicy = in.CPUManagerPolicy
out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
out.MemoryManagerPolicy = in.MemoryManagerPolicy
@ -606,6 +607,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
return err
}
out.CgroupDriver = in.CgroupDriver
out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
out.CPUManagerPolicy = in.CPUManagerPolicy
out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod

View File

@ -24,13 +24,19 @@ import (
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/utils/ptr"
)
// validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
if kc.FailCgroupV1 && !libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup1 := !libcontainercgroups.IsCgroup2UnifiedMode()
if kc.FailCgroupV1 && isCgroup1 {
return fmt.Errorf("kubelet is configured to not run on a host using cgroup v1. cgroup v1 support is in maintenance mode")
}
if isCgroup1 && kc.SingleProcessOOMKill != nil && !ptr.Deref(kc.SingleProcessOOMKill, true) {
return fmt.Errorf("invalid configuration: singleProcessOOMKill must not be explicitly set to false when using cgroup v1")
}
return nil
}

View File

@ -20,10 +20,16 @@ limitations under the License.
package validation
import (
"fmt"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
)
// validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
if kc.SingleProcessOOMKill != nil {
return fmt.Errorf("invalid configuration: singleProcessOOMKill is only supported on linux")
}
return nil
}

View File

@ -29,6 +29,7 @@ import (
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/apis/config/validation"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/utils/ptr"
)
@ -78,6 +79,7 @@ var (
ContainerRuntimeEndpoint: "unix:///run/containerd/containerd.sock",
ContainerLogMaxWorkers: 1,
ContainerLogMonitorInterval: metav1.Duration{Duration: 10 * time.Second},
SingleProcessOOMKill: ptr.To(!kubeletutil.IsCgroup2UnifiedMode()),
}
)

View File

@ -20,9 +20,10 @@ limitations under the License.
package validation
import (
"k8s.io/klog/v2"
"fmt"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
@ -36,6 +37,10 @@ func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) erro
klog.Warningf(message, "CgroupsPerQOS", "--cgroups-per-qos", kc.CgroupsPerQOS)
}
if kc.SingleProcessOOMKill != nil {
return fmt.Errorf("invalid configuration: singleProcessOOMKill is not supported on Windows")
}
enforceNodeAllocatableWithoutNone := sets.New(kc.EnforceNodeAllocatable...).Delete(kubetypes.NodeAllocatableNoneKey)
if len(enforceNodeAllocatableWithoutNone) > 0 {
klog.Warningf(message, "EnforceNodeAllocatable", "--enforce-node-allocatable", kc.EnforceNodeAllocatable)

View File

@ -204,6 +204,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
out.ImageMinimumGCAge = in.ImageMinimumGCAge
out.ImageMaximumGCAge = in.ImageMaximumGCAge
out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod
if in.SingleProcessOOMKill != nil {
in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
*out = new(bool)
**out = **in
}
if in.CPUManagerPolicyOptions != nil {
in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
*out = make(map[string]string, len(*in))

View File

@ -45,6 +45,7 @@ import (
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
netutils "k8s.io/utils/net"
"k8s.io/utils/ptr"
inuserns "github.com/moby/sys/userns"
v1 "k8s.io/api/core/v1"
@ -661,6 +662,20 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.podCache,
)
var singleProcessOOMKill *bool
if sysruntime.GOOS == "linux" {
if !util.IsCgroup2UnifiedMode() {
// This is a default behavior for cgroups v1.
singleProcessOOMKill = ptr.To(true)
} else {
if kubeCfg.SingleProcessOOMKill == nil {
singleProcessOOMKill = ptr.To(false)
} else {
singleProcessOOMKill = kubeCfg.SingleProcessOOMKill
}
}
}
runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
klet.livenessManager,
@ -680,6 +695,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
int(kubeCfg.RegistryBurst),
imageCredentialProviderConfigFile,
imageCredentialProviderBinDir,
singleProcessOOMKill,
kubeCfg.CPUCFSQuota,
kubeCfg.CPUCFSQuotaPeriod,
kubeDeps.RemoteRuntimeService,

View File

@ -3225,6 +3225,7 @@ func TestSyncPodSpans(t *testing.T) {
int(kubeCfg.RegistryBurst),
"",
"",
nil,
kubeCfg.CPUCFSQuota,
kubeCfg.CPUCFSQuotaPeriod,
runtimeSvc,

View File

@ -37,6 +37,7 @@ import (
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
@ -45,6 +46,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/qos"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
cgroups "k8s.io/kubernetes/third_party/forked/cgroups"
"k8s.io/utils/ptr"
)
var defaultPageSize = int64(os.Getpagesize())
@ -247,7 +249,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
}
// runc requires cgroupv2 for unified mode
if isCgroup2UnifiedMode() {
if isCgroup2UnifiedMode() && !ptr.Deref(m.singleProcessOOMKill, true) {
resources.Unified = map[string]string{
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for

View File

@ -249,12 +249,13 @@ func TestCalculateLinuxResources(t *testing.T) {
}
tests := []struct {
name string
cpuReq *resource.Quantity
cpuLim *resource.Quantity
memLim *resource.Quantity
expected *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
name string
cpuReq *resource.Quantity
cpuLim *resource.Quantity
memLim *resource.Quantity
expected *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
singleProcessOOMKill bool
}{
{
name: "Request128MBLimit256MB",
@ -321,6 +322,20 @@ func TestCalculateLinuxResources(t *testing.T) {
},
cgroupVersion: cgroupV2,
},
{
name: "Request128MBLimit256MBSingleProcess",
cpuReq: generateResourceQuantity("1"),
cpuLim: generateResourceQuantity("2"),
memLim: generateResourceQuantity("128Mi"),
expected: &runtimeapi.LinuxContainerResources{
CpuPeriod: 100000,
CpuQuota: 200000,
CpuShares: 1024,
MemoryLimitInBytes: 134217728,
},
cgroupVersion: cgroupV2,
singleProcessOOMKill: true,
},
{
name: "RequestNoMemory",
cpuReq: generateResourceQuantity("2"),
@ -365,6 +380,7 @@ func TestCalculateLinuxResources(t *testing.T) {
}
for _, test := range tests {
setCgroupVersionDuringTest(test.cgroupVersion)
m.singleProcessOOMKill = ptr.To(test.singleProcessOOMKill)
linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim)
assert.Equal(t, test.expected, linuxContainerResources)
}
@ -808,16 +824,18 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
}
for _, tc := range []struct {
name string
limits v1.ResourceList
requests v1.ResourceList
expected *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
name string
limits v1.ResourceList
requests v1.ResourceList
singleProcessOOMKill bool
expected *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
}{
{
"requests & limits, cpu & memory, guaranteed qos",
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
true,
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997},
cgroupV1,
},
@ -825,6 +843,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
"requests & limits, cpu & memory, burstable qos",
v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
true,
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970},
cgroupV1,
},
@ -832,6 +851,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
"best-effort qos",
nil,
nil,
true,
&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000},
cgroupV1,
},
@ -839,6 +859,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
"requests & limits, cpu & memory, guaranteed qos",
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
false,
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997, Unified: map[string]string{"memory.oom.group": "1"}},
cgroupV2,
},
@ -846,6 +867,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
"requests & limits, cpu & memory, burstable qos",
v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
false,
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970, Unified: map[string]string{"memory.oom.group": "1"}},
cgroupV2,
},
@ -853,6 +875,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
"best-effort qos",
nil,
nil,
false,
&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000, Unified: map[string]string{"memory.oom.group": "1"}},
cgroupV2,
},
@ -863,6 +886,8 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
m.singleProcessOOMKill = ptr.To(tc.singleProcessOOMKill)
resources := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
tc.expected.HugepageLimits = resources.HugepageLimits
assert.Equal(t, tc.expected, resources)

View File

@ -118,6 +118,11 @@ type kubeGenericRuntimeManager struct {
readinessManager proberesults.Manager
startupManager proberesults.Manager
// If false, pass "memory.oom.group" to container cgroups when using cgroups v2 to cause processes
// in those cgroups to be killed as a unit by the OOM killer.
// It must be nil except for linux
singleProcessOOMKill *bool
// If true, enforce container cpu limits with CFS quota support
cpuCFSQuota bool
@ -198,6 +203,7 @@ func NewKubeGenericRuntimeManager(
imagePullBurst int,
imageCredentialProviderConfigFile string,
imageCredentialProviderBinDir string,
singleProcessOOMKill *bool,
cpuCFSQuota bool,
cpuCFSQuotaPeriod metav1.Duration,
runtimeService internalapi.RuntimeService,
@ -218,6 +224,7 @@ func NewKubeGenericRuntimeManager(
tracer := tracerProvider.Tracer(instrumentationScope)
kubeRuntimeManager := &kubeGenericRuntimeManager{
recorder: recorder,
singleProcessOOMKill: singleProcessOOMKill,
cpuCFSQuota: cpuCFSQuota,
cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
seccompProfileRoot: filepath.Join(rootDirectory, "seccomp"),

View File

@ -28,11 +28,13 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/utils/ptr"
)
func TestApplySandboxResources(t *testing.T) {
_, _, m, err := createTestRuntimeManager()
m.cpuCFSQuota = true
m.singleProcessOOMKill = ptr.To(false)
config := &runtimeapi.PodSandboxConfig{
Linux: &runtimeapi.LinuxPodSandboxConfig{},

View File

@ -0,0 +1,29 @@
//go:build linux
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
import (
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
)
// IsCgroup2UnifiedMode returns true if the cgroup v2 unified mode is enabled
func IsCgroup2UnifiedMode() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}

View File

@ -0,0 +1,25 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package util
// IsCgroup2UnifiedMode is a no-op for other OSes.
func IsCgroup2UnifiedMode() bool {
return false
}

View File

@ -73,3 +73,8 @@ func NormalizePath(path string) string {
}
return path
}
// IsCgroup2UnifiedMode is a no-op for Windows for now
func IsCgroup2UnifiedMode() bool {
return false
}

View File

@ -354,6 +354,15 @@ type KubeletConfiguration struct {
// Default: "None"
// +optional
CPUManagerPolicy string `json:"cpuManagerPolicy,omitempty"`
// singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
// The default value is determined automatically when you don't specify.
// On non-linux such as windows, only null / absent is allowed.
// On cgroup v1 linux, only null / absent and true are allowed.
// On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.
// +optional
SingleProcessOOMKill *bool `json:"singleProcessOOMKill,omitempty"`
// cpuManagerPolicyOptions is a set of key=value which allows to set extra options
// to fine tune the behaviour of the cpu manager policies.
// Requires both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled.

View File

@ -254,6 +254,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
*out = new(bool)
**out = **in
}
if in.SingleProcessOOMKill != nil {
in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
*out = new(bool)
**out = **in
}
if in.CPUManagerPolicyOptions != nil {
in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
*out = make(map[string]string, len(*in))

View File

@ -19,6 +19,7 @@ package e2enode
import (
"context"
"fmt"
"time"
"github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
@ -35,9 +36,11 @@ import (
)
type testCase struct {
name string
podSpec *v1.Pod
oomTargetContainerName string
name string
podSpec *v1.Pod
oomTargetContainerName string
enableSingleProcessKill *bool
expectPodRunning bool
}
// KubeReservedMemory is default fraction value of node capacity memory to
@ -62,7 +65,7 @@ var _ = SIGDescribe("OOMKiller for pod using more memory than node allocatable [
}
})
var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), func() {
var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), framework.WithSerial(), func() {
f := framework.NewDefaultFramework("oomkiller-test")
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
@ -89,6 +92,24 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
oomTargetContainerName: "oomkill-multi-target-container",
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
getOOMTargetContainerMultiProcess),
enableSingleProcessKill: nil,
})
testCases = append(testCases, testCase{
name: "multi process container (single process kill enabled)",
oomTargetContainerName: "oomkill-multi-target-container",
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
getOOMTargetContainerMultiProcess),
enableSingleProcessKill: ptr.To(true),
expectPodRunning: true,
})
testCases = append(testCases, testCase{
name: "multi process container (single process kill disabled)",
oomTargetContainerName: "oomkill-multi-target-container",
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
getOOMTargetContainerMultiProcess),
enableSingleProcessKill: ptr.To(false),
})
}
for _, tc := range testCases {
@ -99,8 +120,8 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMemory float64) {
ginkgo.Context(testCase.name, func() {
// Update KubeReservedMemory in KubeletConfig.
if kubeReservedMemory > 0 {
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
if kubeReservedMemory > 0 {
if initialConfig.KubeReserved == nil {
initialConfig.KubeReserved = map[string]string{}
}
@ -109,8 +130,10 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
// K8s components such that node allocatable memory is less than node capacity to
// observe OOM kills at cgroup level instead of system OOM kills.
initialConfig.KubeReserved["memory"] = fmt.Sprintf("%d", int(kubeReservedMemory*getLocalNode(context.TODO(), f).Status.Capacity.Memory().AsApproximateFloat64()))
})
}
}
initialConfig.SingleProcessOOMKill = testCase.enableSingleProcessKill
})
ginkgo.BeforeEach(func() {
// Precautionary check that kubelet is healthy before running the test.
@ -120,18 +143,37 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
e2epod.NewPodClient(f).Create(context.TODO(), testCase.podSpec)
})
ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
ginkgo.By("Waiting for the pod to be failed")
err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
if testCase.expectPodRunning {
ginkgo.It("The containers should not be OOMKilled", func() {
err := e2epod.WaitForPodsRunning(context.TODO(), f.ClientSet, f.Namespace.Name, 1, framework.PodStartTimeout)
framework.ExpectNoError(err, "Failed waiting for pod to be running state, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
ginkgo.By("Fetching the latest pod status")
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
gomega.Consistently(context.TODO(), func(ctx context.Context) error {
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, testCase.podSpec.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("expected the pod %s to exist: %w", pod.Name, err)
}
phase := pod.Status.Phase
if phase != v1.PodRunning && phase != v1.PodSucceeded {
return fmt.Errorf("pod %s: unexpected status %s, expected status: %s or %s", pod.Name, pod.Status.Phase, v1.PodRunning, v1.PodSucceeded)
}
return nil
}, 10*time.Second, f.Timeouts.Poll).Should(gomega.BeNil())
})
} else {
ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
ginkgo.By("Waiting for the pod to be failed")
err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
ginkgo.By("Verifying the OOM target container has the expected reason")
verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
})
ginkgo.By("Fetching the latest pod status")
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
ginkgo.By("Verifying the OOM target container has the expected reason")
verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
})
}
ginkgo.AfterEach(func() {
ginkgo.By(fmt.Sprintf("deleting pod: %s", testCase.podSpec.Name))