mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 14:37:00 +00:00
kubelet: new kubelet config option for disabling group oom kill
Signed-off-by: utam0k <k0ma@utam0k.jp>
This commit is contained in:
parent
3036d107a0
commit
4f909c14a0
@ -606,7 +606,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
|
||||
|
||||
// Warn if MemoryQoS enabled with cgroups v1
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) &&
|
||||
!isCgroup2UnifiedMode() {
|
||||
!kubeletutil.IsCgroup2UnifiedMode() {
|
||||
klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1")
|
||||
}
|
||||
// Obtain Kubelet Lock File
|
||||
@ -831,7 +831,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
|
||||
s.TopologyManagerPolicyOptions, features.TopologyManagerPolicyOptions)
|
||||
}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) {
|
||||
if !isCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
|
||||
if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap {
|
||||
// This feature is not supported for cgroupv1 so we are failing early.
|
||||
return fmt.Errorf("swap feature is enabled and LimitedSwap but it is only supported with cgroupv2")
|
||||
}
|
||||
|
@ -19,8 +19,6 @@ package app
|
||||
import (
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/inotify"
|
||||
|
||||
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
)
|
||||
|
||||
func watchForLockfileContention(path string, done chan struct{}) error {
|
||||
@ -46,7 +44,3 @@ func watchForLockfileContention(path string, done chan struct{}) error {
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func isCgroup2UnifiedMode() bool {
|
||||
return libcontainercgroups.IsCgroup2UnifiedMode()
|
||||
}
|
||||
|
7
pkg/generated/openapi/zz_generated.openapi.go
generated
7
pkg/generated/openapi/zz_generated.openapi.go
generated
@ -61947,6 +61947,13 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
|
||||
Format: "",
|
||||
},
|
||||
},
|
||||
"singleProcessOOMKill": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as a group. It means that if true, the behavior aligns with the behavior of cgroups v1. The default value is determined automatically when you don't specify. On non-linux such as windows, only null / absent is allowed. On cgroup v1 linux, only null / absent and true are allowed. On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.",
|
||||
Type: []string{"boolean"},
|
||||
Format: "",
|
||||
},
|
||||
},
|
||||
"cpuManagerPolicyOptions": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "cpuManagerPolicyOptions is a set of key=value which \tallows to set extra options to fine tune the behaviour of the cpu manager policies. Requires both the \"CPUManager\" and \"CPUManagerPolicyOptions\" feature gates to be enabled. Default: nil",
|
||||
|
@ -73,12 +73,12 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
|
||||
obj.NodeStatusReportFrequency = metav1.Duration{Duration: time.Minute}
|
||||
obj.NodeLeaseDurationSeconds = 40
|
||||
obj.CPUManagerPolicy = "none"
|
||||
obj.CPUManagerPolicyOptions = make(map[string]string)
|
||||
obj.CPUManagerPolicyOptions = nil
|
||||
obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency
|
||||
obj.NodeStatusMaxImages = 50
|
||||
obj.TopologyManagerPolicy = kubeletconfig.NoneTopologyManagerPolicy
|
||||
obj.TopologyManagerScope = kubeletconfig.ContainerTopologyManagerScope
|
||||
obj.TopologyManagerPolicyOptions = make(map[string]string)
|
||||
obj.TopologyManagerPolicyOptions = nil
|
||||
obj.QOSReserved = map[string]string{
|
||||
"memory": "50%",
|
||||
}
|
||||
@ -104,13 +104,14 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
|
||||
obj.CgroupsPerQOS = true
|
||||
obj.CgroupDriver = "cgroupfs"
|
||||
obj.EnforceNodeAllocatable = kubeletconfigv1beta1.DefaultNodeAllocatableEnforcement
|
||||
obj.StaticPodURLHeader = make(map[string][]string)
|
||||
obj.StaticPodURLHeader = nil
|
||||
obj.SingleProcessOOMKill = ptr.To(false)
|
||||
obj.ContainerLogMaxFiles = 5
|
||||
obj.ContainerLogMaxSize = "10Mi"
|
||||
obj.ContainerLogMaxWorkers = 1
|
||||
obj.ContainerLogMonitorInterval = metav1.Duration{Duration: 10 * time.Second}
|
||||
obj.ConfigMapAndSecretChangeDetectionStrategy = "Watch"
|
||||
obj.AllowedUnsafeSysctls = []string{}
|
||||
obj.AllowedUnsafeSysctls = nil
|
||||
obj.VolumePluginDir = kubeletconfigv1beta1.DefaultVolumePluginDir
|
||||
obj.ContainerRuntimeEndpoint = "unix:///run/containerd/containerd.sock"
|
||||
|
||||
|
@ -233,6 +233,7 @@ var (
|
||||
"Logging.Options.Text.OutputRoutingOptions.SplitStream",
|
||||
"Logging.VModule[*].FilePattern",
|
||||
"Logging.VModule[*].Verbosity",
|
||||
"SingleProcessOOMKill",
|
||||
"Logging.Verbosity",
|
||||
"TLSCipherSuites[*]",
|
||||
"TLSMinVersion",
|
||||
|
@ -229,6 +229,10 @@ type KubeletConfiguration struct {
|
||||
CgroupsPerQOS bool
|
||||
// driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd)
|
||||
CgroupDriver string
|
||||
// SingleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
|
||||
// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
|
||||
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
|
||||
SingleProcessOOMKill *bool
|
||||
// CPUManagerPolicy is the name of the policy to use.
|
||||
// Requires the CPUManager feature gate to be enabled.
|
||||
CPUManagerPolicy string
|
||||
|
@ -34,7 +34,6 @@ import (
|
||||
)
|
||||
|
||||
func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
config *v1beta1.KubeletConfiguration
|
||||
@ -130,6 +129,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: DefaultPodLogsDir,
|
||||
SingleProcessOOMKill: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -261,6 +261,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(false),
|
||||
LocalStorageCapacityIsolation: ptr.To(false),
|
||||
PodLogsDir: "",
|
||||
SingleProcessOOMKill: ptr.To(false),
|
||||
},
|
||||
&v1beta1.KubeletConfiguration{
|
||||
EnableServer: ptr.To(false),
|
||||
@ -363,6 +364,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(false),
|
||||
LocalStorageCapacityIsolation: ptr.To(false),
|
||||
PodLogsDir: DefaultPodLogsDir,
|
||||
SingleProcessOOMKill: ptr.To(false),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -516,6 +518,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: "/custom/path",
|
||||
SingleProcessOOMKill: ptr.To(true),
|
||||
},
|
||||
&v1beta1.KubeletConfiguration{
|
||||
EnableServer: ptr.To(true),
|
||||
@ -666,6 +669,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: "/custom/path",
|
||||
SingleProcessOOMKill: ptr.To(true),
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -759,6 +763,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: DefaultPodLogsDir,
|
||||
SingleProcessOOMKill: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -852,6 +857,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: DefaultPodLogsDir,
|
||||
SingleProcessOOMKill: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -945,6 +951,7 @@ func TestSetDefaultsKubeletConfiguration(t *testing.T) {
|
||||
RegisterNode: ptr.To(true),
|
||||
LocalStorageCapacityIsolation: ptr.To(true),
|
||||
PodLogsDir: DefaultPodLogsDir,
|
||||
SingleProcessOOMKill: nil,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -409,6 +409,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
|
||||
}
|
||||
out.CgroupDriver = in.CgroupDriver
|
||||
out.CPUManagerPolicy = in.CPUManagerPolicy
|
||||
out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
|
||||
out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
|
||||
out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
|
||||
out.MemoryManagerPolicy = in.MemoryManagerPolicy
|
||||
@ -606,6 +607,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
|
||||
return err
|
||||
}
|
||||
out.CgroupDriver = in.CgroupDriver
|
||||
out.SingleProcessOOMKill = (*bool)(unsafe.Pointer(in.SingleProcessOOMKill))
|
||||
out.CPUManagerPolicy = in.CPUManagerPolicy
|
||||
out.CPUManagerPolicyOptions = *(*map[string]string)(unsafe.Pointer(&in.CPUManagerPolicyOptions))
|
||||
out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
|
||||
|
@ -24,13 +24,19 @@ import (
|
||||
|
||||
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
// validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
|
||||
func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
|
||||
if kc.FailCgroupV1 && !libcontainercgroups.IsCgroup2UnifiedMode() {
|
||||
isCgroup1 := !libcontainercgroups.IsCgroup2UnifiedMode()
|
||||
if kc.FailCgroupV1 && isCgroup1 {
|
||||
return fmt.Errorf("kubelet is configured to not run on a host using cgroup v1. cgroup v1 support is in maintenance mode")
|
||||
}
|
||||
|
||||
if isCgroup1 && kc.SingleProcessOOMKill != nil && !ptr.Deref(kc.SingleProcessOOMKill, true) {
|
||||
return fmt.Errorf("invalid configuration: singleProcessOOMKill must not be explicitly set to false when using cgroup v1")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -20,10 +20,16 @@ limitations under the License.
|
||||
package validation
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
)
|
||||
|
||||
// validateKubeletOSConfiguration validates os specific kubelet configuration and returns an error if it is invalid.
|
||||
func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) error {
|
||||
if kc.SingleProcessOOMKill != nil {
|
||||
return fmt.Errorf("invalid configuration: singleProcessOOMKill is only supported on linux")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ import (
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/apis/config/validation"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
@ -78,6 +79,7 @@ var (
|
||||
ContainerRuntimeEndpoint: "unix:///run/containerd/containerd.sock",
|
||||
ContainerLogMaxWorkers: 1,
|
||||
ContainerLogMonitorInterval: metav1.Duration{Duration: 10 * time.Second},
|
||||
SingleProcessOOMKill: ptr.To(!kubeletutil.IsCgroup2UnifiedMode()),
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -20,9 +20,10 @@ limitations under the License.
|
||||
package validation
|
||||
|
||||
import (
|
||||
"k8s.io/klog/v2"
|
||||
"fmt"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
@ -36,6 +37,10 @@ func validateKubeletOSConfiguration(kc *kubeletconfig.KubeletConfiguration) erro
|
||||
klog.Warningf(message, "CgroupsPerQOS", "--cgroups-per-qos", kc.CgroupsPerQOS)
|
||||
}
|
||||
|
||||
if kc.SingleProcessOOMKill != nil {
|
||||
return fmt.Errorf("invalid configuration: singleProcessOOMKill is not supported on Windows")
|
||||
}
|
||||
|
||||
enforceNodeAllocatableWithoutNone := sets.New(kc.EnforceNodeAllocatable...).Delete(kubetypes.NodeAllocatableNoneKey)
|
||||
if len(enforceNodeAllocatableWithoutNone) > 0 {
|
||||
klog.Warningf(message, "EnforceNodeAllocatable", "--enforce-node-allocatable", kc.EnforceNodeAllocatable)
|
||||
|
5
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
5
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
@ -204,6 +204,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
||||
out.ImageMinimumGCAge = in.ImageMinimumGCAge
|
||||
out.ImageMaximumGCAge = in.ImageMaximumGCAge
|
||||
out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod
|
||||
if in.SingleProcessOOMKill != nil {
|
||||
in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.CPUManagerPolicyOptions != nil {
|
||||
in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
|
||||
*out = make(map[string]string, len(*in))
|
||||
|
@ -45,6 +45,7 @@ import (
|
||||
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
||||
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
|
||||
netutils "k8s.io/utils/net"
|
||||
"k8s.io/utils/ptr"
|
||||
|
||||
inuserns "github.com/moby/sys/userns"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@ -661,6 +662,20 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||
klet.podCache,
|
||||
)
|
||||
|
||||
var singleProcessOOMKill *bool
|
||||
if sysruntime.GOOS == "linux" {
|
||||
if !util.IsCgroup2UnifiedMode() {
|
||||
// This is a default behavior for cgroups v1.
|
||||
singleProcessOOMKill = ptr.To(true)
|
||||
} else {
|
||||
if kubeCfg.SingleProcessOOMKill == nil {
|
||||
singleProcessOOMKill = ptr.To(false)
|
||||
} else {
|
||||
singleProcessOOMKill = kubeCfg.SingleProcessOOMKill
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
|
||||
kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
|
||||
klet.livenessManager,
|
||||
@ -680,6 +695,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||
int(kubeCfg.RegistryBurst),
|
||||
imageCredentialProviderConfigFile,
|
||||
imageCredentialProviderBinDir,
|
||||
singleProcessOOMKill,
|
||||
kubeCfg.CPUCFSQuota,
|
||||
kubeCfg.CPUCFSQuotaPeriod,
|
||||
kubeDeps.RemoteRuntimeService,
|
||||
|
@ -3225,6 +3225,7 @@ func TestSyncPodSpans(t *testing.T) {
|
||||
int(kubeCfg.RegistryBurst),
|
||||
"",
|
||||
"",
|
||||
nil,
|
||||
kubeCfg.CPUCFSQuota,
|
||||
kubeCfg.CPUCFSQuotaPeriod,
|
||||
runtimeSvc,
|
||||
|
@ -37,6 +37,7 @@ import (
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
|
||||
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
||||
kubefeatures "k8s.io/kubernetes/pkg/features"
|
||||
@ -45,6 +46,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
cgroups "k8s.io/kubernetes/third_party/forked/cgroups"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
var defaultPageSize = int64(os.Getpagesize())
|
||||
@ -247,7 +249,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
|
||||
}
|
||||
|
||||
// runc requires cgroupv2 for unified mode
|
||||
if isCgroup2UnifiedMode() {
|
||||
if isCgroup2UnifiedMode() && !ptr.Deref(m.singleProcessOOMKill, true) {
|
||||
resources.Unified = map[string]string{
|
||||
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
|
||||
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
|
||||
|
@ -249,12 +249,13 @@ func TestCalculateLinuxResources(t *testing.T) {
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
cpuReq *resource.Quantity
|
||||
cpuLim *resource.Quantity
|
||||
memLim *resource.Quantity
|
||||
expected *runtimeapi.LinuxContainerResources
|
||||
cgroupVersion CgroupVersion
|
||||
name string
|
||||
cpuReq *resource.Quantity
|
||||
cpuLim *resource.Quantity
|
||||
memLim *resource.Quantity
|
||||
expected *runtimeapi.LinuxContainerResources
|
||||
cgroupVersion CgroupVersion
|
||||
singleProcessOOMKill bool
|
||||
}{
|
||||
{
|
||||
name: "Request128MBLimit256MB",
|
||||
@ -321,6 +322,20 @@ func TestCalculateLinuxResources(t *testing.T) {
|
||||
},
|
||||
cgroupVersion: cgroupV2,
|
||||
},
|
||||
{
|
||||
name: "Request128MBLimit256MBSingleProcess",
|
||||
cpuReq: generateResourceQuantity("1"),
|
||||
cpuLim: generateResourceQuantity("2"),
|
||||
memLim: generateResourceQuantity("128Mi"),
|
||||
expected: &runtimeapi.LinuxContainerResources{
|
||||
CpuPeriod: 100000,
|
||||
CpuQuota: 200000,
|
||||
CpuShares: 1024,
|
||||
MemoryLimitInBytes: 134217728,
|
||||
},
|
||||
cgroupVersion: cgroupV2,
|
||||
singleProcessOOMKill: true,
|
||||
},
|
||||
{
|
||||
name: "RequestNoMemory",
|
||||
cpuReq: generateResourceQuantity("2"),
|
||||
@ -365,6 +380,7 @@ func TestCalculateLinuxResources(t *testing.T) {
|
||||
}
|
||||
for _, test := range tests {
|
||||
setCgroupVersionDuringTest(test.cgroupVersion)
|
||||
m.singleProcessOOMKill = ptr.To(test.singleProcessOOMKill)
|
||||
linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim)
|
||||
assert.Equal(t, test.expected, linuxContainerResources)
|
||||
}
|
||||
@ -808,16 +824,18 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, tc := range []struct {
|
||||
name string
|
||||
limits v1.ResourceList
|
||||
requests v1.ResourceList
|
||||
expected *runtimeapi.LinuxContainerResources
|
||||
cgroupVersion CgroupVersion
|
||||
name string
|
||||
limits v1.ResourceList
|
||||
requests v1.ResourceList
|
||||
singleProcessOOMKill bool
|
||||
expected *runtimeapi.LinuxContainerResources
|
||||
cgroupVersion CgroupVersion
|
||||
}{
|
||||
{
|
||||
"requests & limits, cpu & memory, guaranteed qos",
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
true,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997},
|
||||
cgroupV1,
|
||||
},
|
||||
@ -825,6 +843,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
"requests & limits, cpu & memory, burstable qos",
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
true,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970},
|
||||
cgroupV1,
|
||||
},
|
||||
@ -832,6 +851,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
"best-effort qos",
|
||||
nil,
|
||||
nil,
|
||||
true,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000},
|
||||
cgroupV1,
|
||||
},
|
||||
@ -839,6 +859,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
"requests & limits, cpu & memory, guaranteed qos",
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
false,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 524288000, OomScoreAdj: -997, Unified: map[string]string{"memory.oom.group": "1"}},
|
||||
cgroupV2,
|
||||
},
|
||||
@ -846,6 +867,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
"requests & limits, cpu & memory, burstable qos",
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("500m"), v1.ResourceMemory: resource.MustParse("750Mi")},
|
||||
v1.ResourceList{v1.ResourceCPU: resource.MustParse("250m"), v1.ResourceMemory: resource.MustParse("500Mi")},
|
||||
false,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 256, MemoryLimitInBytes: 786432000, OomScoreAdj: 970, Unified: map[string]string{"memory.oom.group": "1"}},
|
||||
cgroupV2,
|
||||
},
|
||||
@ -853,6 +875,7 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
"best-effort qos",
|
||||
nil,
|
||||
nil,
|
||||
false,
|
||||
&runtimeapi.LinuxContainerResources{CpuShares: 2, OomScoreAdj: 1000, Unified: map[string]string{"memory.oom.group": "1"}},
|
||||
cgroupV2,
|
||||
},
|
||||
@ -863,6 +886,8 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
|
||||
|
||||
pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
|
||||
|
||||
m.singleProcessOOMKill = ptr.To(tc.singleProcessOOMKill)
|
||||
|
||||
resources := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
|
||||
tc.expected.HugepageLimits = resources.HugepageLimits
|
||||
assert.Equal(t, tc.expected, resources)
|
||||
|
@ -118,6 +118,11 @@ type kubeGenericRuntimeManager struct {
|
||||
readinessManager proberesults.Manager
|
||||
startupManager proberesults.Manager
|
||||
|
||||
// If false, pass "memory.oom.group" to container cgroups when using cgroups v2 to cause processes
|
||||
// in those cgroups to be killed as a unit by the OOM killer.
|
||||
// It must be nil except for linux
|
||||
singleProcessOOMKill *bool
|
||||
|
||||
// If true, enforce container cpu limits with CFS quota support
|
||||
cpuCFSQuota bool
|
||||
|
||||
@ -198,6 +203,7 @@ func NewKubeGenericRuntimeManager(
|
||||
imagePullBurst int,
|
||||
imageCredentialProviderConfigFile string,
|
||||
imageCredentialProviderBinDir string,
|
||||
singleProcessOOMKill *bool,
|
||||
cpuCFSQuota bool,
|
||||
cpuCFSQuotaPeriod metav1.Duration,
|
||||
runtimeService internalapi.RuntimeService,
|
||||
@ -218,6 +224,7 @@ func NewKubeGenericRuntimeManager(
|
||||
tracer := tracerProvider.Tracer(instrumentationScope)
|
||||
kubeRuntimeManager := &kubeGenericRuntimeManager{
|
||||
recorder: recorder,
|
||||
singleProcessOOMKill: singleProcessOOMKill,
|
||||
cpuCFSQuota: cpuCFSQuota,
|
||||
cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
|
||||
seccompProfileRoot: filepath.Join(rootDirectory, "seccomp"),
|
||||
|
@ -28,11 +28,13 @@ import (
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
func TestApplySandboxResources(t *testing.T) {
|
||||
_, _, m, err := createTestRuntimeManager()
|
||||
m.cpuCFSQuota = true
|
||||
m.singleProcessOOMKill = ptr.To(false)
|
||||
|
||||
config := &runtimeapi.PodSandboxConfig{
|
||||
Linux: &runtimeapi.LinuxPodSandboxConfig{},
|
||||
|
29
pkg/kubelet/util/util_linux.go
Normal file
29
pkg/kubelet/util/util_linux.go
Normal file
@ -0,0 +1,29 @@
|
||||
//go:build linux
|
||||
// +build linux
|
||||
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package util
|
||||
|
||||
import (
|
||||
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
)
|
||||
|
||||
// IsCgroup2UnifiedMode returns true if the cgroup v2 unified mode is enabled
|
||||
func IsCgroup2UnifiedMode() bool {
|
||||
return libcontainercgroups.IsCgroup2UnifiedMode()
|
||||
}
|
25
pkg/kubelet/util/util_others.go
Normal file
25
pkg/kubelet/util/util_others.go
Normal file
@ -0,0 +1,25 @@
|
||||
//go:build !linux && !windows
|
||||
// +build !linux,!windows
|
||||
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package util
|
||||
|
||||
// IsCgroup2UnifiedMode is a no-op for other OSes.
|
||||
func IsCgroup2UnifiedMode() bool {
|
||||
return false
|
||||
}
|
@ -73,3 +73,8 @@ func NormalizePath(path string) string {
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
// IsCgroup2UnifiedMode is a no-op for Windows for now
|
||||
func IsCgroup2UnifiedMode() bool {
|
||||
return false
|
||||
}
|
||||
|
@ -354,6 +354,15 @@ type KubeletConfiguration struct {
|
||||
// Default: "None"
|
||||
// +optional
|
||||
CPUManagerPolicy string `json:"cpuManagerPolicy,omitempty"`
|
||||
// singleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
|
||||
// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
|
||||
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
|
||||
// The default value is determined automatically when you don't specify.
|
||||
// On non-linux such as windows, only null / absent is allowed.
|
||||
// On cgroup v1 linux, only null / absent and true are allowed.
|
||||
// On cgroup v2 linux, null / absent, true and false are allowed. The default value is false.
|
||||
// +optional
|
||||
SingleProcessOOMKill *bool `json:"singleProcessOOMKill,omitempty"`
|
||||
// cpuManagerPolicyOptions is a set of key=value which allows to set extra options
|
||||
// to fine tune the behaviour of the cpu manager policies.
|
||||
// Requires both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled.
|
||||
|
@ -254,6 +254,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.SingleProcessOOMKill != nil {
|
||||
in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.CPUManagerPolicyOptions != nil {
|
||||
in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
|
||||
*out = make(map[string]string, len(*in))
|
||||
|
@ -19,6 +19,7 @@ package e2enode
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/gomega"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@ -35,9 +36,11 @@ import (
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
name string
|
||||
podSpec *v1.Pod
|
||||
oomTargetContainerName string
|
||||
name string
|
||||
podSpec *v1.Pod
|
||||
oomTargetContainerName string
|
||||
enableSingleProcessKill *bool
|
||||
expectPodRunning bool
|
||||
}
|
||||
|
||||
// KubeReservedMemory is default fraction value of node capacity memory to
|
||||
@ -62,7 +65,7 @@ var _ = SIGDescribe("OOMKiller for pod using more memory than node allocatable [
|
||||
}
|
||||
})
|
||||
|
||||
var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), func() {
|
||||
var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), framework.WithSerial(), func() {
|
||||
f := framework.NewDefaultFramework("oomkiller-test")
|
||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||
|
||||
@ -89,6 +92,24 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
|
||||
oomTargetContainerName: "oomkill-multi-target-container",
|
||||
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
|
||||
getOOMTargetContainerMultiProcess),
|
||||
enableSingleProcessKill: nil,
|
||||
})
|
||||
|
||||
testCases = append(testCases, testCase{
|
||||
name: "multi process container (single process kill enabled)",
|
||||
oomTargetContainerName: "oomkill-multi-target-container",
|
||||
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
|
||||
getOOMTargetContainerMultiProcess),
|
||||
enableSingleProcessKill: ptr.To(true),
|
||||
expectPodRunning: true,
|
||||
})
|
||||
|
||||
testCases = append(testCases, testCase{
|
||||
name: "multi process container (single process kill disabled)",
|
||||
oomTargetContainerName: "oomkill-multi-target-container",
|
||||
podSpec: getOOMTargetPod("oomkill-target-pod", "oomkill-multi-target-container",
|
||||
getOOMTargetContainerMultiProcess),
|
||||
enableSingleProcessKill: ptr.To(false),
|
||||
})
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
@ -99,8 +120,8 @@ var _ = SIGDescribe("OOMKiller [LinuxOnly]", framework.WithNodeConformance(), fu
|
||||
func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMemory float64) {
|
||||
ginkgo.Context(testCase.name, func() {
|
||||
// Update KubeReservedMemory in KubeletConfig.
|
||||
if kubeReservedMemory > 0 {
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
if kubeReservedMemory > 0 {
|
||||
if initialConfig.KubeReserved == nil {
|
||||
initialConfig.KubeReserved = map[string]string{}
|
||||
}
|
||||
@ -109,8 +130,10 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
|
||||
// K8s components such that node allocatable memory is less than node capacity to
|
||||
// observe OOM kills at cgroup level instead of system OOM kills.
|
||||
initialConfig.KubeReserved["memory"] = fmt.Sprintf("%d", int(kubeReservedMemory*getLocalNode(context.TODO(), f).Status.Capacity.Memory().AsApproximateFloat64()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
initialConfig.SingleProcessOOMKill = testCase.enableSingleProcessKill
|
||||
})
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
// Precautionary check that kubelet is healthy before running the test.
|
||||
@ -120,18 +143,37 @@ func runOomKillerTest(f *framework.Framework, testCase testCase, kubeReservedMem
|
||||
e2epod.NewPodClient(f).Create(context.TODO(), testCase.podSpec)
|
||||
})
|
||||
|
||||
ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
|
||||
ginkgo.By("Waiting for the pod to be failed")
|
||||
err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
|
||||
framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
|
||||
if testCase.expectPodRunning {
|
||||
ginkgo.It("The containers should not be OOMKilled", func() {
|
||||
err := e2epod.WaitForPodsRunning(context.TODO(), f.ClientSet, f.Namespace.Name, 1, framework.PodStartTimeout)
|
||||
framework.ExpectNoError(err, "Failed waiting for pod to be running state, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
|
||||
|
||||
ginkgo.By("Fetching the latest pod status")
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
|
||||
gomega.Consistently(context.TODO(), func(ctx context.Context) error {
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, testCase.podSpec.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("expected the pod %s to exist: %w", pod.Name, err)
|
||||
}
|
||||
phase := pod.Status.Phase
|
||||
if phase != v1.PodRunning && phase != v1.PodSucceeded {
|
||||
return fmt.Errorf("pod %s: unexpected status %s, expected status: %s or %s", pod.Name, pod.Status.Phase, v1.PodRunning, v1.PodSucceeded)
|
||||
}
|
||||
return nil
|
||||
}, 10*time.Second, f.Timeouts.Poll).Should(gomega.BeNil())
|
||||
})
|
||||
} else {
|
||||
ginkgo.It("The containers terminated by OOM killer should have the reason set to OOMKilled", func() {
|
||||
ginkgo.By("Waiting for the pod to be failed")
|
||||
err := e2epod.WaitForPodTerminatedInNamespace(context.TODO(), f.ClientSet, testCase.podSpec.Name, "", f.Namespace.Name)
|
||||
framework.ExpectNoError(err, "Failed waiting for pod to terminate, %s/%s", f.Namespace.Name, testCase.podSpec.Name)
|
||||
|
||||
ginkgo.By("Verifying the OOM target container has the expected reason")
|
||||
verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
|
||||
})
|
||||
ginkgo.By("Fetching the latest pod status")
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), testCase.podSpec.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
|
||||
|
||||
ginkgo.By("Verifying the OOM target container has the expected reason")
|
||||
verifyReasonForOOMKilledContainer(pod, testCase.oomTargetContainerName)
|
||||
})
|
||||
}
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
ginkgo.By(fmt.Sprintf("deleting pod: %s", testCase.podSpec.Name))
|
||||
|
Loading…
Reference in New Issue
Block a user