diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index a7ed8e0014a..3bd3ac16eb7 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -834,7 +834,7 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend s.TopologyManagerPolicyOptions, features.TopologyManagerPolicyOptions) } if utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) { - if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == kubelettypes.LimitedSwap { + if !kubeletutil.IsCgroup2UnifiedMode() && s.MemorySwap.SwapBehavior == string(kubelettypes.LimitedSwap) { // This feature is not supported for cgroupv1 so we are failing early. return fmt.Errorf("swap feature is enabled and LimitedSwap but it is only supported with cgroupv2") } diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go index 6f5197ca92c..8d1c2869475 100644 --- a/pkg/kubelet/apis/config/validation/validation.go +++ b/pkg/kubelet/apis/config/validation/validation.go @@ -203,8 +203,8 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration, featur if localFeatureGate.Enabled(features.NodeSwap) { switch kc.MemorySwap.SwapBehavior { case "": - case kubetypes.NoSwap: - case kubetypes.LimitedSwap: + case string(kubetypes.NoSwap): + case string(kubetypes.LimitedSwap): default: allErrors = append(allErrors, fmt.Errorf("invalid configuration: memorySwap.swapBehavior %q must be one of: \"\", %q or %q", kc.MemorySwap.SwapBehavior, kubetypes.LimitedSwap, kubetypes.NoSwap)) } diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index 08da05c2efc..75fccec60ec 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -385,7 +385,7 @@ func TestValidateKubeletConfiguration(t *testing.T) { name: "specify MemorySwap.SwapBehavior without enabling NodeSwap", configure: func(conf *kubeletconfig.KubeletConfiguration) *kubeletconfig.KubeletConfiguration { conf.FeatureGates = map[string]bool{"NodeSwap": false} - conf.MemorySwap.SwapBehavior = kubetypes.LimitedSwap + conf.MemorySwap.SwapBehavior = string(kubetypes.LimitedSwap) return conf }, errMsg: "invalid configuration: memorySwap.swapBehavior cannot be set when NodeSwap feature flag is disabled", diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index 025579ab80e..1073f734031 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -34,6 +34,7 @@ import ( runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/credentialprovider" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/volume" ) @@ -138,6 +139,9 @@ type Runtime interface { ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error) // GetContainerStatus returns the status for the container. GetContainerStatus(ctx context.Context, id ContainerID) (*Status, error) + // GetContainerSwapBehavior reports whether a container could be swappable. + // This is used to decide whether to handle InPlacePodVerticalScaling for containers. + GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) kubelettypes.SwapBehavior } // StreamingRuntime is the interface implemented by runtimes that handle the serving of the diff --git a/pkg/kubelet/container/testing/fake_runtime.go b/pkg/kubelet/container/testing/fake_runtime.go index e360ba40bb8..226972a44a7 100644 --- a/pkg/kubelet/container/testing/fake_runtime.go +++ b/pkg/kubelet/container/testing/fake_runtime.go @@ -30,6 +30,7 @@ import ( runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/credentialprovider" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/volume" ) @@ -70,6 +71,7 @@ type FakeRuntime struct { // from container runtime. BlockImagePulls bool imagePullTokenBucket chan bool + SwapBehavior map[string]kubetypes.SwapBehavior T TB } @@ -536,3 +538,10 @@ func (f *FakeRuntime) GetContainerStatus(_ context.Context, _ kubecontainer.Cont f.CalledFunctions = append(f.CalledFunctions, "GetContainerStatus") return nil, f.Err } + +func (f *FakeRuntime) GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) kubetypes.SwapBehavior { + if f.SwapBehavior != nil && f.SwapBehavior[container.Name] != "" { + return f.SwapBehavior[container.Name] + } + return kubetypes.NoSwap +} diff --git a/pkg/kubelet/container/testing/runtime_mock.go b/pkg/kubelet/container/testing/runtime_mock.go index 1f021ea06d7..d53f2089b1a 100644 --- a/pkg/kubelet/container/testing/runtime_mock.go +++ b/pkg/kubelet/container/testing/runtime_mock.go @@ -33,7 +33,9 @@ import ( mock "github.com/stretchr/testify/mock" - types "k8s.io/apimachinery/pkg/types" + pkgtypes "k8s.io/apimachinery/pkg/types" + + types "k8s.io/kubernetes/pkg/kubelet/types" v1 "k8s.io/cri-api/pkg/apis/runtime/v1" ) @@ -419,6 +421,53 @@ func (_c *MockRuntime_GetContainerStatus_Call) RunAndReturn(run func(context.Con return _c } +// GetContainerSwapBehavior provides a mock function with given fields: pod, _a1 +func (_m *MockRuntime) GetContainerSwapBehavior(pod *corev1.Pod, _a1 *corev1.Container) types.SwapBehavior { + ret := _m.Called(pod, _a1) + + if len(ret) == 0 { + panic("no return value specified for GetContainerSwapBehavior") + } + + var r0 types.SwapBehavior + if rf, ok := ret.Get(0).(func(*corev1.Pod, *corev1.Container) types.SwapBehavior); ok { + r0 = rf(pod, _a1) + } else { + r0 = ret.Get(0).(types.SwapBehavior) + } + + return r0 +} + +// MockRuntime_GetContainerSwapBehavior_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetContainerSwapBehavior' +type MockRuntime_GetContainerSwapBehavior_Call struct { + *mock.Call +} + +// GetContainerSwapBehavior is a helper method to define mock.On call +// - pod *corev1.Pod +// - _a1 *corev1.Container +func (_e *MockRuntime_Expecter) GetContainerSwapBehavior(pod interface{}, _a1 interface{}) *MockRuntime_GetContainerSwapBehavior_Call { + return &MockRuntime_GetContainerSwapBehavior_Call{Call: _e.mock.On("GetContainerSwapBehavior", pod, _a1)} +} + +func (_c *MockRuntime_GetContainerSwapBehavior_Call) Run(run func(pod *corev1.Pod, _a1 *corev1.Container)) *MockRuntime_GetContainerSwapBehavior_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(*corev1.Pod), args[1].(*corev1.Container)) + }) + return _c +} + +func (_c *MockRuntime_GetContainerSwapBehavior_Call) Return(_a0 types.SwapBehavior) *MockRuntime_GetContainerSwapBehavior_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *MockRuntime_GetContainerSwapBehavior_Call) RunAndReturn(run func(*corev1.Pod, *corev1.Container) types.SwapBehavior) *MockRuntime_GetContainerSwapBehavior_Call { + _c.Call.Return(run) + return _c +} + // GetImageRef provides a mock function with given fields: ctx, image func (_m *MockRuntime) GetImageRef(ctx context.Context, image container.ImageSpec) (string, error) { ret := _m.Called(ctx, image) @@ -534,7 +583,7 @@ func (_c *MockRuntime_GetImageSize_Call) RunAndReturn(run func(context.Context, } // GetPodStatus provides a mock function with given fields: ctx, uid, name, namespace -func (_m *MockRuntime) GetPodStatus(ctx context.Context, uid types.UID, name string, namespace string) (*container.PodStatus, error) { +func (_m *MockRuntime) GetPodStatus(ctx context.Context, uid pkgtypes.UID, name string, namespace string) (*container.PodStatus, error) { ret := _m.Called(ctx, uid, name, namespace) if len(ret) == 0 { @@ -543,10 +592,10 @@ func (_m *MockRuntime) GetPodStatus(ctx context.Context, uid types.UID, name str var r0 *container.PodStatus var r1 error - if rf, ok := ret.Get(0).(func(context.Context, types.UID, string, string) (*container.PodStatus, error)); ok { + if rf, ok := ret.Get(0).(func(context.Context, pkgtypes.UID, string, string) (*container.PodStatus, error)); ok { return rf(ctx, uid, name, namespace) } - if rf, ok := ret.Get(0).(func(context.Context, types.UID, string, string) *container.PodStatus); ok { + if rf, ok := ret.Get(0).(func(context.Context, pkgtypes.UID, string, string) *container.PodStatus); ok { r0 = rf(ctx, uid, name, namespace) } else { if ret.Get(0) != nil { @@ -554,7 +603,7 @@ func (_m *MockRuntime) GetPodStatus(ctx context.Context, uid types.UID, name str } } - if rf, ok := ret.Get(1).(func(context.Context, types.UID, string, string) error); ok { + if rf, ok := ret.Get(1).(func(context.Context, pkgtypes.UID, string, string) error); ok { r1 = rf(ctx, uid, name, namespace) } else { r1 = ret.Error(1) @@ -570,16 +619,16 @@ type MockRuntime_GetPodStatus_Call struct { // GetPodStatus is a helper method to define mock.On call // - ctx context.Context -// - uid types.UID +// - uid pkgtypes.UID // - name string // - namespace string func (_e *MockRuntime_Expecter) GetPodStatus(ctx interface{}, uid interface{}, name interface{}, namespace interface{}) *MockRuntime_GetPodStatus_Call { return &MockRuntime_GetPodStatus_Call{Call: _e.mock.On("GetPodStatus", ctx, uid, name, namespace)} } -func (_c *MockRuntime_GetPodStatus_Call) Run(run func(ctx context.Context, uid types.UID, name string, namespace string)) *MockRuntime_GetPodStatus_Call { +func (_c *MockRuntime_GetPodStatus_Call) Run(run func(ctx context.Context, uid pkgtypes.UID, name string, namespace string)) *MockRuntime_GetPodStatus_Call { _c.Call.Run(func(args mock.Arguments) { - run(args[0].(context.Context), args[1].(types.UID), args[2].(string), args[3].(string)) + run(args[0].(context.Context), args[1].(pkgtypes.UID), args[2].(string), args[3].(string)) }) return _c } @@ -589,7 +638,7 @@ func (_c *MockRuntime_GetPodStatus_Call) Return(_a0 *container.PodStatus, _a1 er return _c } -func (_c *MockRuntime_GetPodStatus_Call) RunAndReturn(run func(context.Context, types.UID, string, string) (*container.PodStatus, error)) *MockRuntime_GetPodStatus_Call { +func (_c *MockRuntime_GetPodStatus_Call) RunAndReturn(run func(context.Context, pkgtypes.UID, string, string) (*container.PodStatus, error)) *MockRuntime_GetPodStatus_Call { _c.Call.Return(run) return _c } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 6d999fa6def..fedf14af503 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -2920,6 +2920,40 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, string, string) { return true, "", "" } +func disallowResizeForSwappableContainers(runtime kubecontainer.Runtime, desiredPod, allocatedPod *v1.Pod) (bool, string) { + if desiredPod == nil || allocatedPod == nil { + return false, "" + } + restartableMemoryResizePolicy := func(resizePolicies []v1.ContainerResizePolicy) bool { + for _, policy := range resizePolicies { + if policy.ResourceName == v1.ResourceMemory { + return policy.RestartPolicy == v1.RestartContainer + } + } + return false + } + allocatedContainers := make(map[string]v1.Container) + for _, container := range append(allocatedPod.Spec.Containers, allocatedPod.Spec.InitContainers...) { + allocatedContainers[container.Name] = container + } + for _, desiredContainer := range append(desiredPod.Spec.Containers, desiredPod.Spec.InitContainers...) { + allocatedContainer, ok := allocatedContainers[desiredContainer.Name] + if !ok { + continue + } + origMemRequest := desiredContainer.Resources.Requests[v1.ResourceMemory] + newMemRequest := allocatedContainer.Resources.Requests[v1.ResourceMemory] + if !origMemRequest.Equal(newMemRequest) && !restartableMemoryResizePolicy(allocatedContainer.ResizePolicy) { + aSwapBehavior := runtime.GetContainerSwapBehavior(desiredPod, &desiredContainer) + bSwapBehavior := runtime.GetContainerSwapBehavior(allocatedPod, &allocatedContainer) + if aSwapBehavior != kubetypes.NoSwap || bSwapBehavior != kubetypes.NoSwap { + return true, "In-place resize of containers with swap is not supported." + } + } + } + return false, "" +} + // handlePodResourcesResize returns the "allocated pod", which should be used for all resource // calculations after this function is called. It also updates the cached ResizeStatus according to // the allocation decision and pod status. @@ -2949,6 +2983,10 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine // If there is a pending resize but the resize is not allowed, always use the allocated resources. kl.statusManager.SetPodResizePendingCondition(pod.UID, v1.PodReasonInfeasible, msg) return podFromAllocation, nil + } else if resizeNotAllowed, msg := disallowResizeForSwappableContainers(kl.containerRuntime, pod, podFromAllocation); resizeNotAllowed { + // If this resize involve swap recalculation, set as infeasible, as IPPR with swap is not supported for beta. + kl.statusManager.SetPodResizePendingCondition(pod.UID, v1.PodReasonInfeasible, msg) + return podFromAllocation, nil } kl.podResizeMutex.Lock() diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 6dbe8ee6456..0fb6b124a8b 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -2612,6 +2612,185 @@ func TestPodResourceAllocationReset(t *testing.T) { } } +func TestHandlePodResourcesResizeWithSwap(t *testing.T) { + if goruntime.GOOS == "windows" { + t.Skip("InPlacePodVerticalScaling is not currently supported for Windows") + } + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true) + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, true) + testKubelet := newTestKubelet(t, false) + defer testKubelet.Cleanup() + noSwapContainerName, swapContainerName := "test-container-noswap", "test-container-limitedswap" + runtime := testKubelet.fakeRuntime + runtime.SwapBehavior = map[string]kubetypes.SwapBehavior{ + noSwapContainerName: kubetypes.NoSwap, + swapContainerName: kubetypes.LimitedSwap, + } + kubelet := testKubelet.kubelet + cpu500m := resource.MustParse("500m") + cpu1000m := resource.MustParse("1") + mem500M := resource.MustParse("500Mi") + mem1000M := resource.MustParse("1Gi") + nodes := []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname}, + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("8"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + Allocatable: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("4Gi"), + v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), + }, + }, + }, + } + kubelet.nodeLister = testNodeLister{nodes: nodes} + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "1111", + Name: "pod1", + Namespace: "ns1", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Image: "i1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + }, + }, + }, + }, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "c1", + AllocatedResources: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + Resources: &v1.ResourceRequirements{}, + }, + }, + }, + } + + testKubelet.fakeKubeClient = fake.NewSimpleClientset(testPod) + kubelet.kubeClient = testKubelet.fakeKubeClient + defer testKubelet.fakeKubeClient.ClearActions() + kubelet.podManager.AddPod(testPod) + kubelet.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ + testPod.UID: true, + } + defer kubelet.podManager.RemovePod(testPod) + tests := []struct { + name string + newRequests v1.ResourceList + expectedAllocatedReqs v1.ResourceList + resizePolicy v1.ContainerResizePolicy + swapBehavior kubetypes.SwapBehavior + expectedResize []*v1.PodCondition + }{ + { + name: "NoSwap Request Memory decrease ResizePolicy RestartContainer - expect InProgress", + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + swapBehavior: kubetypes.NoSwap, + resizePolicy: v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, RestartPolicy: v1.RestartContainer}, + expectedResize: []*v1.PodCondition{ + { + Type: v1.PodResizeInProgress, + Status: "True", + }, + }, + }, + { + name: "LimitedSwap Request Memory increase with ResizePolicy RestartContainer - expect InProgress", + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + swapBehavior: kubetypes.LimitedSwap, + resizePolicy: v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, RestartPolicy: v1.RestartContainer}, + expectedResize: []*v1.PodCondition{ + { + Type: v1.PodResizeInProgress, + Status: "True", + }, + }, + }, + { + name: "LimitedSwap Request Memory increase with ResizePolicy NotRequired - expect Infeasible", + newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, + expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, + swapBehavior: kubetypes.LimitedSwap, + resizePolicy: v1.ContainerResizePolicy{ResourceName: v1.ResourceMemory, RestartPolicy: v1.NotRequired}, + expectedResize: []*v1.PodCondition{ + { + Type: v1.PodResizePending, + Status: "True", + Reason: "Infeasible", + Message: "In-place resize of containers with swap is not supported", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + originalPod := testPod.DeepCopy() + originalPod.Spec.Containers[0].ResizePolicy = []v1.ContainerResizePolicy{tt.resizePolicy} + if tt.swapBehavior == kubetypes.NoSwap { + originalPod.Spec.Containers[0].Name = noSwapContainerName + } else { + originalPod.Spec.Containers[0].Name = swapContainerName + } + kubelet.podManager.UpdatePod(originalPod) + newPod := originalPod.DeepCopy() + newPod.Spec.Containers[0].Resources.Requests = tt.newRequests + require.NoError(t, kubelet.allocationManager.SetAllocatedResources(originalPod)) + require.NoError(t, kubelet.allocationManager.SetActuatedResources(originalPod, nil)) + t.Cleanup(func() { kubelet.allocationManager.RemovePod(originalPod.UID) }) + podStatus := &kubecontainer.PodStatus{ + ID: originalPod.UID, + Name: originalPod.Name, + Namespace: originalPod.Namespace, + } + setContainerStatus := func(podStatus *kubecontainer.PodStatus, c *v1.Container, idx int) { + podStatus.ContainerStatuses[idx] = &kubecontainer.Status{ + Name: c.Name, + State: kubecontainer.ContainerStateRunning, + Resources: &kubecontainer.ContainerResources{ + CPURequest: c.Resources.Requests.Cpu(), + CPULimit: c.Resources.Limits.Cpu(), + MemoryLimit: c.Resources.Limits.Memory(), + }, + } + } + podStatus.ContainerStatuses = make([]*kubecontainer.Status, len(originalPod.Spec.Containers)) + for i, c := range originalPod.Spec.Containers { + setContainerStatus(podStatus, &c, i) + } + updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus) + require.NoError(t, err) + updatedPodCtr := updatedPod.Spec.Containers[0] + assert.Equal(t, tt.expectedAllocatedReqs, updatedPodCtr.Resources.Requests, "updated pod spec requests") + + alloc, found := kubelet.allocationManager.GetContainerResourceAllocation(newPod.UID, updatedPodCtr.Name) + require.True(t, found, "container allocation") + assert.Equal(t, tt.expectedAllocatedReqs, alloc.Requests, "stored container request allocation") + resizeStatus := kubelet.statusManager.GetPodResizeConditions(newPod.UID) + for i := range resizeStatus { + // Ignore probe time and last transition time during comparison. + resizeStatus[i].LastProbeTime = metav1.Time{} + resizeStatus[i].LastTransitionTime = metav1.Time{} + assert.Contains(t, resizeStatus[i].Message, tt.expectedResize[i].Message) + resizeStatus[i].Message = tt.expectedResize[i].Message + } + assert.Equal(t, tt.expectedResize, resizeStatus) + }) + } +} + func TestHandlePodResourcesResize(t *testing.T) { if goruntime.GOOS == "windows" { t.Skip("InPlacePodVerticalScaling is not currently supported for Windows") diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index d44432b7e61..92321bd9bd4 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -45,7 +45,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/qos" - kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" + "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/utils/ptr" ) @@ -206,35 +206,45 @@ func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtime } swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo) - if m.memorySwapBehavior == kubelettypes.LimitedSwap { - if !isCgroup2UnifiedMode() { - swapConfigurationHelper.ConfigureNoSwap(lcr) - return - } - } - - if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { - swapConfigurationHelper.ConfigureNoSwap(lcr) - return - } - - if kubelettypes.IsCriticalPod(pod) { - swapConfigurationHelper.ConfigureNoSwap(lcr) - return - } - // NOTE(ehashman): Behavior is defined in the opencontainers runtime spec: // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory - switch m.memorySwapBehavior { - case kubelettypes.NoSwap: + switch m.GetContainerSwapBehavior(pod, container) { + case types.NoSwap: swapConfigurationHelper.ConfigureNoSwap(lcr) - case kubelettypes.LimitedSwap: + case types.LimitedSwap: swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container) default: swapConfigurationHelper.ConfigureNoSwap(lcr) } } +// GetContainerSwapBehavior checks what swap behavior should be configured for a container, +// considering the requirements for enabling swap. +func (m *kubeGenericRuntimeManager) GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) types.SwapBehavior { + c := types.SwapBehavior(m.memorySwapBehavior) + if c == types.LimitedSwap { + if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) || !swapControllerAvailable() { + return types.NoSwap + } + + if !isCgroup2UnifiedMode() { + return types.NoSwap + } + + if types.IsCriticalPod(pod) { + return types.NoSwap + } + podQos := kubeapiqos.GetPodQOS(pod) + containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() + memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 + if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || memoryRequestEqualsToLimit { + return types.NoSwap + } + return c + } + return types.NoSwap +} + // generateContainerResources generates platform specific (linux) container resources config for runtime func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources { enforceMemoryQoS := false @@ -428,15 +438,6 @@ func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigu } func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { - podQos := kubeapiqos.GetPodQOS(pod) - containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() - memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 - - if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit { - m.ConfigureNoSwap(lcr) - return - } - containerMemoryRequest := container.Resources.Requests.Memory() swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) if err != nil { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index 5c0549e20b0..8d21488e9ed 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -924,6 +924,145 @@ func TestGenerateLinuxContainerResources(t *testing.T) { } } +func TestGetContainerSwapBehavior(t *testing.T) { + _, _, m, err := createTestRuntimeManager() + require.NoError(t, err) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "foo", + Namespace: "bar", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi")}, + Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")}, + }, + }, + }, + }, + Status: v1.PodStatus{}, + } + tests := []struct { + name string + configuredMemorySwap types.SwapBehavior + nodeSwapFeatureGateEnabled bool + isSwapControllerAvailable bool + cgroupVersion CgroupVersion + isCriticalPod bool + qosClass v1.PodQOSClass + containerResourceOverride func(container *v1.Container) + expected types.SwapBehavior + }{ + { + name: "NoSwap, user set NoSwap behavior", + configuredMemorySwap: types.NoSwap, + nodeSwapFeatureGateEnabled: false, + expected: types.NoSwap, + }, + { + name: "NoSwap, feature gate turned off", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: false, + expected: types.NoSwap, + }, + { + name: "NoSwap, swap controller unavailable", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: false, + expected: types.NoSwap, + }, + { + name: "NoSwap, cgroup v1", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV1, + expected: types.NoSwap, + }, + { + name: "NoSwap, qos is Best-Effort", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBestEffort, + expected: types.NoSwap, + }, + { + name: "NoSwap, qos is Guaranteed", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSGuaranteed, + expected: types.NoSwap, + }, + { + name: "NoSwap, zero memory", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + containerResourceOverride: func(c *v1.Container) { + c.Resources.Requests = v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("0"), + } + c.Resources.Limits = v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("0"), + } + }, + expected: types.NoSwap, + }, + { + name: "NoSwap, memory request equal to limit", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + containerResourceOverride: func(c *v1.Container) { + c.Resources.Requests = v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("100Mi"), + } + c.Resources.Limits = v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("100Mi"), + } + }, + expected: types.NoSwap, + }, + { + name: "LimitedSwap, cgroup v2", + configuredMemorySwap: types.LimitedSwap, + nodeSwapFeatureGateEnabled: true, + isSwapControllerAvailable: true, + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + expected: types.LimitedSwap, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + m.memorySwapBehavior = string(tt.configuredMemorySwap) + setCgroupVersionDuringTest(tt.cgroupVersion) + defer setSwapControllerAvailableDuringTest(tt.isSwapControllerAvailable)() + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tt.nodeSwapFeatureGateEnabled) + testpod := pod.DeepCopy() + testpod.Status.QOSClass = tt.qosClass + if tt.containerResourceOverride != nil { + tt.containerResourceOverride(&testpod.Spec.Containers[0]) + } + assert.Equal(t, tt.expected, m.GetContainerSwapBehavior(testpod, &testpod.Spec.Containers[0])) + }) + } +} + func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) { _, _, m, err := createTestRuntimeManager() assert.NoError(t, err) @@ -999,7 +1138,7 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) { qosClass v1.PodQOSClass swapDisabledOnNode bool nodeSwapFeatureGateEnabled bool - swapBehavior string + swapBehavior types.SwapBehavior addContainerWithoutRequests bool addGuaranteedContainer bool isCriticalPod bool @@ -1195,7 +1334,7 @@ func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) { setCgroupVersionDuringTest(tc.cgroupVersion) defer setSwapControllerAvailableDuringTest(!tc.swapDisabledOnNode)() featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tc.nodeSwapFeatureGateEnabled) - m.memorySwapBehavior = tc.swapBehavior + m.memorySwapBehavior = string(tc.swapBehavior) var resourceReqsC1, resourceReqsC2 v1.ResourceRequirements switch tc.qosClass { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go b/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go index 8d3b483f7b0..bf7ccaba635 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_unsupported.go @@ -24,6 +24,7 @@ import ( runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/types" ) // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. @@ -48,3 +49,7 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k func toKubeContainerUser(statusUser *runtimeapi.ContainerUser) *kubecontainer.ContainerUser { return nil } + +func (m *kubeGenericRuntimeManager) GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) types.SwapBehavior { + return types.NoSwap +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go index abf71f55b71..d4a77555c28 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_windows.go @@ -26,6 +26,7 @@ import ( "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/winstats" "k8s.io/kubernetes/pkg/securitycontext" ) @@ -183,3 +184,7 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k func toKubeContainerUser(statusUser *runtimeapi.ContainerUser) *kubecontainer.ContainerUser { return nil } + +func (m *kubeGenericRuntimeManager) GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) types.SwapBehavior { + return types.NoSwap +} diff --git a/pkg/kubelet/types/constants.go b/pkg/kubelet/types/constants.go index 56cba9c43b8..791052dbbce 100644 --- a/pkg/kubelet/types/constants.go +++ b/pkg/kubelet/types/constants.go @@ -32,7 +32,9 @@ const ( ) // SwapBehavior types +type SwapBehavior string + const ( - LimitedSwap = "LimitedSwap" - NoSwap = "NoSwap" + LimitedSwap SwapBehavior = "LimitedSwap" + NoSwap SwapBehavior = "NoSwap" ) diff --git a/test/e2e_node/swap_test.go b/test/e2e_node/swap_test.go index eb1cd3d5367..5b8eb1452fb 100644 --- a/test/e2e_node/swap_test.go +++ b/test/e2e_node/swap_test.go @@ -76,7 +76,7 @@ var _ = SIGDescribe("Swap", "[LinuxOnly]", ginkgo.Ordered, feature.Swap, framewo if !isPodCgroupV2(f, pod) { e2eskipper.Skipf("swap tests require cgroup v2") } - gomega.Expect(getSwapBehavior()).To(gomega.Or(gomega.Equal(types.NoSwap), gomega.BeEmpty()), "NodeConformance is expected to run with NoSwap") + gomega.Expect(getSwapBehavior()).To(gomega.Or(gomega.Equal(string(types.NoSwap)), gomega.BeEmpty()), "NodeConformance is expected to run with NoSwap") expectNoSwap(f, pod) }, @@ -105,8 +105,8 @@ var _ = SIGDescribe("Swap", "[LinuxOnly]", ginkgo.Ordered, feature.Swap, framewo enableLimitedSwap := func(ctx context.Context, initialConfig *config.KubeletConfiguration) { msg := "swap behavior is already set to LimitedSwap" - if swapBehavior := initialConfig.MemorySwap.SwapBehavior; swapBehavior != types.LimitedSwap { - initialConfig.MemorySwap.SwapBehavior = types.LimitedSwap + if swapBehavior := initialConfig.MemorySwap.SwapBehavior; swapBehavior != string(types.LimitedSwap) { + initialConfig.MemorySwap.SwapBehavior = string(types.LimitedSwap) msg = "setting swap behavior to LimitedSwap" } @@ -124,7 +124,7 @@ var _ = SIGDescribe("Swap", "[LinuxOnly]", ginkgo.Ordered, feature.Swap, framewo if !isPodCgroupV2(f, pod) { e2eskipper.Skipf("swap tests require cgroup v2") } - gomega.Expect(getSwapBehavior()).To(gomega.Equal(types.LimitedSwap)) + gomega.Expect(getSwapBehavior()).To(gomega.Equal(string(types.LimitedSwap))) expectedSwapLimit := calcSwapForBurstablePod(f, pod) expectLimitedSwap(f, pod, expectedSwapLimit)