diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh index 62117a9e8d6..e6195d73383 100755 --- a/hack/local-up-cluster.sh +++ b/hack/local-up-cluster.sh @@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""} CGROUP_ROOT=${CGROUP_ROOT:-""} # owner of client certs, default to current user if not specified USER=${USER:-$(whoami)} +# if true, limited swap is being used instead of unlimited swap (default) +LIMITED_SWAP=${LIMITED_SWAP:-""} # required for cni installation CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d} @@ -832,6 +834,13 @@ tracing: EOF fi + if [[ "$LIMITED_SWAP" == "true" ]]; then + cat <> "${TMP_DIR}"/kubelet.yaml +memorySwap: + swapBehavior: LimitedSwap +EOF + fi + { # authentication echo "authentication:" diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 8951673e0fb..43ee081674c 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -596,8 +596,9 @@ const ( // Allow pods to failover to a different node in case of non graceful node shutdown NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach" - // owner: @ehashman + // owner: @iholder101 // alpha: v1.22 + // beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400. // // Permits kubelet to run with swap enabled NodeSwap featuregate.Feature = "NodeSwap" @@ -1074,7 +1075,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31 - NodeSwap: {Default: false, PreRelease: featuregate.Alpha}, + NodeSwap: {Default: false, PreRelease: featuregate.Beta}, PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta}, diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index c4be02a45b2..f54eaa2979f 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -45,11 +45,12 @@ import ( const ( // systemdSuffix is the cgroup name suffix for systemd systemdSuffix string = ".slice" - // MemoryMin is memory.min for cgroup v2 - MemoryMin string = "memory.min" - // MemoryHigh is memory.high for cgroup v2 - MemoryHigh string = "memory.high" - Cgroup2MaxCpuLimit string = "max" + // Cgroup2MemoryMin is memory.min for cgroup v2 + Cgroup2MemoryMin string = "memory.min" + // Cgroup2MemoryHigh is memory.high for cgroup v2 + Cgroup2MemoryHigh string = "memory.high" + Cgroup2MaxCpuLimit string = "max" + Cgroup2MaxSwapFilename string = "memory.swap.max" ) var RootCgroupName = CgroupName([]string{}) diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 18b0df17bfc..8a144e7a73c 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, } if memoryMin > 0 { result.Unified = map[string]string{ - MemoryMin: strconv.FormatInt(memoryMin, 10), + Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10), } } } diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go index 74221c67047..b57403dd95b 100644 --- a/pkg/kubelet/cm/node_container_manager_linux.go +++ b/pkg/kubelet/cm/node_container_manager_linux.go @@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1. if rp.Unified == nil { rp.Unified = make(map[string]string) } - rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10) + rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10) } } diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 89b3adae9af..abf4487ee5d 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil { configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string) } - configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10) + configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10) klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin) } @@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil { configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string) } - configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10) + configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10) klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin) } } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index 466378deda3..c600d49bc25 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -20,6 +20,9 @@ limitations under the License. package kuberuntime import ( + "fmt" + cadvisorv1 "github.com/google/cadvisor/info/v1" + kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" "math" "os" "strconv" @@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config enforceMemoryQoS := false // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && - libcontainercgroups.IsCgroup2UnifiedMode() { + isCgroup2UnifiedMode() { enforceMemoryQoS = true } cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS) @@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { + if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { // NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec: // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory switch m.memorySwapBehavior { - case kubelettypes.UnlimitedSwap: - // -1 = unlimited swap - lcr.MemorySwapLimitInBytes = -1 case kubelettypes.LimitedSwap: - fallthrough + swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container) default: - // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit - // Some swapping is still possible. - // Note that if memory limit is 0, memory swap limit is ignored. - lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + swapConfigurationHelper.ConfigureUnlimitedSwap(lcr) } + } else { + swapConfigurationHelper.ConfigureNoSwap(lcr) } // Set memory.min and memory.high to enforce MemoryQoS @@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, memoryRequest := container.Resources.Requests.Memory().Value() memoryLimit := container.Resources.Limits.Memory().Value() if memoryRequest != 0 { - unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) + unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10) } // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. @@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, } } if memoryHigh != 0 && memoryHigh > memoryRequest { - unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) + unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } } if len(unified) > 0 { @@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont enforceMemoryQoS := false // Set memory.min and memory.high if MemoryQoS enabled with cgroups v2 if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) && - libcontainercgroups.IsCgroup2UnifiedMode() { + isCgroup2UnifiedMode() { enforceMemoryQoS = true } return &runtimeapi.ContainerResources{ @@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit } // runc requires cgroupv2 for unified mode - if libcontainercgroups.IsCgroup2UnifiedMode() { + if isCgroup2UnifiedMode() { resources.Unified = map[string]string{ // Ask the kernel to kill all processes in the container cgroup in case of OOM. // See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for @@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k } return cStatusResources } + +// Note: this function variable is being added here so it would be possible to mock +// the cgroup version for unit tests by assigning a new mocked function into it. Without it, +// the cgroup version would solely depend on the environment running the test. +var isCgroup2UnifiedMode = func() bool { + return libcontainercgroups.IsCgroup2UnifiedMode() +} + +type swapConfigurationHelper struct { + machineInfo cadvisorv1.MachineInfo +} + +func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper { + return &swapConfigurationHelper{machineInfo: machineInfo} +} + +func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { + podQos := kubeapiqos.GetPodQOS(pod) + containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() + memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 + + if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit { + m.ConfigureNoSwap(lcr) + return + } + + containerMemoryRequest := container.Resources.Requests.Memory() + swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) + + if err != nil { + klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap") + m.ConfigureNoSwap(lcr) + return + } + + m.configureSwap(lcr, swapLimit) +} + +func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) { + if !isCgroup2UnifiedMode() { + // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit + // Some swapping is still possible. + // Note that if memory limit is 0, memory swap limit is ignored. + lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + return + } + + m.configureSwap(lcr, 0) +} + +func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) { + if !isCgroup2UnifiedMode() { + m.ConfigureNoSwap(lcr) + return + } + + if lcr.Unified == nil { + lcr.Unified = map[string]string{} + } + + lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max" +} + +func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) { + if !isCgroup2UnifiedMode() { + klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected") + return + } + + if lcr.Unified == nil { + lcr.Unified = map[string]string{} + } + + lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory) +} + +// The swap limit is calculated as (/)*. +// For more info, please look at the following KEP: https://kep.k8s.io/2400 +func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) { + if nodeTotalMemory <= 0 { + return 0, fmt.Errorf("total node memory is 0") + } + if containerMemoryRequest > nodeTotalMemory { + return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory) + } + + containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory) + swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable) + + return int64(swapAllocation), nil +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index ec56dc733c4..b50eee1d4ee 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -21,6 +21,9 @@ package kuberuntime import ( "context" + "fmt" + "k8s.io/kubernetes/pkg/kubelet/cm" + "k8s.io/kubernetes/pkg/kubelet/types" "math" "os" "reflect" @@ -38,7 +41,6 @@ import ( runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" - kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig { @@ -244,11 +246,12 @@ func TestCalculateLinuxResources(t *testing.T) { } tests := []struct { - name string - cpuReq *resource.Quantity - cpuLim *resource.Quantity - memLim *resource.Quantity - expected *runtimeapi.LinuxContainerResources + name string + cpuReq *resource.Quantity + cpuLim *resource.Quantity + memLim *resource.Quantity + expected *runtimeapi.LinuxContainerResources + cgroupVersion CgroupVersion }{ { name: "Request128MBLimit256MB", @@ -261,6 +264,7 @@ func TestCalculateLinuxResources(t *testing.T) { CpuShares: 1024, MemoryLimitInBytes: 134217728, }, + cgroupVersion: cgroupV1, }, { name: "RequestNoMemory", @@ -273,6 +277,7 @@ func TestCalculateLinuxResources(t *testing.T) { CpuShares: 2048, MemoryLimitInBytes: 0, }, + cgroupVersion: cgroupV1, }, { name: "RequestNilCPU", @@ -284,6 +289,7 @@ func TestCalculateLinuxResources(t *testing.T) { CpuShares: 2048, MemoryLimitInBytes: 0, }, + cgroupVersion: cgroupV1, }, { name: "RequestZeroCPU", @@ -296,9 +302,66 @@ func TestCalculateLinuxResources(t *testing.T) { CpuShares: 2, MemoryLimitInBytes: 0, }, + cgroupVersion: cgroupV1, + }, + { + name: "Request128MBLimit256MB", + cpuReq: generateResourceQuantity("1"), + cpuLim: generateResourceQuantity("2"), + memLim: generateResourceQuantity("128Mi"), + expected: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 100000, + CpuQuota: 200000, + CpuShares: 1024, + MemoryLimitInBytes: 134217728, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + cgroupVersion: cgroupV2, + }, + { + name: "RequestNoMemory", + cpuReq: generateResourceQuantity("2"), + cpuLim: generateResourceQuantity("8"), + memLim: generateResourceQuantity("0"), + expected: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 100000, + CpuQuota: 800000, + CpuShares: 2048, + MemoryLimitInBytes: 0, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + cgroupVersion: cgroupV2, + }, + { + name: "RequestNilCPU", + cpuLim: generateResourceQuantity("2"), + memLim: generateResourceQuantity("0"), + expected: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 100000, + CpuQuota: 200000, + CpuShares: 2048, + MemoryLimitInBytes: 0, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + cgroupVersion: cgroupV2, + }, + { + name: "RequestZeroCPU", + cpuReq: generateResourceQuantity("0"), + cpuLim: generateResourceQuantity("2"), + memLim: generateResourceQuantity("0"), + expected: &runtimeapi.LinuxContainerResources{ + CpuPeriod: 100000, + CpuQuota: 200000, + CpuShares: 2, + MemoryLimitInBytes: 0, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + cgroupVersion: cgroupV2, }, } for _, test := range tests { + setCgroupVersionDuringTest(test.cgroupVersion) linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim) assert.Equal(t, test.expected, linuxContainerResources) } @@ -634,96 +697,6 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) { } } -func TestGenerateLinuxContainerConfigSwap(t *testing.T) { - defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, true)() - _, _, m, err := createTestRuntimeManager() - if err != nil { - t.Fatalf("error creating test RuntimeManager: %v", err) - } - m.machineInfo.MemoryCapacity = 1000000 - containerName := "test" - - for _, tc := range []struct { - name string - swapSetting string - pod *v1.Pod - expected int64 - }{ - { - name: "config unset, memory limit set", - // no swap setting - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{{ - Name: containerName, - Resources: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - "memory": resource.MustParse("1000"), - }, - Requests: v1.ResourceList{ - "memory": resource.MustParse("1000"), - }, - }, - }}, - }, - }, - expected: 1000, - }, - { - name: "config unset, no memory limit", - // no swap setting - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - {Name: containerName}, - }, - }, - }, - expected: 0, - }, - { - // Note: behaviour will be the same as previous two cases - name: "config set to LimitedSwap, memory limit set", - swapSetting: kubelettypes.LimitedSwap, - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{{ - Name: containerName, - Resources: v1.ResourceRequirements{ - Limits: v1.ResourceList{ - "memory": resource.MustParse("1000"), - }, - Requests: v1.ResourceList{ - "memory": resource.MustParse("1000"), - }, - }, - }}, - }, - }, - expected: 1000, - }, - { - name: "UnlimitedSwap enabled", - swapSetting: kubelettypes.UnlimitedSwap, - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - {Name: containerName}, - }, - }, - }, - expected: -1, - }, - } { - t.Run(tc.name, func(t *testing.T) { - m.memorySwapBehavior = tc.swapSetting - actual, err := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false) - assert.NoError(t, err) - assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name) - }) - } -} - func TestGenerateLinuxContainerResources(t *testing.T) { _, _, m, err := createTestRuntimeManager() assert.NoError(t, err) @@ -875,6 +848,10 @@ func TestGenerateLinuxContainerResources(t *testing.T) { if tc.scalingFg { defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)() } + + setCgroupVersionDuringTest(cgroupV1) + tc.expected.MemorySwapLimitInBytes = tc.expected.MemoryLimitInBytes + pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests} if len(tc.cStatus) > 0 { pod.Status.ContainerStatuses = tc.cStatus @@ -888,3 +865,289 @@ func TestGenerateLinuxContainerResources(t *testing.T) { } //TODO(vinaykul,InPlacePodVerticalScaling): Add unit tests for cgroup v1 & v2 } + +func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) { + _, _, m, err := createTestRuntimeManager() + assert.NoError(t, err) + m.machineInfo.MemoryCapacity = 42949672960 // 40Gb == 40 * 1024^3 + m.machineInfo.SwapCapacity = 5368709120 // 5Gb == 5 * 1024^3 + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "foo", + Namespace: "bar", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "c1", + }, + { + Name: "c2", + }, + }, + }, + Status: v1.PodStatus{}, + } + + expectNoSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) { + const msg = "container is expected to not have swap access" + + for _, r := range resources { + switch cgroupVersion { + case cgroupV1: + assert.Equal(t, r.MemoryLimitInBytes, r.MemorySwapLimitInBytes, msg) + case cgroupV2: + assert.Equal(t, "0", r.Unified[cm.Cgroup2MaxSwapFilename], msg) + } + } + } + + expectUnlimitedSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) { + const msg = "container is expected to have unlimited swap access" + + for _, r := range resources { + switch cgroupVersion { + case cgroupV1: + assert.Equal(t, int64(-1), r.MemorySwapLimitInBytes, msg) + case cgroupV2: + assert.Equal(t, "max", r.Unified[cm.Cgroup2MaxSwapFilename], msg) + } + } + } + + expectSwap := func(cgroupVersion CgroupVersion, swapBytesExpected int64, resources *runtimeapi.LinuxContainerResources) { + msg := fmt.Sprintf("container swap is expected to be limited by %d bytes", swapBytesExpected) + + switch cgroupVersion { + case cgroupV1: + assert.Equal(t, resources.MemoryLimitInBytes+swapBytesExpected, resources.MemorySwapLimitInBytes, msg) + case cgroupV2: + assert.Equal(t, fmt.Sprintf("%d", swapBytesExpected), resources.Unified[cm.Cgroup2MaxSwapFilename], msg) + } + } + + calcSwapForBurstablePods := func(containerMemoryRequest int64) int64 { + swapSize, err := calcSwapForBurstablePods(containerMemoryRequest, int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) + assert.NoError(t, err) + + return swapSize + } + + for _, tc := range []struct { + name string + cgroupVersion CgroupVersion + qosClass v1.PodQOSClass + nodeSwapFeatureGateEnabled bool + swapBehavior string + addContainerWithoutRequests bool + addGuaranteedContainer bool + }{ + // With cgroup v1 + { + name: "cgroups v1, LimitedSwap, Burstable QoS", + cgroupVersion: cgroupV1, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + }, + { + name: "cgroups v1, UnlimitedSwap, Burstable QoS", + cgroupVersion: cgroupV1, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + }, + { + name: "cgroups v1, LimitedSwap, Best-effort QoS", + cgroupVersion: cgroupV1, + qosClass: v1.PodQOSBestEffort, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + }, + + // With feature gate turned off + { + name: "NodeSwap feature gate turned off, cgroups v2, LimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: false, + swapBehavior: types.LimitedSwap, + }, + { + name: "NodeSwap feature gate turned off, cgroups v2, UnlimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: false, + swapBehavior: types.UnlimitedSwap, + }, + + // With no swapBehavior, UnlimitedSwap should be the default + { + name: "With no swapBehavior - UnlimitedSwap should be the default", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBestEffort, + nodeSwapFeatureGateEnabled: true, + swapBehavior: "", + }, + + // With Guaranteed and Best-effort QoS + { + name: "Best-effort Qos, cgroups v2, LimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + }, + { + name: "Best-effort Qos, cgroups v2, UnlimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + }, + { + name: "Guaranteed Qos, cgroups v2, LimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSGuaranteed, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + }, + { + name: "Guaranteed Qos, cgroups v2, UnlimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSGuaranteed, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + }, + + // With a "guaranteed" container (when memory requests equal to limits) + { + name: "Burstable Qos, cgroups v2, LimitedSwap, with a guaranteed container", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + addContainerWithoutRequests: false, + addGuaranteedContainer: true, + }, + { + name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a guaranteed container", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + addContainerWithoutRequests: false, + addGuaranteedContainer: true, + }, + + // Swap is expected to be allocated + { + name: "Burstable Qos, cgroups v2, LimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + addContainerWithoutRequests: false, + addGuaranteedContainer: false, + }, + { + name: "Burstable Qos, cgroups v2, UnlimitedSwap", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + addContainerWithoutRequests: false, + addGuaranteedContainer: false, + }, + { + name: "Burstable Qos, cgroups v2, LimitedSwap, with a container with no requests", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.LimitedSwap, + addContainerWithoutRequests: true, + addGuaranteedContainer: false, + }, + { + name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a container with no requests", + cgroupVersion: cgroupV2, + qosClass: v1.PodQOSBurstable, + nodeSwapFeatureGateEnabled: true, + swapBehavior: types.UnlimitedSwap, + addContainerWithoutRequests: true, + addGuaranteedContainer: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + setCgroupVersionDuringTest(tc.cgroupVersion) + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tc.nodeSwapFeatureGateEnabled)() + m.memorySwapBehavior = tc.swapBehavior + + var resourceReqsC1, resourceReqsC2 v1.ResourceRequirements + switch tc.qosClass { + case v1.PodQOSBurstable: + resourceReqsC1 = v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi")}, + } + + if !tc.addContainerWithoutRequests { + resourceReqsC2 = v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")}, + } + + if tc.addGuaranteedContainer { + resourceReqsC2.Limits = v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")} + } + } + case v1.PodQOSGuaranteed: + resourceReqsC1 = v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")}, + Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")}, + } + resourceReqsC2 = v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")}, + Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")}, + } + } + pod.Spec.Containers[0].Resources = resourceReqsC1 + pod.Spec.Containers[1].Resources = resourceReqsC2 + + resourcesC1 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false) + resourcesC2 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[1], false) + + if !tc.nodeSwapFeatureGateEnabled || tc.cgroupVersion == cgroupV1 || (tc.swapBehavior == types.LimitedSwap && tc.qosClass != v1.PodQOSBurstable) { + expectNoSwap(tc.cgroupVersion, resourcesC1, resourcesC2) + return + } + + if tc.swapBehavior == types.UnlimitedSwap || tc.swapBehavior == "" { + expectUnlimitedSwap(tc.cgroupVersion, resourcesC1, resourcesC2) + return + } + + c1ExpectedSwap := calcSwapForBurstablePods(resourceReqsC1.Requests.Memory().Value()) + c2ExpectedSwap := int64(0) + if !tc.addContainerWithoutRequests && !tc.addGuaranteedContainer { + c2ExpectedSwap = calcSwapForBurstablePods(resourceReqsC2.Requests.Memory().Value()) + } + + expectSwap(tc.cgroupVersion, c1ExpectedSwap, resourcesC1) + expectSwap(tc.cgroupVersion, c2ExpectedSwap, resourcesC2) + }) + } +} + +type CgroupVersion string + +const ( + cgroupV1 CgroupVersion = "v1" + cgroupV2 CgroupVersion = "v2" +) + +func setCgroupVersionDuringTest(version CgroupVersion) { + isCgroup2UnifiedMode = func() bool { + return version == cgroupV2 + } +} diff --git a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go index 648a218549f..e302ee9c263 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go @@ -38,6 +38,59 @@ func TestApplySandboxResources(t *testing.T) { Linux: &runtimeapi.LinuxPodSandboxConfig{}, } + getPodWithOverhead := func() *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "bar", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("128Mi"), + v1.ResourceCPU: resource.MustParse("2"), + }, + Limits: v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("256Mi"), + v1.ResourceCPU: resource.MustParse("4"), + }, + }, + }, + }, + Overhead: v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("128Mi"), + v1.ResourceCPU: resource.MustParse("1"), + }, + }, + } + } + getPodWithoutOverhead := func() *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "bar", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("128Mi"), + }, + Limits: v1.ResourceList{ + v1.ResourceMemory: resource.MustParse("256Mi"), + }, + }, + }, + }, + }, + } + } + require.NoError(t, err) tests := []struct { @@ -45,36 +98,11 @@ func TestApplySandboxResources(t *testing.T) { pod *v1.Pod expectedResource *runtimeapi.LinuxContainerResources expectedOverhead *runtimeapi.LinuxContainerResources + cgroupVersion CgroupVersion }{ { description: "pod with overhead defined", - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: "12345678", - Name: "bar", - Namespace: "new", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("128Mi"), - v1.ResourceCPU: resource.MustParse("2"), - }, - Limits: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("256Mi"), - v1.ResourceCPU: resource.MustParse("4"), - }, - }, - }, - }, - Overhead: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("128Mi"), - v1.ResourceCPU: resource.MustParse("1"), - }, - }, - }, + pod: getPodWithOverhead(), expectedResource: &runtimeapi.LinuxContainerResources{ MemoryLimitInBytes: 268435456, CpuPeriod: 100000, @@ -87,30 +115,11 @@ func TestApplySandboxResources(t *testing.T) { CpuQuota: 100000, CpuShares: 1024, }, + cgroupVersion: cgroupV1, }, { description: "pod without overhead defined", - pod: &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: "12345678", - Name: "bar", - Namespace: "new", - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("128Mi"), - }, - Limits: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("256Mi"), - }, - }, - }, - }, - }, - }, + pod: getPodWithoutOverhead(), expectedResource: &runtimeapi.LinuxContainerResources{ MemoryLimitInBytes: 268435456, CpuPeriod: 100000, @@ -118,10 +127,45 @@ func TestApplySandboxResources(t *testing.T) { CpuShares: 2, }, expectedOverhead: &runtimeapi.LinuxContainerResources{}, + cgroupVersion: cgroupV1, + }, + { + description: "pod with overhead defined", + pod: getPodWithOverhead(), + expectedResource: &runtimeapi.LinuxContainerResources{ + MemoryLimitInBytes: 268435456, + CpuPeriod: 100000, + CpuQuota: 400000, + CpuShares: 2048, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + expectedOverhead: &runtimeapi.LinuxContainerResources{ + MemoryLimitInBytes: 134217728, + CpuPeriod: 100000, + CpuQuota: 100000, + CpuShares: 1024, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + cgroupVersion: cgroupV2, + }, + { + description: "pod without overhead defined", + pod: getPodWithoutOverhead(), + expectedResource: &runtimeapi.LinuxContainerResources{ + MemoryLimitInBytes: 268435456, + CpuPeriod: 100000, + CpuQuota: 0, + CpuShares: 2, + Unified: map[string]string{"memory.oom.group": "1"}, + }, + expectedOverhead: &runtimeapi.LinuxContainerResources{}, + cgroupVersion: cgroupV2, }, } for i, test := range tests { + setCgroupVersionDuringTest(test.cgroupVersion) + m.applySandboxResources(test.pod, config) assert.Equal(t, test.expectedResource, config.Linux.Resources, "TestCase[%d]: %s", i, test.description) assert.Equal(t, test.expectedOverhead, config.Linux.Overhead, "TestCase[%d]: %s", i, test.description) diff --git a/test/e2e_node/swap_test.go b/test/e2e_node/swap_test.go new file mode 100644 index 00000000000..7df97af920c --- /dev/null +++ b/test/e2e_node/swap_test.go @@ -0,0 +1,254 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "context" + "fmt" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/rand" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/types" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + testutils "k8s.io/kubernetes/test/utils" + admissionapi "k8s.io/pod-security-admission/api" + "path/filepath" + "strconv" +) + +const ( + cgroupBasePath = "/sys/fs/cgroup/" + cgroupV1SwapLimitFile = "/memory/memory.memsw.limit_in_bytes" + cgroupV2SwapLimitFile = "memory.swap.max" + cgroupV1MemLimitFile = "/memory/memory.limit_in_bytes" +) + +var _ = SIGDescribe("Swap [NodeConformance][LinuxOnly]", func() { + f := framework.NewDefaultFramework("swap-test") + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline + + ginkgo.DescribeTable("with configuration", func(qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) { + ginkgo.By(fmt.Sprintf("Creating a pod of QOS class %s. memoryRequestEqualLimit: %t", qosClass, memoryRequestEqualLimit)) + pod := getSwapTestPod(f, qosClass, memoryRequestEqualLimit) + pod = runPodAndWaitUntilScheduled(f, pod) + + isCgroupV2 := isPodCgroupV2(f, pod) + isLimitedSwap := isLimitedSwap(f, pod) + + if !isSwapFeatureGateEnabled() || !isCgroupV2 || (isLimitedSwap && (qosClass != v1.PodQOSBurstable || memoryRequestEqualLimit)) { + ginkgo.By(fmt.Sprintf("Expecting no swap. feature gate on? %t isCgroupV2? %t is QoS burstable? %t", isSwapFeatureGateEnabled(), isCgroupV2, qosClass == v1.PodQOSBurstable)) + expectNoSwap(f, pod, isCgroupV2) + return + } + + if !isLimitedSwap { + ginkgo.By("expecting unlimited swap") + expectUnlimitedSwap(f, pod, isCgroupV2) + return + } + + ginkgo.By("expecting limited swap") + expectedSwapLimit := calcSwapForBurstablePod(f, pod) + expectLimitedSwap(f, pod, expectedSwapLimit) + }, + ginkgo.Entry("QOS Best-effort", v1.PodQOSBestEffort, false), + ginkgo.Entry("QOS Burstable", v1.PodQOSBurstable, false), + ginkgo.Entry("QOS Burstable with memory request equals to limit", v1.PodQOSBurstable, true), + ginkgo.Entry("QOS Guaranteed", v1.PodQOSGuaranteed, false), + ) +}) + +// Note that memoryRequestEqualLimit is effective only when qosClass is PodQOSBestEffort. +func getSwapTestPod(f *framework.Framework, qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) *v1.Pod { + podMemoryAmount := resource.MustParse("128Mi") + + var resources v1.ResourceRequirements + switch qosClass { + case v1.PodQOSBestEffort: + // nothing to do in this case + case v1.PodQOSBurstable: + resources = v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceMemory: podMemoryAmount, + }, + } + + if memoryRequestEqualLimit { + resources.Limits = resources.Requests + } + case v1.PodQOSGuaranteed: + resources = v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: podMemoryAmount, + }, + } + resources.Requests = resources.Limits + } + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod-swap-" + rand.String(5), + Namespace: f.Namespace.Name, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyAlways, + Containers: []v1.Container{ + { + Name: "busybox-container", + Image: busyboxImage, + Command: []string{"sleep", "600"}, + Resources: resources, + }, + }, + }, + } + + return pod +} + +func runPodAndWaitUntilScheduled(f *framework.Framework, pod *v1.Pod) *v1.Pod { + ginkgo.By("running swap test pod") + podClient := e2epod.NewPodClient(f) + + pod = podClient.CreateSync(context.Background(), pod) + pod, err := podClient.Get(context.Background(), pod.Name, metav1.GetOptions{}) + + framework.ExpectNoError(err) + isReady, err := testutils.PodRunningReady(pod) + framework.ExpectNoError(err) + gomega.ExpectWithOffset(1, isReady).To(gomega.BeTrue(), "pod should be ready") + + return pod +} + +func isSwapFeatureGateEnabled() bool { + ginkgo.By("figuring if NodeSwap feature gate is turned on") + return utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap) +} + +func readCgroupFile(f *framework.Framework, pod *v1.Pod, filename string) string { + filePath := filepath.Join(cgroupBasePath, filename) + + ginkgo.By("reading cgroup file " + filePath) + output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "cat "+filePath) + + return output +} + +func isPodCgroupV2(f *framework.Framework, pod *v1.Pod) bool { + ginkgo.By("figuring is test pod runs cgroup v2") + output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", `if test -f "/sys/fs/cgroup/cgroup.controllers"; then echo "true"; else echo "false"; fi`) + + return output == "true" +} + +func expectNoSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) { + if isCgroupV2 { + swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile) + gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("0"), "max swap allowed should be zero") + } else { + swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile) + memLimit := readCgroupFile(f, pod, cgroupV1MemLimitFile) + gomega.ExpectWithOffset(1, swapPlusMemLimit).ToNot(gomega.BeEmpty()) + gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal(memLimit)) + } +} + +func expectUnlimitedSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) { + if isCgroupV2 { + swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile) + gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("max"), "max swap allowed should be \"max\"") + } else { + swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile) + gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal("-1")) + } +} + +// supports v2 only as v1 shouldn't support LimitedSwap +func expectLimitedSwap(f *framework.Framework, pod *v1.Pod, expectedSwapLimit int64) { + swapLimitStr := readCgroupFile(f, pod, cgroupV2SwapLimitFile) + + swapLimit, err := strconv.Atoi(swapLimitStr) + framework.ExpectNoError(err, "cannot convert swap limit to int") + + // cgroup values are always aligned w.r.t. the page size, which is usually 4Ki + const cgroupAlignment int64 = 4 * 1024 // 4Ki + const errMsg = "swap limitation is not as expected" + + gomega.ExpectWithOffset(1, int64(swapLimit)).To( + gomega.Or( + gomega.BeNumerically(">=", expectedSwapLimit-cgroupAlignment), + gomega.BeNumerically("<=", expectedSwapLimit+cgroupAlignment), + ), + errMsg, + ) +} + +func getSwapCapacity(f *framework.Framework, pod *v1.Pod) int64 { + output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "free -b | grep Swap | xargs | cut -d\" \" -f2") + + swapCapacity, err := strconv.Atoi(output) + framework.ExpectNoError(err, "cannot convert swap size to int") + + ginkgo.By(fmt.Sprintf("providing swap capacity: %d", swapCapacity)) + + return int64(swapCapacity) +} + +func getMemoryCapacity(f *framework.Framework, pod *v1.Pod) int64 { + nodes, err := f.ClientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{}) + framework.ExpectNoError(err, "failed listing nodes") + + for _, node := range nodes.Items { + if node.Name != pod.Spec.NodeName { + continue + } + + memCapacity := node.Status.Capacity[v1.ResourceMemory] + return memCapacity.Value() + } + + framework.ExpectNoError(fmt.Errorf("node %s wasn't found", pod.Spec.NodeName)) + return 0 +} + +func calcSwapForBurstablePod(f *framework.Framework, pod *v1.Pod) int64 { + nodeMemoryCapacity := getMemoryCapacity(f, pod) + nodeSwapCapacity := getSwapCapacity(f, pod) + containerMemoryRequest := pod.Spec.Containers[0].Resources.Requests.Memory().Value() + + containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeMemoryCapacity) + swapAllocation := containerMemoryProportion * float64(nodeSwapCapacity) + ginkgo.By(fmt.Sprintf("Calculating swap for burstable pods: nodeMemoryCapacity: %d, nodeSwapCapacity: %d, containerMemoryRequest: %d, swapAllocation: %d", + nodeMemoryCapacity, nodeSwapCapacity, containerMemoryRequest, int64(swapAllocation))) + + return int64(swapAllocation) +} + +func isLimitedSwap(f *framework.Framework, pod *v1.Pod) bool { + kubeletCfg, err := getCurrentKubeletConfig(context.Background()) + framework.ExpectNoError(err, "cannot get kubelet config") + + return kubeletCfg.MemorySwap.SwapBehavior == types.LimitedSwap +}