From f37aec6c57f2d3f47e6142da33a6cd2b8fa854c2 Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Wed, 15 Mar 2023 13:31:08 +0200
Subject: [PATCH 1/6] Add LIMITED_SWAP env var to enable limited swap

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 hack/local-up-cluster.sh | 9 +++++++++
 1 file changed, 9 insertions(+)
diff --git a/hack/local-up-cluster.sh b/hack/local-up-cluster.sh
index 62117a9e8d6..e6195d73383 100755
--- a/hack/local-up-cluster.sh
+++ b/hack/local-up-cluster.sh
@@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""}
 CGROUP_ROOT=${CGROUP_ROOT:-""}
 # owner of client certs, default to current user if not specified
 USER=${USER:-$(whoami)}
+# if true, limited swap is being used instead of unlimited swap (default)
+LIMITED_SWAP=${LIMITED_SWAP:-""}
 
 # required for cni installation
 CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d}
@@ -832,6 +834,13 @@ tracing:
 EOF
     fi
 
+    if [[ "$LIMITED_SWAP" == "true" ]]; then
+        cat <<EOF >> "${TMP_DIR}"/kubelet.yaml
+memorySwap:
+  swapBehavior: LimitedSwap
+EOF
+    fi
+
     {
       # authentication
       echo "authentication:"

From e4da568f3319bb44e1b16cd021d866c81b758d82 Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Tue, 20 Jun 2023 11:28:21 +0200
Subject: [PATCH 2/6] Make kuberuntime unit tests environment independent +
 support cgroup v2

Before this commit, to find out the current node's
cgroup version, a libcontainers function was used
directly. This way, cgroup version is hard to mock
and is dependant on the environment in which the unit
tests are being run.

After this commit, libcontainer's function is wrapped
within a variable function that can be re-assigned by
the tests. This way the test can easily mock the cgroup
version and become environment independant.

After this commit both cgroup versions v1 and v2
are being tested, no matter in which environment
it runs.

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 .../kuberuntime_container_linux.go            |  10 +-
 .../kuberuntime_container_linux_test.go       |  84 ++++++++++-
 .../kuberuntime_sandbox_linux_test.go         | 140 ++++++++++++------
 3 files changed, 178 insertions(+), 56 deletions(-)

diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index 466378deda3..4153ab7e13c 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -46,7 +46,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
 	enforceMemoryQoS := false
 	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
-		libcontainercgroups.IsCgroup2UnifiedMode() {
+		isCgroup2UnifiedMode() {
 		enforceMemoryQoS = true
 	}
 	cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
@@ -171,7 +171,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
 	enforceMemoryQoS := false
 	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
 	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
-		libcontainercgroups.IsCgroup2UnifiedMode() {
+		isCgroup2UnifiedMode() {
 		enforceMemoryQoS = true
 	}
 	return &runtimeapi.ContainerResources{
@@ -216,7 +216,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
 	}
 
 	// runc requires cgroupv2 for unified mode
-	if libcontainercgroups.IsCgroup2UnifiedMode() {
+	if isCgroup2UnifiedMode() {
 		resources.Unified = map[string]string{
 			// Ask the kernel to kill all processes in the container cgroup in case of OOM.
 			// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
@@ -298,3 +298,7 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
 	}
 	return cStatusResources
 }
+
+var isCgroup2UnifiedMode = func() bool {
+	return libcontainercgroups.IsCgroup2UnifiedMode()
+}
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
index ec56dc733c4..996754c2417 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@@ -244,11 +244,12 @@ func TestCalculateLinuxResources(t *testing.T) {
 	}
 
 	tests := []struct {
-		name     string
-		cpuReq   *resource.Quantity
-		cpuLim   *resource.Quantity
-		memLim   *resource.Quantity
-		expected *runtimeapi.LinuxContainerResources
+		name          string
+		cpuReq        *resource.Quantity
+		cpuLim        *resource.Quantity
+		memLim        *resource.Quantity
+		expected      *runtimeapi.LinuxContainerResources
+		cgroupVersion CgroupVersion
 	}{
 		{
 			name:   "Request128MBLimit256MB",
@@ -261,6 +262,7 @@ func TestCalculateLinuxResources(t *testing.T) {
 				CpuShares:          1024,
 				MemoryLimitInBytes: 134217728,
 			},
+			cgroupVersion: cgroupV1,
 		},
 		{
 			name:   "RequestNoMemory",
@@ -273,6 +275,7 @@ func TestCalculateLinuxResources(t *testing.T) {
 				CpuShares:          2048,
 				MemoryLimitInBytes: 0,
 			},
+			cgroupVersion: cgroupV1,
 		},
 		{
 			name:   "RequestNilCPU",
@@ -284,6 +287,7 @@ func TestCalculateLinuxResources(t *testing.T) {
 				CpuShares:          2048,
 				MemoryLimitInBytes: 0,
 			},
+			cgroupVersion: cgroupV1,
 		},
 		{
 			name:   "RequestZeroCPU",
@@ -296,9 +300,66 @@ func TestCalculateLinuxResources(t *testing.T) {
 				CpuShares:          2,
 				MemoryLimitInBytes: 0,
 			},
+			cgroupVersion: cgroupV1,
+		},
+		{
+			name:   "Request128MBLimit256MB",
+			cpuReq: generateResourceQuantity("1"),
+			cpuLim: generateResourceQuantity("2"),
+			memLim: generateResourceQuantity("128Mi"),
+			expected: &runtimeapi.LinuxContainerResources{
+				CpuPeriod:          100000,
+				CpuQuota:           200000,
+				CpuShares:          1024,
+				MemoryLimitInBytes: 134217728,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			cgroupVersion: cgroupV2,
+		},
+		{
+			name:   "RequestNoMemory",
+			cpuReq: generateResourceQuantity("2"),
+			cpuLim: generateResourceQuantity("8"),
+			memLim: generateResourceQuantity("0"),
+			expected: &runtimeapi.LinuxContainerResources{
+				CpuPeriod:          100000,
+				CpuQuota:           800000,
+				CpuShares:          2048,
+				MemoryLimitInBytes: 0,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			cgroupVersion: cgroupV2,
+		},
+		{
+			name:   "RequestNilCPU",
+			cpuLim: generateResourceQuantity("2"),
+			memLim: generateResourceQuantity("0"),
+			expected: &runtimeapi.LinuxContainerResources{
+				CpuPeriod:          100000,
+				CpuQuota:           200000,
+				CpuShares:          2048,
+				MemoryLimitInBytes: 0,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			cgroupVersion: cgroupV2,
+		},
+		{
+			name:   "RequestZeroCPU",
+			cpuReq: generateResourceQuantity("0"),
+			cpuLim: generateResourceQuantity("2"),
+			memLim: generateResourceQuantity("0"),
+			expected: &runtimeapi.LinuxContainerResources{
+				CpuPeriod:          100000,
+				CpuQuota:           200000,
+				CpuShares:          2,
+				MemoryLimitInBytes: 0,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			cgroupVersion: cgroupV2,
 		},
 	}
 	for _, test := range tests {
+		setCgroupVersionDuringTest(test.cgroupVersion)
 		linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim)
 		assert.Equal(t, test.expected, linuxContainerResources)
 	}
@@ -888,3 +949,16 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 	}
 	//TODO(vinaykul,InPlacePodVerticalScaling): Add unit tests for cgroup v1 & v2
 }
+
+type CgroupVersion string
+
+const (
+	cgroupV1 CgroupVersion = "v1"
+	cgroupV2 CgroupVersion = "v2"
+)
+
+func setCgroupVersionDuringTest(version CgroupVersion) {
+	isCgroup2UnifiedMode = func() bool {
+		return version == cgroupV2
+	}
+}
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
index 648a218549f..e302ee9c263 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_sandbox_linux_test.go
@@ -38,6 +38,59 @@ func TestApplySandboxResources(t *testing.T) {
 		Linux: &runtimeapi.LinuxPodSandboxConfig{},
 	}
 
+	getPodWithOverhead := func() *v1.Pod {
+		return &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				UID:       "12345678",
+				Name:      "bar",
+				Namespace: "new",
+			},
+			Spec: v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Resources: v1.ResourceRequirements{
+							Requests: v1.ResourceList{
+								v1.ResourceMemory: resource.MustParse("128Mi"),
+								v1.ResourceCPU:    resource.MustParse("2"),
+							},
+							Limits: v1.ResourceList{
+								v1.ResourceMemory: resource.MustParse("256Mi"),
+								v1.ResourceCPU:    resource.MustParse("4"),
+							},
+						},
+					},
+				},
+				Overhead: v1.ResourceList{
+					v1.ResourceMemory: resource.MustParse("128Mi"),
+					v1.ResourceCPU:    resource.MustParse("1"),
+				},
+			},
+		}
+	}
+	getPodWithoutOverhead := func() *v1.Pod {
+		return &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				UID:       "12345678",
+				Name:      "bar",
+				Namespace: "new",
+			},
+			Spec: v1.PodSpec{
+				Containers: []v1.Container{
+					{
+						Resources: v1.ResourceRequirements{
+							Requests: v1.ResourceList{
+								v1.ResourceMemory: resource.MustParse("128Mi"),
+							},
+							Limits: v1.ResourceList{
+								v1.ResourceMemory: resource.MustParse("256Mi"),
+							},
+						},
+					},
+				},
+			},
+		}
+	}
+
 	require.NoError(t, err)
 
 	tests := []struct {
@@ -45,36 +98,11 @@ func TestApplySandboxResources(t *testing.T) {
 		pod              *v1.Pod
 		expectedResource *runtimeapi.LinuxContainerResources
 		expectedOverhead *runtimeapi.LinuxContainerResources
+		cgroupVersion    CgroupVersion
 	}{
 		{
 			description: "pod with overhead defined",
-			pod: &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					UID:       "12345678",
-					Name:      "bar",
-					Namespace: "new",
-				},
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{
-						{
-							Resources: v1.ResourceRequirements{
-								Requests: v1.ResourceList{
-									v1.ResourceMemory: resource.MustParse("128Mi"),
-									v1.ResourceCPU:    resource.MustParse("2"),
-								},
-								Limits: v1.ResourceList{
-									v1.ResourceMemory: resource.MustParse("256Mi"),
-									v1.ResourceCPU:    resource.MustParse("4"),
-								},
-							},
-						},
-					},
-					Overhead: v1.ResourceList{
-						v1.ResourceMemory: resource.MustParse("128Mi"),
-						v1.ResourceCPU:    resource.MustParse("1"),
-					},
-				},
-			},
+			pod:         getPodWithOverhead(),
 			expectedResource: &runtimeapi.LinuxContainerResources{
 				MemoryLimitInBytes: 268435456,
 				CpuPeriod:          100000,
@@ -87,30 +115,11 @@ func TestApplySandboxResources(t *testing.T) {
 				CpuQuota:           100000,
 				CpuShares:          1024,
 			},
+			cgroupVersion: cgroupV1,
 		},
 		{
 			description: "pod without overhead defined",
-			pod: &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					UID:       "12345678",
-					Name:      "bar",
-					Namespace: "new",
-				},
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{
-						{
-							Resources: v1.ResourceRequirements{
-								Requests: v1.ResourceList{
-									v1.ResourceMemory: resource.MustParse("128Mi"),
-								},
-								Limits: v1.ResourceList{
-									v1.ResourceMemory: resource.MustParse("256Mi"),
-								},
-							},
-						},
-					},
-				},
-			},
+			pod:         getPodWithoutOverhead(),
 			expectedResource: &runtimeapi.LinuxContainerResources{
 				MemoryLimitInBytes: 268435456,
 				CpuPeriod:          100000,
@@ -118,10 +127,45 @@ func TestApplySandboxResources(t *testing.T) {
 				CpuShares:          2,
 			},
 			expectedOverhead: &runtimeapi.LinuxContainerResources{},
+			cgroupVersion:    cgroupV1,
+		},
+		{
+			description: "pod with overhead defined",
+			pod:         getPodWithOverhead(),
+			expectedResource: &runtimeapi.LinuxContainerResources{
+				MemoryLimitInBytes: 268435456,
+				CpuPeriod:          100000,
+				CpuQuota:           400000,
+				CpuShares:          2048,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			expectedOverhead: &runtimeapi.LinuxContainerResources{
+				MemoryLimitInBytes: 134217728,
+				CpuPeriod:          100000,
+				CpuQuota:           100000,
+				CpuShares:          1024,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			cgroupVersion: cgroupV2,
+		},
+		{
+			description: "pod without overhead defined",
+			pod:         getPodWithoutOverhead(),
+			expectedResource: &runtimeapi.LinuxContainerResources{
+				MemoryLimitInBytes: 268435456,
+				CpuPeriod:          100000,
+				CpuQuota:           0,
+				CpuShares:          2,
+				Unified:            map[string]string{"memory.oom.group": "1"},
+			},
+			expectedOverhead: &runtimeapi.LinuxContainerResources{},
+			cgroupVersion:    cgroupV2,
 		},
 	}
 
 	for i, test := range tests {
+		setCgroupVersionDuringTest(test.cgroupVersion)
+
 		m.applySandboxResources(test.pod, config)
 		assert.Equal(t, test.expectedResource, config.Linux.Resources, "TestCase[%d]: %s", i, test.description)
 		assert.Equal(t, test.expectedOverhead, config.Linux.Overhead, "TestCase[%d]: %s", i, test.description)

From a30410d9ceb7a5dce0d038925f894a1e1064b4ec Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Thu, 13 Apr 2023 17:36:04 +0300
Subject: [PATCH 3/6] LimitedSwap: Automatically configure swap limit for
 Burstable QoS Pods

After this commit, when LimitedSwap is enabled,
containers would get swap acess limited with respect
the container memory request, total physical memory
on the node, and the swap size on the node.

Pods of Best-Effort / Guaranteed QoS classes don't get
to swap. In addition, container with memory requests
that are equal to their memory limits also don't get to
swap.

The swap limitation is calculated in the following way:
1. Calculate the container's memory proportionate to the node's memory:
- Divide the container's memory request by the total node's physical memory.
  Let's call this value ContainerMemoryProportion.

2. Multiply the container memory proportion by the available
swap memory for Pods:
Meaning: ContainerMemoryProportion * TotalPodsSwapAvailable.

Fore more information:
https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 pkg/kubelet/cm/cgroup_manager_linux.go        |  11 +-
 pkg/kubelet/cm/helpers_linux.go               |   2 +-
 .../cm/node_container_manager_linux.go        |   2 +-
 pkg/kubelet/cm/qos_container_manager_linux.go |   4 +-
 .../kuberuntime_container_linux.go            | 108 ++++++++++++++++--
 5 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go
index c4be02a45b2..f54eaa2979f 100644
--- a/pkg/kubelet/cm/cgroup_manager_linux.go
+++ b/pkg/kubelet/cm/cgroup_manager_linux.go
@@ -45,11 +45,12 @@ import (
 const (
 	// systemdSuffix is the cgroup name suffix for systemd
 	systemdSuffix string = ".slice"
-	// MemoryMin is memory.min for cgroup v2
-	MemoryMin string = "memory.min"
-	// MemoryHigh is memory.high for cgroup v2
-	MemoryHigh         string = "memory.high"
-	Cgroup2MaxCpuLimit string = "max"
+	// Cgroup2MemoryMin is memory.min for cgroup v2
+	Cgroup2MemoryMin string = "memory.min"
+	// Cgroup2MemoryHigh is memory.high for cgroup v2
+	Cgroup2MemoryHigh      string = "memory.high"
+	Cgroup2MaxCpuLimit     string = "max"
+	Cgroup2MaxSwapFilename string = "memory.swap.max"
 )
 
 var RootCgroupName = CgroupName([]string{})
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index 18b0df17bfc..8a144e7a73c 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
 		}
 		if memoryMin > 0 {
 			result.Unified = map[string]string{
-				MemoryMin: strconv.FormatInt(memoryMin, 10),
+				Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
 			}
 		}
 	}
diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go
index 74221c67047..b57403dd95b 100644
--- a/pkg/kubelet/cm/node_container_manager_linux.go
+++ b/pkg/kubelet/cm/node_container_manager_linux.go
@@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
 			if rp.Unified == nil {
 				rp.Unified = make(map[string]string)
 			}
-			rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
+			rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
 		}
 	}
 
diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go
index 89b3adae9af..abf4487ee5d 100644
--- a/pkg/kubelet/cm/qos_container_manager_linux.go
+++ b/pkg/kubelet/cm/qos_container_manager_linux.go
@@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
 		if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
 			configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
 		}
-		configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
+		configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
 	}
 
@@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
 		if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
 			configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
 		}
-		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
+		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
 	}
 }
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index 4153ab7e13c..c600d49bc25 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -20,6 +20,9 @@ limitations under the License.
 package kuberuntime
 
 import (
+	"fmt"
+	cadvisorv1 "github.com/google/cadvisor/info/v1"
+	kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 	"math"
 	"os"
 	"strconv"
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 
 	lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
 
-	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
+	if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
 		// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
 		// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
 		switch m.memorySwapBehavior {
-		case kubelettypes.UnlimitedSwap:
-			// -1 = unlimited swap
-			lcr.MemorySwapLimitInBytes = -1
 		case kubelettypes.LimitedSwap:
-			fallthrough
+			swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
 		default:
-			// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
-			// Some swapping is still possible.
-			// Note that if memory limit is 0, memory swap limit is ignored.
-			lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
+			swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
 		}
+	} else {
+		swapConfigurationHelper.ConfigureNoSwap(lcr)
 	}
 
 	// Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 		memoryRequest := container.Resources.Requests.Memory().Value()
 		memoryLimit := container.Resources.Limits.Memory().Value()
 		if memoryRequest != 0 {
-			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
+			unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
 		}
 
 		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 				}
 			}
 			if memoryHigh != 0 && memoryHigh > memoryRequest {
-				unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
+				unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 			}
 		}
 		if len(unified) > 0 {
@@ -299,6 +298,93 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
 	return cStatusResources
 }
 
+// Note: this function variable is being added here so it would be possible to mock
+// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
+// the cgroup version would solely depend on the environment running the test.
 var isCgroup2UnifiedMode = func() bool {
 	return libcontainercgroups.IsCgroup2UnifiedMode()
 }
+
+type swapConfigurationHelper struct {
+	machineInfo cadvisorv1.MachineInfo
+}
+
+func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
+	return &swapConfigurationHelper{machineInfo: machineInfo}
+}
+
+func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
+	podQos := kubeapiqos.GetPodQOS(pod)
+	containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
+	memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
+
+	if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	containerMemoryRequest := container.Resources.Requests.Memory()
+	swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
+
+	if err != nil {
+		klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	m.configureSwap(lcr, swapLimit)
+}
+
+func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
+	if !isCgroup2UnifiedMode() {
+		// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
+		// Some swapping is still possible.
+		// Note that if memory limit is 0, memory swap limit is ignored.
+		lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
+		return
+	}
+
+	m.configureSwap(lcr, 0)
+}
+
+func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
+	if !isCgroup2UnifiedMode() {
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	if lcr.Unified == nil {
+		lcr.Unified = map[string]string{}
+	}
+
+	lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
+}
+
+func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
+	if !isCgroup2UnifiedMode() {
+		klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
+		return
+	}
+
+	if lcr.Unified == nil {
+		lcr.Unified = map[string]string{}
+	}
+
+	lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
+}
+
+// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
+// For more info, please look at the following KEP: https://kep.k8s.io/2400
+func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
+	if nodeTotalMemory <= 0 {
+		return 0, fmt.Errorf("total node memory is 0")
+	}
+	if containerMemoryRequest > nodeTotalMemory {
+		return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
+	}
+
+	containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
+	swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
+
+	return int64(swapAllocation), nil
+}

From 4b6314f815b114dacc7fddcdc74fb6243c8a80c9 Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Wed, 31 May 2023 13:39:44 +0300
Subject: [PATCH 4/6] Unit test: Swap - Limited/Unlimited Swap, cgroups v1/v2,
 etc

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 .../kuberuntime_container_linux_test.go       | 371 +++++++++++++-----
 1 file changed, 280 insertions(+), 91 deletions(-)

diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
index 996754c2417..b50eee1d4ee 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@@ -21,6 +21,9 @@ package kuberuntime
 
 import (
 	"context"
+	"fmt"
+	"k8s.io/kubernetes/pkg/kubelet/cm"
+	"k8s.io/kubernetes/pkg/kubelet/types"
 	"math"
 	"os"
 	"reflect"
@@ -38,7 +41,6 @@ import (
 	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
 	"k8s.io/kubernetes/pkg/features"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
-	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 )
 
 func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig {
@@ -695,96 +697,6 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) {
 	}
 }
 
-func TestGenerateLinuxContainerConfigSwap(t *testing.T) {
-	defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, true)()
-	_, _, m, err := createTestRuntimeManager()
-	if err != nil {
-		t.Fatalf("error creating test RuntimeManager: %v", err)
-	}
-	m.machineInfo.MemoryCapacity = 1000000
-	containerName := "test"
-
-	for _, tc := range []struct {
-		name        string
-		swapSetting string
-		pod         *v1.Pod
-		expected    int64
-	}{
-		{
-			name: "config unset, memory limit set",
-			// no swap setting
-			pod: &v1.Pod{
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{{
-						Name: containerName,
-						Resources: v1.ResourceRequirements{
-							Limits: v1.ResourceList{
-								"memory": resource.MustParse("1000"),
-							},
-							Requests: v1.ResourceList{
-								"memory": resource.MustParse("1000"),
-							},
-						},
-					}},
-				},
-			},
-			expected: 1000,
-		},
-		{
-			name: "config unset, no memory limit",
-			// no swap setting
-			pod: &v1.Pod{
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{
-						{Name: containerName},
-					},
-				},
-			},
-			expected: 0,
-		},
-		{
-			// Note: behaviour will be the same as previous two cases
-			name:        "config set to LimitedSwap, memory limit set",
-			swapSetting: kubelettypes.LimitedSwap,
-			pod: &v1.Pod{
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{{
-						Name: containerName,
-						Resources: v1.ResourceRequirements{
-							Limits: v1.ResourceList{
-								"memory": resource.MustParse("1000"),
-							},
-							Requests: v1.ResourceList{
-								"memory": resource.MustParse("1000"),
-							},
-						},
-					}},
-				},
-			},
-			expected: 1000,
-		},
-		{
-			name:        "UnlimitedSwap enabled",
-			swapSetting: kubelettypes.UnlimitedSwap,
-			pod: &v1.Pod{
-				Spec: v1.PodSpec{
-					Containers: []v1.Container{
-						{Name: containerName},
-					},
-				},
-			},
-			expected: -1,
-		},
-	} {
-		t.Run(tc.name, func(t *testing.T) {
-			m.memorySwapBehavior = tc.swapSetting
-			actual, err := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false)
-			assert.NoError(t, err)
-			assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name)
-		})
-	}
-}
-
 func TestGenerateLinuxContainerResources(t *testing.T) {
 	_, _, m, err := createTestRuntimeManager()
 	assert.NoError(t, err)
@@ -936,6 +848,10 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 			if tc.scalingFg {
 				defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)()
 			}
+
+			setCgroupVersionDuringTest(cgroupV1)
+			tc.expected.MemorySwapLimitInBytes = tc.expected.MemoryLimitInBytes
+
 			pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
 			if len(tc.cStatus) > 0 {
 				pod.Status.ContainerStatuses = tc.cStatus
@@ -950,6 +866,279 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
 	//TODO(vinaykul,InPlacePodVerticalScaling): Add unit tests for cgroup v1 & v2
 }
 
+func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
+	_, _, m, err := createTestRuntimeManager()
+	assert.NoError(t, err)
+	m.machineInfo.MemoryCapacity = 42949672960 // 40Gb == 40 * 1024^3
+	m.machineInfo.SwapCapacity = 5368709120    // 5Gb == 5 * 1024^3
+
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID:       "12345678",
+			Name:      "foo",
+			Namespace: "bar",
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: "c1",
+				},
+				{
+					Name: "c2",
+				},
+			},
+		},
+		Status: v1.PodStatus{},
+	}
+
+	expectNoSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
+		const msg = "container is expected to not have swap access"
+
+		for _, r := range resources {
+			switch cgroupVersion {
+			case cgroupV1:
+				assert.Equal(t, r.MemoryLimitInBytes, r.MemorySwapLimitInBytes, msg)
+			case cgroupV2:
+				assert.Equal(t, "0", r.Unified[cm.Cgroup2MaxSwapFilename], msg)
+			}
+		}
+	}
+
+	expectUnlimitedSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
+		const msg = "container is expected to have unlimited swap access"
+
+		for _, r := range resources {
+			switch cgroupVersion {
+			case cgroupV1:
+				assert.Equal(t, int64(-1), r.MemorySwapLimitInBytes, msg)
+			case cgroupV2:
+				assert.Equal(t, "max", r.Unified[cm.Cgroup2MaxSwapFilename], msg)
+			}
+		}
+	}
+
+	expectSwap := func(cgroupVersion CgroupVersion, swapBytesExpected int64, resources *runtimeapi.LinuxContainerResources) {
+		msg := fmt.Sprintf("container swap is expected to be limited by %d bytes", swapBytesExpected)
+
+		switch cgroupVersion {
+		case cgroupV1:
+			assert.Equal(t, resources.MemoryLimitInBytes+swapBytesExpected, resources.MemorySwapLimitInBytes, msg)
+		case cgroupV2:
+			assert.Equal(t, fmt.Sprintf("%d", swapBytesExpected), resources.Unified[cm.Cgroup2MaxSwapFilename], msg)
+		}
+	}
+
+	calcSwapForBurstablePods := func(containerMemoryRequest int64) int64 {
+		swapSize, err := calcSwapForBurstablePods(containerMemoryRequest, int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
+		assert.NoError(t, err)
+
+		return swapSize
+	}
+
+	for _, tc := range []struct {
+		name                        string
+		cgroupVersion               CgroupVersion
+		qosClass                    v1.PodQOSClass
+		nodeSwapFeatureGateEnabled  bool
+		swapBehavior                string
+		addContainerWithoutRequests bool
+		addGuaranteedContainer      bool
+	}{
+		// With cgroup v1
+		{
+			name:                       "cgroups v1, LimitedSwap, Burstable QoS",
+			cgroupVersion:              cgroupV1,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.LimitedSwap,
+		},
+		{
+			name:                       "cgroups v1, UnlimitedSwap, Burstable QoS",
+			cgroupVersion:              cgroupV1,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.UnlimitedSwap,
+		},
+		{
+			name:                       "cgroups v1, LimitedSwap, Best-effort QoS",
+			cgroupVersion:              cgroupV1,
+			qosClass:                   v1.PodQOSBestEffort,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.LimitedSwap,
+		},
+
+		// With feature gate turned off
+		{
+			name:                       "NodeSwap feature gate turned off, cgroups v2, LimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: false,
+			swapBehavior:               types.LimitedSwap,
+		},
+		{
+			name:                       "NodeSwap feature gate turned off, cgroups v2, UnlimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: false,
+			swapBehavior:               types.UnlimitedSwap,
+		},
+
+		// With no swapBehavior, UnlimitedSwap should be the default
+		{
+			name:                       "With no swapBehavior - UnlimitedSwap should be the default",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSBestEffort,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               "",
+		},
+
+		// With Guaranteed and Best-effort QoS
+		{
+			name:                       "Best-effort Qos, cgroups v2, LimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.LimitedSwap,
+		},
+		{
+			name:                       "Best-effort Qos, cgroups v2, UnlimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.UnlimitedSwap,
+		},
+		{
+			name:                       "Guaranteed Qos, cgroups v2, LimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSGuaranteed,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.LimitedSwap,
+		},
+		{
+			name:                       "Guaranteed Qos, cgroups v2, UnlimitedSwap",
+			cgroupVersion:              cgroupV2,
+			qosClass:                   v1.PodQOSGuaranteed,
+			nodeSwapFeatureGateEnabled: true,
+			swapBehavior:               types.UnlimitedSwap,
+		},
+
+		// With a "guaranteed" container (when memory requests equal to limits)
+		{
+			name:                        "Burstable Qos, cgroups v2, LimitedSwap, with a guaranteed container",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.LimitedSwap,
+			addContainerWithoutRequests: false,
+			addGuaranteedContainer:      true,
+		},
+		{
+			name:                        "Burstable Qos, cgroups v2, UnlimitedSwap, with a guaranteed container",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.UnlimitedSwap,
+			addContainerWithoutRequests: false,
+			addGuaranteedContainer:      true,
+		},
+
+		// Swap is expected to be allocated
+		{
+			name:                        "Burstable Qos, cgroups v2, LimitedSwap",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.LimitedSwap,
+			addContainerWithoutRequests: false,
+			addGuaranteedContainer:      false,
+		},
+		{
+			name:                        "Burstable Qos, cgroups v2, UnlimitedSwap",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.UnlimitedSwap,
+			addContainerWithoutRequests: false,
+			addGuaranteedContainer:      false,
+		},
+		{
+			name:                        "Burstable Qos, cgroups v2, LimitedSwap, with a container with no requests",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.LimitedSwap,
+			addContainerWithoutRequests: true,
+			addGuaranteedContainer:      false,
+		},
+		{
+			name:                        "Burstable Qos, cgroups v2, UnlimitedSwap, with a container with no requests",
+			cgroupVersion:               cgroupV2,
+			qosClass:                    v1.PodQOSBurstable,
+			nodeSwapFeatureGateEnabled:  true,
+			swapBehavior:                types.UnlimitedSwap,
+			addContainerWithoutRequests: true,
+			addGuaranteedContainer:      false,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			setCgroupVersionDuringTest(tc.cgroupVersion)
+			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tc.nodeSwapFeatureGateEnabled)()
+			m.memorySwapBehavior = tc.swapBehavior
+
+			var resourceReqsC1, resourceReqsC2 v1.ResourceRequirements
+			switch tc.qosClass {
+			case v1.PodQOSBurstable:
+				resourceReqsC1 = v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi")},
+				}
+
+				if !tc.addContainerWithoutRequests {
+					resourceReqsC2 = v1.ResourceRequirements{
+						Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")},
+					}
+
+					if tc.addGuaranteedContainer {
+						resourceReqsC2.Limits = v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")}
+					}
+				}
+			case v1.PodQOSGuaranteed:
+				resourceReqsC1 = v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")},
+					Limits:   v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")},
+				}
+				resourceReqsC2 = v1.ResourceRequirements{
+					Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")},
+					Limits:   v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")},
+				}
+			}
+			pod.Spec.Containers[0].Resources = resourceReqsC1
+			pod.Spec.Containers[1].Resources = resourceReqsC2
+
+			resourcesC1 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
+			resourcesC2 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[1], false)
+
+			if !tc.nodeSwapFeatureGateEnabled || tc.cgroupVersion == cgroupV1 || (tc.swapBehavior == types.LimitedSwap && tc.qosClass != v1.PodQOSBurstable) {
+				expectNoSwap(tc.cgroupVersion, resourcesC1, resourcesC2)
+				return
+			}
+
+			if tc.swapBehavior == types.UnlimitedSwap || tc.swapBehavior == "" {
+				expectUnlimitedSwap(tc.cgroupVersion, resourcesC1, resourcesC2)
+				return
+			}
+
+			c1ExpectedSwap := calcSwapForBurstablePods(resourceReqsC1.Requests.Memory().Value())
+			c2ExpectedSwap := int64(0)
+			if !tc.addContainerWithoutRequests && !tc.addGuaranteedContainer {
+				c2ExpectedSwap = calcSwapForBurstablePods(resourceReqsC2.Requests.Memory().Value())
+			}
+
+			expectSwap(tc.cgroupVersion, c1ExpectedSwap, resourcesC1)
+			expectSwap(tc.cgroupVersion, c2ExpectedSwap, resourcesC2)
+		})
+	}
+}
+
 type CgroupVersion string
 
 const (

From 619be9c15372238edbb442e6b2ffd7a6cc827cfc Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Wed, 5 Jul 2023 13:59:02 +0300
Subject: [PATCH 5/6] Add a swap e2e test

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 test/e2e_node/swap_test.go | 254 +++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 test/e2e_node/swap_test.go

diff --git a/test/e2e_node/swap_test.go b/test/e2e_node/swap_test.go
new file mode 100644
index 00000000000..7df97af920c
--- /dev/null
+++ b/test/e2e_node/swap_test.go
@@ -0,0 +1,254 @@
+/*
+Copyright 2023 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2enode
+
+import (
+	"context"
+	"fmt"
+	"github.com/onsi/ginkgo/v2"
+	"github.com/onsi/gomega"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/rand"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
+	"k8s.io/kubernetes/pkg/features"
+	"k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
+	testutils "k8s.io/kubernetes/test/utils"
+	admissionapi "k8s.io/pod-security-admission/api"
+	"path/filepath"
+	"strconv"
+)
+
+const (
+	cgroupBasePath        = "/sys/fs/cgroup/"
+	cgroupV1SwapLimitFile = "/memory/memory.memsw.limit_in_bytes"
+	cgroupV2SwapLimitFile = "memory.swap.max"
+	cgroupV1MemLimitFile  = "/memory/memory.limit_in_bytes"
+)
+
+var _ = SIGDescribe("Swap [NodeConformance][LinuxOnly]", func() {
+	f := framework.NewDefaultFramework("swap-test")
+	f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline
+
+	ginkgo.DescribeTable("with configuration", func(qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) {
+		ginkgo.By(fmt.Sprintf("Creating a pod of QOS class %s. memoryRequestEqualLimit: %t", qosClass, memoryRequestEqualLimit))
+		pod := getSwapTestPod(f, qosClass, memoryRequestEqualLimit)
+		pod = runPodAndWaitUntilScheduled(f, pod)
+
+		isCgroupV2 := isPodCgroupV2(f, pod)
+		isLimitedSwap := isLimitedSwap(f, pod)
+
+		if !isSwapFeatureGateEnabled() || !isCgroupV2 || (isLimitedSwap && (qosClass != v1.PodQOSBurstable || memoryRequestEqualLimit)) {
+			ginkgo.By(fmt.Sprintf("Expecting no swap. feature gate on? %t isCgroupV2? %t is QoS burstable? %t", isSwapFeatureGateEnabled(), isCgroupV2, qosClass == v1.PodQOSBurstable))
+			expectNoSwap(f, pod, isCgroupV2)
+			return
+		}
+
+		if !isLimitedSwap {
+			ginkgo.By("expecting unlimited swap")
+			expectUnlimitedSwap(f, pod, isCgroupV2)
+			return
+		}
+
+		ginkgo.By("expecting limited swap")
+		expectedSwapLimit := calcSwapForBurstablePod(f, pod)
+		expectLimitedSwap(f, pod, expectedSwapLimit)
+	},
+		ginkgo.Entry("QOS Best-effort", v1.PodQOSBestEffort, false),
+		ginkgo.Entry("QOS Burstable", v1.PodQOSBurstable, false),
+		ginkgo.Entry("QOS Burstable with memory request equals to limit", v1.PodQOSBurstable, true),
+		ginkgo.Entry("QOS Guaranteed", v1.PodQOSGuaranteed, false),
+	)
+})
+
+// Note that memoryRequestEqualLimit is effective only when qosClass is PodQOSBestEffort.
+func getSwapTestPod(f *framework.Framework, qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) *v1.Pod {
+	podMemoryAmount := resource.MustParse("128Mi")
+
+	var resources v1.ResourceRequirements
+	switch qosClass {
+	case v1.PodQOSBestEffort:
+		// nothing to do in this case
+	case v1.PodQOSBurstable:
+		resources = v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				v1.ResourceMemory: podMemoryAmount,
+			},
+		}
+
+		if memoryRequestEqualLimit {
+			resources.Limits = resources.Requests
+		}
+	case v1.PodQOSGuaranteed:
+		resources = v1.ResourceRequirements{
+			Limits: v1.ResourceList{
+				v1.ResourceCPU:    resource.MustParse("200m"),
+				v1.ResourceMemory: podMemoryAmount,
+			},
+		}
+		resources.Requests = resources.Limits
+	}
+
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-pod-swap-" + rand.String(5),
+			Namespace: f.Namespace.Name,
+		},
+		Spec: v1.PodSpec{
+			RestartPolicy: v1.RestartPolicyAlways,
+			Containers: []v1.Container{
+				{
+					Name:      "busybox-container",
+					Image:     busyboxImage,
+					Command:   []string{"sleep", "600"},
+					Resources: resources,
+				},
+			},
+		},
+	}
+
+	return pod
+}
+
+func runPodAndWaitUntilScheduled(f *framework.Framework, pod *v1.Pod) *v1.Pod {
+	ginkgo.By("running swap test pod")
+	podClient := e2epod.NewPodClient(f)
+
+	pod = podClient.CreateSync(context.Background(), pod)
+	pod, err := podClient.Get(context.Background(), pod.Name, metav1.GetOptions{})
+
+	framework.ExpectNoError(err)
+	isReady, err := testutils.PodRunningReady(pod)
+	framework.ExpectNoError(err)
+	gomega.ExpectWithOffset(1, isReady).To(gomega.BeTrue(), "pod should be ready")
+
+	return pod
+}
+
+func isSwapFeatureGateEnabled() bool {
+	ginkgo.By("figuring if NodeSwap feature gate is turned on")
+	return utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap)
+}
+
+func readCgroupFile(f *framework.Framework, pod *v1.Pod, filename string) string {
+	filePath := filepath.Join(cgroupBasePath, filename)
+
+	ginkgo.By("reading cgroup file " + filePath)
+	output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "cat "+filePath)
+
+	return output
+}
+
+func isPodCgroupV2(f *framework.Framework, pod *v1.Pod) bool {
+	ginkgo.By("figuring is test pod runs cgroup v2")
+	output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", `if test -f "/sys/fs/cgroup/cgroup.controllers"; then echo "true"; else echo "false"; fi`)
+
+	return output == "true"
+}
+
+func expectNoSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) {
+	if isCgroupV2 {
+		swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
+		gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("0"), "max swap allowed should be zero")
+	} else {
+		swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile)
+		memLimit := readCgroupFile(f, pod, cgroupV1MemLimitFile)
+		gomega.ExpectWithOffset(1, swapPlusMemLimit).ToNot(gomega.BeEmpty())
+		gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal(memLimit))
+	}
+}
+
+func expectUnlimitedSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) {
+	if isCgroupV2 {
+		swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
+		gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("max"), "max swap allowed should be \"max\"")
+	} else {
+		swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile)
+		gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal("-1"))
+	}
+}
+
+// supports v2 only as v1 shouldn't support LimitedSwap
+func expectLimitedSwap(f *framework.Framework, pod *v1.Pod, expectedSwapLimit int64) {
+	swapLimitStr := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
+
+	swapLimit, err := strconv.Atoi(swapLimitStr)
+	framework.ExpectNoError(err, "cannot convert swap limit to int")
+
+	// cgroup values are always aligned w.r.t. the page size, which is usually 4Ki
+	const cgroupAlignment int64 = 4 * 1024 // 4Ki
+	const errMsg = "swap limitation is not as expected"
+
+	gomega.ExpectWithOffset(1, int64(swapLimit)).To(
+		gomega.Or(
+			gomega.BeNumerically(">=", expectedSwapLimit-cgroupAlignment),
+			gomega.BeNumerically("<=", expectedSwapLimit+cgroupAlignment),
+		),
+		errMsg,
+	)
+}
+
+func getSwapCapacity(f *framework.Framework, pod *v1.Pod) int64 {
+	output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "free -b | grep Swap | xargs | cut -d\" \" -f2")
+
+	swapCapacity, err := strconv.Atoi(output)
+	framework.ExpectNoError(err, "cannot convert swap size to int")
+
+	ginkgo.By(fmt.Sprintf("providing swap capacity: %d", swapCapacity))
+
+	return int64(swapCapacity)
+}
+
+func getMemoryCapacity(f *framework.Framework, pod *v1.Pod) int64 {
+	nodes, err := f.ClientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
+	framework.ExpectNoError(err, "failed listing nodes")
+
+	for _, node := range nodes.Items {
+		if node.Name != pod.Spec.NodeName {
+			continue
+		}
+
+		memCapacity := node.Status.Capacity[v1.ResourceMemory]
+		return memCapacity.Value()
+	}
+
+	framework.ExpectNoError(fmt.Errorf("node %s wasn't found", pod.Spec.NodeName))
+	return 0
+}
+
+func calcSwapForBurstablePod(f *framework.Framework, pod *v1.Pod) int64 {
+	nodeMemoryCapacity := getMemoryCapacity(f, pod)
+	nodeSwapCapacity := getSwapCapacity(f, pod)
+	containerMemoryRequest := pod.Spec.Containers[0].Resources.Requests.Memory().Value()
+
+	containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeMemoryCapacity)
+	swapAllocation := containerMemoryProportion * float64(nodeSwapCapacity)
+	ginkgo.By(fmt.Sprintf("Calculating swap for burstable pods: nodeMemoryCapacity: %d, nodeSwapCapacity: %d, containerMemoryRequest: %d, swapAllocation: %d",
+		nodeMemoryCapacity, nodeSwapCapacity, containerMemoryRequest, int64(swapAllocation)))
+
+	return int64(swapAllocation)
+}
+
+func isLimitedSwap(f *framework.Framework, pod *v1.Pod) bool {
+	kubeletCfg, err := getCurrentKubeletConfig(context.Background())
+	framework.ExpectNoError(err, "cannot get kubelet config")
+
+	return kubeletCfg.MemorySwap.SwapBehavior == types.LimitedSwap
+}

From 4321d8c60f32ed0899b3b7b33bb54787281c5448 Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Tue, 4 Jul 2023 17:55:37 +0300
Subject: [PATCH 6/6] Graduate NodeSwap to Beta1

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 pkg/features/kube_features.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
index 7cd55002036..13a385bbd76 100644
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@@ -566,8 +566,9 @@ const (
 	// Allow pods to failover to a different node in case of non graceful node shutdown
 	NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach"
 
-	// owner: @ehashman
+	// owner: @iholder101
 	// alpha: v1.22
+	// beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400.
 	//
 	// Permits kubelet to run with swap enabled
 	NodeSwap featuregate.Feature = "NodeSwap"
@@ -1010,7 +1011,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 
 	NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
 
-	NodeSwap: {Default: false, PreRelease: featuregate.Alpha},
+	NodeSwap: {Default: false, PreRelease: featuregate.Beta},
 
 	PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta},