From a30410d9ceb7a5dce0d038925f894a1e1064b4ec Mon Sep 17 00:00:00 2001
From: Itamar Holder <iholder@redhat.com>
Date: Thu, 13 Apr 2023 17:36:04 +0300
Subject: [PATCH] LimitedSwap: Automatically configure swap limit for Burstable
 QoS Pods

After this commit, when LimitedSwap is enabled,
containers would get swap acess limited with respect
the container memory request, total physical memory
on the node, and the swap size on the node.

Pods of Best-Effort / Guaranteed QoS classes don't get
to swap. In addition, container with memory requests
that are equal to their memory limits also don't get to
swap.

The swap limitation is calculated in the following way:
1. Calculate the container's memory proportionate to the node's memory:
- Divide the container's memory request by the total node's physical memory.
  Let's call this value ContainerMemoryProportion.

2. Multiply the container memory proportion by the available
swap memory for Pods:
Meaning: ContainerMemoryProportion * TotalPodsSwapAvailable.

Fore more information:
https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md

Signed-off-by: Itamar Holder <iholder@redhat.com>
---
 pkg/kubelet/cm/cgroup_manager_linux.go        |  11 +-
 pkg/kubelet/cm/helpers_linux.go               |   2 +-
 .../cm/node_container_manager_linux.go        |   2 +-
 pkg/kubelet/cm/qos_container_manager_linux.go |   4 +-
 .../kuberuntime_container_linux.go            | 108 ++++++++++++++++--
 5 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go
index c4be02a45b2..f54eaa2979f 100644
--- a/pkg/kubelet/cm/cgroup_manager_linux.go
+++ b/pkg/kubelet/cm/cgroup_manager_linux.go
@@ -45,11 +45,12 @@ import (
 const (
 	// systemdSuffix is the cgroup name suffix for systemd
 	systemdSuffix string = ".slice"
-	// MemoryMin is memory.min for cgroup v2
-	MemoryMin string = "memory.min"
-	// MemoryHigh is memory.high for cgroup v2
-	MemoryHigh         string = "memory.high"
-	Cgroup2MaxCpuLimit string = "max"
+	// Cgroup2MemoryMin is memory.min for cgroup v2
+	Cgroup2MemoryMin string = "memory.min"
+	// Cgroup2MemoryHigh is memory.high for cgroup v2
+	Cgroup2MemoryHigh      string = "memory.high"
+	Cgroup2MaxCpuLimit     string = "max"
+	Cgroup2MaxSwapFilename string = "memory.swap.max"
 )
 
 var RootCgroupName = CgroupName([]string{})
diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go
index 18b0df17bfc..8a144e7a73c 100644
--- a/pkg/kubelet/cm/helpers_linux.go
+++ b/pkg/kubelet/cm/helpers_linux.go
@@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
 		}
 		if memoryMin > 0 {
 			result.Unified = map[string]string{
-				MemoryMin: strconv.FormatInt(memoryMin, 10),
+				Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
 			}
 		}
 	}
diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go
index 74221c67047..b57403dd95b 100644
--- a/pkg/kubelet/cm/node_container_manager_linux.go
+++ b/pkg/kubelet/cm/node_container_manager_linux.go
@@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
 			if rp.Unified == nil {
 				rp.Unified = make(map[string]string)
 			}
-			rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
+			rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
 		}
 	}
 
diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go
index 89b3adae9af..abf4487ee5d 100644
--- a/pkg/kubelet/cm/qos_container_manager_linux.go
+++ b/pkg/kubelet/cm/qos_container_manager_linux.go
@@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
 		if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
 			configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
 		}
-		configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
+		configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
 	}
 
@@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
 		if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
 			configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
 		}
-		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
+		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
 		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
 	}
 }
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index 4153ab7e13c..c600d49bc25 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -20,6 +20,9 @@ limitations under the License.
 package kuberuntime
 
 import (
+	"fmt"
+	cadvisorv1 "github.com/google/cadvisor/info/v1"
+	kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 	"math"
 	"os"
 	"strconv"
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 
 	lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
 
-	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
+	if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
 		// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
 		// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
 		switch m.memorySwapBehavior {
-		case kubelettypes.UnlimitedSwap:
-			// -1 = unlimited swap
-			lcr.MemorySwapLimitInBytes = -1
 		case kubelettypes.LimitedSwap:
-			fallthrough
+			swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
 		default:
-			// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
-			// Some swapping is still possible.
-			// Note that if memory limit is 0, memory swap limit is ignored.
-			lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
+			swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
 		}
+	} else {
+		swapConfigurationHelper.ConfigureNoSwap(lcr)
 	}
 
 	// Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 		memoryRequest := container.Resources.Requests.Memory().Value()
 		memoryLimit := container.Resources.Limits.Memory().Value()
 		if memoryRequest != 0 {
-			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
+			unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
 		}
 
 		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 				}
 			}
 			if memoryHigh != 0 && memoryHigh > memoryRequest {
-				unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
+				unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 			}
 		}
 		if len(unified) > 0 {
@@ -299,6 +298,93 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
 	return cStatusResources
 }
 
+// Note: this function variable is being added here so it would be possible to mock
+// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
+// the cgroup version would solely depend on the environment running the test.
 var isCgroup2UnifiedMode = func() bool {
 	return libcontainercgroups.IsCgroup2UnifiedMode()
 }
+
+type swapConfigurationHelper struct {
+	machineInfo cadvisorv1.MachineInfo
+}
+
+func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
+	return &swapConfigurationHelper{machineInfo: machineInfo}
+}
+
+func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
+	podQos := kubeapiqos.GetPodQOS(pod)
+	containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
+	memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
+
+	if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	containerMemoryRequest := container.Resources.Requests.Memory()
+	swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
+
+	if err != nil {
+		klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	m.configureSwap(lcr, swapLimit)
+}
+
+func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
+	if !isCgroup2UnifiedMode() {
+		// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
+		// Some swapping is still possible.
+		// Note that if memory limit is 0, memory swap limit is ignored.
+		lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
+		return
+	}
+
+	m.configureSwap(lcr, 0)
+}
+
+func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
+	if !isCgroup2UnifiedMode() {
+		m.ConfigureNoSwap(lcr)
+		return
+	}
+
+	if lcr.Unified == nil {
+		lcr.Unified = map[string]string{}
+	}
+
+	lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
+}
+
+func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
+	if !isCgroup2UnifiedMode() {
+		klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
+		return
+	}
+
+	if lcr.Unified == nil {
+		lcr.Unified = map[string]string{}
+	}
+
+	lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
+}
+
+// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
+// For more info, please look at the following KEP: https://kep.k8s.io/2400
+func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
+	if nodeTotalMemory <= 0 {
+		return 0, fmt.Errorf("total node memory is 0")
+	}
+	if containerMemoryRequest > nodeTotalMemory {
+		return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
+	}
+
+	containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
+	swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
+
+	return int64(swapAllocation), nil
+}