LimitedSwap: Automatically configure swap limit for Burstable QoS Pods

After this commit, when LimitedSwap is enabled,
containers would get swap acess limited with respect
the container memory request, total physical memory
on the node, and the swap size on the node.

Pods of Best-Effort / Guaranteed QoS classes don't get
to swap. In addition, container with memory requests
that are equal to their memory limits also don't get to
swap.

The swap limitation is calculated in the following way:
1. Calculate the container's memory proportionate to the node's memory:
- Divide the container's memory request by the total node's physical memory.
  Let's call this value ContainerMemoryProportion.

2. Multiply the container memory proportion by the available
swap memory for Pods:
Meaning: ContainerMemoryProportion * TotalPodsSwapAvailable.

Fore more information:
https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md

Signed-off-by: Itamar Holder <iholder@redhat.com>
This commit is contained in:
Itamar Holder
2023-04-13 17:36:04 +03:00
parent e4da568f33
commit a30410d9ce
5 changed files with 107 additions and 20 deletions

View File

@@ -45,11 +45,12 @@ import (
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// MemoryMin is memory.min for cgroup v2
MemoryMin string = "memory.min"
// MemoryHigh is memory.high for cgroup v2
MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)
var RootCgroupName = CgroupName([]string{})

View File

@@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
}
if memoryMin > 0 {
result.Unified = map[string]string{
MemoryMin: strconv.FormatInt(memoryMin, 10),
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}

View File

@@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}

View File

@@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
}
@@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
}
}

View File

@@ -20,6 +20,9 @@ limitations under the License.
package kuberuntime
import (
"fmt"
cadvisorv1 "github.com/google/cadvisor/info/v1"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"math"
"os"
"strconv"
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.UnlimitedSwap:
// -1 = unlimited swap
lcr.MemorySwapLimitInBytes = -1
case kubelettypes.LimitedSwap:
fallthrough
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
} else {
swapConfigurationHelper.ConfigureNoSwap(lcr)
}
// Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
memoryRequest := container.Resources.Requests.Memory().Value()
memoryLimit := container.Resources.Limits.Memory().Value()
if memoryRequest != 0 {
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
}
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
}
}
if memoryHigh != 0 && memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
}
}
if len(unified) > 0 {
@@ -299,6 +298,93 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
return cStatusResources
}
// Note: this function variable is being added here so it would be possible to mock
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
// the cgroup version would solely depend on the environment running the test.
var isCgroup2UnifiedMode = func() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}
type swapConfigurationHelper struct {
machineInfo cadvisorv1.MachineInfo
}
func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
return &swapConfigurationHelper{machineInfo: machineInfo}
}
func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
podQos := kubeapiqos.GetPodQOS(pod)
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
m.ConfigureNoSwap(lcr)
return
}
containerMemoryRequest := container.Resources.Requests.Memory()
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
if err != nil {
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
m.ConfigureNoSwap(lcr)
return
}
m.configureSwap(lcr, swapLimit)
}
func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
return
}
m.configureSwap(lcr, 0)
}
func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
m.ConfigureNoSwap(lcr)
return
}
if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}
lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
}
func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
if !isCgroup2UnifiedMode() {
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
return
}
if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}
lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
}
// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
// For more info, please look at the following KEP: https://kep.k8s.io/2400
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
if nodeTotalMemory <= 0 {
return 0, fmt.Errorf("total node memory is 0")
}
if containerMemoryRequest > nodeTotalMemory {
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
}
containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
return int64(swapAllocation), nil
}