diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index c4be02a45b2..f54eaa2979f 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -45,11 +45,12 @@ import ( const ( // systemdSuffix is the cgroup name suffix for systemd systemdSuffix string = ".slice" - // MemoryMin is memory.min for cgroup v2 - MemoryMin string = "memory.min" - // MemoryHigh is memory.high for cgroup v2 - MemoryHigh string = "memory.high" - Cgroup2MaxCpuLimit string = "max" + // Cgroup2MemoryMin is memory.min for cgroup v2 + Cgroup2MemoryMin string = "memory.min" + // Cgroup2MemoryHigh is memory.high for cgroup v2 + Cgroup2MemoryHigh string = "memory.high" + Cgroup2MaxCpuLimit string = "max" + Cgroup2MaxSwapFilename string = "memory.swap.max" ) var RootCgroupName = CgroupName([]string{}) diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 18b0df17bfc..8a144e7a73c 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, } if memoryMin > 0 { result.Unified = map[string]string{ - MemoryMin: strconv.FormatInt(memoryMin, 10), + Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10), } } } diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go index 74221c67047..b57403dd95b 100644 --- a/pkg/kubelet/cm/node_container_manager_linux.go +++ b/pkg/kubelet/cm/node_container_manager_linux.go @@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1. if rp.Unified == nil { rp.Unified = make(map[string]string) } - rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10) + rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10) } } diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 89b3adae9af..abf4487ee5d 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil { configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string) } - configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10) + configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10) klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin) } @@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil { configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string) } - configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10) + configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10) klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin) } } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index 4153ab7e13c..c600d49bc25 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -20,6 +20,9 @@ limitations under the License. package kuberuntime import ( + "fmt" + cadvisorv1 "github.com/google/cadvisor/info/v1" + kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" "math" "os" "strconv" @@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources) - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { + if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) { // NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec: // https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory switch m.memorySwapBehavior { - case kubelettypes.UnlimitedSwap: - // -1 = unlimited swap - lcr.MemorySwapLimitInBytes = -1 case kubelettypes.LimitedSwap: - fallthrough + swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container) default: - // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit - // Some swapping is still possible. - // Note that if memory limit is 0, memory swap limit is ignored. - lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + swapConfigurationHelper.ConfigureUnlimitedSwap(lcr) } + } else { + swapConfigurationHelper.ConfigureNoSwap(lcr) } // Set memory.min and memory.high to enforce MemoryQoS @@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, memoryRequest := container.Resources.Requests.Memory().Value() memoryLimit := container.Resources.Limits.Memory().Value() if memoryRequest != 0 { - unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) + unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10) } // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. @@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, } } if memoryHigh != 0 && memoryHigh > memoryRequest { - unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) + unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } } if len(unified) > 0 { @@ -299,6 +298,93 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k return cStatusResources } +// Note: this function variable is being added here so it would be possible to mock +// the cgroup version for unit tests by assigning a new mocked function into it. Without it, +// the cgroup version would solely depend on the environment running the test. var isCgroup2UnifiedMode = func() bool { return libcontainercgroups.IsCgroup2UnifiedMode() } + +type swapConfigurationHelper struct { + machineInfo cadvisorv1.MachineInfo +} + +func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper { + return &swapConfigurationHelper{machineInfo: machineInfo} +} + +func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) { + podQos := kubeapiqos.GetPodQOS(pod) + containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero() + memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0 + + if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit { + m.ConfigureNoSwap(lcr) + return + } + + containerMemoryRequest := container.Resources.Requests.Memory() + swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity)) + + if err != nil { + klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap") + m.ConfigureNoSwap(lcr) + return + } + + m.configureSwap(lcr, swapLimit) +} + +func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) { + if !isCgroup2UnifiedMode() { + // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit + // Some swapping is still possible. + // Note that if memory limit is 0, memory swap limit is ignored. + lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + return + } + + m.configureSwap(lcr, 0) +} + +func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) { + if !isCgroup2UnifiedMode() { + m.ConfigureNoSwap(lcr) + return + } + + if lcr.Unified == nil { + lcr.Unified = map[string]string{} + } + + lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max" +} + +func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) { + if !isCgroup2UnifiedMode() { + klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected") + return + } + + if lcr.Unified == nil { + lcr.Unified = map[string]string{} + } + + lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory) +} + +// The swap limit is calculated as (/)*. +// For more info, please look at the following KEP: https://kep.k8s.io/2400 +func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) { + if nodeTotalMemory <= 0 { + return 0, fmt.Errorf("total node memory is 0") + } + if containerMemoryRequest > nodeTotalMemory { + return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory) + } + + containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory) + swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable) + + return int64(swapAllocation), nil +}