mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-31 16:46:54 +00:00
LimitedSwap: Automatically configure swap limit for Burstable QoS Pods
After this commit, when LimitedSwap is enabled, containers would get swap acess limited with respect the container memory request, total physical memory on the node, and the swap size on the node. Pods of Best-Effort / Guaranteed QoS classes don't get to swap. In addition, container with memory requests that are equal to their memory limits also don't get to swap. The swap limitation is calculated in the following way: 1. Calculate the container's memory proportionate to the node's memory: - Divide the container's memory request by the total node's physical memory. Let's call this value ContainerMemoryProportion. 2. Multiply the container memory proportion by the available swap memory for Pods: Meaning: ContainerMemoryProportion * TotalPodsSwapAvailable. Fore more information: https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md Signed-off-by: Itamar Holder <iholder@redhat.com>
This commit is contained in:
@@ -45,11 +45,12 @@ import (
|
||||
const (
|
||||
// systemdSuffix is the cgroup name suffix for systemd
|
||||
systemdSuffix string = ".slice"
|
||||
// MemoryMin is memory.min for cgroup v2
|
||||
MemoryMin string = "memory.min"
|
||||
// MemoryHigh is memory.high for cgroup v2
|
||||
MemoryHigh string = "memory.high"
|
||||
Cgroup2MaxCpuLimit string = "max"
|
||||
// Cgroup2MemoryMin is memory.min for cgroup v2
|
||||
Cgroup2MemoryMin string = "memory.min"
|
||||
// Cgroup2MemoryHigh is memory.high for cgroup v2
|
||||
Cgroup2MemoryHigh string = "memory.high"
|
||||
Cgroup2MaxCpuLimit string = "max"
|
||||
Cgroup2MaxSwapFilename string = "memory.swap.max"
|
||||
)
|
||||
|
||||
var RootCgroupName = CgroupName([]string{})
|
||||
|
@@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
|
||||
}
|
||||
if memoryMin > 0 {
|
||||
result.Unified = map[string]string{
|
||||
MemoryMin: strconv.FormatInt(memoryMin, 10),
|
||||
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
|
||||
if rp.Unified == nil {
|
||||
rp.Unified = make(map[string]string)
|
||||
}
|
||||
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
|
||||
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
|
||||
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
|
||||
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
|
||||
}
|
||||
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
|
||||
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
|
||||
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
|
||||
}
|
||||
|
||||
@@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
|
||||
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
|
||||
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
|
||||
}
|
||||
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
|
||||
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
|
||||
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
|
||||
}
|
||||
}
|
||||
|
@@ -20,6 +20,9 @@ limitations under the License.
|
||||
package kuberuntime
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
cadvisorv1 "github.com/google/cadvisor/info/v1"
|
||||
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
|
||||
|
||||
lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
|
||||
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
|
||||
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
|
||||
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
|
||||
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
|
||||
switch m.memorySwapBehavior {
|
||||
case kubelettypes.UnlimitedSwap:
|
||||
// -1 = unlimited swap
|
||||
lcr.MemorySwapLimitInBytes = -1
|
||||
case kubelettypes.LimitedSwap:
|
||||
fallthrough
|
||||
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
|
||||
default:
|
||||
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
|
||||
// Some swapping is still possible.
|
||||
// Note that if memory limit is 0, memory swap limit is ignored.
|
||||
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
|
||||
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
|
||||
}
|
||||
} else {
|
||||
swapConfigurationHelper.ConfigureNoSwap(lcr)
|
||||
}
|
||||
|
||||
// Set memory.min and memory.high to enforce MemoryQoS
|
||||
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
|
||||
memoryRequest := container.Resources.Requests.Memory().Value()
|
||||
memoryLimit := container.Resources.Limits.Memory().Value()
|
||||
if memoryRequest != 0 {
|
||||
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
|
||||
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
|
||||
}
|
||||
|
||||
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
|
||||
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
|
||||
}
|
||||
}
|
||||
if memoryHigh != 0 && memoryHigh > memoryRequest {
|
||||
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
|
||||
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
|
||||
}
|
||||
}
|
||||
if len(unified) > 0 {
|
||||
@@ -299,6 +298,93 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
|
||||
return cStatusResources
|
||||
}
|
||||
|
||||
// Note: this function variable is being added here so it would be possible to mock
|
||||
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
|
||||
// the cgroup version would solely depend on the environment running the test.
|
||||
var isCgroup2UnifiedMode = func() bool {
|
||||
return libcontainercgroups.IsCgroup2UnifiedMode()
|
||||
}
|
||||
|
||||
type swapConfigurationHelper struct {
|
||||
machineInfo cadvisorv1.MachineInfo
|
||||
}
|
||||
|
||||
func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
|
||||
return &swapConfigurationHelper{machineInfo: machineInfo}
|
||||
}
|
||||
|
||||
func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
|
||||
podQos := kubeapiqos.GetPodQOS(pod)
|
||||
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
|
||||
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
|
||||
|
||||
if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
|
||||
m.ConfigureNoSwap(lcr)
|
||||
return
|
||||
}
|
||||
|
||||
containerMemoryRequest := container.Resources.Requests.Memory()
|
||||
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
|
||||
|
||||
if err != nil {
|
||||
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
|
||||
m.ConfigureNoSwap(lcr)
|
||||
return
|
||||
}
|
||||
|
||||
m.configureSwap(lcr, swapLimit)
|
||||
}
|
||||
|
||||
func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
|
||||
if !isCgroup2UnifiedMode() {
|
||||
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
|
||||
// Some swapping is still possible.
|
||||
// Note that if memory limit is 0, memory swap limit is ignored.
|
||||
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
|
||||
return
|
||||
}
|
||||
|
||||
m.configureSwap(lcr, 0)
|
||||
}
|
||||
|
||||
func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
|
||||
if !isCgroup2UnifiedMode() {
|
||||
m.ConfigureNoSwap(lcr)
|
||||
return
|
||||
}
|
||||
|
||||
if lcr.Unified == nil {
|
||||
lcr.Unified = map[string]string{}
|
||||
}
|
||||
|
||||
lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
|
||||
}
|
||||
|
||||
func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
|
||||
if !isCgroup2UnifiedMode() {
|
||||
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
|
||||
return
|
||||
}
|
||||
|
||||
if lcr.Unified == nil {
|
||||
lcr.Unified = map[string]string{}
|
||||
}
|
||||
|
||||
lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
|
||||
}
|
||||
|
||||
// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
|
||||
// For more info, please look at the following KEP: https://kep.k8s.io/2400
|
||||
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
|
||||
if nodeTotalMemory <= 0 {
|
||||
return 0, fmt.Errorf("total node memory is 0")
|
||||
}
|
||||
if containerMemoryRequest > nodeTotalMemory {
|
||||
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
|
||||
}
|
||||
|
||||
containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
|
||||
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
|
||||
|
||||
return int64(swapAllocation), nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user