Merge pull request #118764 from iholder101/Swap/burstableQoS-impl

Add full cgroup v2 swap support with automatically calculated swap limit for LimitedSwap and Burstable QoS Pods
This commit is contained in:
Kubernetes Prow Robot 2023-07-17 19:49:07 -07:00 committed by GitHub
commit da2fdf8cc3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 831 additions and 169 deletions

View File

@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""}
CGROUP_ROOT=${CGROUP_ROOT:-""}
# owner of client certs, default to current user if not specified
USER=${USER:-$(whoami)}
# if true, limited swap is being used instead of unlimited swap (default)
LIMITED_SWAP=${LIMITED_SWAP:-""}
# required for cni installation
CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d}
@ -832,6 +834,13 @@ tracing:
EOF
fi
if [[ "$LIMITED_SWAP" == "true" ]]; then
cat <<EOF >> "${TMP_DIR}"/kubelet.yaml
memorySwap:
swapBehavior: LimitedSwap
EOF
fi
{
# authentication
echo "authentication:"

View File

@ -596,8 +596,9 @@ const (
// Allow pods to failover to a different node in case of non graceful node shutdown
NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach"
// owner: @ehashman
// owner: @iholder101
// alpha: v1.22
// beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400.
//
// Permits kubelet to run with swap enabled
NodeSwap featuregate.Feature = "NodeSwap"
@ -1074,7 +1075,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
NodeSwap: {Default: false, PreRelease: featuregate.Alpha},
NodeSwap: {Default: false, PreRelease: featuregate.Beta},
PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta},

View File

@ -45,11 +45,12 @@ import (
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// MemoryMin is memory.min for cgroup v2
MemoryMin string = "memory.min"
// MemoryHigh is memory.high for cgroup v2
MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)
var RootCgroupName = CgroupName([]string{})

View File

@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
}
if memoryMin > 0 {
result.Unified = map[string]string{
MemoryMin: strconv.FormatInt(memoryMin, 10),
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}

View File

@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}

View File

@ -292,7 +292,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
}
@ -300,7 +300,7 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
}
}

View File

@ -20,6 +20,9 @@ limitations under the License.
package kuberuntime
import (
"fmt"
cadvisorv1 "github.com/google/cadvisor/info/v1"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"math"
"os"
"strconv"
@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.UnlimitedSwap:
// -1 = unlimited swap
lcr.MemorySwapLimitInBytes = -1
case kubelettypes.LimitedSwap:
fallthrough
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
} else {
swapConfigurationHelper.ConfigureNoSwap(lcr)
}
// Set memory.min and memory.high to enforce MemoryQoS
@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
memoryRequest := container.Resources.Requests.Memory().Value()
memoryLimit := container.Resources.Limits.Memory().Value()
if memoryRequest != 0 {
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
}
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
}
}
if memoryHigh != 0 && memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
}
}
if len(unified) > 0 {
@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
return &runtimeapi.ContainerResources{
@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
}
// runc requires cgroupv2 for unified mode
if libcontainercgroups.IsCgroup2UnifiedMode() {
if isCgroup2UnifiedMode() {
resources.Unified = map[string]string{
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
}
return cStatusResources
}
// Note: this function variable is being added here so it would be possible to mock
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
// the cgroup version would solely depend on the environment running the test.
var isCgroup2UnifiedMode = func() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}
type swapConfigurationHelper struct {
machineInfo cadvisorv1.MachineInfo
}
func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
return &swapConfigurationHelper{machineInfo: machineInfo}
}
func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
podQos := kubeapiqos.GetPodQOS(pod)
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
m.ConfigureNoSwap(lcr)
return
}
containerMemoryRequest := container.Resources.Requests.Memory()
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
if err != nil {
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
m.ConfigureNoSwap(lcr)
return
}
m.configureSwap(lcr, swapLimit)
}
func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
return
}
m.configureSwap(lcr, 0)
}
func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
m.ConfigureNoSwap(lcr)
return
}
if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}
lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
}
func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
if !isCgroup2UnifiedMode() {
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
return
}
if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}
lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
}
// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
// For more info, please look at the following KEP: https://kep.k8s.io/2400
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
if nodeTotalMemory <= 0 {
return 0, fmt.Errorf("total node memory is 0")
}
if containerMemoryRequest > nodeTotalMemory {
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
}
containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
return int64(swapAllocation), nil
}

View File

@ -21,6 +21,9 @@ package kuberuntime
import (
"context"
"fmt"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/types"
"math"
"os"
"reflect"
@ -38,7 +41,6 @@ import (
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
)
func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig {
@ -244,11 +246,12 @@ func TestCalculateLinuxResources(t *testing.T) {
}
tests := []struct {
name string
cpuReq *resource.Quantity
cpuLim *resource.Quantity
memLim *resource.Quantity
expected *runtimeapi.LinuxContainerResources
name string
cpuReq *resource.Quantity
cpuLim *resource.Quantity
memLim *resource.Quantity
expected *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
}{
{
name: "Request128MBLimit256MB",
@ -261,6 +264,7 @@ func TestCalculateLinuxResources(t *testing.T) {
CpuShares: 1024,
MemoryLimitInBytes: 134217728,
},
cgroupVersion: cgroupV1,
},
{
name: "RequestNoMemory",
@ -273,6 +277,7 @@ func TestCalculateLinuxResources(t *testing.T) {
CpuShares: 2048,
MemoryLimitInBytes: 0,
},
cgroupVersion: cgroupV1,
},
{
name: "RequestNilCPU",
@ -284,6 +289,7 @@ func TestCalculateLinuxResources(t *testing.T) {
CpuShares: 2048,
MemoryLimitInBytes: 0,
},
cgroupVersion: cgroupV1,
},
{
name: "RequestZeroCPU",
@ -296,9 +302,66 @@ func TestCalculateLinuxResources(t *testing.T) {
CpuShares: 2,
MemoryLimitInBytes: 0,
},
cgroupVersion: cgroupV1,
},
{
name: "Request128MBLimit256MB",
cpuReq: generateResourceQuantity("1"),
cpuLim: generateResourceQuantity("2"),
memLim: generateResourceQuantity("128Mi"),
expected: &runtimeapi.LinuxContainerResources{
CpuPeriod: 100000,
CpuQuota: 200000,
CpuShares: 1024,
MemoryLimitInBytes: 134217728,
Unified: map[string]string{"memory.oom.group": "1"},
},
cgroupVersion: cgroupV2,
},
{
name: "RequestNoMemory",
cpuReq: generateResourceQuantity("2"),
cpuLim: generateResourceQuantity("8"),
memLim: generateResourceQuantity("0"),
expected: &runtimeapi.LinuxContainerResources{
CpuPeriod: 100000,
CpuQuota: 800000,
CpuShares: 2048,
MemoryLimitInBytes: 0,
Unified: map[string]string{"memory.oom.group": "1"},
},
cgroupVersion: cgroupV2,
},
{
name: "RequestNilCPU",
cpuLim: generateResourceQuantity("2"),
memLim: generateResourceQuantity("0"),
expected: &runtimeapi.LinuxContainerResources{
CpuPeriod: 100000,
CpuQuota: 200000,
CpuShares: 2048,
MemoryLimitInBytes: 0,
Unified: map[string]string{"memory.oom.group": "1"},
},
cgroupVersion: cgroupV2,
},
{
name: "RequestZeroCPU",
cpuReq: generateResourceQuantity("0"),
cpuLim: generateResourceQuantity("2"),
memLim: generateResourceQuantity("0"),
expected: &runtimeapi.LinuxContainerResources{
CpuPeriod: 100000,
CpuQuota: 200000,
CpuShares: 2,
MemoryLimitInBytes: 0,
Unified: map[string]string{"memory.oom.group": "1"},
},
cgroupVersion: cgroupV2,
},
}
for _, test := range tests {
setCgroupVersionDuringTest(test.cgroupVersion)
linuxContainerResources := m.calculateLinuxResources(test.cpuReq, test.cpuLim, test.memLim)
assert.Equal(t, test.expected, linuxContainerResources)
}
@ -634,96 +697,6 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) {
}
}
func TestGenerateLinuxContainerConfigSwap(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, true)()
_, _, m, err := createTestRuntimeManager()
if err != nil {
t.Fatalf("error creating test RuntimeManager: %v", err)
}
m.machineInfo.MemoryCapacity = 1000000
containerName := "test"
for _, tc := range []struct {
name string
swapSetting string
pod *v1.Pod
expected int64
}{
{
name: "config unset, memory limit set",
// no swap setting
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: containerName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
Requests: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
},
}},
},
},
expected: 1000,
},
{
name: "config unset, no memory limit",
// no swap setting
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{Name: containerName},
},
},
},
expected: 0,
},
{
// Note: behaviour will be the same as previous two cases
name: "config set to LimitedSwap, memory limit set",
swapSetting: kubelettypes.LimitedSwap,
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: containerName,
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
Requests: v1.ResourceList{
"memory": resource.MustParse("1000"),
},
},
}},
},
},
expected: 1000,
},
{
name: "UnlimitedSwap enabled",
swapSetting: kubelettypes.UnlimitedSwap,
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{Name: containerName},
},
},
},
expected: -1,
},
} {
t.Run(tc.name, func(t *testing.T) {
m.memorySwapBehavior = tc.swapSetting
actual, err := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false)
assert.NoError(t, err)
assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name)
})
}
}
func TestGenerateLinuxContainerResources(t *testing.T) {
_, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
@ -875,6 +848,10 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
if tc.scalingFg {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)()
}
setCgroupVersionDuringTest(cgroupV1)
tc.expected.MemorySwapLimitInBytes = tc.expected.MemoryLimitInBytes
pod.Spec.Containers[0].Resources = v1.ResourceRequirements{Limits: tc.limits, Requests: tc.requests}
if len(tc.cStatus) > 0 {
pod.Status.ContainerStatuses = tc.cStatus
@ -888,3 +865,289 @@ func TestGenerateLinuxContainerResources(t *testing.T) {
}
//TODO(vinaykul,InPlacePodVerticalScaling): Add unit tests for cgroup v1 & v2
}
func TestGenerateLinuxContainerResourcesWithSwap(t *testing.T) {
_, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
m.machineInfo.MemoryCapacity = 42949672960 // 40Gb == 40 * 1024^3
m.machineInfo.SwapCapacity = 5368709120 // 5Gb == 5 * 1024^3
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "foo",
Namespace: "bar",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "c1",
},
{
Name: "c2",
},
},
},
Status: v1.PodStatus{},
}
expectNoSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
const msg = "container is expected to not have swap access"
for _, r := range resources {
switch cgroupVersion {
case cgroupV1:
assert.Equal(t, r.MemoryLimitInBytes, r.MemorySwapLimitInBytes, msg)
case cgroupV2:
assert.Equal(t, "0", r.Unified[cm.Cgroup2MaxSwapFilename], msg)
}
}
}
expectUnlimitedSwap := func(cgroupVersion CgroupVersion, resources ...*runtimeapi.LinuxContainerResources) {
const msg = "container is expected to have unlimited swap access"
for _, r := range resources {
switch cgroupVersion {
case cgroupV1:
assert.Equal(t, int64(-1), r.MemorySwapLimitInBytes, msg)
case cgroupV2:
assert.Equal(t, "max", r.Unified[cm.Cgroup2MaxSwapFilename], msg)
}
}
}
expectSwap := func(cgroupVersion CgroupVersion, swapBytesExpected int64, resources *runtimeapi.LinuxContainerResources) {
msg := fmt.Sprintf("container swap is expected to be limited by %d bytes", swapBytesExpected)
switch cgroupVersion {
case cgroupV1:
assert.Equal(t, resources.MemoryLimitInBytes+swapBytesExpected, resources.MemorySwapLimitInBytes, msg)
case cgroupV2:
assert.Equal(t, fmt.Sprintf("%d", swapBytesExpected), resources.Unified[cm.Cgroup2MaxSwapFilename], msg)
}
}
calcSwapForBurstablePods := func(containerMemoryRequest int64) int64 {
swapSize, err := calcSwapForBurstablePods(containerMemoryRequest, int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
assert.NoError(t, err)
return swapSize
}
for _, tc := range []struct {
name string
cgroupVersion CgroupVersion
qosClass v1.PodQOSClass
nodeSwapFeatureGateEnabled bool
swapBehavior string
addContainerWithoutRequests bool
addGuaranteedContainer bool
}{
// With cgroup v1
{
name: "cgroups v1, LimitedSwap, Burstable QoS",
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "cgroups v1, UnlimitedSwap, Burstable QoS",
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
{
name: "cgroups v1, LimitedSwap, Best-effort QoS",
cgroupVersion: cgroupV1,
qosClass: v1.PodQOSBestEffort,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
// With feature gate turned off
{
name: "NodeSwap feature gate turned off, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: false,
swapBehavior: types.LimitedSwap,
},
{
name: "NodeSwap feature gate turned off, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: false,
swapBehavior: types.UnlimitedSwap,
},
// With no swapBehavior, UnlimitedSwap should be the default
{
name: "With no swapBehavior - UnlimitedSwap should be the default",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBestEffort,
nodeSwapFeatureGateEnabled: true,
swapBehavior: "",
},
// With Guaranteed and Best-effort QoS
{
name: "Best-effort Qos, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Best-effort Qos, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
{
name: "Guaranteed Qos, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
},
{
name: "Guaranteed Qos, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSGuaranteed,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
},
// With a "guaranteed" container (when memory requests equal to limits)
{
name: "Burstable Qos, cgroups v2, LimitedSwap, with a guaranteed container",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: true,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a guaranteed container",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: true,
},
// Swap is expected to be allocated
{
name: "Burstable Qos, cgroups v2, LimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: false,
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, LimitedSwap, with a container with no requests",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.LimitedSwap,
addContainerWithoutRequests: true,
addGuaranteedContainer: false,
},
{
name: "Burstable Qos, cgroups v2, UnlimitedSwap, with a container with no requests",
cgroupVersion: cgroupV2,
qosClass: v1.PodQOSBurstable,
nodeSwapFeatureGateEnabled: true,
swapBehavior: types.UnlimitedSwap,
addContainerWithoutRequests: true,
addGuaranteedContainer: false,
},
} {
t.Run(tc.name, func(t *testing.T) {
setCgroupVersionDuringTest(tc.cgroupVersion)
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeSwap, tc.nodeSwapFeatureGateEnabled)()
m.memorySwapBehavior = tc.swapBehavior
var resourceReqsC1, resourceReqsC2 v1.ResourceRequirements
switch tc.qosClass {
case v1.PodQOSBurstable:
resourceReqsC1 = v1.ResourceRequirements{
Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi")},
}
if !tc.addContainerWithoutRequests {
resourceReqsC2 = v1.ResourceRequirements{
Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")},
}
if tc.addGuaranteedContainer {
resourceReqsC2.Limits = v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi")}
}
}
case v1.PodQOSGuaranteed:
resourceReqsC1 = v1.ResourceRequirements{
Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")},
Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi"), v1.ResourceCPU: resource.MustParse("1")},
}
resourceReqsC2 = v1.ResourceRequirements{
Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")},
Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("2Gi"), v1.ResourceCPU: resource.MustParse("1")},
}
}
pod.Spec.Containers[0].Resources = resourceReqsC1
pod.Spec.Containers[1].Resources = resourceReqsC2
resourcesC1 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[0], false)
resourcesC2 := m.generateLinuxContainerResources(pod, &pod.Spec.Containers[1], false)
if !tc.nodeSwapFeatureGateEnabled || tc.cgroupVersion == cgroupV1 || (tc.swapBehavior == types.LimitedSwap && tc.qosClass != v1.PodQOSBurstable) {
expectNoSwap(tc.cgroupVersion, resourcesC1, resourcesC2)
return
}
if tc.swapBehavior == types.UnlimitedSwap || tc.swapBehavior == "" {
expectUnlimitedSwap(tc.cgroupVersion, resourcesC1, resourcesC2)
return
}
c1ExpectedSwap := calcSwapForBurstablePods(resourceReqsC1.Requests.Memory().Value())
c2ExpectedSwap := int64(0)
if !tc.addContainerWithoutRequests && !tc.addGuaranteedContainer {
c2ExpectedSwap = calcSwapForBurstablePods(resourceReqsC2.Requests.Memory().Value())
}
expectSwap(tc.cgroupVersion, c1ExpectedSwap, resourcesC1)
expectSwap(tc.cgroupVersion, c2ExpectedSwap, resourcesC2)
})
}
}
type CgroupVersion string
const (
cgroupV1 CgroupVersion = "v1"
cgroupV2 CgroupVersion = "v2"
)
func setCgroupVersionDuringTest(version CgroupVersion) {
isCgroup2UnifiedMode = func() bool {
return version == cgroupV2
}
}

View File

@ -38,6 +38,59 @@ func TestApplySandboxResources(t *testing.T) {
Linux: &runtimeapi.LinuxPodSandboxConfig{},
}
getPodWithOverhead := func() *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("2"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
v1.ResourceCPU: resource.MustParse("4"),
},
},
},
},
Overhead: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("1"),
},
},
}
}
getPodWithoutOverhead := func() *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
},
},
},
},
},
}
}
require.NoError(t, err)
tests := []struct {
@ -45,36 +98,11 @@ func TestApplySandboxResources(t *testing.T) {
pod *v1.Pod
expectedResource *runtimeapi.LinuxContainerResources
expectedOverhead *runtimeapi.LinuxContainerResources
cgroupVersion CgroupVersion
}{
{
description: "pod with overhead defined",
pod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("2"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
v1.ResourceCPU: resource.MustParse("4"),
},
},
},
},
Overhead: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("1"),
},
},
},
pod: getPodWithOverhead(),
expectedResource: &runtimeapi.LinuxContainerResources{
MemoryLimitInBytes: 268435456,
CpuPeriod: 100000,
@ -87,30 +115,11 @@ func TestApplySandboxResources(t *testing.T) {
CpuQuota: 100000,
CpuShares: 1024,
},
cgroupVersion: cgroupV1,
},
{
description: "pod without overhead defined",
pod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
},
},
},
},
},
},
pod: getPodWithoutOverhead(),
expectedResource: &runtimeapi.LinuxContainerResources{
MemoryLimitInBytes: 268435456,
CpuPeriod: 100000,
@ -118,10 +127,45 @@ func TestApplySandboxResources(t *testing.T) {
CpuShares: 2,
},
expectedOverhead: &runtimeapi.LinuxContainerResources{},
cgroupVersion: cgroupV1,
},
{
description: "pod with overhead defined",
pod: getPodWithOverhead(),
expectedResource: &runtimeapi.LinuxContainerResources{
MemoryLimitInBytes: 268435456,
CpuPeriod: 100000,
CpuQuota: 400000,
CpuShares: 2048,
Unified: map[string]string{"memory.oom.group": "1"},
},
expectedOverhead: &runtimeapi.LinuxContainerResources{
MemoryLimitInBytes: 134217728,
CpuPeriod: 100000,
CpuQuota: 100000,
CpuShares: 1024,
Unified: map[string]string{"memory.oom.group": "1"},
},
cgroupVersion: cgroupV2,
},
{
description: "pod without overhead defined",
pod: getPodWithoutOverhead(),
expectedResource: &runtimeapi.LinuxContainerResources{
MemoryLimitInBytes: 268435456,
CpuPeriod: 100000,
CpuQuota: 0,
CpuShares: 2,
Unified: map[string]string{"memory.oom.group": "1"},
},
expectedOverhead: &runtimeapi.LinuxContainerResources{},
cgroupVersion: cgroupV2,
},
}
for i, test := range tests {
setCgroupVersionDuringTest(test.cgroupVersion)
m.applySandboxResources(test.pod, config)
assert.Equal(t, test.expectedResource, config.Linux.Resources, "TestCase[%d]: %s", i, test.description)
assert.Equal(t, test.expectedOverhead, config.Linux.Overhead, "TestCase[%d]: %s", i, test.description)

254
test/e2e_node/swap_test.go Normal file
View File

@ -0,0 +1,254 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2enode
import (
"context"
"fmt"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/rand"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/test/e2e/framework"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
testutils "k8s.io/kubernetes/test/utils"
admissionapi "k8s.io/pod-security-admission/api"
"path/filepath"
"strconv"
)
const (
cgroupBasePath = "/sys/fs/cgroup/"
cgroupV1SwapLimitFile = "/memory/memory.memsw.limit_in_bytes"
cgroupV2SwapLimitFile = "memory.swap.max"
cgroupV1MemLimitFile = "/memory/memory.limit_in_bytes"
)
var _ = SIGDescribe("Swap [NodeConformance][LinuxOnly]", func() {
f := framework.NewDefaultFramework("swap-test")
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline
ginkgo.DescribeTable("with configuration", func(qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) {
ginkgo.By(fmt.Sprintf("Creating a pod of QOS class %s. memoryRequestEqualLimit: %t", qosClass, memoryRequestEqualLimit))
pod := getSwapTestPod(f, qosClass, memoryRequestEqualLimit)
pod = runPodAndWaitUntilScheduled(f, pod)
isCgroupV2 := isPodCgroupV2(f, pod)
isLimitedSwap := isLimitedSwap(f, pod)
if !isSwapFeatureGateEnabled() || !isCgroupV2 || (isLimitedSwap && (qosClass != v1.PodQOSBurstable || memoryRequestEqualLimit)) {
ginkgo.By(fmt.Sprintf("Expecting no swap. feature gate on? %t isCgroupV2? %t is QoS burstable? %t", isSwapFeatureGateEnabled(), isCgroupV2, qosClass == v1.PodQOSBurstable))
expectNoSwap(f, pod, isCgroupV2)
return
}
if !isLimitedSwap {
ginkgo.By("expecting unlimited swap")
expectUnlimitedSwap(f, pod, isCgroupV2)
return
}
ginkgo.By("expecting limited swap")
expectedSwapLimit := calcSwapForBurstablePod(f, pod)
expectLimitedSwap(f, pod, expectedSwapLimit)
},
ginkgo.Entry("QOS Best-effort", v1.PodQOSBestEffort, false),
ginkgo.Entry("QOS Burstable", v1.PodQOSBurstable, false),
ginkgo.Entry("QOS Burstable with memory request equals to limit", v1.PodQOSBurstable, true),
ginkgo.Entry("QOS Guaranteed", v1.PodQOSGuaranteed, false),
)
})
// Note that memoryRequestEqualLimit is effective only when qosClass is PodQOSBestEffort.
func getSwapTestPod(f *framework.Framework, qosClass v1.PodQOSClass, memoryRequestEqualLimit bool) *v1.Pod {
podMemoryAmount := resource.MustParse("128Mi")
var resources v1.ResourceRequirements
switch qosClass {
case v1.PodQOSBestEffort:
// nothing to do in this case
case v1.PodQOSBurstable:
resources = v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: podMemoryAmount,
},
}
if memoryRequestEqualLimit {
resources.Limits = resources.Requests
}
case v1.PodQOSGuaranteed:
resources = v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("200m"),
v1.ResourceMemory: podMemoryAmount,
},
}
resources.Requests = resources.Limits
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod-swap-" + rand.String(5),
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyAlways,
Containers: []v1.Container{
{
Name: "busybox-container",
Image: busyboxImage,
Command: []string{"sleep", "600"},
Resources: resources,
},
},
},
}
return pod
}
func runPodAndWaitUntilScheduled(f *framework.Framework, pod *v1.Pod) *v1.Pod {
ginkgo.By("running swap test pod")
podClient := e2epod.NewPodClient(f)
pod = podClient.CreateSync(context.Background(), pod)
pod, err := podClient.Get(context.Background(), pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
isReady, err := testutils.PodRunningReady(pod)
framework.ExpectNoError(err)
gomega.ExpectWithOffset(1, isReady).To(gomega.BeTrue(), "pod should be ready")
return pod
}
func isSwapFeatureGateEnabled() bool {
ginkgo.By("figuring if NodeSwap feature gate is turned on")
return utilfeature.DefaultFeatureGate.Enabled(features.NodeSwap)
}
func readCgroupFile(f *framework.Framework, pod *v1.Pod, filename string) string {
filePath := filepath.Join(cgroupBasePath, filename)
ginkgo.By("reading cgroup file " + filePath)
output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "cat "+filePath)
return output
}
func isPodCgroupV2(f *framework.Framework, pod *v1.Pod) bool {
ginkgo.By("figuring is test pod runs cgroup v2")
output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", `if test -f "/sys/fs/cgroup/cgroup.controllers"; then echo "true"; else echo "false"; fi`)
return output == "true"
}
func expectNoSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) {
if isCgroupV2 {
swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("0"), "max swap allowed should be zero")
} else {
swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile)
memLimit := readCgroupFile(f, pod, cgroupV1MemLimitFile)
gomega.ExpectWithOffset(1, swapPlusMemLimit).ToNot(gomega.BeEmpty())
gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal(memLimit))
}
}
func expectUnlimitedSwap(f *framework.Framework, pod *v1.Pod, isCgroupV2 bool) {
if isCgroupV2 {
swapLimit := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
gomega.ExpectWithOffset(1, swapLimit).To(gomega.Equal("max"), "max swap allowed should be \"max\"")
} else {
swapPlusMemLimit := readCgroupFile(f, pod, cgroupV1SwapLimitFile)
gomega.ExpectWithOffset(1, swapPlusMemLimit).To(gomega.Equal("-1"))
}
}
// supports v2 only as v1 shouldn't support LimitedSwap
func expectLimitedSwap(f *framework.Framework, pod *v1.Pod, expectedSwapLimit int64) {
swapLimitStr := readCgroupFile(f, pod, cgroupV2SwapLimitFile)
swapLimit, err := strconv.Atoi(swapLimitStr)
framework.ExpectNoError(err, "cannot convert swap limit to int")
// cgroup values are always aligned w.r.t. the page size, which is usually 4Ki
const cgroupAlignment int64 = 4 * 1024 // 4Ki
const errMsg = "swap limitation is not as expected"
gomega.ExpectWithOffset(1, int64(swapLimit)).To(
gomega.Or(
gomega.BeNumerically(">=", expectedSwapLimit-cgroupAlignment),
gomega.BeNumerically("<=", expectedSwapLimit+cgroupAlignment),
),
errMsg,
)
}
func getSwapCapacity(f *framework.Framework, pod *v1.Pod) int64 {
output := e2epod.ExecCommandInContainer(f, pod.Name, pod.Spec.Containers[0].Name, "/bin/sh", "-ec", "free -b | grep Swap | xargs | cut -d\" \" -f2")
swapCapacity, err := strconv.Atoi(output)
framework.ExpectNoError(err, "cannot convert swap size to int")
ginkgo.By(fmt.Sprintf("providing swap capacity: %d", swapCapacity))
return int64(swapCapacity)
}
func getMemoryCapacity(f *framework.Framework, pod *v1.Pod) int64 {
nodes, err := f.ClientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
framework.ExpectNoError(err, "failed listing nodes")
for _, node := range nodes.Items {
if node.Name != pod.Spec.NodeName {
continue
}
memCapacity := node.Status.Capacity[v1.ResourceMemory]
return memCapacity.Value()
}
framework.ExpectNoError(fmt.Errorf("node %s wasn't found", pod.Spec.NodeName))
return 0
}
func calcSwapForBurstablePod(f *framework.Framework, pod *v1.Pod) int64 {
nodeMemoryCapacity := getMemoryCapacity(f, pod)
nodeSwapCapacity := getSwapCapacity(f, pod)
containerMemoryRequest := pod.Spec.Containers[0].Resources.Requests.Memory().Value()
containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeMemoryCapacity)
swapAllocation := containerMemoryProportion * float64(nodeSwapCapacity)
ginkgo.By(fmt.Sprintf("Calculating swap for burstable pods: nodeMemoryCapacity: %d, nodeSwapCapacity: %d, containerMemoryRequest: %d, swapAllocation: %d",
nodeMemoryCapacity, nodeSwapCapacity, containerMemoryRequest, int64(swapAllocation)))
return int64(swapAllocation)
}
func isLimitedSwap(f *framework.Framework, pod *v1.Pod) bool {
kubeletCfg, err := getCurrentKubeletConfig(context.Background())
framework.ExpectNoError(err, "cannot get kubelet config")
return kubeletCfg.MemorySwap.SwapBehavior == types.LimitedSwap
}