diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 79e16f66089..674d32a48aa 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -58207,7 +58207,7 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen }, "memoryThrottlingFactor": { SchemaProps: spec.SchemaProps{ - Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.8", + Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.9", Type: []string{"number"}, Format: "double", }, diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml index c7b59e6404b..8306dd69d38 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml @@ -61,7 +61,7 @@ maxOpenFiles: 1000000 maxPods: 110 memoryManagerPolicy: None memorySwap: {} -memoryThrottlingFactor: 0.8 +memoryThrottlingFactor: 0.9 nodeLeaseDurationSeconds: 40 nodeStatusMaxImages: 50 nodeStatusReportFrequency: 5m0s diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml index 753708fd88c..785010e2abb 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml @@ -61,7 +61,7 @@ maxOpenFiles: 1000000 maxPods: 110 memoryManagerPolicy: None memorySwap: {} -memoryThrottlingFactor: 0.8 +memoryThrottlingFactor: 0.9 nodeLeaseDurationSeconds: 40 nodeStatusMaxImages: 50 nodeStatusReportFrequency: 5m0s diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index be29dae1fd1..059c20f6050 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -438,7 +438,7 @@ type KubeletConfiguration struct { // Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure // while increasing will put less reclaim pressure. // See https://kep.k8s.io/2570 for more details. - // Default: 0.8 + // Default: 0.9 // +featureGate=MemoryQoS // +optional MemoryThrottlingFactor *float64 diff --git a/pkg/kubelet/apis/config/v1beta1/defaults.go b/pkg/kubelet/apis/config/v1beta1/defaults.go index 5d4e8c117fb..bf52fc2396a 100644 --- a/pkg/kubelet/apis/config/v1beta1/defaults.go +++ b/pkg/kubelet/apis/config/v1beta1/defaults.go @@ -38,7 +38,7 @@ const ( DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos - DefaultMemoryThrottlingFactor = 0.8 + DefaultMemoryThrottlingFactor = 0.9 ) var ( diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index ab9c9abc53b..a4de1238b12 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -65,7 +65,7 @@ var ( TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy, ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, - MemoryThrottlingFactor: utilpointer.Float64(0.8), + MemoryThrottlingFactor: utilpointer.Float64(0.9), FeatureGates: map[string]bool{ "CustomCPUCFSQuotaPeriod": true, "GracefulNodeShutdown": true, diff --git a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go index 30a340c0952..37479dd5b58 100644 --- a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go @@ -113,7 +113,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS internalLifecycle: cm.NewFakeInternalContainerLifecycle(), logReduction: logreduction.NewLogReduction(identicalErrorDelay), logManager: logManager, - memoryThrottlingFactor: 0.8, + memoryThrottlingFactor: 0.9, } typedVersion, err := runtimeService.Version(ctx, kubeRuntimeAPIVersion) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index 202ff2bce48..3cb9c968fb1 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -20,6 +20,8 @@ limitations under the License. package kuberuntime import ( + "math" + "os" "strconv" "time" @@ -37,6 +39,8 @@ import ( kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) +var defaultPageSize = int64(os.Getpagesize()) + // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { enforceMemoryQoS := false @@ -112,22 +116,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) } - // If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor - // for container level cgroup if memory.high>memory.min. - // If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor - // for container level cgroup. - memoryHigh := int64(0) - if memoryLimit != 0 { - memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor) - } else { - allocatable := m.getNodeAllocatable() - allocatableMemory, ok := allocatable[v1.ResourceMemory] - if ok && allocatableMemory.Value() > 0 { - memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor) + // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. + // Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high. + if memoryRequest != memoryLimit { + // The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27. + // It will be set based on formula: + // `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize` + // where default value of memory throttling factor is set to 0.9 + // More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos + memoryHigh := int64(0) + if memoryLimit != 0 { + memoryHigh = int64(math.Floor( + float64(memoryRequest)+ + (float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize + } else { + allocatable := m.getNodeAllocatable() + allocatableMemory, ok := allocatable[v1.ResourceMemory] + if ok && allocatableMemory.Value() > 0 { + memoryHigh = int64(math.Floor( + float64(memoryRequest)+ + (float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize + } + } + if memoryHigh != 0 && memoryHigh > memoryRequest { + unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } - } - if memoryHigh > memoryRequest { - unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } if len(unified) > 0 { if lcr.Unified == nil { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index 82b3ab16ff0..9d1e0d40871 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -21,6 +21,8 @@ package kuberuntime import ( "context" + "math" + "os" "reflect" "strconv" "testing" @@ -307,6 +309,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { _, _, m, err := createTestRuntimeManager() assert.NoError(t, err) + podRequestMemory := resource.MustParse("128Mi") + pod1LimitMemory := resource.MustParse("256Mi") pod1 := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ UID: "12345678", @@ -323,10 +327,10 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { WorkingDir: "testWorkingDir", Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("128Mi"), + v1.ResourceMemory: podRequestMemory, }, Limits: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("256Mi"), + v1.ResourceMemory: pod1LimitMemory, }, }, }, @@ -350,15 +354,21 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { WorkingDir: "testWorkingDir", Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{ - v1.ResourceMemory: resource.MustParse("128Mi"), + v1.ResourceMemory: podRequestMemory, }, }, }, }, }, } + pageSize := int64(os.Getpagesize()) memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory) - pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor + pod1MemoryHigh := int64(math.Floor( + float64(podRequestMemory.Value())+ + (float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize + pod2MemoryHigh := int64(math.Floor( + float64(podRequestMemory.Value())+ + (float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize type expectedResult struct { containerConfig *runtimeapi.LinuxContainerConfig @@ -378,7 +388,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { expected: &expectedResult{ l1, 128 * 1024 * 1024, - int64(float64(256*1024*1024) * m.memoryThrottlingFactor), + int64(pod1MemoryHigh), }, }, { diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index c7919ca5679..5822cb3f924 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -776,7 +776,7 @@ type KubeletConfiguration struct { // Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure // while increasing will put less reclaim pressure. // See https://kep.k8s.io/2570 for more details. - // Default: 0.8 + // Default: 0.9 // +featureGate=MemoryQoS // +optional MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`