Merge pull request #115371 from pacoxu/cgroup-v2-memory-tuning

default memoryThrottlingFactor to 0.9 and optimize the memory.high formulas
2025-07-22 11:21:47 +00:00 · 2023-03-08 18:46:00 -08:00 · 2023-03-08 18:46:00 -08:00 · 625b8be09e
commit 625b8be09e
parent 8d5c96fed2 81c5a122c3
10 changed files with 51 additions and 28 deletions
--- a/pkg/generated/openapi/zz_generated.openapi.go
+++ b/pkg/generated/openapi/zz_generated.openapi.go
@ -58207,7 +58207,7 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
 					},
 					"memoryThrottlingFactor": {
 						SchemaProps: spec.SchemaProps{
-							Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.8",
+							Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.9",
 							Type:        []string{"number"},
 							Format:      "double",
 						},
--- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml
+++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml
@ -61,7 +61,7 @@ maxOpenFiles: 1000000
 maxPods: 110
 memoryManagerPolicy: None
 memorySwap: {}
-memoryThrottlingFactor: 0.8
+memoryThrottlingFactor: 0.9
 nodeLeaseDurationSeconds: 40
 nodeStatusMaxImages: 50
 nodeStatusReportFrequency: 5m0s
--- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml
+++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml
@ -61,7 +61,7 @@ maxOpenFiles: 1000000
 maxPods: 110
 memoryManagerPolicy: None
 memorySwap: {}
-memoryThrottlingFactor: 0.8
+memoryThrottlingFactor: 0.9
 nodeLeaseDurationSeconds: 40
 nodeStatusMaxImages: 50
 nodeStatusReportFrequency: 5m0s
--- a/pkg/kubelet/apis/config/types.go
+++ b/pkg/kubelet/apis/config/types.go
@ -438,7 +438,7 @@ type KubeletConfiguration struct {
 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
 	// while increasing will put less reclaim pressure.
 	// See https://kep.k8s.io/2570 for more details.
-	// Default: 0.8
+	// Default: 0.9
 	// +featureGate=MemoryQoS
 	// +optional
 	MemoryThrottlingFactor *float64
--- a/pkg/kubelet/apis/config/v1beta1/defaults.go
+++ b/pkg/kubelet/apis/config/v1beta1/defaults.go
@ -38,7 +38,7 @@ const (
 	DefaultVolumePluginDir       = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"

 	// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
-	DefaultMemoryThrottlingFactor = 0.8
+	DefaultMemoryThrottlingFactor = 0.9
 )

 var (
--- a/pkg/kubelet/apis/config/validation/validation_test.go
+++ b/pkg/kubelet/apis/config/validation/validation_test.go
@ -65,7 +65,7 @@ var (
 		TopologyManagerPolicy:           kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second},
 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
-		MemoryThrottlingFactor:          utilpointer.Float64(0.8),
+		MemoryThrottlingFactor:          utilpointer.Float64(0.9),
 		FeatureGates: map[string]bool{
 			"CustomCPUCFSQuotaPeriod": true,
 			"GracefulNodeShutdown":    true,
--- a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
@ -113,7 +113,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
 		internalLifecycle:      cm.NewFakeInternalContainerLifecycle(),
 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay),
 		logManager:             logManager,
-		memoryThrottlingFactor: 0.8,
+		memoryThrottlingFactor: 0.9,
 	}

 	typedVersion, err := runtimeService.Version(ctx, kubeRuntimeAPIVersion)
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@ -20,6 +20,8 @@ limitations under the License.
 package kuberuntime

 import (
+	"math"
+	"os"
 	"strconv"
 	"time"

@ -37,6 +39,8 @@ import (
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 )

+var defaultPageSize = int64(os.Getpagesize())
+
 // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
 func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
 	enforceMemoryQoS := false
@ -112,22 +116,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
 		}

-		// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor
-		// for container level cgroup if memory.high>memory.min.
-		// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor
-		// for container level cgroup.
-		memoryHigh := int64(0)
-		if memoryLimit != 0 {
-			memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor)
-		} else {
-			allocatable := m.getNodeAllocatable()
-			allocatableMemory, ok := allocatable[v1.ResourceMemory]
-			if ok && allocatableMemory.Value() > 0 {
-				memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor)
+		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
+		// Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high.
+		if memoryRequest != memoryLimit {
+			// The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27.
+			// It will be set based on formula:
+			// `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize`
+			// where default value of memory throttling factor is set to 0.9
+			// More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos
+			memoryHigh := int64(0)
+			if memoryLimit != 0 {
+				memoryHigh = int64(math.Floor(
+					float64(memoryRequest)+
+						(float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
+			} else {
+				allocatable := m.getNodeAllocatable()
+				allocatableMemory, ok := allocatable[v1.ResourceMemory]
+				if ok && allocatableMemory.Value() > 0 {
+					memoryHigh = int64(math.Floor(
+						float64(memoryRequest)+
+							(float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
+				}
+			}
+			if memoryHigh != 0 && memoryHigh > memoryRequest {
+				unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 			}
-		}
-		if memoryHigh > memoryRequest {
-			unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 		}
 		if len(unified) > 0 {
 			if lcr.Unified == nil {
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@ -21,6 +21,8 @@ package kuberuntime

 import (
 	"context"
+	"math"
+	"os"
 	"reflect"
 	"strconv"
 	"testing"
@ -307,6 +309,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 	_, _, m, err := createTestRuntimeManager()
 	assert.NoError(t, err)

+	podRequestMemory := resource.MustParse("128Mi")
+	pod1LimitMemory := resource.MustParse("256Mi")
 	pod1 := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			UID:       "12345678",
@ -323,10 +327,10 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 					WorkingDir:      "testWorkingDir",
 					Resources: v1.ResourceRequirements{
 						Requests: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("128Mi"),
+							v1.ResourceMemory: podRequestMemory,
 						},
 						Limits: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("256Mi"),
+							v1.ResourceMemory: pod1LimitMemory,
 						},
 					},
 				},
@ -350,15 +354,21 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 					WorkingDir:      "testWorkingDir",
 					Resources: v1.ResourceRequirements{
 						Requests: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("128Mi"),
+							v1.ResourceMemory: podRequestMemory,
 						},
 					},
 				},
 			},
 		},
 	}
+	pageSize := int64(os.Getpagesize())
 	memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory)
-	pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor
+	pod1MemoryHigh := int64(math.Floor(
+		float64(podRequestMemory.Value())+
+			(float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize
+	pod2MemoryHigh := int64(math.Floor(
+		float64(podRequestMemory.Value())+
+			(float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize

 	type expectedResult struct {
 		containerConfig *runtimeapi.LinuxContainerConfig
@ -378,7 +388,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 			expected: &expectedResult{
 				l1,
 				128 * 1024 * 1024,
-				int64(float64(256*1024*1024) * m.memoryThrottlingFactor),
+				int64(pod1MemoryHigh),
 			},
 		},
 		{
--- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
@ -776,7 +776,7 @@ type KubeletConfiguration struct {
 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
 	// while increasing will put less reclaim pressure.
 	// See https://kep.k8s.io/2570 for more details.
-	// Default: 0.8
+	// Default: 0.9
 	// +featureGate=MemoryQoS
 	// +optional
 	MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`