From 7dab6253e1d644c28200db57d215afda9dc7182c Mon Sep 17 00:00:00 2001
From: Paco Xu <paco.xu@daocloud.io>
Date: Sat, 28 Jan 2023 17:35:29 +0800
Subject: [PATCH 1/2] default memoryThrottlingFactor to 0.9 and optimize the
 memory.high calculation formulas

---
 pkg/generated/openapi/zz_generated.openapi.go       |  2 +-
 .../KubeletConfiguration/after/v1beta1.yaml         |  2 +-
 .../roundtrip/default/v1beta1.yaml                  |  2 +-
 pkg/kubelet/apis/config/types.go                    |  2 +-
 pkg/kubelet/apis/config/v1beta1/defaults.go         |  2 +-
 .../apis/config/validation/validation_test.go       |  2 +-
 pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go |  2 +-
 .../kuberuntime/kuberuntime_container_linux.go      |  4 ++--
 .../kuberuntime/kuberuntime_container_linux_test.go | 13 ++++++++-----
 staging/src/k8s.io/kubelet/config/v1beta1/types.go  |  2 +-
 10 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go
index 3c9bca19fe1..25aa97fb182 100644
--- a/pkg/generated/openapi/zz_generated.openapi.go
+++ b/pkg/generated/openapi/zz_generated.openapi.go
@@ -58071,7 +58071,7 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
 					},
 					"memoryThrottlingFactor": {
 						SchemaProps: spec.SchemaProps{
-							Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.8",
+							Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.9",
 							Type:        []string{"number"},
 							Format:      "double",
 						},
diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml
index 401cc7b8eb4..3dd905dbc24 100644
--- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml
+++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml
@@ -61,7 +61,7 @@ maxOpenFiles: 1000000
 maxPods: 110
 memoryManagerPolicy: None
 memorySwap: {}
-memoryThrottlingFactor: 0.8
+memoryThrottlingFactor: 0.9
 nodeLeaseDurationSeconds: 40
 nodeStatusMaxImages: 50
 nodeStatusReportFrequency: 5m0s
diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml
index 401cc7b8eb4..3dd905dbc24 100644
--- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml
+++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml
@@ -61,7 +61,7 @@ maxOpenFiles: 1000000
 maxPods: 110
 memoryManagerPolicy: None
 memorySwap: {}
-memoryThrottlingFactor: 0.8
+memoryThrottlingFactor: 0.9
 nodeLeaseDurationSeconds: 40
 nodeStatusMaxImages: 50
 nodeStatusReportFrequency: 5m0s
diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go
index 526d89eb6e1..e9f4c335d9b 100644
--- a/pkg/kubelet/apis/config/types.go
+++ b/pkg/kubelet/apis/config/types.go
@@ -440,7 +440,7 @@ type KubeletConfiguration struct {
 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
 	// while increasing will put less reclaim pressure.
 	// See https://kep.k8s.io/2570 for more details.
-	// Default: 0.8
+	// Default: 0.9
 	// +featureGate=MemoryQoS
 	// +optional
 	MemoryThrottlingFactor *float64
diff --git a/pkg/kubelet/apis/config/v1beta1/defaults.go b/pkg/kubelet/apis/config/v1beta1/defaults.go
index 37f714b2227..ee2e5870b07 100644
--- a/pkg/kubelet/apis/config/v1beta1/defaults.go
+++ b/pkg/kubelet/apis/config/v1beta1/defaults.go
@@ -38,7 +38,7 @@ const (
 	DefaultVolumePluginDir       = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"
 
 	// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
-	DefaultMemoryThrottlingFactor = 0.8
+	DefaultMemoryThrottlingFactor = 0.9
 )
 
 var (
diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go
index db86577b6b5..611d9db62d6 100644
--- a/pkg/kubelet/apis/config/validation/validation_test.go
+++ b/pkg/kubelet/apis/config/validation/validation_test.go
@@ -65,7 +65,7 @@ var (
 		TopologyManagerPolicy:           kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second},
 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
-		MemoryThrottlingFactor:          utilpointer.Float64(0.8),
+		MemoryThrottlingFactor:          utilpointer.Float64(0.9),
 		FeatureGates: map[string]bool{
 			"CustomCPUCFSQuotaPeriod": true,
 			"GracefulNodeShutdown":    true,
diff --git a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
index 30a340c0952..37479dd5b58 100644
--- a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
@@ -113,7 +113,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
 		internalLifecycle:      cm.NewFakeInternalContainerLifecycle(),
 		logReduction:           logreduction.NewLogReduction(identicalErrorDelay),
 		logManager:             logManager,
-		memoryThrottlingFactor: 0.8,
+		memoryThrottlingFactor: 0.9,
 	}
 
 	typedVersion, err := runtimeService.Version(ctx, kubeRuntimeAPIVersion)
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index 202ff2bce48..af140001e07 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -118,12 +118,12 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 		// for container level cgroup.
 		memoryHigh := int64(0)
 		if memoryLimit != 0 {
-			memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor)
+			memoryHigh = int64(float64(memoryRequest) + (float64(memoryLimit)-float64(memoryRequest))*m.memoryThrottlingFactor)
 		} else {
 			allocatable := m.getNodeAllocatable()
 			allocatableMemory, ok := allocatable[v1.ResourceMemory]
 			if ok && allocatableMemory.Value() > 0 {
-				memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor)
+				memoryHigh = int64(float64(memoryRequest) + (float64(allocatableMemory.Value())-float64(memoryRequest))*m.memoryThrottlingFactor)
 			}
 		}
 		if memoryHigh > memoryRequest {
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
index 82b3ab16ff0..99c790da417 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@@ -307,6 +307,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 	_, _, m, err := createTestRuntimeManager()
 	assert.NoError(t, err)
 
+	podRequestMemory := resource.MustParse("128Mi")
+	pod1LimitMemory := resource.MustParse("256Mi")
 	pod1 := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			UID:       "12345678",
@@ -323,10 +325,10 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 					WorkingDir:      "testWorkingDir",
 					Resources: v1.ResourceRequirements{
 						Requests: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("128Mi"),
+							v1.ResourceMemory: podRequestMemory,
 						},
 						Limits: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("256Mi"),
+							v1.ResourceMemory: pod1LimitMemory,
 						},
 					},
 				},
@@ -350,7 +352,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 					WorkingDir:      "testWorkingDir",
 					Resources: v1.ResourceRequirements{
 						Requests: v1.ResourceList{
-							v1.ResourceMemory: resource.MustParse("128Mi"),
+							v1.ResourceMemory: podRequestMemory,
 						},
 					},
 				},
@@ -358,7 +360,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 		},
 	}
 	memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory)
-	pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor
+	pod1MemoryHigh := float64(podRequestMemory.Value()) + (float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor
+	pod2MemoryHigh := float64(podRequestMemory.Value()) + (float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor
 
 	type expectedResult struct {
 		containerConfig *runtimeapi.LinuxContainerConfig
@@ -378,7 +381,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 			expected: &expectedResult{
 				l1,
 				128 * 1024 * 1024,
-				int64(float64(256*1024*1024) * m.memoryThrottlingFactor),
+				int64(pod1MemoryHigh),
 			},
 		},
 		{
diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
index 0e80321791e..97b3ab29243 100644
--- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
@@ -778,7 +778,7 @@ type KubeletConfiguration struct {
 	// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
 	// while increasing will put less reclaim pressure.
 	// See https://kep.k8s.io/2570 for more details.
-	// Default: 0.8
+	// Default: 0.9
 	// +featureGate=MemoryQoS
 	// +optional
 	MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`

From 81c5a122c37e5b25ca0880c2edec209896dbd871 Mon Sep 17 00:00:00 2001
From: Paco Xu <paco.xu@daocloud.io>
Date: Wed, 8 Feb 2023 13:47:30 +0800
Subject: [PATCH 2/2] add pageSize to memory.high formula

---
 .../kuberuntime_container_linux.go            | 43 ++++++++++++-------
 .../kuberuntime_container_linux_test.go       | 11 ++++-
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
index af140001e07..3cb9c968fb1 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
@@ -20,6 +20,8 @@ limitations under the License.
 package kuberuntime
 
 import (
+	"math"
+	"os"
 	"strconv"
 	"time"
 
@@ -37,6 +39,8 @@ import (
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 )
 
+var defaultPageSize = int64(os.Getpagesize())
+
 // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
 func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
 	enforceMemoryQoS := false
@@ -112,22 +116,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
 			unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
 		}
 
-		// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor
-		// for container level cgroup if memory.high>memory.min.
-		// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor
-		// for container level cgroup.
-		memoryHigh := int64(0)
-		if memoryLimit != 0 {
-			memoryHigh = int64(float64(memoryRequest) + (float64(memoryLimit)-float64(memoryRequest))*m.memoryThrottlingFactor)
-		} else {
-			allocatable := m.getNodeAllocatable()
-			allocatableMemory, ok := allocatable[v1.ResourceMemory]
-			if ok && allocatableMemory.Value() > 0 {
-				memoryHigh = int64(float64(memoryRequest) + (float64(allocatableMemory.Value())-float64(memoryRequest))*m.memoryThrottlingFactor)
+		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
+		// Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high.
+		if memoryRequest != memoryLimit {
+			// The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27.
+			// It will be set based on formula:
+			// `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize`
+			// where default value of memory throttling factor is set to 0.9
+			// More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos
+			memoryHigh := int64(0)
+			if memoryLimit != 0 {
+				memoryHigh = int64(math.Floor(
+					float64(memoryRequest)+
+						(float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
+			} else {
+				allocatable := m.getNodeAllocatable()
+				allocatableMemory, ok := allocatable[v1.ResourceMemory]
+				if ok && allocatableMemory.Value() > 0 {
+					memoryHigh = int64(math.Floor(
+						float64(memoryRequest)+
+							(float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
+				}
+			}
+			if memoryHigh != 0 && memoryHigh > memoryRequest {
+				unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 			}
-		}
-		if memoryHigh > memoryRequest {
-			unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
 		}
 		if len(unified) > 0 {
 			if lcr.Unified == nil {
diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
index 99c790da417..9d1e0d40871 100644
--- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go
@@ -21,6 +21,8 @@ package kuberuntime
 
 import (
 	"context"
+	"math"
+	"os"
 	"reflect"
 	"strconv"
 	"testing"
@@ -359,9 +361,14 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
 			},
 		},
 	}
+	pageSize := int64(os.Getpagesize())
 	memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory)
-	pod1MemoryHigh := float64(podRequestMemory.Value()) + (float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor
-	pod2MemoryHigh := float64(podRequestMemory.Value()) + (float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor
+	pod1MemoryHigh := int64(math.Floor(
+		float64(podRequestMemory.Value())+
+			(float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize
+	pod2MemoryHigh := int64(math.Floor(
+		float64(podRequestMemory.Value())+
+			(float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize
 
 	type expectedResult struct {
 		containerConfig *runtimeapi.LinuxContainerConfig