Merge pull request #115371 from pacoxu/cgroup-v2-memory-tuning

default memoryThrottlingFactor to 0.9 and optimize the memory.high formulas
This commit is contained in:
Kubernetes Prow Robot 2023-03-08 18:46:00 -08:00 committed by GitHub
commit 625b8be09e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 51 additions and 28 deletions

View File

@ -58207,7 +58207,7 @@ func schema_k8sio_kubelet_config_v1beta1_KubeletConfiguration(ref common.Referen
}, },
"memoryThrottlingFactor": { "memoryThrottlingFactor": {
SchemaProps: spec.SchemaProps{ SchemaProps: spec.SchemaProps{
Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.8", Description: "MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory when setting the cgroupv2 memory.high value to enforce MemoryQoS. Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure while increasing will put less reclaim pressure. See https://kep.k8s.io/2570 for more details. Default: 0.9",
Type: []string{"number"}, Type: []string{"number"},
Format: "double", Format: "double",
}, },

View File

@ -61,7 +61,7 @@ maxOpenFiles: 1000000
maxPods: 110 maxPods: 110
memoryManagerPolicy: None memoryManagerPolicy: None
memorySwap: {} memorySwap: {}
memoryThrottlingFactor: 0.8 memoryThrottlingFactor: 0.9
nodeLeaseDurationSeconds: 40 nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50 nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s nodeStatusReportFrequency: 5m0s

View File

@ -61,7 +61,7 @@ maxOpenFiles: 1000000
maxPods: 110 maxPods: 110
memoryManagerPolicy: None memoryManagerPolicy: None
memorySwap: {} memorySwap: {}
memoryThrottlingFactor: 0.8 memoryThrottlingFactor: 0.9
nodeLeaseDurationSeconds: 40 nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50 nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s nodeStatusReportFrequency: 5m0s

View File

@ -438,7 +438,7 @@ type KubeletConfiguration struct {
// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure // Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
// while increasing will put less reclaim pressure. // while increasing will put less reclaim pressure.
// See https://kep.k8s.io/2570 for more details. // See https://kep.k8s.io/2570 for more details.
// Default: 0.8 // Default: 0.9
// +featureGate=MemoryQoS // +featureGate=MemoryQoS
// +optional // +optional
MemoryThrottlingFactor *float64 MemoryThrottlingFactor *float64

View File

@ -38,7 +38,7 @@ const (
DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos // See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
DefaultMemoryThrottlingFactor = 0.8 DefaultMemoryThrottlingFactor = 0.9
) )
var ( var (

View File

@ -65,7 +65,7 @@ var (
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy, TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
MemoryThrottlingFactor: utilpointer.Float64(0.8), MemoryThrottlingFactor: utilpointer.Float64(0.9),
FeatureGates: map[string]bool{ FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true, "CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true, "GracefulNodeShutdown": true,

View File

@ -113,7 +113,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
internalLifecycle: cm.NewFakeInternalContainerLifecycle(), internalLifecycle: cm.NewFakeInternalContainerLifecycle(),
logReduction: logreduction.NewLogReduction(identicalErrorDelay), logReduction: logreduction.NewLogReduction(identicalErrorDelay),
logManager: logManager, logManager: logManager,
memoryThrottlingFactor: 0.8, memoryThrottlingFactor: 0.9,
} }
typedVersion, err := runtimeService.Version(ctx, kubeRuntimeAPIVersion) typedVersion, err := runtimeService.Version(ctx, kubeRuntimeAPIVersion)

View File

@ -20,6 +20,8 @@ limitations under the License.
package kuberuntime package kuberuntime
import ( import (
"math"
"os"
"strconv" "strconv"
"time" "time"
@ -37,6 +39,8 @@ import (
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
) )
var defaultPageSize = int64(os.Getpagesize())
// applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
enforceMemoryQoS := false enforceMemoryQoS := false
@ -112,22 +116,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
} }
// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
// for container level cgroup if memory.high>memory.min. // Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high.
// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor if memoryRequest != memoryLimit {
// for container level cgroup. // The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27.
memoryHigh := int64(0) // It will be set based on formula:
if memoryLimit != 0 { // `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize`
memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor) // where default value of memory throttling factor is set to 0.9
} else { // More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos
allocatable := m.getNodeAllocatable() memoryHigh := int64(0)
allocatableMemory, ok := allocatable[v1.ResourceMemory] if memoryLimit != 0 {
if ok && allocatableMemory.Value() > 0 { memoryHigh = int64(math.Floor(
memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor) float64(memoryRequest)+
(float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
} else {
allocatable := m.getNodeAllocatable()
allocatableMemory, ok := allocatable[v1.ResourceMemory]
if ok && allocatableMemory.Value() > 0 {
memoryHigh = int64(math.Floor(
float64(memoryRequest)+
(float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
}
}
if memoryHigh != 0 && memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
} }
}
if memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
} }
if len(unified) > 0 { if len(unified) > 0 {
if lcr.Unified == nil { if lcr.Unified == nil {

View File

@ -21,6 +21,8 @@ package kuberuntime
import ( import (
"context" "context"
"math"
"os"
"reflect" "reflect"
"strconv" "strconv"
"testing" "testing"
@ -307,6 +309,8 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
_, _, m, err := createTestRuntimeManager() _, _, m, err := createTestRuntimeManager()
assert.NoError(t, err) assert.NoError(t, err)
podRequestMemory := resource.MustParse("128Mi")
pod1LimitMemory := resource.MustParse("256Mi")
pod1 := &v1.Pod{ pod1 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
UID: "12345678", UID: "12345678",
@ -323,10 +327,10 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
WorkingDir: "testWorkingDir", WorkingDir: "testWorkingDir",
Resources: v1.ResourceRequirements{ Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{ Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"), v1.ResourceMemory: podRequestMemory,
}, },
Limits: v1.ResourceList{ Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"), v1.ResourceMemory: pod1LimitMemory,
}, },
}, },
}, },
@ -350,15 +354,21 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
WorkingDir: "testWorkingDir", WorkingDir: "testWorkingDir",
Resources: v1.ResourceRequirements{ Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{ Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"), v1.ResourceMemory: podRequestMemory,
}, },
}, },
}, },
}, },
}, },
} }
pageSize := int64(os.Getpagesize())
memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory) memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory)
pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor pod1MemoryHigh := int64(math.Floor(
float64(podRequestMemory.Value())+
(float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize
pod2MemoryHigh := int64(math.Floor(
float64(podRequestMemory.Value())+
(float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize
type expectedResult struct { type expectedResult struct {
containerConfig *runtimeapi.LinuxContainerConfig containerConfig *runtimeapi.LinuxContainerConfig
@ -378,7 +388,7 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
expected: &expectedResult{ expected: &expectedResult{
l1, l1,
128 * 1024 * 1024, 128 * 1024 * 1024,
int64(float64(256*1024*1024) * m.memoryThrottlingFactor), int64(pod1MemoryHigh),
}, },
}, },
{ {

View File

@ -776,7 +776,7 @@ type KubeletConfiguration struct {
// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure // Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
// while increasing will put less reclaim pressure. // while increasing will put less reclaim pressure.
// See https://kep.k8s.io/2570 for more details. // See https://kep.k8s.io/2570 for more details.
// Default: 0.8 // Default: 0.9
// +featureGate=MemoryQoS // +featureGate=MemoryQoS
// +optional // +optional
MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"` MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`