diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index af140001e07..3cb9c968fb1 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -20,6 +20,8 @@ limitations under the License. package kuberuntime import ( + "math" + "os" "strconv" "time" @@ -37,6 +39,8 @@ import ( kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" ) +var defaultPageSize = int64(os.Getpagesize()) + // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig. func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error { enforceMemoryQoS := false @@ -112,22 +116,31 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10) } - // If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor - // for container level cgroup if memory.high>memory.min. - // If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor - // for container level cgroup. - memoryHigh := int64(0) - if memoryLimit != 0 { - memoryHigh = int64(float64(memoryRequest) + (float64(memoryLimit)-float64(memoryRequest))*m.memoryThrottlingFactor) - } else { - allocatable := m.getNodeAllocatable() - allocatableMemory, ok := allocatable[v1.ResourceMemory] - if ok && allocatableMemory.Value() > 0 { - memoryHigh = int64(float64(memoryRequest) + (float64(allocatableMemory.Value())-float64(memoryRequest))*m.memoryThrottlingFactor) + // Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit. + // Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high. + if memoryRequest != memoryLimit { + // The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27. + // It will be set based on formula: + // `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize` + // where default value of memory throttling factor is set to 0.9 + // More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos + memoryHigh := int64(0) + if memoryLimit != 0 { + memoryHigh = int64(math.Floor( + float64(memoryRequest)+ + (float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize + } else { + allocatable := m.getNodeAllocatable() + allocatableMemory, ok := allocatable[v1.ResourceMemory] + if ok && allocatableMemory.Value() > 0 { + memoryHigh = int64(math.Floor( + float64(memoryRequest)+ + (float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize + } + } + if memoryHigh != 0 && memoryHigh > memoryRequest { + unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } - } - if memoryHigh > memoryRequest { - unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10) } if len(unified) > 0 { if lcr.Unified == nil { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go index 99c790da417..9d1e0d40871 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux_test.go @@ -21,6 +21,8 @@ package kuberuntime import ( "context" + "math" + "os" "reflect" "strconv" "testing" @@ -359,9 +361,14 @@ func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) { }, }, } + pageSize := int64(os.Getpagesize()) memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory) - pod1MemoryHigh := float64(podRequestMemory.Value()) + (float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor - pod2MemoryHigh := float64(podRequestMemory.Value()) + (float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*m.memoryThrottlingFactor + pod1MemoryHigh := int64(math.Floor( + float64(podRequestMemory.Value())+ + (float64(pod1LimitMemory.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize + pod2MemoryHigh := int64(math.Floor( + float64(podRequestMemory.Value())+ + (float64(memoryNodeAllocatable.Value())-float64(podRequestMemory.Value()))*float64(m.memoryThrottlingFactor))/float64(pageSize)) * pageSize type expectedResult struct { containerConfig *runtimeapi.LinuxContainerConfig