feature: support Memory QoS for cgroups v2

This commit is contained in:
Li Bo 2021-06-17 20:11:58 +08:00
parent f915aa39e8
commit c3d9b10ca8
38 changed files with 796 additions and 60 deletions

View File

@ -522,6 +522,11 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend
return err
}
// Warn if MemoryQoS enabled with cgroups v1
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryQoS) &&
!isCgroup2UnifiedMode() {
klog.InfoS("Warning: MemoryQoS feature only works with cgroups v2 on Linux, but enabled with cgroups v1")
}
// Obtain Kubelet Lock File
if s.ExitOnLockContention && s.LockFilePath == "" {
return errors.New("cannot exit on lock file contention: no lock file specified")

View File

@ -19,6 +19,8 @@ package app
import (
"k8s.io/klog/v2"
"k8s.io/utils/inotify"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
)
func watchForLockfileContention(path string, done chan struct{}) error {
@ -44,3 +46,7 @@ func watchForLockfileContention(path string, done chan struct{}) error {
}()
return nil
}
func isCgroup2UnifiedMode() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}

View File

@ -23,3 +23,7 @@ import "errors"
func watchForLockfileContention(path string, done chan struct{}) error {
return errors.New("kubelet unsupported in this build")
}
func isCgroup2UnifiedMode() bool {
return false
}

View File

@ -773,6 +773,13 @@ const (
// The user namespace has to be created before running kubelet.
// All the node components such as CRI need to be running in the same user namespace.
KubeletInUserNamespace featuregate.Feature = "KubeletInUserNamespace"
// owner: @xiaoxubeii
// kep: http://kep.k8s.io/2570
// alpha: v1.22
//
// Enables kubelet to support memory QoS with cgroups v2.
MemoryQoS featuregate.Feature = "MemoryQoS"
)
func init() {
@ -888,6 +895,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
CSRDuration: {Default: true, PreRelease: featuregate.Beta},
DelegateFSGroupToCSIDriver: {Default: false, PreRelease: featuregate.Alpha},
KubeletInUserNamespace: {Default: false, PreRelease: featuregate.Alpha},
MemoryQoS: {Default: false, PreRelease: featuregate.Alpha},
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
// unintentionally on either side:

View File

@ -17,6 +17,7 @@ limitations under the License.
package fuzzer
import (
"math/rand"
"time"
"github.com/google/gofuzz"
@ -29,6 +30,7 @@ import (
kubeletconfigv1beta1 "k8s.io/kubernetes/pkg/kubelet/apis/config/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/qos"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
utilpointer "k8s.io/utils/pointer"
)
// Funcs returns the fuzzer functions for the kubeletconfig apis.
@ -106,6 +108,7 @@ func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
obj.Logging.Format = "text"
}
obj.EnableSystemLogHandler = true
obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(rand.Float64())
},
}
}

View File

@ -249,5 +249,6 @@ var (
"VolumePluginDir",
"ShutdownGracePeriod.Duration",
"ShutdownGracePeriodCriticalPods.Duration",
"MemoryThrottlingFactor",
)
)

View File

@ -59,6 +59,7 @@ maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memorySwap: {}
memoryThrottlingFactor: 0.8
nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s

View File

@ -59,6 +59,7 @@ maxOpenFiles: 1000000
maxPods: 110
memoryManagerPolicy: None
memorySwap: {}
memoryThrottlingFactor: 0.8
nodeLeaseDurationSeconds: 40
nodeStatusMaxImages: 50
nodeStatusReportFrequency: 5m0s

View File

@ -413,6 +413,15 @@ type KubeletConfiguration struct {
EnableDebugFlagsHandler bool
// SeccompDefault enables the use of `RuntimeDefault` as the default seccomp profile for all workloads.
SeccompDefault bool
// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory
// when setting the cgroupv2 memory.high value to enforce MemoryQoS.
// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
// while increasing will put less reclaim pressure.
// See http://kep.k8s.io/2570 for more details.
// Default: 0.8
// +featureGate=MemoryQoS
// +optional
MemoryThrottlingFactor *float64
}
// KubeletAuthorizationMode denotes the authorization mode for the kubelet

View File

@ -36,6 +36,9 @@ const (
DefaultIPTablesMasqueradeBit = 14
DefaultIPTablesDropBit = 15
DefaultVolumePluginDir = "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
DefaultMemoryThrottlingFactor = 0.8
)
var (
@ -255,4 +258,7 @@ func SetDefaults_KubeletConfiguration(obj *kubeletconfigv1beta1.KubeletConfigura
if obj.SeccompDefault == nil {
obj.SeccompDefault = utilpointer.BoolPtr(false)
}
if obj.MemoryThrottlingFactor == nil {
obj.MemoryThrottlingFactor = utilpointer.Float64Ptr(DefaultMemoryThrottlingFactor)
}
}

View File

@ -387,6 +387,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
if err := v1.Convert_Pointer_bool_To_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil {
return err
}
out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor))
return nil
}
@ -554,6 +555,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
if err := v1.Convert_bool_To_Pointer_bool(&in.SeccompDefault, &out.SeccompDefault, s); err != nil {
return err
}
out.MemoryThrottlingFactor = (*float64)(unsafe.Pointer(in.MemoryThrottlingFactor))
return nil
}

View File

@ -212,5 +212,13 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
if errs := logs.ValidateLoggingConfiguration(&kc.Logging, field.NewPath("logging")); len(errs) > 0 {
allErrors = append(allErrors, errs.ToAggregate().Errors()...)
}
if localFeatureGate.Enabled(features.MemoryQoS) && kc.MemoryThrottlingFactor == nil {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor is required when MemoryQoS feature flag is enabled"))
}
if kc.MemoryThrottlingFactor != nil && (*kc.MemoryThrottlingFactor <= 0 || *kc.MemoryThrottlingFactor > 1.0) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: memoryThrottlingFactor %v must be greater than 0 and less than or equal to 1.0", kc.MemoryThrottlingFactor))
}
return utilerrors.NewAggregate(allErrors)
}

View File

@ -25,6 +25,7 @@ import (
componentbaseconfig "k8s.io/component-base/config"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
utilpointer "k8s.io/utils/pointer"
)
func TestValidateKubeletConfiguration(t *testing.T) {
@ -59,9 +60,11 @@ func TestValidateKubeletConfiguration(t *testing.T) {
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.8),
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
"MemoryQoS": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
@ -103,8 +106,10 @@ func TestValidateKubeletConfiguration(t *testing.T) {
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.9),
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"MemoryQoS": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
@ -147,10 +152,12 @@ func TestValidateKubeletConfiguration(t *testing.T) {
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5),
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
"NodeSwapEnabled": true,
"MemoryQoS": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
@ -230,16 +237,18 @@ func TestValidateKubeletConfiguration(t *testing.T) {
ShutdownGracePeriod: metav1.Duration{Duration: 40 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: "invalid"},
MemoryThrottlingFactor: utilpointer.Float64Ptr(1.1),
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
"NodeSwapEnabled": true,
"MemoryQoS": true,
},
Logging: componentbaseconfig.LoggingConfiguration{
Format: "text",
},
}
const numErrsErrorCase2 = 4
const numErrsErrorCase2 = 5
if allErrors := ValidateKubeletConfiguration(errorCase2); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase2 {
t.Errorf("expect %d errors, got %v", numErrsErrorCase2, len(allErrors.(utilerrors.Aggregate).Errors()))
}

View File

@ -282,6 +282,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.MemoryThrottlingFactor != nil {
in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor
*out = new(float64)
**out = **in
}
return
}

View File

@ -53,6 +53,10 @@ const (
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// MemoryMin is memory.min for cgroup v2
MemoryMin string = "memory.min"
// MemoryHigh is memory.high for cgroup v2
MemoryHigh string = "memory.high"
)
var RootCgroupName = CgroupName([]string{})
@ -434,6 +438,15 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
Limit: uint64(0),
})
}
// Ideally unified is used for all the resources when running on cgroup v2.
// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller
// you can specify the correct setting without relying on the conversions performed by the OCI runtime.
if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() {
resources.Unified = make(map[string]string)
for k, v := range resourceConfig.Unified {
resources.Unified[k] = v
}
}
return resources
}

View File

@ -112,6 +112,9 @@ type ContainerManager interface {
// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources.
GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
GetNodeAllocatableAbsolute() v1.ResourceList
// Implements the podresources Provider API for CPUs, Memory and Devices
podresources.CPUsProvider
podresources.DevicesProvider

View File

@ -491,7 +491,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
err = cm.qosContainerManager.Start(cm.GetNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialize top level QOS containers: %v", err)
}

View File

@ -147,6 +147,10 @@ func (cm *containerManagerStub) GetAllocatableMemory() []*podresourcesapi.Contai
return nil
}
func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: false}
}

View File

@ -251,3 +251,7 @@ func (cm *containerManagerImpl) GetMemory(_, _ string) []*podresourcesapi.Contai
func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}

View File

@ -227,3 +227,9 @@ func (cm *FakeContainerManager) GetAllocatableMemory() []*podresourcesapi.Contai
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
return nil
}

View File

@ -113,7 +113,7 @@ func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64) *ResourceConfig {
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
// sum requests and limits.
reqs, limits := resource.PodRequestsAndLimits(pod)
@ -185,6 +185,19 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64)
result.CpuShares = &shares
}
result.HugePageLimit = hugePageLimits
if enforceMemoryQoS {
memoryMin := int64(0)
if request, found := reqs[v1.ResourceMemory]; found {
memoryMin = request.Value()
}
if memoryMin > 0 {
result.Unified = map[string]string{
MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}
return result
}

View File

@ -252,7 +252,7 @@ func TestResourceConfigForPod(t *testing.T) {
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false)
if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) {
t.Errorf("unexpected result, test: %v, cpu period not as expected", testName)
@ -472,7 +472,7 @@ func TestResourceConfigForPodWithCustomCPUCFSQuotaPeriod(t *testing.T) {
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod)
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, false)
if !reflect.DeepEqual(actual.CpuPeriod, testCase.expected.CpuPeriod) {
t.Errorf("unexpected result, test: %v, cpu period not as expected", testName)
@ -622,3 +622,211 @@ func TestHugePageLimits(t *testing.T) {
}
}
func TestResourceConfigForPodWithEnforceMemoryQoS(t *testing.T) {
defaultQuotaPeriod := uint64(100 * time.Millisecond / time.Microsecond)
tunedQuotaPeriod := uint64(5 * time.Millisecond / time.Microsecond)
minShares := uint64(MinShares)
burstableShares := MilliCPUToShares(100)
memoryQuantity := resource.MustParse("200Mi")
burstableMemory := memoryQuantity.Value()
burstablePartialShares := MilliCPUToShares(200)
burstableQuota := MilliCPUToQuota(200, int64(defaultQuotaPeriod))
guaranteedShares := MilliCPUToShares(100)
guaranteedQuota := MilliCPUToQuota(100, int64(defaultQuotaPeriod))
guaranteedTunedQuota := MilliCPUToQuota(100, int64(tunedQuotaPeriod))
memoryQuantity = resource.MustParse("100Mi")
cpuNoLimit := int64(-1)
guaranteedMemory := memoryQuantity.Value()
testCases := map[string]struct {
pod *v1.Pod
expected *ResourceConfig
enforceCPULimits bool
quotaPeriod uint64
}{
"besteffort": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("", ""), getResourceList("", "")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &minShares},
},
"burstable-no-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstableShares, Unified: map[string]string{"memory.min": "104857600"}},
},
"burstable-with-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"burstable-with-limits-no-cpu-enforcement": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
enforceCPULimits: false,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"burstable-partial-limits": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}},
},
"burstable-with-limits-with-tuned-quota": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: tunedQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &burstableQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"burstable-with-limits-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
},
},
},
enforceCPULimits: false,
quotaPeriod: tunedQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstableShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &burstableMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"burstable-partial-limits-with-tuned-quota": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("200m", "200Mi")),
},
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("", "")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: tunedQuotaPeriod,
expected: &ResourceConfig{CpuShares: &burstablePartialShares, Unified: map[string]string{"memory.min": "209715200"}},
},
"guaranteed": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedQuota, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"guaranteed-no-cpu-enforcement": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
enforceCPULimits: false,
quotaPeriod: defaultQuotaPeriod,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &defaultQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"guaranteed-with-tuned-quota": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
enforceCPULimits: true,
quotaPeriod: tunedQuotaPeriod,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &guaranteedTunedQuota, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
"guaranteed-no-cpu-enforcement-with-tuned-quota": {
pod: &v1.Pod{
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Resources: getResourceRequirements(getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
},
},
},
},
enforceCPULimits: false,
quotaPeriod: tunedQuotaPeriod,
expected: &ResourceConfig{CpuShares: &guaranteedShares, CpuQuota: &cpuNoLimit, CpuPeriod: &tunedQuotaPeriod, Memory: &guaranteedMemory, Unified: map[string]string{"memory.min": "104857600"}},
},
}
for testName, testCase := range testCases {
actual := ResourceConfigForPod(testCase.pod, testCase.enforceCPULimits, testCase.quotaPeriod, true)
if !reflect.DeepEqual(actual.Unified, testCase.expected.Unified) {
t.Errorf("unexpected result, test: %v, unified not as expected", testName)
}
}
}

View File

@ -42,7 +42,7 @@ func MilliCPUToShares(milliCPU int64) int64 {
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64) *ResourceConfig {
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
return nil
}

View File

@ -20,13 +20,16 @@ package cm
import (
"fmt"
"strconv"
"strings"
"time"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
@ -131,9 +134,22 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error {
rp := getCgroupConfig(rl)
// Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information,
// see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) {
if rp.Memory != nil {
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}
cgroupConfig := &CgroupConfig{
Name: cName,
ResourceParameters: getCgroupConfig(rl),
ResourceParameters: rp,
}
if cgroupConfig.ResourceParameters == nil {
return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name)
@ -174,10 +190,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
return &rc
}
// getNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.capacity)
}

View File

@ -250,7 +250,7 @@ func TestNodeAllocatableForEnforcement(t *testing.T) {
NodeConfig: nc,
capacity: tc.capacity,
}
for k, v := range cm.getNodeAllocatableAbsolute() {
for k, v := range cm.GetNodeAllocatableAbsolute() {
expected, exists := tc.expected[k]
assert.True(t, exists)
assert.Equal(t, expected.MilliValue(), v.MilliValue(), "test case %d failed for resource %q", idx+1, k)

View File

@ -23,11 +23,14 @@ import (
"path"
"strings"
v1 "k8s.io/api/core/v1"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
@ -71,14 +74,22 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
enforceMemoryQoS := false
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
// Create the pod container
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
}
if m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
}
if enforceMemoryQoS {
klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified)
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
}

View File

@ -18,6 +18,7 @@ package cm
import (
"fmt"
"strconv"
"strings"
"sync"
"time"
@ -27,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -191,10 +193,9 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]
return nil
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
// getQoSMemoryRequests sums and returns the memory request of all pods for
// guaranteed and burstable qos classes.
func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 {
qosMemoryRequests := map[v1.PodQOSClass]int64{
v1.PodQOSGuaranteed: 0,
v1.PodQOSBurstable: 0,
@ -216,6 +217,15 @@ func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*C
qosMemoryRequests[qosClass] += podMemoryRequest
}
return qosMemoryRequests
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
qosMemoryRequests := m.getQoSMemoryRequests()
resources := m.getNodeAllocatable()
allocatableResource, ok := resources[v1.ResourceMemory]
if !ok {
@ -265,11 +275,43 @@ func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSCla
}
}
// setMemoryQoS sums the memory requests of all pods in the Burstable class,
// and set the sum memory as the memory.min in the Unified field of CgroupConfig.
func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) {
qosMemoryRequests := m.getQoSMemoryRequests()
// Calculate the memory.min:
// for burstable(/kubepods/burstable): sum of all burstable pods
// for guaranteed(/kubepods): sum of all guaranteed and burstable pods
burstableMin := qosMemoryRequests[v1.PodQOSBurstable]
guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin
if burstableMin > 0 {
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memory.min", burstableMin)
}
if guaranteedMin > 0 {
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memory.min", guaranteedMin)
}
}
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSGuaranteed: {
Name: m.qosContainersInfo.Guaranteed,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
@ -290,6 +332,12 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
return err
}
// update the qos level cgrougs v2 settings of memory qos if feature enabled
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
m.setMemoryQoS(qosConfigs)
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
for resource, percentReserve := range m.qosReserved {
switch resource {

View File

@ -0,0 +1,154 @@
// +build linux
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"strconv"
"testing"
"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func activeTestPods() []*v1.Pod {
return []*v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "guaranteed-pod",
Namespace: "test",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("1"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("1"),
},
},
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
UID: "87654321",
Name: "burstable-pod-1",
Namespace: "test",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
v1.ResourceCPU: resource.MustParse("1"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
v1.ResourceCPU: resource.MustParse("2"),
},
},
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
UID: "01234567",
Name: "burstable-pod-2",
Namespace: "test",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
v1.ResourceCPU: resource.MustParse("2"),
},
},
},
},
},
},
}
}
func createTestQOSContainerManager() (*qosContainerManagerImpl, error) {
subsystems, err := GetCgroupSubsystems()
if err != nil {
return nil, fmt.Errorf("failed to get mounted cgroup subsystems: %v", err)
}
cgroupRoot := ParseCgroupfsToCgroupName("/")
cgroupRoot = NewCgroupName(cgroupRoot, defaultNodeAllocatableCgroupName)
qosContainerManager := &qosContainerManagerImpl{
subsystems: subsystems,
cgroupManager: NewCgroupManager(subsystems, "cgroupfs"),
cgroupRoot: cgroupRoot,
qosReserved: nil,
}
qosContainerManager.activePods = activeTestPods
return qosContainerManager, nil
}
func TestQoSContainerCgroup(t *testing.T) {
m, err := createTestQOSContainerManager()
assert.Nil(t, err)
qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
v1.PodQOSGuaranteed: {
Name: m.qosContainersInfo.Guaranteed,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBurstable: {
Name: m.qosContainersInfo.Burstable,
ResourceParameters: &ResourceConfig{},
},
v1.PodQOSBestEffort: {
Name: m.qosContainersInfo.BestEffort,
ResourceParameters: &ResourceConfig{},
},
}
m.setMemoryQoS(qosConfigs)
burstableMin := resource.MustParse("384Mi")
guaranteedMin := resource.MustParse("128Mi")
assert.Equal(t, qosConfigs[v1.PodQOSGuaranteed].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value()+guaranteedMin.Value(), 10))
assert.Equal(t, qosConfigs[v1.PodQOSBurstable].ResourceParameters.Unified["memory.min"], strconv.FormatInt(burstableMin.Value(), 10))
}

View File

@ -35,6 +35,8 @@ type ResourceConfig struct {
HugePageLimit map[int64]int64
// Maximum number of pids
PidsLimit *int64
// Unified for cgroup v2
Unified map[string]string
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.

View File

@ -666,6 +666,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.runtimeClassManager,
seccompDefault,
kubeCfg.MemorySwap.SwapBehavior,
kubeDeps.ContainerManager.GetNodeAllocatableAbsolute,
*kubeCfg.MemoryThrottlingFactor,
)
if err != nil {
return nil, err

View File

@ -21,6 +21,8 @@ import (
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
@ -38,6 +40,9 @@ import (
const (
fakeSeccompProfileRoot = "/fakeSeccompProfileRoot"
fakeNodeAllocatableMemory = "32Gi"
fakeNodeAllocatableCPU = "16"
)
type fakeHTTP struct {
@ -79,21 +84,22 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
return nil, err
}
kubeRuntimeManager := &kubeGenericRuntimeManager{
recorder: recorder,
cpuCFSQuota: false,
cpuCFSQuotaPeriod: metav1.Duration{Duration: time.Microsecond * 100},
livenessManager: proberesults.NewManager(),
startupManager: proberesults.NewManager(),
machineInfo: machineInfo,
osInterface: osInterface,
runtimeHelper: runtimeHelper,
runtimeService: runtimeService,
imageService: imageService,
keyring: keyring,
seccompProfileRoot: fakeSeccompProfileRoot,
internalLifecycle: cm.NewFakeInternalContainerLifecycle(),
logReduction: logreduction.NewLogReduction(identicalErrorDelay),
logManager: logManager,
recorder: recorder,
cpuCFSQuota: false,
cpuCFSQuotaPeriod: metav1.Duration{Duration: time.Microsecond * 100},
livenessManager: proberesults.NewManager(),
startupManager: proberesults.NewManager(),
machineInfo: machineInfo,
osInterface: osInterface,
runtimeHelper: runtimeHelper,
runtimeService: runtimeService,
imageService: imageService,
keyring: keyring,
seccompProfileRoot: fakeSeccompProfileRoot,
internalLifecycle: cm.NewFakeInternalContainerLifecycle(),
logReduction: logreduction.NewLogReduction(identicalErrorDelay),
logManager: logManager,
memoryThrottlingFactor: 0.8,
}
typedVersion, err := runtimeService.Version(kubeRuntimeAPIVersion)
@ -118,5 +124,12 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
kubeRuntimeManager,
kubeRuntimeManager)
kubeRuntimeManager.getNodeAllocatable = func() v1.ResourceList {
return v1.ResourceList{
v1.ResourceMemory: resource.MustParse(fakeNodeAllocatableMemory),
v1.ResourceCPU: resource.MustParse(fakeNodeAllocatableCPU),
}
}
return kubeRuntimeManager, nil
}

View File

@ -19,8 +19,10 @@ limitations under the License.
package kuberuntime
import (
"strconv"
"time"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -28,6 +30,7 @@ import (
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/qos"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
@ -35,12 +38,18 @@ import (
// applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget)
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
config.Linux = m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
return nil
}
// generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) *runtimeapi.LinuxContainerConfig {
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) *runtimeapi.LinuxContainerConfig {
lc := &runtimeapi.LinuxContainerConfig{
Resources: &runtimeapi.LinuxContainerResources{},
SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username),
@ -56,6 +65,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C
cpuRequest := container.Resources.Requests.Cpu()
cpuLimit := container.Resources.Limits.Cpu()
memoryLimit := container.Resources.Limits.Memory().Value()
memoryRequest := container.Resources.Requests.Memory().Value()
oomScoreAdj := int64(qos.GetContainerOOMScoreAdjust(pod, container,
int64(m.machineInfo.MemoryCapacity)))
// If request is not specified, but limit is, we want request to default to limit.
@ -107,6 +117,43 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.C
}
}
// Set memory.min and memory.high to enforce MemoryQoS
if enforceMemoryQoS {
unified := map[string]string{}
if memoryRequest != 0 {
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
}
// If container sets limits.memory, we set memory.high=pod.spec.containers[i].resources.limits[memory] * memory_throttling_factor
// for container level cgroup if memory.high>memory.min.
// If container doesn't set limits.memory, we set memory.high=node_allocatable_memory * memory_throttling_factor
// for container level cgroup.
memoryHigh := int64(0)
if memoryLimit != 0 {
memoryHigh = int64(float64(memoryLimit) * m.memoryThrottlingFactor)
} else {
allocatable := m.getNodeAllocatable()
allocatableMemory, ok := allocatable[v1.ResourceMemory]
if ok && allocatableMemory.Value() > 0 {
memoryHigh = int64(float64(allocatableMemory.Value()) * m.memoryThrottlingFactor)
}
}
if memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
}
if len(unified) > 0 {
if lc.Resources.Unified == nil {
lc.Resources.Unified = unified
} else {
for k, v := range unified {
lc.Resources.Unified[k] = v
}
}
klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified)
}
}
return lc
}

View File

@ -20,6 +20,7 @@ package kuberuntime
import (
"reflect"
"strconv"
"testing"
"github.com/google/go-cmp/cmp"
@ -36,7 +37,7 @@ import (
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
)
func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int) *runtimeapi.ContainerConfig {
func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerIndex int, enforceMemoryQoS bool) *runtimeapi.ContainerConfig {
container := &pod.Spec.Containers[containerIndex]
podIP := ""
restartCount := 0
@ -62,7 +63,7 @@ func makeExpectedConfig(m *kubeGenericRuntimeManager, pod *v1.Pod, containerInde
Stdin: container.Stdin,
StdinOnce: container.StdinOnce,
Tty: container.TTY,
Linux: m.generateLinuxContainerConfig(container, pod, new(int64), "", nil),
Linux: m.generateLinuxContainerConfig(container, pod, new(int64), "", nil, enforceMemoryQoS),
Envs: envs,
}
return expectedConfig
@ -97,7 +98,7 @@ func TestGenerateContainerConfig(t *testing.T) {
},
}
expectedConfig := makeExpectedConfig(m, pod, 0)
expectedConfig := makeExpectedConfig(m, pod, 0, false)
containerConfig, _, err := m.generateContainerConfig(&pod.Spec.Containers[0], pod, 0, "", pod.Spec.Containers[0].Image, []string{}, nil)
assert.NoError(t, err)
assert.Equal(t, expectedConfig, containerConfig, "generate container config for kubelet runtime v1.")
@ -145,6 +146,101 @@ func TestGenerateContainerConfig(t *testing.T) {
assert.Error(t, err, "RunAsNonRoot should fail for non-numeric username")
}
func TestGenerateContainerConfigWithMemoryQoSEnforced(t *testing.T) {
_, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
pod1 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
Command: []string{"testCommand"},
WorkingDir: "testWorkingDir",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
},
Limits: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("256Mi"),
},
},
},
},
},
}
pod2 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "foo",
Image: "busybox",
ImagePullPolicy: v1.PullIfNotPresent,
Command: []string{"testCommand"},
WorkingDir: "testWorkingDir",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceMemory: resource.MustParse("128Mi"),
},
},
},
},
},
}
memoryNodeAllocatable := resource.MustParse(fakeNodeAllocatableMemory)
pod2MemoryHigh := float64(memoryNodeAllocatable.Value()) * m.memoryThrottlingFactor
type expectedResult struct {
containerConfig *runtimeapi.LinuxContainerConfig
memoryLow int64
memoryHigh int64
}
tests := []struct {
name string
pod *v1.Pod
expected *expectedResult
}{
{
name: "Request128MBLimit256MB",
pod: pod1,
expected: &expectedResult{
m.generateLinuxContainerConfig(&pod1.Spec.Containers[0], pod1, new(int64), "", nil, true),
128 * 1024 * 1024,
int64(float64(256*1024*1024) * m.memoryThrottlingFactor),
},
},
{
name: "Request128MBWithoutLimit",
pod: pod2,
expected: &expectedResult{
m.generateLinuxContainerConfig(&pod2.Spec.Containers[0], pod2, new(int64), "", nil, true),
128 * 1024 * 1024,
int64(pod2MemoryHigh),
},
},
}
for _, test := range tests {
linuxConfig := m.generateLinuxContainerConfig(&test.pod.Spec.Containers[0], test.pod, new(int64), "", nil, true)
assert.Equal(t, test.expected.containerConfig, linuxConfig, test.name)
assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.min"], strconv.FormatInt(test.expected.memoryLow, 10), test.name)
assert.Equal(t, linuxConfig.GetResources().GetUnified()["memory.high"], strconv.FormatInt(test.expected.memoryHigh, 10), test.name)
}
}
func TestGetHugepageLimitsFromResources(t *testing.T) {
var baseHugepage []*runtimeapi.HugepageLimit
@ -361,7 +457,7 @@ func TestGenerateLinuxContainerConfigNamespaces(t *testing.T) {
},
} {
t.Run(tc.name, func(t *testing.T) {
got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target)
got := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", tc.target, false)
if diff := cmp.Diff(tc.want, got.SecurityContext.NamespaceOptions); diff != "" {
t.Errorf("%v: diff (-want +got):\n%v", t.Name(), diff)
}
@ -452,7 +548,7 @@ func TestGenerateLinuxContainerConfigSwap(t *testing.T) {
} {
t.Run(tc.name, func(t *testing.T) {
m.memorySwapBehavior = tc.swapSetting
actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil)
actual := m.generateLinuxContainerConfig(&tc.pod.Spec.Containers[0], tc.pod, nil, "", nil, false)
assert.Equal(t, tc.expected, actual.Resources.MemorySwapLimitInBytes, "memory swap config for %s", tc.name)
})
}

View File

@ -147,6 +147,12 @@ type kubeGenericRuntimeManager struct {
// MemorySwapBehavior defines how swap is used
memorySwapBehavior string
//Function to get node allocatable resources
getNodeAllocatable func() v1.ResourceList
// Memory throttling factor for MemoryQoS
memoryThrottlingFactor float64
}
// KubeGenericRuntime is a interface contains interfaces for container runtime and command.
@ -190,27 +196,31 @@ func NewKubeGenericRuntimeManager(
runtimeClassManager *runtimeclass.Manager,
seccompDefault bool,
memorySwapBehavior string,
getNodeAllocatable func() v1.ResourceList,
memoryThrottlingFactor float64,
) (KubeGenericRuntime, error) {
kubeRuntimeManager := &kubeGenericRuntimeManager{
recorder: recorder,
cpuCFSQuota: cpuCFSQuota,
cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
seccompProfileRoot: seccompProfileRoot,
livenessManager: livenessManager,
readinessManager: readinessManager,
startupManager: startupManager,
machineInfo: machineInfo,
osInterface: osInterface,
runtimeHelper: runtimeHelper,
runtimeService: newInstrumentedRuntimeService(runtimeService),
imageService: newInstrumentedImageManagerService(imageService),
internalLifecycle: internalLifecycle,
legacyLogProvider: legacyLogProvider,
logManager: logManager,
runtimeClassManager: runtimeClassManager,
logReduction: logreduction.NewLogReduction(identicalErrorDelay),
seccompDefault: seccompDefault,
memorySwapBehavior: memorySwapBehavior,
recorder: recorder,
cpuCFSQuota: cpuCFSQuota,
cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
seccompProfileRoot: seccompProfileRoot,
livenessManager: livenessManager,
readinessManager: readinessManager,
startupManager: startupManager,
machineInfo: machineInfo,
osInterface: osInterface,
runtimeHelper: runtimeHelper,
runtimeService: newInstrumentedRuntimeService(runtimeService),
imageService: newInstrumentedImageManagerService(imageService),
internalLifecycle: internalLifecycle,
legacyLogProvider: legacyLogProvider,
logManager: logManager,
runtimeClassManager: runtimeClassManager,
logReduction: logreduction.NewLogReduction(identicalErrorDelay),
seccompDefault: seccompDefault,
memorySwapBehavior: memorySwapBehavior,
getNodeAllocatable: getNodeAllocatable,
memoryThrottlingFactor: memoryThrottlingFactor,
}
typedVersion, err := kubeRuntimeManager.getTypedVersion()

View File

@ -28,6 +28,7 @@ import (
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
@ -58,7 +59,10 @@ func customTestRuntimeManager(keyring *credentialprovider.BasicDockerKeyring) (*
// Only an empty machineInfo is needed here, because in unit test all containers are besteffort,
// data in machineInfo is not used. If burstable containers are used in unit test in the future,
// we may want to set memory capacity.
machineInfo := &cadvisorapi.MachineInfo{}
memoryCapacityQuantity := resource.MustParse(fakeNodeAllocatableMemory)
machineInfo := &cadvisorapi.MachineInfo{
MemoryCapacity: uint64(memoryCapacityQuantity.Value()),
}
osInterface := &containertest.FakeOS{}
manager, err := newFakeKubeRuntimeManager(fakeRuntimeService, fakeImageService, machineInfo, osInterface, &containertest.FakeRuntimeHelper{}, keyring)
return fakeRuntimeService, fakeImageService, manager, err

View File

@ -121,7 +121,7 @@ func calculateEmptyDirMemorySize(nodeAllocatableMemory *resource.Quantity, spec
// determine pod resource allocation
// we use the same function for pod cgroup assigment to maintain consistent behavior
// NOTE: this could be nil on systems that do not support pod memory containment (i.e. windows)
podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000))
podResourceConfig := cm.ResourceConfigForPod(pod, false, uint64(100000), false)
if podResourceConfig != nil && podResourceConfig.Memory != nil {
podMemoryLimit := resource.NewQuantity(*(podResourceConfig.Memory), resource.BinarySI)
// ensure 0 < value < size

View File

@ -947,6 +947,15 @@ type KubeletConfiguration struct {
// Default: false
// +optional
SeccompDefault *bool `json:"seccompDefault,omitempty"`
// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory
// when setting the cgroupv2 memory.high value to enforce MemoryQoS.
// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
// while increasing will put less reclaim pressure.
// See http://kep.k8s.io/2570 for more details.
// Default: 0.8
// +featureGate=MemoryQoS
// +optional
MemoryThrottlingFactor *float64 `json:"memoryThrottlingFactor,omitempty"`
}
type KubeletAuthorizationMode string

View File

@ -327,6 +327,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
*out = new(bool)
**out = **in
}
if in.MemoryThrottlingFactor != nil {
in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor
*out = new(float64)
**out = **in
}
return
}