From a673b64864924d3ae8e4e781f44fbf691e4262ac Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 14 Dec 2021 22:46:38 -0800 Subject: [PATCH 1/5] kubelet/cm: speed up cgroup creation There's no need to call m.Update (which will create another instance of libcontainer cgroup manager, convert all the resources and then set them). All this is already done here, except for Set(). Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/cgroup_manager_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 230173690d5..c64067a3712 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -534,7 +534,7 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { // it may confuse why we call set after we do apply, but the issue is that runc // follows a similar pattern. it's needed to ensure cpu quota is set properly. - if err := m.Update(cgroupConfig); err != nil { + if err := manager.Set(resources); err != nil { utilruntime.HandleError(fmt.Errorf("cgroup update failed %v", err)) } From 59148e22d069975ab1889a5ae892e45653096aeb Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 23 Aug 2021 14:06:46 -0700 Subject: [PATCH 2/5] pkg/kubelet/cm: rm dup code Commit ecd6361f added setting PidsLimit to Create and Update. Commit bce9d5f2 added setting PidsLimit to m.toResources. Now, PidsLimit is assigned twice. Remove the duplicate. Fixes: bce9d5f2 Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/cgroup_manager_linux.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index c64067a3712..7c153703bb9 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -449,8 +449,7 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { }() // Extract the cgroup resource parameters - resourceConfig := cgroupConfig.ResourceParameters - resources := m.toResources(resourceConfig) + resources := m.toResources(cgroupConfig.ResourceParameters) libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{ Resources: resources, @@ -470,10 +469,6 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) } - if cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { - resources.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit - } - if unified { supportedControllers := getSupportedUnifiedControllers() if !supportedControllers.Has("hugetlb") { @@ -512,10 +507,6 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() } - if cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { - libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit - } - // get the manager with the specified cgroup configuration manager, err := m.adapter.newManager(libcontainerCgroupConfig, nil) if err != nil { From 11b0d57c9375e24746a7f781ef3d2bca63e9d87a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 20 Dec 2021 14:48:40 -0800 Subject: [PATCH 3/5] pkg/kubelet/cm/cgroup_manager: simplify setting hugetlb Commit 79be8be10edba made hugetlb settings optional if cgroup v2 is used and hugetlb is not available, fixing issue 92933. Note at that time this was only needed for v2, because for v1 the resources were set one-by-one, and only for supported resources. Commit d312ef7eb6730099 switched the code to using Set from runc/libcontainer cgroups manager, and expanded the check to cgroup v1 as well. Move this check earlier, to inside m.toResources, so instead of converting all hugetlb resources from ResourceConfig to libcontainers's Resources.HugetlbLimit, and then setting it to nil, we can skip the conversion entirely if hugetlb is not supported, thus not doing the work that is not needed. Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/cgroup_manager_linux.go | 51 ++++++++++++++------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 7c153703bb9..e09930e33aa 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -404,8 +404,34 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont if resourceConfig.PidsLimit != nil { resources.PidsLimit = *resourceConfig.PidsLimit } - // if huge pages are enabled, we set them in libcontainer - // for each page size enumerated, set that value + + m.maybeSetHugetlb(resourceConfig, resources) + + // Ideally unified is used for all the resources when running on cgroup v2. + // It doesn't make difference for the memory.max limit, but for e.g. the cpu controller + // you can specify the correct setting without relying on the conversions performed by the OCI runtime. + if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() { + resources.Unified = make(map[string]string) + for k, v := range resourceConfig.Unified { + resources.Unified[k] = v + } + } + return resources +} + +func (m *cgroupManagerImpl) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) { + // Check if hugetlb is supported. + if libcontainercgroups.IsCgroup2UnifiedMode() { + if !getSupportedUnifiedControllers().Has("hugetlb") { + klog.V(6).InfoS("Optional subsystem not supported: hugetlb") + return + } + } else if _, ok := m.subsystems.MountPoints["hugetlb"]; !ok { + klog.V(6).InfoS("Optional subsystem not supported: hugetlb") + return + } + + // For each page size enumerated, set that value. pageSizes := sets.NewString() for pageSize, limit := range resourceConfig.HugePageLimit { sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize) @@ -429,16 +455,6 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont Limit: uint64(0), }) } - // Ideally unified is used for all the resources when running on cgroup v2. - // It doesn't make difference for the memory.max limit, but for e.g. the cpu controller - // you can specify the correct setting without relying on the conversions performed by the OCI runtime. - if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() { - resources.Unified = make(map[string]string) - for k, v := range resourceConfig.Unified { - resources.Unified[k] = v - } - } - return resources } // Update updates the cgroup with the specified Cgroup Configuration @@ -469,17 +485,6 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) } - if unified { - supportedControllers := getSupportedUnifiedControllers() - if !supportedControllers.Has("hugetlb") { - resources.HugetlbLimit = nil - klog.V(6).InfoS("Optional subsystem not supported: hugetlb") - } - } else if _, ok := m.subsystems.MountPoints["hugetlb"]; !ok { - resources.HugetlbLimit = nil - klog.V(6).InfoS("Optional subsystem not supported: hugetlb") - } - manager, err := m.adapter.newManager(libcontainerCgroupConfig, paths) if err != nil { return fmt.Errorf("failed to create cgroup manager: %v", err) From 9652d0cedcc6709307db4b1a2c18e57c5b96280f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 21 Dec 2021 12:16:39 -0800 Subject: [PATCH 4/5] pkg/kubelet/cm: move common code to libctCgroupConfig Instead of doing (almost) the same thing from the three different methods (Create, Update, Destroy), move the functionality to libctCgroupConfig, replacing updateSystemdCgroupInfo. The needResources bool is needed because we do not need resources during Destroy, so we skip the unneeded resource conversion. Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/cgroup_manager_linux.go | 80 ++++++++++++-------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index e09930e33aa..4acdfe51253 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -237,20 +237,41 @@ func (m *cgroupManagerImpl) buildCgroupUnifiedPath(name CgroupName) string { return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName) } -// TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead. -// It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent) -// and split it appropriately, using essentially the logic below. -// This was done for cgroupfs in opencontainers/runc#497 but a counterpart -// for systemd was never introduced. -func updateSystemdCgroupInfo(cgroupConfig *libcontainerconfigs.Cgroup, cgroupName CgroupName) { - dir, base := path.Split(cgroupName.ToSystemd()) +// libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config. +func (m *cgroupManagerImpl) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup { + config := &libcontainerconfigs.Cgroup{} + if needResources { + config.Resources = m.toResources(in.ResourceParameters) + } else { + config.Resources = &libcontainerconfigs.Resources{} + } + + if m.adapter.cgroupManagerType == libcontainerCgroupfs { + // For fs cgroup manager, we can either set Path or Name and Parent. + // Setting Path is easier. + config.Path = in.Name.ToCgroupfs() + + return config + } + + // For systemd, we have to set Name and Parent, as they are needed to talk to systemd. + // Setting Path is optional as it can be deduced from Name and Parent. + + // TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead. + // It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent) + // and split it appropriately, using essentially the logic below. + // This was done for cgroupfs in opencontainers/runc#497 but a counterpart + // for systemd was never introduced. + dir, base := path.Split(in.Name.ToSystemd()) if dir == "/" { dir = "-.slice" } else { dir = path.Base(dir) } - cgroupConfig.Parent = dir - cgroupConfig.Name = base + config.Parent = dir + config.Name = base + + return config } // Exists checks if all subsystem cgroups already exist @@ -313,14 +334,7 @@ func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name) - libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{} - // libcontainer consumes a different field and expects a different syntax - // depending on the cgroup driver in use, so we need this conditional here. - if m.adapter.cgroupManagerType == libcontainerSystemd { - updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) - } else { - libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() - } + libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, false) manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths) if err != nil { @@ -464,12 +478,7 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) }() - // Extract the cgroup resource parameters - resources := m.toResources(cgroupConfig.ResourceParameters) - - libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{ - Resources: resources, - } + libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true) unified := libcontainercgroups.IsCgroup2UnifiedMode() var paths map[string]string @@ -479,17 +488,11 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { paths = m.buildCgroupPaths(cgroupConfig.Name) } - // libcontainer consumes a different field and expects a different syntax - // depending on the cgroup driver in use, so we need this conditional here. - if m.adapter.cgroupManagerType == libcontainerSystemd { - updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) - } - manager, err := m.adapter.newManager(libcontainerCgroupConfig, paths) if err != nil { return fmt.Errorf("failed to create cgroup manager: %v", err) } - return manager.Set(resources) + return manager.Set(libcontainerCgroupConfig.Resources) } // Create creates the specified cgroup @@ -499,18 +502,7 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) }() - resources := m.toResources(cgroupConfig.ResourceParameters) - - libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{ - Resources: resources, - } - // libcontainer consumes a different field and expects a different syntax - // depending on the cgroup driver in use, so we need this conditional here. - if m.adapter.cgroupManagerType == libcontainerSystemd { - updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) - } else { - libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() - } + libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true) // get the manager with the specified cgroup configuration manager, err := m.adapter.newManager(libcontainerCgroupConfig, nil) @@ -530,8 +522,8 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { // it may confuse why we call set after we do apply, but the issue is that runc // follows a similar pattern. it's needed to ensure cpu quota is set properly. - if err := manager.Set(resources); err != nil { - utilruntime.HandleError(fmt.Errorf("cgroup update failed %v", err)) + if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil { + utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err)) } return nil From de5a69d8475f26d12b1b40bf8c3c5123724ade73 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 22 Dec 2021 17:26:08 -0800 Subject: [PATCH 5/5] pkg/kubelet/cm: fix potential nil dereference in enforceExistingCgroup Move the rl == nil check to before we dereference it. Signed-off-by: Kir Kolyshkin --- pkg/kubelet/cm/node_container_manager_linux.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go index 3217be59873..0ec90bec255 100644 --- a/pkg/kubelet/cm/node_container_manager_linux.go +++ b/pkg/kubelet/cm/node_container_manager_linux.go @@ -136,6 +136,9 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error { // enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface. func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.ResourceList) error { rp := getCgroupConfig(rl) + if rp == nil { + return fmt.Errorf("%q cgroup is not configured properly", cName) + } // Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information, // see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos @@ -152,9 +155,6 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1. Name: cName, ResourceParameters: rp, } - if cgroupConfig.ResourceParameters == nil { - return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name) - } klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CpuShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit) if !cgroupManager.Exists(cgroupConfig.Name) { return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name)