From 6d16fee22934829583184caca32672bf473d077c Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Fri, 3 Apr 2020 15:54:00 +0200 Subject: [PATCH 1/4] kubelet: cpu hard capping is supported on cgroup v2 Signed-off-by: Giuseppe Scrivano --- pkg/kubelet/cm/container_manager_linux.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index d6af9835883..a95dbafd9f0 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -162,6 +162,7 @@ func validateSystemRequirements(mountUtil mount.Interface) (features, error) { } if cgroups.IsCgroup2UnifiedMode() { + f.cpuHardcapping = true return f, nil } From a9772b2290b493aa0b5f6311c692243785c6b347 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Fri, 3 Apr 2020 15:53:39 +0200 Subject: [PATCH 2/4] kubelet: adapt cgroup_manager to cgroup v2 Signed-off-by: Giuseppe Scrivano --- pkg/kubelet/cm/BUILD | 2 + pkg/kubelet/cm/cgroup_manager_linux.go | 171 +++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 1ad243e0b3a..9e67fa9019a 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -70,6 +70,7 @@ go_library( "//vendor/github.com/docker/go-units:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", + "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/k8s.io/utils/io:go_default_library", @@ -121,6 +122,7 @@ go_library( "//vendor/github.com/docker/go-units:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", + "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/k8s.io/utils/io:go_default_library", diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 4d8b26e5eb0..8a53688dbee 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -18,15 +18,18 @@ package cm import ( "fmt" + "io/ioutil" "os" "path" "path/filepath" "strconv" "strings" + "sync" "time" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" + cgroupfs2 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs" "k8s.io/klog" @@ -36,6 +39,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" kubefeatures "k8s.io/kubernetes/pkg/features" + cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" "k8s.io/kubernetes/pkg/kubelet/metrics" ) @@ -228,6 +232,12 @@ func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string return cgroupPaths } +// buildCgroupUnifiedPath builds a path to the specified name. +func (m *cgroupManagerImpl) buildCgroupUnifiedPath(name CgroupName) string { + cgroupFsAdaptedName := m.Name(name) + return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName) +} + // TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead. // It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent) // and split it appropriately, using essentially the logic below. @@ -246,6 +256,21 @@ func updateSystemdCgroupInfo(cgroupConfig *libcontainerconfigs.Cgroup, cgroupNam // Exists checks if all subsystem cgroups already exist func (m *cgroupManagerImpl) Exists(name CgroupName) bool { + if libcontainercgroups.IsCgroup2UnifiedMode() { + cgroupPath := m.buildCgroupUnifiedPath(name) + neededControllers := getSupportedUnifiedControllers() + enabledControllers, err := readUnifiedControllers(cgroupPath) + if err != nil { + return false + } + difference := neededControllers.Difference(enabledControllers) + if difference.Len() > 0 { + klog.V(4).Infof("The Cgroup %v has some missing controllers: %v", name, difference) + return false + } + return true + } + // Get map of all cgroup paths on the system for the particular cgroup cgroupPaths := m.buildCgroupPaths(name) @@ -338,7 +363,7 @@ func getSupportedSubsystems() map[subsystem]bool { return supportedSubsystems } -// setSupportedSubsystems sets cgroup resource limits only on the supported +// setSupportedSubsystemsV1 sets cgroup resource limits on cgroup v1 only on the supported // subsystems. ie. cpu and memory. We don't use libcontainer's cgroup/fs/Set() // method as it doesn't allow us to skip updates on the devices cgroup // Allowing or denying all devices by writing 'a' to devices.allow or devices.deny is @@ -347,7 +372,7 @@ func getSupportedSubsystems() map[subsystem]bool { // We would like to skip setting any values on the device cgroup in this case // but this is not possible with libcontainers Set() method // See https://github.com/opencontainers/runc/issues/932 -func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error { +func setSupportedSubsystemsV1(cgroupConfig *libcontainerconfigs.Cgroup) error { for sys, required := range getSupportedSubsystems() { if _, ok := cgroupConfig.Paths[sys.Name()]; !ok { if required { @@ -388,6 +413,104 @@ func getCpuMax(cpuQuota *int64, cpuPeriod *uint64) string { return fmt.Sprintf("%s %s", quotaStr, periodStr) } +// readUnifiedControllers reads the controllers available at the specified cgroup +func readUnifiedControllers(path string) (sets.String, error) { + controllersFileContent, err := ioutil.ReadFile(filepath.Join(path, "cgroup.controllers")) + if err != nil { + return nil, err + } + controllers := strings.Fields(string(controllersFileContent)) + return sets.NewString(controllers...), nil +} + +var ( + availableRootControllersOnce sync.Once + availableRootControllers sets.String +) + +// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2 +func getSupportedUnifiedControllers() sets.String { + // This is the set of controllers used by the Kubelet + supportedControllers := sets.NewString("cpu", "cpuset", "memory", "hugetlb") + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) { + supportedControllers.Insert("pids") + } + // Memoize the set of controllers that are present in the root cgroup + availableRootControllersOnce.Do(func() { + var err error + availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot) + if err != nil { + panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot)) + } + }) + // Return the set of controllers that are supported both by the Kubelet and by the kernel + return supportedControllers.Intersection(availableRootControllers) +} + +// propagateControllers on an unified hierarchy enables all the supported controllers for the specified cgroup +func propagateControllers(path string) error { + if err := os.MkdirAll(path, 0755); err != nil { + return fmt.Errorf("failed to create cgroup %q : %v", path, err) + } + + // Retrieve all the supported controllers from the cgroup root + controllersFileContent, err := ioutil.ReadFile(filepath.Join(cmutil.CgroupRoot, "cgroup.controllers")) + if err != nil { + return fmt.Errorf("failed to read controllers from %q : %v", cmutil.CgroupRoot, err) + } + + supportedControllers := getSupportedUnifiedControllers() + + // The retrieved content looks like: "cpuset cpu io memory hugetlb pids". Prepend each of the controllers + // with '+', so we have something like "+cpuset +cpu +io +memory +hugetlb +pids" + controllers := "" + for _, controller := range strings.Fields(string(controllersFileContent)) { + // ignore controllers we don't care about + if !supportedControllers.Has(controller) { + continue + } + + sep := " +" + if controllers == "" { + sep = "+" + } + controllers = controllers + sep + controller + } + + current := cmutil.CgroupRoot + relPath, err := filepath.Rel(cmutil.CgroupRoot, path) + if err != nil { + return fmt.Errorf("failed to get relative path to cgroup root from %q: %v", path, err) + } + // Write the controllers list to each "cgroup.subtree_control" file until it reaches the parent cgroup. + // For the /foo/bar/baz cgroup, controllers must be enabled sequentially in the files: + // - /sys/fs/cgroup/foo/cgroup.subtree_control + // - /sys/fs/cgroup/foo/bar/cgroup.subtree_control + for _, p := range strings.Split(filepath.Dir(relPath), "/") { + current = filepath.Join(current, p) + if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), []byte(controllers), 0755); err != nil { + return fmt.Errorf("failed to enable controllers on %q: %v", cmutil.CgroupRoot, err) + } + } + return nil +} + +// setResourcesV2 sets cgroup resource limits on cgroup v2 +func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error { + if err := propagateControllers(cgroupConfig.Path); err != nil { + return err + } + + manager, err := cgroupfs2.NewManager(cgroupConfig, cgroupConfig.Path, false) + if err != nil { + return fmt.Errorf("failed to create cgroup v2 manager: %v", err) + } + config := &libcontainerconfigs.Config{ + Cgroups: cgroupConfig, + } + return manager.Set(config) +} + func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources { resources := &libcontainerconfigs.Resources{} if resourceConfig == nil { @@ -454,12 +577,17 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { resourceConfig := cgroupConfig.ResourceParameters resources := m.toResources(resourceConfig) - cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name) - libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{ Resources: resources, - Paths: cgroupPaths, } + + unified := libcontainercgroups.IsCgroup2UnifiedMode() + if unified { + libcontainerCgroupConfig.Path = m.buildCgroupUnifiedPath(cgroupConfig.Name) + } else { + libcontainerCgroupConfig.Paths = m.buildCgroupPaths(cgroupConfig.Name) + } + // libcontainer consumes a different field and expects a different syntax // depending on the cgroup driver in use, so we need this conditional here. if m.adapter.cgroupManagerType == libcontainerSystemd { @@ -472,8 +600,14 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit } - if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil { - return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err) + if unified { + if err := setResourcesV2(libcontainerCgroupConfig); err != nil { + return fmt.Errorf("failed to set resources for cgroup %v: %v", cgroupConfig.Name, err) + } + } else { + if err := setSupportedSubsystemsV1(libcontainerCgroupConfig); err != nil { + return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err) + } } return nil } @@ -619,10 +753,25 @@ func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats { // Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) { - cgroupPaths := m.buildCgroupPaths(name) - stats, err := getStatsSupportedSubsystems(cgroupPaths) - if err != nil { - return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err) + var err error + var stats *libcontainercgroups.Stats + if libcontainercgroups.IsCgroup2UnifiedMode() { + cgroupPath := m.buildCgroupUnifiedPath(name) + manager, err := cgroupfs2.NewManager(nil, cgroupPath, false) + if err != nil { + return nil, fmt.Errorf("failed to create cgroup v2 manager: %v", err) + } + + stats, err = manager.GetStats() + if err != nil { + return nil, fmt.Errorf("failed to get stats for cgroup %v: %v", name, err) + } + } else { + cgroupPaths := m.buildCgroupPaths(name) + stats, err = getStatsSupportedSubsystems(cgroupPaths) + if err != nil { + return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err) + } } return toResourceStats(stats), nil } From 43c56eb4038ee9f6b7275f5aefe3aba6fd5f181b Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 7 Apr 2020 13:09:45 +0200 Subject: [PATCH 3/4] e2e_node: adapt tests to cgroup v2 and fix node_container_manager_test to run with the systemd cgroup manager. Signed-off-by: Giuseppe Scrivano --- test/e2e_node/BUILD | 2 + test/e2e_node/hugepages_test.go | 9 ++- test/e2e_node/node_container_manager_test.go | 81 +++++++++++++++----- test/e2e_node/pids_test.go | 8 +- test/e2e_node/pods_container_manager_test.go | 18 ++++- test/e2e_node/services/kubelet.go | 4 + test/e2e_node/summary_test.go | 6 +- test/e2e_node/utils_linux.go | 28 +++++++ test/e2e_node/utils_unsupported.go | 24 ++++++ 9 files changed, 155 insertions(+), 25 deletions(-) create mode 100644 test/e2e_node/utils_linux.go create mode 100644 test/e2e_node/utils_unsupported.go diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 7125bc64969..54f7109a99d 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -21,6 +21,8 @@ go_library( "util_sriov.go", "util_xfs_linux.go", "util_xfs_unsupported.go", + "utils_linux.go", + "utils_unsupported.go", ], importpath = "k8s.io/kubernetes/test/e2e_node", visibility = ["//visibility:public"], diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go index c15d2a2a541..2f3a933774c 100644 --- a/test/e2e_node/hugepages_test.go +++ b/test/e2e_node/hugepages_test.go @@ -50,8 +50,13 @@ func makePodToVerifyHugePages(baseName string, hugePagesLimit resource.Quantity) cgroupFsName = cgroupName.ToCgroupfs() } - // this command takes the expected value and compares it against the actual value for the pod cgroup hugetlb.2MB.limit_in_bytes - command := fmt.Sprintf("expected=%v; actual=$(cat /tmp/hugetlb/%v/hugetlb.2MB.limit_in_bytes); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName) + command := "" + // this command takes the expected value and compares it against the actual value for the pod cgroup hugetlb.2MB. + if IsCgroup2UnifiedMode() { + command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/%v/hugetlb.2MB.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName) + } else { + command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/hugetlb/%v/hugetlb.2MB.limit_in_bytes); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName) + } framework.Logf("Pod to run command: %v", command) pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e_node/node_container_manager_test.go b/test/e2e_node/node_container_manager_test.go index 45cf79f73df..d128cecaee6 100644 --- a/test/e2e_node/node_container_manager_test.go +++ b/test/e2e_node/node_container_manager_test.go @@ -149,6 +149,11 @@ func destroyTemporaryCgroupsForReservation(cgroupManager cm.CgroupManager) error return cgroupManager.Destroy(cgroupConfig) } +// convertSharesToWeight converts from cgroup v1 cpu.shares to cgroup v2 cpu.weight +func convertSharesToWeight(shares int64) int64 { + return 1 + ((shares-2)*9999)/262142 +} + func runTest(f *framework.Framework) error { var oldCfg *kubeletconfig.KubeletConfiguration subsystems, err := cm.GetCgroupSubsystems() @@ -187,8 +192,14 @@ func runTest(f *framework.Framework) error { expectedNAPodCgroup := cm.ParseCgroupfsToCgroupName(currentConfig.CgroupRoot) expectedNAPodCgroup = cm.NewCgroupName(expectedNAPodCgroup, "kubepods") if !cgroupManager.Exists(expectedNAPodCgroup) { - return fmt.Errorf("Expected Node Allocatable Cgroup Does not exist") + return fmt.Errorf("Expected Node Allocatable Cgroup %q does not exist", expectedNAPodCgroup) } + + memoryLimitFile := "memory.limit_in_bytes" + if IsCgroup2UnifiedMode() { + memoryLimitFile = "memory.max" + } + // TODO: Update cgroupManager to expose a Status interface to get current Cgroup Settings. // The node may not have updated capacity and allocatable yet, so check that it happens eventually. gomega.Eventually(func() error { @@ -199,20 +210,33 @@ func runTest(f *framework.Framework) error { if len(nodeList.Items) != 1 { return fmt.Errorf("Unexpected number of node objects for node e2e. Expects only one node: %+v", nodeList) } + cgroupName := "kubepods" + if currentConfig.CgroupDriver == "systemd" { + cgroupName = "kubepods.slice" + } + node := nodeList.Items[0] capacity := node.Status.Capacity allocatableCPU, allocatableMemory, allocatablePIDs := getAllocatableLimits("200m", "200Mi", "1738", capacity) // Total Memory reservation is 200Mi excluding eviction thresholds. // Expect CPU shares on node allocatable cgroup to equal allocatable. - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], "kubepods", "cpu.shares"), int64(cm.MilliCPUToShares(allocatableCPU.MilliValue())), 10); err != nil { - return err + shares := int64(cm.MilliCPUToShares(allocatableCPU.MilliValue())) + if IsCgroup2UnifiedMode() { + // convert to the cgroup v2 cpu.weight value + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupName, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil { + return err + } + } else { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupName, "cpu.shares"), shares, 10); err != nil { + return err + } } // Expect Memory limit on node allocatable cgroup to equal allocatable. - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], "kubepods", "memory.limit_in_bytes"), allocatableMemory.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupName, memoryLimitFile), allocatableMemory.Value(), 0); err != nil { return err } // Expect PID limit on node allocatable cgroup to equal allocatable. - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], "kubepods", "pids.max"), allocatablePIDs.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupName, "pids.max"), allocatablePIDs.Value(), 0); err != nil { return err } @@ -235,42 +259,61 @@ func runTest(f *framework.Framework) error { return nil }, time.Minute, 5*time.Second).Should(gomega.BeNil()) - kubeReservedCgroupName := cm.NewCgroupName(cm.RootCgroupName, kubeReservedCgroup) - if !cgroupManager.Exists(kubeReservedCgroupName) { - return fmt.Errorf("Expected kube reserved cgroup Does not exist") + cgroupPath := "" + if currentConfig.CgroupDriver == "systemd" { + cgroupPath = cm.ParseSystemdToCgroupName(kubeReservedCgroup).ToSystemd() + } else { + cgroupPath = cgroupManager.Name(cm.NewCgroupName(cm.RootCgroupName, kubeReservedCgroup)) } // Expect CPU shares on kube reserved cgroup to equal it's reservation which is `100m`. kubeReservedCPU := resource.MustParse(currentConfig.KubeReserved[string(v1.ResourceCPU)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupManager.Name(kubeReservedCgroupName), "cpu.shares"), int64(cm.MilliCPUToShares(kubeReservedCPU.MilliValue())), 10); err != nil { - return err + shares := int64(cm.MilliCPUToShares(kubeReservedCPU.MilliValue())) + if IsCgroup2UnifiedMode() { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil { + return err + } + } else { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.shares"), shares, 10); err != nil { + return err + } } // Expect Memory limit kube reserved cgroup to equal configured value `100Mi`. kubeReservedMemory := resource.MustParse(currentConfig.KubeReserved[string(v1.ResourceMemory)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(kubeReservedCgroupName), "memory.limit_in_bytes"), kubeReservedMemory.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupPath, memoryLimitFile), kubeReservedMemory.Value(), 0); err != nil { return err } // Expect process ID limit kube reserved cgroup to equal configured value `738`. kubeReservedPIDs := resource.MustParse(currentConfig.KubeReserved[string(pidlimit.PIDs)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(kubeReservedCgroupName), "pids.max"), kubeReservedPIDs.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupPath, "pids.max"), kubeReservedPIDs.Value(), 0); err != nil { return err } - systemReservedCgroupName := cm.NewCgroupName(cm.RootCgroupName, systemReservedCgroup) - if !cgroupManager.Exists(systemReservedCgroupName) { - return fmt.Errorf("Expected system reserved cgroup Does not exist") + + if currentConfig.CgroupDriver == "systemd" { + cgroupPath = cm.ParseSystemdToCgroupName(systemReservedCgroup).ToSystemd() + } else { + cgroupPath = cgroupManager.Name(cm.NewCgroupName(cm.RootCgroupName, systemReservedCgroup)) } + // Expect CPU shares on system reserved cgroup to equal it's reservation which is `100m`. systemReservedCPU := resource.MustParse(currentConfig.SystemReserved[string(v1.ResourceCPU)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupManager.Name(systemReservedCgroupName), "cpu.shares"), int64(cm.MilliCPUToShares(systemReservedCPU.MilliValue())), 10); err != nil { - return err + shares = int64(cm.MilliCPUToShares(systemReservedCPU.MilliValue())) + if IsCgroup2UnifiedMode() { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil { + return err + } + } else { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.shares"), shares, 10); err != nil { + return err + } } // Expect Memory limit on node allocatable cgroup to equal allocatable. systemReservedMemory := resource.MustParse(currentConfig.SystemReserved[string(v1.ResourceMemory)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(systemReservedCgroupName), "memory.limit_in_bytes"), systemReservedMemory.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupPath, memoryLimitFile), systemReservedMemory.Value(), 0); err != nil { return err } // Expect process ID limit system reserved cgroup to equal configured value `1000`. systemReservedPIDs := resource.MustParse(currentConfig.SystemReserved[string(pidlimit.PIDs)]) - if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(systemReservedCgroupName), "pids.max"), systemReservedPIDs.Value(), 0); err != nil { + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupPath, "pids.max"), systemReservedPIDs.Value(), 0); err != nil { return err } return nil diff --git a/test/e2e_node/pids_test.go b/test/e2e_node/pids_test.go index 9d5d34ecb0f..0c3192aaefd 100644 --- a/test/e2e_node/pids_test.go +++ b/test/e2e_node/pids_test.go @@ -45,7 +45,13 @@ func makePodToVerifyPids(baseName string, pidsLimit resource.Quantity) *v1.Pod { } // this command takes the expected value and compares it against the actual value for the pod cgroup pids.max - command := fmt.Sprintf("expected=%v; actual=$(cat /tmp/pids/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName) + command := "" + if IsCgroup2UnifiedMode() { + command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName) + } else { + command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/pids/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName) + } + framework.Logf("Pod to run command: %v", command) pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e_node/pods_container_manager_test.go b/test/e2e_node/pods_container_manager_test.go index 4551c8a4cbe..540aa098db8 100644 --- a/test/e2e_node/pods_container_manager_test.go +++ b/test/e2e_node/pods_container_manager_test.go @@ -75,8 +75,14 @@ func makePodToVerifyCgroups(cgroupNames []string) *v1.Pod { klog.Infof("expecting %v cgroups to be found", cgroupFsNames) // build the pod command to either verify cgroups exist command := "" + for _, cgroupFsName := range cgroupFsNames { - localCommand := "if [ ! -d /tmp/memory/" + cgroupFsName + " ] || [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 1; fi; " + localCommand := "" + if IsCgroup2UnifiedMode() { + localCommand = "if [ ! -d /tmp/" + cgroupFsName + " ]; then exit 1; fi; " + } else { + localCommand = "if [ ! -d /tmp/memory/" + cgroupFsName + " ] || [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 1; fi; " + } command += localCommand } @@ -117,6 +123,14 @@ func makePodToVerifyCgroupRemoved(baseName string) *v1.Pod { components := strings.Split(baseName, "/") cgroupName := cm.NewCgroupName(cm.RootCgroupName, components...) cgroupFsName := toCgroupFsName(cgroupName) + + command := "" + if IsCgroup2UnifiedMode() { + command = "for i in `seq 1 10`; do if [ ! -d /tmp/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1" + } else { + command = "for i in `seq 1 10`; do if [ ! -d /tmp/memory/" + cgroupFsName + " ] && [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1" + } + pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod" + string(uuid.NewUUID()), @@ -127,7 +141,7 @@ func makePodToVerifyCgroupRemoved(baseName string) *v1.Pod { { Image: busyboxImage, Name: "container" + string(uuid.NewUUID()), - Command: []string{"sh", "-c", "for i in `seq 1 10`; do if [ ! -d /tmp/memory/" + cgroupFsName + " ] && [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1"}, + Command: []string{"sh", "-c", command}, VolumeMounts: []v1.VolumeMount{ { Name: "sysfscgroup", diff --git a/test/e2e_node/services/kubelet.go b/test/e2e_node/services/kubelet.go index 7ffd95d675f..a6e863f2a6b 100644 --- a/test/e2e_node/services/kubelet.go +++ b/test/e2e_node/services/kubelet.go @@ -194,6 +194,10 @@ func (e *E2EServices) startKubelet() (*server, error) { unitName := fmt.Sprintf("kubelet-%s.service", unitTimestamp) cmdArgs = append(cmdArgs, systemdRun, + "-p", "Delegate=true", + "-p", "CPUAccounting=true", + "-p", "MemoryAccounting=true", + "-p", "TasksAccounting=true", "--unit="+unitName, "--slice=runtime.slice", "--remain-after-exit", diff --git a/test/e2e_node/summary_test.go b/test/e2e_node/summary_test.go index acd6fe55375..c2f5d350635 100644 --- a/test/e2e_node/summary_test.go +++ b/test/e2e_node/summary_test.go @@ -435,7 +435,11 @@ func recordSystemCgroupProcesses() { continue } - pids, err := ioutil.ReadFile(fmt.Sprintf("/sys/fs/cgroup/cpu/%s/cgroup.procs", cgroup)) + filePattern := "/sys/fs/cgroup/cpu/%s/cgroup.procs" + if IsCgroup2UnifiedMode() { + filePattern = "/sys/fs/cgroup/%s/cgroup.procs" + } + pids, err := ioutil.ReadFile(fmt.Sprintf(filePattern, cgroup)) if err != nil { framework.Logf("Failed to read processes in cgroup %s: %v", name, err) continue diff --git a/test/e2e_node/utils_linux.go b/test/e2e_node/utils_linux.go new file mode 100644 index 00000000000..50f59401ba9 --- /dev/null +++ b/test/e2e_node/utils_linux.go @@ -0,0 +1,28 @@ +// +build linux + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" +) + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + return libcontainercgroups.IsCgroup2UnifiedMode() +} diff --git a/test/e2e_node/utils_unsupported.go b/test/e2e_node/utils_unsupported.go new file mode 100644 index 00000000000..69322913b75 --- /dev/null +++ b/test/e2e_node/utils_unsupported.go @@ -0,0 +1,24 @@ +// +build !linux + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + return false +} From 26d94ad628691a61de52b67a05902eff4cf1753d Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 9 Apr 2020 12:18:40 +0200 Subject: [PATCH 4/4] kubelet: do not configure the device cgroup Signed-off-by: Giuseppe Scrivano --- pkg/kubelet/cm/cgroup_manager_linux.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 8a53688dbee..a9177d94e75 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -500,6 +500,8 @@ func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error { if err := propagateControllers(cgroupConfig.Path); err != nil { return err } + allowAll := true + cgroupConfig.Resources.AllowAllDevices = &allowAll manager, err := cgroupfs2.NewManager(cgroupConfig, cgroupConfig.Path, false) if err != nil {