diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 4d759f98a93..c7c7026da40 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -22,7 +22,6 @@ import ( "os" "path" "path/filepath" - "strconv" "strings" "sync" "time" @@ -146,20 +145,19 @@ func newLibcontainerAdapter(cgroupManagerType libcontainerCgroupManagerType) *li func (l *libcontainerAdapter) newManager(cgroups *libcontainerconfigs.Cgroup, paths map[string]string) (libcontainercgroups.Manager, error) { switch l.cgroupManagerType { case libcontainerCgroupfs: - return &cgroupfs.Manager{ - Cgroups: cgroups, - Paths: paths, - }, nil + if libcontainercgroups.IsCgroup2UnifiedMode() { + return cgroupfs2.NewManager(cgroups, paths["memory"], false) + } + return cgroupfs.NewManager(cgroups, paths, false), nil case libcontainerSystemd: // this means you asked systemd to manage cgroups, but systemd was not on the host, so all you can do is panic... - if !cgroupsystemd.UseSystemd() { + if !cgroupsystemd.IsRunningSystemd() { panic("systemd cgroup manager not available") } - f, err := cgroupsystemd.NewSystemdCgroupsManager() - if err != nil { - return nil, err + if libcontainercgroups.IsCgroup2UnifiedMode() { + return cgroupsystemd.NewUnifiedManager(cgroups, paths["memory"], false), nil } - return f(cgroups, paths), nil + return cgroupsystemd.NewLegacyManager(cgroups, paths), nil } return nil, fmt.Errorf("invalid cgroup manager configuration") } @@ -320,7 +318,11 @@ func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { if m.adapter.cgroupManagerType == libcontainerSystemd { updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) } else { - libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() + if libcontainercgroups.IsCgroup2UnifiedMode() { + libcontainerCgroupConfig.Path = m.buildCgroupUnifiedPath(cgroupConfig.Name) + } else { + libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() + } } manager, err := m.adapter.newManager(libcontainerCgroupConfig, cgroupPaths) @@ -397,19 +399,6 @@ func getCpuWeight(cpuShares *uint64) uint64 { return 1 + ((*cpuShares-2)*9999)/262142 } -// getCpuMax returns the cgroup v2 cpu.max setting given the cpu quota and the cpu period -func getCpuMax(cpuQuota *int64, cpuPeriod *uint64) string { - quotaStr := "max" - periodStr := "100000" - if cpuQuota != nil { - quotaStr = strconv.FormatInt(*cpuQuota, 10) - } - if cpuPeriod != nil { - periodStr = strconv.FormatUint(*cpuPeriod, 10) - } - return fmt.Sprintf("%s %s", quotaStr, periodStr) -} - // readUnifiedControllers reads the controllers available at the specified cgroup func readUnifiedControllers(path string) (sets.String, error) { controllersFileContent, err := ioutil.ReadFile(filepath.Join(path, "cgroup.controllers")) @@ -497,8 +486,15 @@ func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error { if err := propagateControllers(cgroupConfig.Path); err != nil { return err } - allowAll := true - cgroupConfig.Resources.AllowAllDevices = &allowAll + cgroupConfig.Resources.Devices = []*libcontainerconfigs.DeviceRule{ + { + Type: 'a', + Permissions: "rwm", + Allow: true, + Minor: libcontainerconfigs.Wildcard, + Major: libcontainerconfigs.Wildcard, + }, + } manager, err := cgroupfs2.NewManager(cgroupConfig, cgroupConfig.Path, false) if err != nil { @@ -511,26 +507,35 @@ func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error { } func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources { - resources := &libcontainerconfigs.Resources{} + resources := &libcontainerconfigs.Resources{ + Devices: []*libcontainerconfigs.DeviceRule{ + { + Type: 'a', + Permissions: "rwm", + Allow: true, + Minor: libcontainerconfigs.Wildcard, + Major: libcontainerconfigs.Wildcard, + }, + }, + } if resourceConfig == nil { return resources } if resourceConfig.Memory != nil { resources.Memory = *resourceConfig.Memory } - if libcontainercgroups.IsCgroup2UnifiedMode() { - resources.CpuWeight = getCpuWeight(resourceConfig.CpuShares) - resources.CpuMax = getCpuMax(resourceConfig.CpuQuota, resourceConfig.CpuPeriod) - } else { - if resourceConfig.CpuShares != nil { + if resourceConfig.CpuShares != nil { + if libcontainercgroups.IsCgroup2UnifiedMode() { + resources.CpuWeight = getCpuWeight(resourceConfig.CpuShares) + } else { resources.CpuShares = *resourceConfig.CpuShares } - if resourceConfig.CpuQuota != nil { - resources.CpuQuota = *resourceConfig.CpuQuota - } - if resourceConfig.CpuPeriod != nil { - resources.CpuPeriod = *resourceConfig.CpuPeriod - } + } + if resourceConfig.CpuQuota != nil { + resources.CpuQuota = *resourceConfig.CpuQuota + } + if resourceConfig.CpuPeriod != nil { + resources.CpuPeriod = *resourceConfig.CpuPeriod } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) { if resourceConfig.PidsLimit != nil { @@ -591,8 +596,6 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { // depending on the cgroup driver in use, so we need this conditional here. if m.adapter.cgroupManagerType == libcontainerSystemd { updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) - } else { - libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { @@ -628,7 +631,11 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { if m.adapter.cgroupManagerType == libcontainerSystemd { updateSystemdCgroupInfo(libcontainerCgroupConfig, cgroupConfig.Name) } else { - libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() + if libcontainercgroups.IsCgroup2UnifiedMode() { + libcontainerCgroupConfig.Path = m.buildCgroupUnifiedPath(cgroupConfig.Name) + } else { + libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() + } } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index a56bf8dc83d..3bc8e5f45a5 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -30,7 +30,8 @@ import ( "time" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" + cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" + cgroupfs2 "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "k8s.io/klog/v2" utilio "k8s.io/utils/io" @@ -91,17 +92,21 @@ type systemContainer struct { // Function that ensures the state of the container. // m is the cgroup manager for the specified container. - ensureStateFunc func(m *fs.Manager) error + ensureStateFunc func(m cgroups.Manager) error // Manager for the cgroups of the external container. - manager *fs.Manager + manager cgroups.Manager } -func newSystemCgroups(containerName string) *systemContainer { +func newSystemCgroups(containerName string) (*systemContainer, error) { + manager, err := createManager(containerName) + if err != nil { + return nil, err + } return &systemContainer{ name: containerName, - manager: createManager(containerName), - } + manager: manager, + }, nil } type containerManagerImpl struct { @@ -365,17 +370,28 @@ func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLi } // Create a cgroup container manager. -func createManager(containerName string) *fs.Manager { - allowAllDevices := true - return &fs.Manager{ - Cgroups: &configs.Cgroup{ - Parent: "/", - Name: containerName, - Resources: &configs.Resources{ - AllowAllDevices: &allowAllDevices, +func createManager(containerName string) (cgroups.Manager, error) { + cg := &configs.Cgroup{ + Parent: "/", + Name: containerName, + Resources: &configs.Resources{ + Devices: []*configs.DeviceRule{ + { + Type: 'a', + Permissions: "rwm", + Allow: true, + Minor: configs.Wildcard, + Major: configs.Wildcard, + }, }, }, } + + if cgroups.IsCgroup2UnifiedMode() { + return cgroupfs2.NewManager(cg, "", false) + + } + return cgroupfs.NewManager(cg, nil, false), nil } type KernelTunableBehavior string @@ -483,27 +499,28 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { if cm.SystemCgroupsName == "/" { return fmt.Errorf("system container cannot be root (\"/\")") } - cont := newSystemCgroups(cm.SystemCgroupsName) - cont.ensureStateFunc = func(manager *fs.Manager) error { + cont, err := newSystemCgroups(cm.SystemCgroupsName) + if err != nil { + return err + } + cont.ensureStateFunc = func(manager cgroups.Manager) error { return ensureSystemCgroups("/", manager) } systemContainers = append(systemContainers, cont) } if cm.KubeletCgroupsName != "" { - cont := newSystemCgroups(cm.KubeletCgroupsName) - allowAllDevices := true - manager := fs.Manager{ - Cgroups: &configs.Cgroup{ - Parent: "/", - Name: cm.KubeletCgroupsName, - Resources: &configs.Resources{ - AllowAllDevices: &allowAllDevices, - }, - }, + cont, err := newSystemCgroups(cm.KubeletCgroupsName) + if err != nil { + return err } - cont.ensureStateFunc = func(_ *fs.Manager) error { - return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager) + + manager, err := createManager(cm.KubeletCgroupsName) + if err != nil { + return err + } + cont.ensureStateFunc = func(_ cgroups.Manager) error { + return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, manager) } systemContainers = append(systemContainers, cont) } else { @@ -821,7 +838,7 @@ func getPidsForProcess(name, pidFile string) ([]int, error) { // Temporarily export the function to be used by dockershim. // TODO(yujuhong): Move this function to dockershim once kubelet migrates to // dockershim as the default. -func EnsureDockerInContainer(dockerAPIVersion *utilversion.Version, oomScoreAdj int, manager *fs.Manager) error { +func EnsureDockerInContainer(dockerAPIVersion *utilversion.Version, oomScoreAdj int, manager cgroups.Manager) error { type process struct{ name, file string } dockerProcs := []process{{dockerProcessName, dockerPidFile}} if dockerAPIVersion.AtLeast(containerdAPIVersion) { @@ -845,7 +862,7 @@ func EnsureDockerInContainer(dockerAPIVersion *utilversion.Version, oomScoreAdj return utilerrors.NewAggregate(errs) } -func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error { +func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager cgroups.Manager) error { if runningInHost, err := isProcessRunningInHost(pid); err != nil { // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. return err @@ -862,10 +879,18 @@ func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs. errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) } - if cont != manager.Cgroups.Name { + name := "" + cgroups, err := manager.GetCgroups() + if err != nil { + errs = append(errs, fmt.Errorf("failed to get cgroups for %d: %v", pid, err)) + } else { + name = cgroups.Name + } + + if cont != name { err = manager.Apply(pid) if err != nil { - errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err)) + errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, name, err)) } } } @@ -890,7 +915,10 @@ func getContainer(pid int) (string, error) { } if cgroups.IsCgroup2UnifiedMode() { - c, _ := cgs[""] + c, found := cgs[""] + if !found { + return "", cgroups.NewNotFoundError("unified") + } return c, nil } @@ -936,7 +964,7 @@ func getContainer(pid int) (string, error) { // The reason of leaving kernel threads at root cgroup is that we don't want to tie the // execution of these threads with to-be defined /system quota and create priority inversions. // -func ensureSystemCgroups(rootCgroupPath string, manager *fs.Manager) error { +func ensureSystemCgroups(rootCgroupPath string, manager cgroups.Manager) error { // Move non-kernel PIDs to the system container. // Only keep errors on latest attempt. var finalErr error @@ -967,7 +995,13 @@ func ensureSystemCgroups(rootCgroupPath string, manager *fs.Manager) error { for _, pid := range pids { err := manager.Apply(pid) if err != nil { - finalErr = fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, manager.Cgroups.Name, err) + name := "" + cgroups, err := manager.GetCgroups() + if err == nil { + name = cgroups.Name + } + + finalErr = fmt.Errorf("failed to move PID %d into the system container %q: %v", pid, name, err) } } diff --git a/pkg/kubelet/cm/container_manager_linux_test.go b/pkg/kubelet/cm/container_manager_linux_test.go index a8a40a52f6a..2a7115f2764 100644 --- a/pkg/kubelet/cm/container_manager_linux_test.go +++ b/pkg/kubelet/cm/container_manager_linux_test.go @@ -60,7 +60,11 @@ func fakeContainerMgrMountInt() mount.Interface { func TestCgroupMountValidationSuccess(t *testing.T) { f, err := validateSystemRequirements(fakeContainerMgrMountInt()) assert.Nil(t, err) - assert.False(t, f.cpuHardcapping, "cpu hardcapping is expected to be disabled") + if cgroups.IsCgroup2UnifiedMode() { + assert.True(t, f.cpuHardcapping, "cpu hardcapping is expected to be enabled") + } else { + assert.False(t, f.cpuHardcapping, "cpu hardcapping is expected to be disabled") + } } func TestCgroupMountValidationMemoryMissing(t *testing.T) { @@ -115,6 +119,19 @@ func TestCgroupMountValidationMultipleSubsystem(t *testing.T) { assert.Nil(t, err) } +func TestGetCpuWeight(t *testing.T) { + assert.Equal(t, uint64(0), getCpuWeight(nil)) + + v := uint64(2) + assert.Equal(t, uint64(1), getCpuWeight(&v)) + + v = uint64(262144) + assert.Equal(t, uint64(10000), getCpuWeight(&v)) + + v = uint64(1000000000) + assert.Equal(t, uint64(10000), getCpuWeight(&v)) +} + func TestSoftRequirementsValidationSuccess(t *testing.T) { if cgroups.IsCgroup2UnifiedMode() { t.Skip("skipping cgroup v1 test on a cgroup v2 system") @@ -148,28 +165,3 @@ func TestSoftRequirementsValidationSuccess(t *testing.T) { assert.NoError(t, err) assert.True(t, f.cpuHardcapping, "cpu hardcapping is expected to be enabled") } - -func TestGetCpuWeight(t *testing.T) { - assert.Equal(t, uint64(0), getCpuWeight(nil)) - - v := uint64(2) - assert.Equal(t, uint64(1), getCpuWeight(&v)) - - v = uint64(262144) - assert.Equal(t, uint64(10000), getCpuWeight(&v)) - - v = uint64(1000000000) - assert.Equal(t, uint64(10000), getCpuWeight(&v)) -} - -func TestGetCpuMax(t *testing.T) { - assert.Equal(t, getCpuMax(nil, nil), "max 100000") - - quota := int64(50000) - period := uint64(200000) - assert.Equal(t, "50000 200000", getCpuMax("a, &period)) - - assert.Equal(t, "max 200000", getCpuMax(nil, &period)) - - assert.Equal(t, "50000 100000", getCpuMax("a, nil)) -} diff --git a/pkg/kubelet/dockershim/cm/BUILD b/pkg/kubelet/dockershim/cm/BUILD index 71c2b8f29cf..3392465c75f 100644 --- a/pkg/kubelet/dockershim/cm/BUILD +++ b/pkg/kubelet/dockershim/cm/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/kubelet/dockershim/libdocker:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", @@ -41,6 +42,7 @@ go_library( "//pkg/kubelet/dockershim/libdocker:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", diff --git a/pkg/kubelet/dockershim/cm/container_manager_linux.go b/pkg/kubelet/dockershim/cm/container_manager_linux.go index e133134b0c0..2599517716d 100644 --- a/pkg/kubelet/dockershim/cm/container_manager_linux.go +++ b/pkg/kubelet/dockershim/cm/container_manager_linux.go @@ -25,7 +25,8 @@ import ( "strconv" "time" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups" + cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" utilversion "k8s.io/apimachinery/pkg/util/version" "k8s.io/apimachinery/pkg/util/wait" @@ -68,7 +69,7 @@ type containerManager struct { // Name of the cgroups. cgroupsName string // Manager for the cgroups. - cgroupsManager *fs.Manager + cgroupsManager cgroups.Manager } func (m *containerManager) Start() error { @@ -103,7 +104,7 @@ func (m *containerManager) doWork() { } } -func createCgroupManager(name string) (*fs.Manager, error) { +func createCgroupManager(name string) (cgroups.Manager, error) { var memoryLimit uint64 memoryCapacity, err := getMemoryCapacity() @@ -118,19 +119,24 @@ func createCgroupManager(name string) (*fs.Manager, error) { } klog.V(2).Infof("Configure resource-only container %q with memory limit: %d", name, memoryLimit) - allowAllDevices := true - cm := &fs.Manager{ - Cgroups: &configs.Cgroup{ - Parent: "/", - Name: name, - Resources: &configs.Resources{ - Memory: int64(memoryLimit), - MemorySwap: -1, - AllowAllDevices: &allowAllDevices, + cg := &configs.Cgroup{ + Parent: "/", + Name: name, + Resources: &configs.Resources{ + Memory: int64(memoryLimit), + MemorySwap: -1, + Devices: []*configs.DeviceRule{ + { + Minor: configs.Wildcard, + Major: configs.Wildcard, + Type: 'a', + Permissions: "rwm", + Allow: true, + }, }, }, } - return cm, nil + return cgroupfs.NewManager(cg, nil, false), nil } // getMemoryCapacity returns the memory capacity on the machine in bytes. diff --git a/test/e2e_node/resource_collector.go b/test/e2e_node/resource_collector.go index 6145ce6a7e4..8ab4fdaddf1 100644 --- a/test/e2e_node/resource_collector.go +++ b/test/e2e_node/resource_collector.go @@ -518,6 +518,14 @@ func getContainer(pid int) (string, error) { return "", err } + if cgroups.IsCgroup2UnifiedMode() { + unified, found := cgs[""] + if !found { + return "", cgroups.NewNotFoundError("unified") + } + return unified, nil + } + cpu, found := cgs["cpu"] if !found { return "", cgroups.NewNotFoundError("cpu")