From 6d16fee22934829583184caca32672bf473d077c Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Fri, 3 Apr 2020 15:54:00 +0200
Subject: [PATCH 1/4] kubelet: cpu hard capping is supported on cgroup v2

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 pkg/kubelet/cm/container_manager_linux.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index d6af9835883..a95dbafd9f0 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -162,6 +162,7 @@ func validateSystemRequirements(mountUtil mount.Interface) (features, error) {
 	}
 
 	if cgroups.IsCgroup2UnifiedMode() {
+		f.cpuHardcapping = true
 		return f, nil
 	}
 

From a9772b2290b493aa0b5f6311c692243785c6b347 Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Fri, 3 Apr 2020 15:53:39 +0200
Subject: [PATCH 2/4] kubelet: adapt cgroup_manager to cgroup v2

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 pkg/kubelet/cm/BUILD                   |   2 +
 pkg/kubelet/cm/cgroup_manager_linux.go | 171 +++++++++++++++++++++++--
 2 files changed, 162 insertions(+), 11 deletions(-)

diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD
index 1ad243e0b3a..9e67fa9019a 100644
--- a/pkg/kubelet/cm/BUILD
+++ b/pkg/kubelet/cm/BUILD
@@ -70,6 +70,7 @@ go_library(
             "//vendor/github.com/docker/go-units:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
             "//vendor/k8s.io/utils/io:go_default_library",
@@ -121,6 +122,7 @@ go_library(
             "//vendor/github.com/docker/go-units:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
             "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
             "//vendor/k8s.io/utils/io:go_default_library",
diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go
index 4d8b26e5eb0..8a53688dbee 100644
--- a/pkg/kubelet/cm/cgroup_manager_linux.go
+++ b/pkg/kubelet/cm/cgroup_manager_linux.go
@@ -18,15 +18,18 @@ package cm
 
 import (
 	"fmt"
+	"io/ioutil"
 	"os"
 	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
 	cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
+	cgroupfs2 "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
 	libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
 	"k8s.io/klog"
@@ -36,6 +39,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/sets"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	kubefeatures "k8s.io/kubernetes/pkg/features"
+	cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
 	"k8s.io/kubernetes/pkg/kubelet/metrics"
 )
 
@@ -228,6 +232,12 @@ func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string
 	return cgroupPaths
 }
 
+// buildCgroupUnifiedPath builds a path to the specified name.
+func (m *cgroupManagerImpl) buildCgroupUnifiedPath(name CgroupName) string {
+	cgroupFsAdaptedName := m.Name(name)
+	return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName)
+}
+
 // TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead.
 // It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent)
 // and split it appropriately, using essentially the logic below.
@@ -246,6 +256,21 @@ func updateSystemdCgroupInfo(cgroupConfig *libcontainerconfigs.Cgroup, cgroupNam
 
 // Exists checks if all subsystem cgroups already exist
 func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
+	if libcontainercgroups.IsCgroup2UnifiedMode() {
+		cgroupPath := m.buildCgroupUnifiedPath(name)
+		neededControllers := getSupportedUnifiedControllers()
+		enabledControllers, err := readUnifiedControllers(cgroupPath)
+		if err != nil {
+			return false
+		}
+		difference := neededControllers.Difference(enabledControllers)
+		if difference.Len() > 0 {
+			klog.V(4).Infof("The Cgroup %v has some missing controllers: %v", name, difference)
+			return false
+		}
+		return true
+	}
+
 	// Get map of all cgroup paths on the system for the particular cgroup
 	cgroupPaths := m.buildCgroupPaths(name)
 
@@ -338,7 +363,7 @@ func getSupportedSubsystems() map[subsystem]bool {
 	return supportedSubsystems
 }
 
-// setSupportedSubsystems sets cgroup resource limits only on the supported
+// setSupportedSubsystemsV1 sets cgroup resource limits on cgroup v1 only on the supported
 // subsystems. ie. cpu and memory. We don't use libcontainer's cgroup/fs/Set()
 // method as it doesn't allow us to skip updates on the devices cgroup
 // Allowing or denying all devices by writing 'a' to devices.allow or devices.deny is
@@ -347,7 +372,7 @@ func getSupportedSubsystems() map[subsystem]bool {
 // We would like to skip setting any values on the device cgroup in this case
 // but this is not possible with libcontainers Set() method
 // See https://github.com/opencontainers/runc/issues/932
-func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
+func setSupportedSubsystemsV1(cgroupConfig *libcontainerconfigs.Cgroup) error {
 	for sys, required := range getSupportedSubsystems() {
 		if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
 			if required {
@@ -388,6 +413,104 @@ func getCpuMax(cpuQuota *int64, cpuPeriod *uint64) string {
 	return fmt.Sprintf("%s %s", quotaStr, periodStr)
 }
 
+// readUnifiedControllers reads the controllers available at the specified cgroup
+func readUnifiedControllers(path string) (sets.String, error) {
+	controllersFileContent, err := ioutil.ReadFile(filepath.Join(path, "cgroup.controllers"))
+	if err != nil {
+		return nil, err
+	}
+	controllers := strings.Fields(string(controllersFileContent))
+	return sets.NewString(controllers...), nil
+}
+
+var (
+	availableRootControllersOnce sync.Once
+	availableRootControllers     sets.String
+)
+
+// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2
+func getSupportedUnifiedControllers() sets.String {
+	// This is the set of controllers used by the Kubelet
+	supportedControllers := sets.NewString("cpu", "cpuset", "memory", "hugetlb")
+	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
+		supportedControllers.Insert("pids")
+	}
+	// Memoize the set of controllers that are present in the root cgroup
+	availableRootControllersOnce.Do(func() {
+		var err error
+		availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot)
+		if err != nil {
+			panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot))
+		}
+	})
+	// Return the set of controllers that are supported both by the Kubelet and by the kernel
+	return supportedControllers.Intersection(availableRootControllers)
+}
+
+// propagateControllers on an unified hierarchy enables all the supported controllers for the specified cgroup
+func propagateControllers(path string) error {
+	if err := os.MkdirAll(path, 0755); err != nil {
+		return fmt.Errorf("failed to create cgroup %q : %v", path, err)
+	}
+
+	// Retrieve all the supported controllers from the cgroup root
+	controllersFileContent, err := ioutil.ReadFile(filepath.Join(cmutil.CgroupRoot, "cgroup.controllers"))
+	if err != nil {
+		return fmt.Errorf("failed to read controllers from %q : %v", cmutil.CgroupRoot, err)
+	}
+
+	supportedControllers := getSupportedUnifiedControllers()
+
+	// The retrieved content looks like: "cpuset cpu io memory hugetlb pids".  Prepend each of the controllers
+	// with '+', so we have something like "+cpuset +cpu +io +memory +hugetlb +pids"
+	controllers := ""
+	for _, controller := range strings.Fields(string(controllersFileContent)) {
+		// ignore controllers we don't care about
+		if !supportedControllers.Has(controller) {
+			continue
+		}
+
+		sep := " +"
+		if controllers == "" {
+			sep = "+"
+		}
+		controllers = controllers + sep + controller
+	}
+
+	current := cmutil.CgroupRoot
+	relPath, err := filepath.Rel(cmutil.CgroupRoot, path)
+	if err != nil {
+		return fmt.Errorf("failed to get relative path to cgroup root from %q: %v", path, err)
+	}
+	// Write the controllers list to each "cgroup.subtree_control" file until it reaches the parent cgroup.
+	// For the /foo/bar/baz cgroup, controllers must be enabled sequentially in the files:
+	// - /sys/fs/cgroup/foo/cgroup.subtree_control
+	// - /sys/fs/cgroup/foo/bar/cgroup.subtree_control
+	for _, p := range strings.Split(filepath.Dir(relPath), "/") {
+		current = filepath.Join(current, p)
+		if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), []byte(controllers), 0755); err != nil {
+			return fmt.Errorf("failed to enable controllers on %q: %v", cmutil.CgroupRoot, err)
+		}
+	}
+	return nil
+}
+
+// setResourcesV2 sets cgroup resource limits on cgroup v2
+func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error {
+	if err := propagateControllers(cgroupConfig.Path); err != nil {
+		return err
+	}
+
+	manager, err := cgroupfs2.NewManager(cgroupConfig, cgroupConfig.Path, false)
+	if err != nil {
+		return fmt.Errorf("failed to create cgroup v2 manager: %v", err)
+	}
+	config := &libcontainerconfigs.Config{
+		Cgroups: cgroupConfig,
+	}
+	return manager.Set(config)
+}
+
 func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
 	resources := &libcontainerconfigs.Resources{}
 	if resourceConfig == nil {
@@ -454,12 +577,17 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
 	resourceConfig := cgroupConfig.ResourceParameters
 	resources := m.toResources(resourceConfig)
 
-	cgroupPaths := m.buildCgroupPaths(cgroupConfig.Name)
-
 	libcontainerCgroupConfig := &libcontainerconfigs.Cgroup{
 		Resources: resources,
-		Paths:     cgroupPaths,
 	}
+
+	unified := libcontainercgroups.IsCgroup2UnifiedMode()
+	if unified {
+		libcontainerCgroupConfig.Path = m.buildCgroupUnifiedPath(cgroupConfig.Name)
+	} else {
+		libcontainerCgroupConfig.Paths = m.buildCgroupPaths(cgroupConfig.Name)
+	}
+
 	// libcontainer consumes a different field and expects a different syntax
 	// depending on the cgroup driver in use, so we need this conditional here.
 	if m.adapter.cgroupManagerType == libcontainerSystemd {
@@ -472,8 +600,14 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
 		libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit
 	}
 
-	if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
-		return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err)
+	if unified {
+		if err := setResourcesV2(libcontainerCgroupConfig); err != nil {
+			return fmt.Errorf("failed to set resources for cgroup %v: %v", cgroupConfig.Name, err)
+		}
+	} else {
+		if err := setSupportedSubsystemsV1(libcontainerCgroupConfig); err != nil {
+			return fmt.Errorf("failed to set supported cgroup subsystems for cgroup %v: %v", cgroupConfig.Name, err)
+		}
 	}
 	return nil
 }
@@ -619,10 +753,25 @@ func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
 
 // Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
 func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
-	cgroupPaths := m.buildCgroupPaths(name)
-	stats, err := getStatsSupportedSubsystems(cgroupPaths)
-	if err != nil {
-		return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
+	var err error
+	var stats *libcontainercgroups.Stats
+	if libcontainercgroups.IsCgroup2UnifiedMode() {
+		cgroupPath := m.buildCgroupUnifiedPath(name)
+		manager, err := cgroupfs2.NewManager(nil, cgroupPath, false)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create cgroup v2 manager: %v", err)
+		}
+
+		stats, err = manager.GetStats()
+		if err != nil {
+			return nil, fmt.Errorf("failed to get stats for cgroup %v: %v", name, err)
+		}
+	} else {
+		cgroupPaths := m.buildCgroupPaths(name)
+		stats, err = getStatsSupportedSubsystems(cgroupPaths)
+		if err != nil {
+			return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
+		}
 	}
 	return toResourceStats(stats), nil
 }

From 43c56eb4038ee9f6b7275f5aefe3aba6fd5f181b Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Tue, 7 Apr 2020 13:09:45 +0200
Subject: [PATCH 3/4] e2e_node: adapt tests to cgroup v2

and fix node_container_manager_test to run with the systemd cgroup
manager.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 test/e2e_node/BUILD                          |  2 +
 test/e2e_node/hugepages_test.go              |  9 ++-
 test/e2e_node/node_container_manager_test.go | 81 +++++++++++++++-----
 test/e2e_node/pids_test.go                   |  8 +-
 test/e2e_node/pods_container_manager_test.go | 18 ++++-
 test/e2e_node/services/kubelet.go            |  4 +
 test/e2e_node/summary_test.go                |  6 +-
 test/e2e_node/utils_linux.go                 | 28 +++++++
 test/e2e_node/utils_unsupported.go           | 24 ++++++
 9 files changed, 155 insertions(+), 25 deletions(-)
 create mode 100644 test/e2e_node/utils_linux.go
 create mode 100644 test/e2e_node/utils_unsupported.go

diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD
index 7125bc64969..54f7109a99d 100644
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@@ -21,6 +21,8 @@ go_library(
         "util_sriov.go",
         "util_xfs_linux.go",
         "util_xfs_unsupported.go",
+        "utils_linux.go",
+        "utils_unsupported.go",
     ],
     importpath = "k8s.io/kubernetes/test/e2e_node",
     visibility = ["//visibility:public"],
diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go
index c15d2a2a541..2f3a933774c 100644
--- a/test/e2e_node/hugepages_test.go
+++ b/test/e2e_node/hugepages_test.go
@@ -50,8 +50,13 @@ func makePodToVerifyHugePages(baseName string, hugePagesLimit resource.Quantity)
 		cgroupFsName = cgroupName.ToCgroupfs()
 	}
 
-	// this command takes the expected value and compares it against the actual value for the pod cgroup hugetlb.2MB.limit_in_bytes
-	command := fmt.Sprintf("expected=%v; actual=$(cat /tmp/hugetlb/%v/hugetlb.2MB.limit_in_bytes); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName)
+	command := ""
+	// this command takes the expected value and compares it against the actual value for the pod cgroup hugetlb.2MB.<LIMIT>
+	if IsCgroup2UnifiedMode() {
+		command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/%v/hugetlb.2MB.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName)
+	} else {
+		command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/hugetlb/%v/hugetlb.2MB.limit_in_bytes); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), cgroupFsName)
+	}
 	framework.Logf("Pod to run command: %v", command)
 	pod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
diff --git a/test/e2e_node/node_container_manager_test.go b/test/e2e_node/node_container_manager_test.go
index 45cf79f73df..d128cecaee6 100644
--- a/test/e2e_node/node_container_manager_test.go
+++ b/test/e2e_node/node_container_manager_test.go
@@ -149,6 +149,11 @@ func destroyTemporaryCgroupsForReservation(cgroupManager cm.CgroupManager) error
 	return cgroupManager.Destroy(cgroupConfig)
 }
 
+// convertSharesToWeight converts from cgroup v1 cpu.shares to cgroup v2 cpu.weight
+func convertSharesToWeight(shares int64) int64 {
+	return 1 + ((shares-2)*9999)/262142
+}
+
 func runTest(f *framework.Framework) error {
 	var oldCfg *kubeletconfig.KubeletConfiguration
 	subsystems, err := cm.GetCgroupSubsystems()
@@ -187,8 +192,14 @@ func runTest(f *framework.Framework) error {
 	expectedNAPodCgroup := cm.ParseCgroupfsToCgroupName(currentConfig.CgroupRoot)
 	expectedNAPodCgroup = cm.NewCgroupName(expectedNAPodCgroup, "kubepods")
 	if !cgroupManager.Exists(expectedNAPodCgroup) {
-		return fmt.Errorf("Expected Node Allocatable Cgroup Does not exist")
+		return fmt.Errorf("Expected Node Allocatable Cgroup %q does not exist", expectedNAPodCgroup)
 	}
+
+	memoryLimitFile := "memory.limit_in_bytes"
+	if IsCgroup2UnifiedMode() {
+		memoryLimitFile = "memory.max"
+	}
+
 	// TODO: Update cgroupManager to expose a Status interface to get current Cgroup Settings.
 	// The node may not have updated capacity and allocatable yet, so check that it happens eventually.
 	gomega.Eventually(func() error {
@@ -199,20 +210,33 @@ func runTest(f *framework.Framework) error {
 		if len(nodeList.Items) != 1 {
 			return fmt.Errorf("Unexpected number of node objects for node e2e. Expects only one node: %+v", nodeList)
 		}
+		cgroupName := "kubepods"
+		if currentConfig.CgroupDriver == "systemd" {
+			cgroupName = "kubepods.slice"
+		}
+
 		node := nodeList.Items[0]
 		capacity := node.Status.Capacity
 		allocatableCPU, allocatableMemory, allocatablePIDs := getAllocatableLimits("200m", "200Mi", "1738", capacity)
 		// Total Memory reservation is 200Mi excluding eviction thresholds.
 		// Expect CPU shares on node allocatable cgroup to equal allocatable.
-		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], "kubepods", "cpu.shares"), int64(cm.MilliCPUToShares(allocatableCPU.MilliValue())), 10); err != nil {
-			return err
+		shares := int64(cm.MilliCPUToShares(allocatableCPU.MilliValue()))
+		if IsCgroup2UnifiedMode() {
+			// convert to the cgroup v2 cpu.weight value
+			if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupName, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil {
+				return err
+			}
+		} else {
+			if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupName, "cpu.shares"), shares, 10); err != nil {
+				return err
+			}
 		}
 		// Expect Memory limit on node allocatable cgroup to equal allocatable.
-		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], "kubepods", "memory.limit_in_bytes"), allocatableMemory.Value(), 0); err != nil {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupName, memoryLimitFile), allocatableMemory.Value(), 0); err != nil {
 			return err
 		}
 		// Expect PID limit on node allocatable cgroup to equal allocatable.
-		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], "kubepods", "pids.max"), allocatablePIDs.Value(), 0); err != nil {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupName, "pids.max"), allocatablePIDs.Value(), 0); err != nil {
 			return err
 		}
 
@@ -235,42 +259,61 @@ func runTest(f *framework.Framework) error {
 		return nil
 	}, time.Minute, 5*time.Second).Should(gomega.BeNil())
 
-	kubeReservedCgroupName := cm.NewCgroupName(cm.RootCgroupName, kubeReservedCgroup)
-	if !cgroupManager.Exists(kubeReservedCgroupName) {
-		return fmt.Errorf("Expected kube reserved cgroup Does not exist")
+	cgroupPath := ""
+	if currentConfig.CgroupDriver == "systemd" {
+		cgroupPath = cm.ParseSystemdToCgroupName(kubeReservedCgroup).ToSystemd()
+	} else {
+		cgroupPath = cgroupManager.Name(cm.NewCgroupName(cm.RootCgroupName, kubeReservedCgroup))
 	}
 	// Expect CPU shares on kube reserved cgroup to equal it's reservation which is `100m`.
 	kubeReservedCPU := resource.MustParse(currentConfig.KubeReserved[string(v1.ResourceCPU)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupManager.Name(kubeReservedCgroupName), "cpu.shares"), int64(cm.MilliCPUToShares(kubeReservedCPU.MilliValue())), 10); err != nil {
-		return err
+	shares := int64(cm.MilliCPUToShares(kubeReservedCPU.MilliValue()))
+	if IsCgroup2UnifiedMode() {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil {
+			return err
+		}
+	} else {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.shares"), shares, 10); err != nil {
+			return err
+		}
 	}
 	// Expect Memory limit kube reserved cgroup to equal configured value `100Mi`.
 	kubeReservedMemory := resource.MustParse(currentConfig.KubeReserved[string(v1.ResourceMemory)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(kubeReservedCgroupName), "memory.limit_in_bytes"), kubeReservedMemory.Value(), 0); err != nil {
+	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupPath, memoryLimitFile), kubeReservedMemory.Value(), 0); err != nil {
 		return err
 	}
 	// Expect process ID limit kube reserved cgroup to equal configured value `738`.
 	kubeReservedPIDs := resource.MustParse(currentConfig.KubeReserved[string(pidlimit.PIDs)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(kubeReservedCgroupName), "pids.max"), kubeReservedPIDs.Value(), 0); err != nil {
+	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupPath, "pids.max"), kubeReservedPIDs.Value(), 0); err != nil {
 		return err
 	}
-	systemReservedCgroupName := cm.NewCgroupName(cm.RootCgroupName, systemReservedCgroup)
-	if !cgroupManager.Exists(systemReservedCgroupName) {
-		return fmt.Errorf("Expected system reserved cgroup Does not exist")
+
+	if currentConfig.CgroupDriver == "systemd" {
+		cgroupPath = cm.ParseSystemdToCgroupName(systemReservedCgroup).ToSystemd()
+	} else {
+		cgroupPath = cgroupManager.Name(cm.NewCgroupName(cm.RootCgroupName, systemReservedCgroup))
 	}
+
 	// Expect CPU shares on system reserved cgroup to equal it's reservation which is `100m`.
 	systemReservedCPU := resource.MustParse(currentConfig.SystemReserved[string(v1.ResourceCPU)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupManager.Name(systemReservedCgroupName), "cpu.shares"), int64(cm.MilliCPUToShares(systemReservedCPU.MilliValue())), 10); err != nil {
-		return err
+	shares = int64(cm.MilliCPUToShares(systemReservedCPU.MilliValue()))
+	if IsCgroup2UnifiedMode() {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.weight"), convertSharesToWeight(shares), 10); err != nil {
+			return err
+		}
+	} else {
+		if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], cgroupPath, "cpu.shares"), shares, 10); err != nil {
+			return err
+		}
 	}
 	// Expect Memory limit on node allocatable cgroup to equal allocatable.
 	systemReservedMemory := resource.MustParse(currentConfig.SystemReserved[string(v1.ResourceMemory)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(systemReservedCgroupName), "memory.limit_in_bytes"), systemReservedMemory.Value(), 0); err != nil {
+	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupPath, memoryLimitFile), systemReservedMemory.Value(), 0); err != nil {
 		return err
 	}
 	// Expect process ID limit system reserved cgroup to equal configured value `1000`.
 	systemReservedPIDs := resource.MustParse(currentConfig.SystemReserved[string(pidlimit.PIDs)])
-	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(systemReservedCgroupName), "pids.max"), systemReservedPIDs.Value(), 0); err != nil {
+	if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupPath, "pids.max"), systemReservedPIDs.Value(), 0); err != nil {
 		return err
 	}
 	return nil
diff --git a/test/e2e_node/pids_test.go b/test/e2e_node/pids_test.go
index 9d5d34ecb0f..0c3192aaefd 100644
--- a/test/e2e_node/pids_test.go
+++ b/test/e2e_node/pids_test.go
@@ -45,7 +45,13 @@ func makePodToVerifyPids(baseName string, pidsLimit resource.Quantity) *v1.Pod {
 	}
 
 	// this command takes the expected value and compares it against the actual value for the pod cgroup pids.max
-	command := fmt.Sprintf("expected=%v; actual=$(cat /tmp/pids/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName)
+	command := ""
+	if IsCgroup2UnifiedMode() {
+		command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName)
+	} else {
+		command = fmt.Sprintf("expected=%v; actual=$(cat /tmp/pids/%v/pids.max); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", pidsLimit.Value(), cgroupFsName)
+	}
+
 	framework.Logf("Pod to run command: %v", command)
 	pod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
diff --git a/test/e2e_node/pods_container_manager_test.go b/test/e2e_node/pods_container_manager_test.go
index 4551c8a4cbe..540aa098db8 100644
--- a/test/e2e_node/pods_container_manager_test.go
+++ b/test/e2e_node/pods_container_manager_test.go
@@ -75,8 +75,14 @@ func makePodToVerifyCgroups(cgroupNames []string) *v1.Pod {
 	klog.Infof("expecting %v cgroups to be found", cgroupFsNames)
 	// build the pod command to either verify cgroups exist
 	command := ""
+
 	for _, cgroupFsName := range cgroupFsNames {
-		localCommand := "if [ ! -d /tmp/memory/" + cgroupFsName + " ] || [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 1; fi; "
+		localCommand := ""
+		if IsCgroup2UnifiedMode() {
+			localCommand = "if [ ! -d /tmp/" + cgroupFsName + " ]; then exit 1; fi; "
+		} else {
+			localCommand = "if [ ! -d /tmp/memory/" + cgroupFsName + " ] || [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 1; fi; "
+		}
 		command += localCommand
 	}
 
@@ -117,6 +123,14 @@ func makePodToVerifyCgroupRemoved(baseName string) *v1.Pod {
 	components := strings.Split(baseName, "/")
 	cgroupName := cm.NewCgroupName(cm.RootCgroupName, components...)
 	cgroupFsName := toCgroupFsName(cgroupName)
+
+	command := ""
+	if IsCgroup2UnifiedMode() {
+		command = "for i in `seq 1 10`; do if [ ! -d /tmp/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1"
+	} else {
+		command = "for i in `seq 1 10`; do if [ ! -d /tmp/memory/" + cgroupFsName + " ] && [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1"
+	}
+
 	pod := &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "pod" + string(uuid.NewUUID()),
@@ -127,7 +141,7 @@ func makePodToVerifyCgroupRemoved(baseName string) *v1.Pod {
 				{
 					Image:   busyboxImage,
 					Name:    "container" + string(uuid.NewUUID()),
-					Command: []string{"sh", "-c", "for i in `seq 1 10`; do if [ ! -d /tmp/memory/" + cgroupFsName + " ] && [ ! -d /tmp/cpu/" + cgroupFsName + " ]; then exit 0; else sleep 10; fi; done; exit 1"},
+					Command: []string{"sh", "-c", command},
 					VolumeMounts: []v1.VolumeMount{
 						{
 							Name:      "sysfscgroup",
diff --git a/test/e2e_node/services/kubelet.go b/test/e2e_node/services/kubelet.go
index 7ffd95d675f..a6e863f2a6b 100644
--- a/test/e2e_node/services/kubelet.go
+++ b/test/e2e_node/services/kubelet.go
@@ -194,6 +194,10 @@ func (e *E2EServices) startKubelet() (*server, error) {
 		unitName := fmt.Sprintf("kubelet-%s.service", unitTimestamp)
 		cmdArgs = append(cmdArgs,
 			systemdRun,
+			"-p", "Delegate=true",
+			"-p", "CPUAccounting=true",
+			"-p", "MemoryAccounting=true",
+			"-p", "TasksAccounting=true",
 			"--unit="+unitName,
 			"--slice=runtime.slice",
 			"--remain-after-exit",
diff --git a/test/e2e_node/summary_test.go b/test/e2e_node/summary_test.go
index acd6fe55375..c2f5d350635 100644
--- a/test/e2e_node/summary_test.go
+++ b/test/e2e_node/summary_test.go
@@ -435,7 +435,11 @@ func recordSystemCgroupProcesses() {
 			continue
 		}
 
-		pids, err := ioutil.ReadFile(fmt.Sprintf("/sys/fs/cgroup/cpu/%s/cgroup.procs", cgroup))
+		filePattern := "/sys/fs/cgroup/cpu/%s/cgroup.procs"
+		if IsCgroup2UnifiedMode() {
+			filePattern = "/sys/fs/cgroup/%s/cgroup.procs"
+		}
+		pids, err := ioutil.ReadFile(fmt.Sprintf(filePattern, cgroup))
 		if err != nil {
 			framework.Logf("Failed to read processes in cgroup %s: %v", name, err)
 			continue
diff --git a/test/e2e_node/utils_linux.go b/test/e2e_node/utils_linux.go
new file mode 100644
index 00000000000..50f59401ba9
--- /dev/null
+++ b/test/e2e_node/utils_linux.go
@@ -0,0 +1,28 @@
+// +build linux
+
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2enode
+
+import (
+	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
+func IsCgroup2UnifiedMode() bool {
+	return libcontainercgroups.IsCgroup2UnifiedMode()
+}
diff --git a/test/e2e_node/utils_unsupported.go b/test/e2e_node/utils_unsupported.go
new file mode 100644
index 00000000000..69322913b75
--- /dev/null
+++ b/test/e2e_node/utils_unsupported.go
@@ -0,0 +1,24 @@
+// +build !linux
+
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2enode
+
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
+func IsCgroup2UnifiedMode() bool {
+	return false
+}

From 26d94ad628691a61de52b67a05902eff4cf1753d Mon Sep 17 00:00:00 2001
From: Giuseppe Scrivano <gscrivan@redhat.com>
Date: Thu, 9 Apr 2020 12:18:40 +0200
Subject: [PATCH 4/4] kubelet: do not configure the device cgroup

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
 pkg/kubelet/cm/cgroup_manager_linux.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go
index 8a53688dbee..a9177d94e75 100644
--- a/pkg/kubelet/cm/cgroup_manager_linux.go
+++ b/pkg/kubelet/cm/cgroup_manager_linux.go
@@ -500,6 +500,8 @@ func setResourcesV2(cgroupConfig *libcontainerconfigs.Cgroup) error {
 	if err := propagateControllers(cgroupConfig.Path); err != nil {
 		return err
 	}
+	allowAll := true
+	cgroupConfig.Resources.AllowAllDevices = &allowAll
 
 	manager, err := cgroupfs2.NewManager(cgroupConfig, cgroupConfig.Path, false)
 	if err != nil {