virtcontainers: Rename and clean the cgroup interface

We call it a ResourceController, and we make it not so Linux specific.
Now the Linux implementations is the cgroups one.

Signed-off-by: Samuel Ortiz <s.ortiz@apple.com>
This commit is contained in:
Samuel Ortiz 2022-02-02 15:49:06 +00:00 committed by Samuel Ortiz
parent ad10e201e1
commit 0d1a7da682
4 changed files with 394 additions and 345 deletions

View File

@ -1,4 +1,6 @@
// Copyright (c) 2021 Apple Inc. // +build linux
//
// Copyright (c) 2021-2022 Apple Inc.
// //
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
@ -7,8 +9,11 @@ package cgroups
import ( import (
"fmt" "fmt"
"os"
"path/filepath" "path/filepath"
"sync"
"github.com/containerd/cgroups"
v1 "github.com/containerd/cgroups/stats/v1" v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
@ -23,17 +28,6 @@ const CgroupKataPrefix = "kata"
// DefaultCgroupPath runtime-determined location in the cgroups hierarchy. // DefaultCgroupPath runtime-determined location in the cgroups hierarchy.
const DefaultCgroupPath = "/vc" const DefaultCgroupPath = "/vc"
var (
cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups")
)
// SetLogger sets up a logger for this pkg
func SetLogger(logger *logrus.Entry) {
fields := cgroupsLogger.Data
cgroupsLogger = logger.WithFields(fields)
}
func RenameCgroupPath(path string) (string, error) { func RenameCgroupPath(path string) (string, error) {
if path == "" { if path == "" {
path = DefaultCgroupPath path = DefaultCgroupPath
@ -45,16 +39,297 @@ func RenameCgroupPath(path string) (string, error) {
} }
type Cgroup interface { type LinuxCgroup struct {
Delete() error cgroup cgroups.Cgroup
Stat() (*v1.Metrics, error) path string
AddProcess(int, ...string) error cpusets *specs.LinuxCPU
AddTask(int, ...string) error devices []specs.LinuxDeviceCgroup
Update(*specs.LinuxResources) error
MoveTo(string) error sync.Mutex
MoveToParent() error }
AddDevice(string) error
RemoveDevice(string) error func sandboxDevices() []specs.LinuxDeviceCgroup {
UpdateCpuSet(string, string) error devices := []specs.LinuxDeviceCgroup{}
Path() string
defaultDevices := []string{
"/dev/null",
"/dev/random",
"/dev/full",
"/dev/tty",
"/dev/zero",
"/dev/urandom",
"/dev/console",
}
// Processes running in a device-cgroup are constrained, they have acccess
// only to the devices listed in the devices.list file.
// In order to run Virtual Machines and create virtqueues, hypervisors
// need access to certain character devices in the host, like kvm and vhost-net.
hypervisorDevices := []string{
"/dev/kvm", // To run virtual machines
"/dev/vhost-net", // To create virtqueues
"/dev/vfio/vfio", // To access VFIO devices
"/dev/vhost-vsock", // To interact with vsock if
}
defaultDevices = append(defaultDevices, hypervisorDevices...)
for _, device := range defaultDevices {
ldevice, err := DeviceToLinuxDevice(device)
if err != nil {
controllerLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device)
continue
}
devices = append(devices, ldevice)
}
wildcardMajor := int64(-1)
wildcardMinor := int64(-1)
ptsMajor := int64(136)
tunMajor := int64(10)
tunMinor := int64(200)
wildcardDevices := []specs.LinuxDeviceCgroup{
// allow mknod for any device
{
Allow: true,
Type: "c",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
{
Allow: true,
Type: "b",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Allow: true,
Type: "c",
Major: &ptsMajor,
Minor: &wildcardMinor,
Access: "rwm",
},
// tuntap
{
Allow: true,
Type: "c",
Major: &tunMajor,
Minor: &tunMinor,
Access: "rwm",
},
}
devices = append(devices, wildcardDevices...)
return devices
}
func NewResourceController(path string, resources *specs.LinuxResources) (ResourceController, error) {
var err error
cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: cgroupPath,
devices: resources.Devices,
cpusets: resources.CPU,
cgroup: cgroup,
}, nil
}
func NewSandboxResourceController(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (ResourceController, error) {
var cgroup cgroups.Cgroup
sandboxResources := *resources
sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...)
// Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence,
// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect
if !IsSystemdCgroup(path) || !sandboxCgroupOnly {
return NewResourceController(path, &sandboxResources)
}
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, err
}
// github.com/containerd/cgroups doesn't support creating a scope unit with
// v1 cgroups against systemd, the following interacts directly with systemd
// to create the cgroup and then load it using containerd's api
err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant
if err != nil {
return nil, err
}
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
// load created cgroup and update with resources
if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil {
err = cgroup.Update(&sandboxResources)
}
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
devices: sandboxResources.Devices,
cpusets: sandboxResources.CPU,
cgroup: cgroup,
}, nil
}
func LoadResourceController(path string) (ResourceController, error) {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
cgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
cgroup: cgroup,
}, nil
}
func (c *LinuxCgroup) Logger() *logrus.Entry {
return controllerLogger.WithField("source", "cgroups")
}
func (c *LinuxCgroup) Delete() error {
return c.cgroup.Delete()
}
func (c *LinuxCgroup) Stat() (*v1.Metrics, error) {
return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist))
}
func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error {
return c.cgroup.Add(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) AddThread(pid int, subsystems ...string) error {
return c.cgroup.AddTask(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error {
return c.cgroup.Update(resources)
}
func (c *LinuxCgroup) MoveTo(path string) error {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return err
}
newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return err
}
return c.cgroup.MoveTo(newCgroup)
}
func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
c.devices = append(c.devices, deviceResource)
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
for i, d := range c.devices {
if d.Type == deviceResource.Type &&
d.Major == deviceResource.Major &&
d.Minor == deviceResource.Minor {
c.devices = append(c.devices[:i], c.devices[i+1:]...)
}
}
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error {
c.Lock()
defer c.Unlock()
if len(cpuset) > 0 {
// If we didn't have a cpuset defined, let's create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Cpus = cpuset
}
if len(memset) > 0 {
// If we didn't have a cpuset defined, let's now create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Mems = memset
}
return c.cgroup.Update(&specs.LinuxResources{
CPU: c.cpusets,
})
}
func (c *LinuxCgroup) Type() ResourceControllerType {
return LinuxCgroups
}
func (c *LinuxCgroup) ID() string {
return c.path
}
func (c *LinuxCgroup) Parent() string {
return filepath.Dir(c.path)
} }

View File

@ -1,310 +0,0 @@
// Copyright (c) 2021-2022 Apple Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package cgroups
import (
"os"
"path/filepath"
"sync"
"github.com/containerd/cgroups"
v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
type LinuxCgroup struct {
cgroup cgroups.Cgroup
path string
cpusets *specs.LinuxCPU
devices []specs.LinuxDeviceCgroup
sync.Mutex
}
func sandboxDevices() []specs.LinuxDeviceCgroup {
devices := []specs.LinuxDeviceCgroup{}
defaultDevices := []string{
"/dev/null",
"/dev/random",
"/dev/full",
"/dev/tty",
"/dev/zero",
"/dev/urandom",
"/dev/console",
}
// Processes running in a device-cgroup are constrained, they have acccess
// only to the devices listed in the devices.list file.
// In order to run Virtual Machines and create virtqueues, hypervisors
// need access to certain character devices in the host, like kvm and vhost-net.
hypervisorDevices := []string{
"/dev/kvm", // To run virtual machines
"/dev/vhost-net", // To create virtqueues
"/dev/vfio/vfio", // To access VFIO devices
"/dev/vhost-vsock", // To interact with vsock if
}
defaultDevices = append(defaultDevices, hypervisorDevices...)
for _, device := range defaultDevices {
ldevice, err := DeviceToLinuxDevice(device)
if err != nil {
cgroupsLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device)
continue
}
devices = append(devices, ldevice)
}
wildcardMajor := int64(-1)
wildcardMinor := int64(-1)
ptsMajor := int64(136)
tunMajor := int64(10)
tunMinor := int64(200)
wildcardDevices := []specs.LinuxDeviceCgroup{
// allow mknod for any device
{
Allow: true,
Type: "c",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
{
Allow: true,
Type: "b",
Major: &wildcardMajor,
Minor: &wildcardMinor,
Access: "m",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Allow: true,
Type: "c",
Major: &ptsMajor,
Minor: &wildcardMinor,
Access: "rwm",
},
// tuntap
{
Allow: true,
Type: "c",
Major: &tunMajor,
Minor: &tunMinor,
Access: "rwm",
},
}
devices = append(devices, wildcardDevices...)
return devices
}
func NewCgroup(path string, resources *specs.LinuxResources) (Cgroup, error) {
var err error
cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path))
if err != nil {
return nil, err
}
cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: cgroupPath,
devices: resources.Devices,
cpusets: resources.CPU,
cgroup: cgroup,
}, nil
}
func NewSandboxCgroup(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (Cgroup, error) {
var cgroup cgroups.Cgroup
sandboxResources := *resources
sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...)
// Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence,
// if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect
if !IsSystemdCgroup(path) || !sandboxCgroupOnly {
return NewCgroup(path, &sandboxResources)
}
slice, unit, err := getSliceAndUnit(path)
if err != nil {
return nil, err
}
// github.com/containerd/cgroups doesn't support creating a scope unit with
// v1 cgroups against systemd, the following interacts directly with systemd
// to create the cgroup and then load it using containerd's api
err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant
if err != nil {
return nil, err
}
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
// load created cgroup and update with resources
if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil {
err = cgroup.Update(&sandboxResources)
}
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
devices: sandboxResources.Devices,
cpusets: sandboxResources.CPU,
cgroup: cgroup,
}, nil
}
func LoadCgroup(path string) (Cgroup, error) {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return nil, err
}
cgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return nil, err
}
return &LinuxCgroup{
path: path,
cgroup: cgroup,
}, nil
}
func (c *LinuxCgroup) Logger() *logrus.Entry {
return cgroupsLogger.WithField("source", "cgroups")
}
func (c *LinuxCgroup) Delete() error {
return c.cgroup.Delete()
}
func (c *LinuxCgroup) Stat() (*v1.Metrics, error) {
return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist))
}
func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error {
return c.cgroup.Add(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) AddTask(pid int, subsystems ...string) error {
return c.cgroup.AddTask(cgroups.Process{Pid: pid})
}
func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error {
return c.cgroup.Update(resources)
}
func (c *LinuxCgroup) MoveTo(path string) error {
cgHierarchy, cgPath, err := cgroupHierarchy(path)
if err != nil {
return err
}
newCgroup, err := cgroups.Load(cgHierarchy, cgPath)
if err != nil {
return err
}
return c.cgroup.MoveTo(newCgroup)
}
func (c *LinuxCgroup) MoveToParent() error {
parentPath := filepath.Dir(c.path)
return c.MoveTo(parentPath)
}
func (c *LinuxCgroup) AddDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
c.devices = append(c.devices, deviceResource)
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error {
deviceResource, err := DeviceToLinuxDevice(deviceHostPath)
if err != nil {
return err
}
c.Lock()
defer c.Unlock()
for i, d := range c.devices {
if d.Type == deviceResource.Type &&
d.Major == deviceResource.Major &&
d.Minor == deviceResource.Minor {
c.devices = append(c.devices[:i], c.devices[i+1:]...)
}
}
if err := c.cgroup.Update(&specs.LinuxResources{
Devices: c.devices,
}); err != nil {
return err
}
return nil
}
func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error {
c.Lock()
defer c.Unlock()
if len(cpuset) > 0 {
// If we didn't have a cpuset defined, let's create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Cpus = cpuset
}
if len(memset) > 0 {
// If we didn't have a cpuset defined, let's now create:
if c.cpusets == nil {
c.cpusets = &specs.LinuxCPU{}
}
c.cpusets.Mems = memset
}
return c.cgroup.Update(&specs.LinuxResources{
CPU: c.cpusets,
})
}
func (c *LinuxCgroup) Path() string {
return c.path
}

View File

@ -0,0 +1,82 @@
// Copyright (c) 2021 Apple Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
package cgroups
import (
v1 "github.com/containerd/cgroups/stats/v1"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
var (
controllerLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups")
)
// SetLogger sets up a logger for this pkg
func SetLogger(logger *logrus.Entry) {
fields := controllerLogger.Data
controllerLogger = logger.WithFields(fields)
}
// HypervisorType describes an hypervisor type.
type ResourceControllerType string
const (
LinuxCgroups ResourceControllerType = "cgroups"
)
// String converts an hypervisor type to a string.
func (rType *ResourceControllerType) String() string {
switch *rType {
case LinuxCgroups:
return string(LinuxCgroups)
default:
return "Unknown controller type"
}
}
// ResourceController represents a system resources controller.
// On Linux this interface is implemented through the cgroups API.
type ResourceController interface {
// Type returns the resource controller implementation type.
Type() ResourceControllerType
// The controller identifier, e.g. a Linux cgroups path.
ID() string
// Parent returns the parent controller, on hierarchically
// defined resource (e.g. Linux cgroups).
Parent() string
// Delete the controller.
Delete() error
// Stat returns the statistics for the controller.
Stat() (*v1.Metrics, error)
// AddProcess adds a process to a set of controllers.
AddProcess(int, ...string) error
// AddThread adds a process thread to a set of controllers.
AddThread(int, ...string) error
// Update updates the set of resources controlled, based on
// an OCI resources description.
Update(*specs.LinuxResources) error
// MoveTo moves a controller to another one.
MoveTo(string) error
// AddDevice adds a device resource to the controller.
AddDevice(string) error
// RemoveDevice removes a device resource to the controller.
RemoveDevice(string) error
// UpdateCpuSet updates the set of controlled CPUs and memory nodes.
UpdateCpuSet(string, string) error
}

View File

@ -199,8 +199,8 @@ type Sandbox struct {
config *SandboxConfig config *SandboxConfig
annotationsLock *sync.RWMutex annotationsLock *sync.RWMutex
wg *sync.WaitGroup wg *sync.WaitGroup
sandboxCgroup cgroups.Cgroup sandboxCgroup cgroups.ResourceController
overheadCgroup cgroups.Cgroup overheadCgroup cgroups.ResourceController
cw *consoleWatcher cw *consoleWatcher
containers map[string]*Container containers map[string]*Container
@ -672,13 +672,13 @@ func (s *Sandbox) createCgroups() error {
// Depending on the SandboxCgroupOnly value, this cgroup // Depending on the SandboxCgroupOnly value, this cgroup
// will either hold all the pod threads (SandboxCgroupOnly is true) // will either hold all the pod threads (SandboxCgroupOnly is true)
// or only the virtual CPU ones (SandboxCgroupOnly is false). // or only the virtual CPU ones (SandboxCgroupOnly is false).
s.sandboxCgroup, err = cgroups.NewSandboxCgroup(cgroupPath, &resources, s.config.SandboxCgroupOnly) s.sandboxCgroup, err = cgroups.NewSandboxResourceController(cgroupPath, &resources, s.config.SandboxCgroupOnly)
if err != nil { if err != nil {
return fmt.Errorf("Could not create the sandbox cgroup %v", err) return fmt.Errorf("Could not create the sandbox cgroup %v", err)
} }
// Now that the sandbox cgroup is created, we can set the state cgroup root paths. // Now that the sandbox cgroup is created, we can set the state cgroup root paths.
s.state.SandboxCgroupPath = s.sandboxCgroup.Path() s.state.SandboxCgroupPath = s.sandboxCgroup.ID()
s.state.OverheadCgroupPath = "" s.state.OverheadCgroupPath = ""
if s.config.SandboxCgroupOnly { if s.config.SandboxCgroupOnly {
@ -688,14 +688,14 @@ func (s *Sandbox) createCgroups() error {
// into the sandbox cgroup. // into the sandbox cgroup.
// We're creating an overhead cgroup, with no constraints. Everything but // We're creating an overhead cgroup, with no constraints. Everything but
// the vCPU threads will eventually make it there. // the vCPU threads will eventually make it there.
overheadCgroup, err := cgroups.NewCgroup(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{}) overheadCgroup, err := cgroups.NewResourceController(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{})
// TODO: support systemd cgroups overhead cgroup // TODO: support systemd cgroups overhead cgroup
// https://github.com/kata-containers/kata-containers/issues/2963 // https://github.com/kata-containers/kata-containers/issues/2963
if err != nil { if err != nil {
return err return err
} }
s.overheadCgroup = overheadCgroup s.overheadCgroup = overheadCgroup
s.state.OverheadCgroupPath = s.overheadCgroup.Path() s.state.OverheadCgroupPath = s.overheadCgroup.ID()
} }
return nil return nil
@ -2131,12 +2131,13 @@ func (s *Sandbox) cgroupsDelete() error {
return nil return nil
} }
sandboxCgroup, err := cgroups.LoadCgroup(s.state.SandboxCgroupPath) sandboxCgroup, err := cgroups.LoadResourceController(s.state.SandboxCgroupPath)
if err != nil { if err != nil {
return err return err
} }
if err := sandboxCgroup.MoveToParent(); err != nil { resCtrlParent := sandboxCgroup.Parent()
if err := sandboxCgroup.MoveTo(resCtrlParent); err != nil {
return err return err
} }
@ -2145,12 +2146,13 @@ func (s *Sandbox) cgroupsDelete() error {
} }
if s.state.OverheadCgroupPath != "" { if s.state.OverheadCgroupPath != "" {
overheadCgroup, err := cgroups.LoadCgroup(s.state.OverheadCgroupPath) overheadCgroup, err := cgroups.LoadResourceController(s.state.OverheadCgroupPath)
if err != nil { if err != nil {
return err return err
} }
if err := s.overheadCgroup.MoveToParent(); err != nil { resCtrlParent := overheadCgroup.Parent()
if err := s.overheadCgroup.MoveTo(resCtrlParent); err != nil {
return err return err
} }
@ -2171,7 +2173,7 @@ func (s *Sandbox) constrainHypervisor(ctx context.Context) error {
// All vCPU threads move to the sandbox cgroup. // All vCPU threads move to the sandbox cgroup.
for _, i := range tids.vcpus { for _, i := range tids.vcpus {
if err := s.sandboxCgroup.AddTask(i); err != nil { if err := s.sandboxCgroup.AddThread(i); err != nil {
return err return err
} }
} }