From b42ed39349bceae80d54c518be79bda31026a55d Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Mon, 19 Jul 2021 16:57:28 +0200 Subject: [PATCH] virtcontainers: cgroups: Add a containerd API based cgroups package Eventually, we will convert the virtcontainers and the whole Kata runtime code base to only rely on that package. This will make Kata only depends on the simpler containerd cgroups API. Signed-off-by: Samuel Ortiz --- .../virtcontainers/pkg/cgroups/cgroups.go | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 src/runtime/virtcontainers/pkg/cgroups/cgroups.go diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go new file mode 100644 index 0000000000..a138be092f --- /dev/null +++ b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go @@ -0,0 +1,289 @@ +// Copyright (c) 2021 Apple Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package cgroups + +import ( + "path/filepath" + "sync" + + "github.com/containerd/cgroups" + v1 "github.com/containerd/cgroups/stats/v1" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + + "golang.org/x/sys/unix" +) + +type Cgroup struct { + cgroup cgroups.Cgroup + path string + cpusets *specs.LinuxCPU + devices []specs.LinuxDeviceCgroup + + sync.Mutex +} + +func deviceToDeviceCgroup(device string) (*specs.LinuxDeviceCgroup, error) { + var st unix.Stat_t + + if err := unix.Stat(device, &st); err != nil { + return nil, err + } + + devType := "" + switch st.Mode & unix.S_IFMT { + case unix.S_IFCHR: + devType = "c" + case unix.S_IFBLK: + devType = "b" + } + + major := int64(unix.Major(st.Rdev)) + minor := int64(unix.Minor(st.Rdev)) + + return &specs.LinuxDeviceCgroup{ + Allow: true, + Type: devType, + Major: &major, + Minor: &minor, + Access: "rwm", + }, nil +} + +func sandboxDevices() []specs.LinuxDeviceCgroup { + devices := []specs.LinuxDeviceCgroup{} + + defaultDevices := []string{ + "/dev/null", + "/dev/random", + "/dev/full", + "/dev/tty", + "/dev/zero", + "/dev/urandom", + "/dev/console", + } + + // Processes running in a device-cgroup are constrained, they have acccess + // only to the devices listed in the devices.list file. + // In order to run Virtual Machines and create virtqueues, hypervisors + // need access to certain character devices in the host, like kvm and vhost-net. + hypervisorDevices := []string{ + "/dev/kvm", // To run virtual machines + "/dev/vhost-net", // To create virtqueues + "/dev/vfio/vfio", // To access VFIO devices + } + + defaultDevices = append(defaultDevices, hypervisorDevices...) + + for _, device := range defaultDevices { + ldevice, err := deviceToDeviceCgroup(device) + if err != nil { + cgroupsLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device) + continue + } + devices = append(devices, *ldevice) + } + + wildcardMajor := int64(-1) + wildcardMinor := int64(-1) + ptsMajor := int64(136) + tunMajor := int64(10) + tunMinor := int64(200) + + wildcardDevices := []specs.LinuxDeviceCgroup{ + // allow mknod for any device + { + Allow: true, + Type: "c", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + { + Allow: true, + Type: "b", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Allow: true, + Type: "c", + Major: &ptsMajor, + Minor: &wildcardMinor, + Access: "rwm", + }, + // tuntap + { + Allow: true, + Type: "c", + Major: &tunMajor, + Minor: &tunMinor, + Access: "rwm", + }, + } + + devices = append(devices, wildcardDevices...) + + return devices +} + +func NewCgroup(path string, resources *specs.LinuxResources) (*Cgroup, error) { + var err error + + cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path)) + if err != nil { + return nil, err + } + + cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return nil, err + } + + return &Cgroup{ + path: cgroupPath, + devices: resources.Devices, + cpusets: resources.CPU, + cgroup: cgroup, + }, nil +} + +func NewSandboxCgroup(path string, resources *specs.LinuxResources) (*Cgroup, error) { + sandboxResources := *resources + sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...) + + return NewCgroup(path, &sandboxResources) +} + +func Load(path string) (*Cgroup, error) { + cgroup, err := cgroups.Load(cgroups.V1, cgroups.StaticPath(path)) + if err != nil { + return nil, err + } + + return &Cgroup{ + path: path, + cgroup: cgroup, + }, nil +} + +func (c *Cgroup) Logger() *logrus.Entry { + return cgroupsLogger.WithField("source", "cgroups") +} + +func (c *Cgroup) Delete() error { + return c.cgroup.Delete() +} + +func (c *Cgroup) Stat() (*v1.Metrics, error) { + return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) +} + +func (c *Cgroup) AddProcess(pid int, subsystems ...string) error { + return c.cgroup.Add(cgroups.Process{Pid: pid}) +} + +func (c *Cgroup) AddTask(pid int, subsystems ...string) error { + return c.cgroup.AddTask(cgroups.Process{Pid: pid}) +} + +func (c *Cgroup) Update(resources *specs.LinuxResources) error { + return c.cgroup.Update(resources) +} + +func (c *Cgroup) MoveTo(path string) error { + newCgroup, err := cgroups.Load(cgroups.V1, cgroups.StaticPath(path)) + if err != nil { + return err + } + + return c.cgroup.MoveTo(newCgroup) +} + +func (c *Cgroup) MoveToParent() error { + parentPath := filepath.Dir(c.path) + + return c.MoveTo(parentPath) +} + +func (c *Cgroup) AddDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + c.devices = append(c.devices, deviceResource) + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *Cgroup) RemoveDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + for i, d := range c.devices { + if d.Type == deviceResource.Type && + d.Major == deviceResource.Major && + d.Minor == deviceResource.Minor { + c.devices = append(c.devices[:i], c.devices[i+1:]...) + } + } + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *Cgroup) UpdateCpuSet(cpuset, memset string) error { + c.Lock() + defer c.Unlock() + + if len(cpuset) > 0 { + // If we didn't have a cpuset defined, let's create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Cpus = cpuset + } + + if len(memset) > 0 { + // If we didn't have a cpuset defined, let's now create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Mems = memset + } + + return c.cgroup.Update(&specs.LinuxResources{ + CPU: c.cpusets, + }) +} + +func (c *Cgroup) Path() string { + return c.path +}