diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go index bc12cb2b26..19aeb8ced6 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go +++ b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go @@ -1,4 +1,6 @@ -// Copyright (c) 2021 Apple Inc. +// +build linux +// +// Copyright (c) 2021-2022 Apple Inc. // // SPDX-License-Identifier: Apache-2.0 // @@ -7,8 +9,11 @@ package cgroups import ( "fmt" + "os" "path/filepath" + "sync" + "github.com/containerd/cgroups" v1 "github.com/containerd/cgroups/stats/v1" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -23,17 +28,6 @@ const CgroupKataPrefix = "kata" // DefaultCgroupPath runtime-determined location in the cgroups hierarchy. const DefaultCgroupPath = "/vc" -var ( - cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") -) - -// SetLogger sets up a logger for this pkg -func SetLogger(logger *logrus.Entry) { - fields := cgroupsLogger.Data - - cgroupsLogger = logger.WithFields(fields) -} - func RenameCgroupPath(path string) (string, error) { if path == "" { path = DefaultCgroupPath @@ -45,16 +39,297 @@ func RenameCgroupPath(path string) (string, error) { } -type Cgroup interface { - Delete() error - Stat() (*v1.Metrics, error) - AddProcess(int, ...string) error - AddTask(int, ...string) error - Update(*specs.LinuxResources) error - MoveTo(string) error - MoveToParent() error - AddDevice(string) error - RemoveDevice(string) error - UpdateCpuSet(string, string) error - Path() string +type LinuxCgroup struct { + cgroup cgroups.Cgroup + path string + cpusets *specs.LinuxCPU + devices []specs.LinuxDeviceCgroup + + sync.Mutex +} + +func sandboxDevices() []specs.LinuxDeviceCgroup { + devices := []specs.LinuxDeviceCgroup{} + + defaultDevices := []string{ + "/dev/null", + "/dev/random", + "/dev/full", + "/dev/tty", + "/dev/zero", + "/dev/urandom", + "/dev/console", + } + + // Processes running in a device-cgroup are constrained, they have acccess + // only to the devices listed in the devices.list file. + // In order to run Virtual Machines and create virtqueues, hypervisors + // need access to certain character devices in the host, like kvm and vhost-net. + hypervisorDevices := []string{ + "/dev/kvm", // To run virtual machines + "/dev/vhost-net", // To create virtqueues + "/dev/vfio/vfio", // To access VFIO devices + "/dev/vhost-vsock", // To interact with vsock if + } + + defaultDevices = append(defaultDevices, hypervisorDevices...) + + for _, device := range defaultDevices { + ldevice, err := DeviceToLinuxDevice(device) + if err != nil { + controllerLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device) + continue + } + devices = append(devices, ldevice) + } + + wildcardMajor := int64(-1) + wildcardMinor := int64(-1) + ptsMajor := int64(136) + tunMajor := int64(10) + tunMinor := int64(200) + + wildcardDevices := []specs.LinuxDeviceCgroup{ + // allow mknod for any device + { + Allow: true, + Type: "c", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + { + Allow: true, + Type: "b", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Allow: true, + Type: "c", + Major: &ptsMajor, + Minor: &wildcardMinor, + Access: "rwm", + }, + // tuntap + { + Allow: true, + Type: "c", + Major: &tunMajor, + Minor: &tunMinor, + Access: "rwm", + }, + } + + devices = append(devices, wildcardDevices...) + + return devices +} + +func NewResourceController(path string, resources *specs.LinuxResources) (ResourceController, error) { + var err error + + cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path)) + if err != nil { + return nil, err + } + + cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: cgroupPath, + devices: resources.Devices, + cpusets: resources.CPU, + cgroup: cgroup, + }, nil +} + +func NewSandboxResourceController(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (ResourceController, error) { + var cgroup cgroups.Cgroup + sandboxResources := *resources + sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...) + + // Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence, + // if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect + if !IsSystemdCgroup(path) || !sandboxCgroupOnly { + return NewResourceController(path, &sandboxResources) + } + + slice, unit, err := getSliceAndUnit(path) + if err != nil { + return nil, err + } + // github.com/containerd/cgroups doesn't support creating a scope unit with + // v1 cgroups against systemd, the following interacts directly with systemd + // to create the cgroup and then load it using containerd's api + err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant + if err != nil { + return nil, err + } + + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return nil, err + } + + // load created cgroup and update with resources + if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil { + err = cgroup.Update(&sandboxResources) + } + + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: path, + devices: sandboxResources.Devices, + cpusets: sandboxResources.CPU, + cgroup: cgroup, + }, nil +} + +func LoadResourceController(path string) (ResourceController, error) { + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return nil, err + } + + cgroup, err := cgroups.Load(cgHierarchy, cgPath) + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: path, + cgroup: cgroup, + }, nil +} + +func (c *LinuxCgroup) Logger() *logrus.Entry { + return controllerLogger.WithField("source", "cgroups") +} + +func (c *LinuxCgroup) Delete() error { + return c.cgroup.Delete() +} + +func (c *LinuxCgroup) Stat() (*v1.Metrics, error) { + return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) +} + +func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error { + return c.cgroup.Add(cgroups.Process{Pid: pid}) +} + +func (c *LinuxCgroup) AddThread(pid int, subsystems ...string) error { + return c.cgroup.AddTask(cgroups.Process{Pid: pid}) +} + +func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error { + return c.cgroup.Update(resources) +} + +func (c *LinuxCgroup) MoveTo(path string) error { + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return err + } + + newCgroup, err := cgroups.Load(cgHierarchy, cgPath) + if err != nil { + return err + } + + return c.cgroup.MoveTo(newCgroup) +} + +func (c *LinuxCgroup) AddDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + c.devices = append(c.devices, deviceResource) + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + for i, d := range c.devices { + if d.Type == deviceResource.Type && + d.Major == deviceResource.Major && + d.Minor == deviceResource.Minor { + c.devices = append(c.devices[:i], c.devices[i+1:]...) + } + } + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error { + c.Lock() + defer c.Unlock() + + if len(cpuset) > 0 { + // If we didn't have a cpuset defined, let's create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Cpus = cpuset + } + + if len(memset) > 0 { + // If we didn't have a cpuset defined, let's now create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Mems = memset + } + + return c.cgroup.Update(&specs.LinuxResources{ + CPU: c.cpusets, + }) +} + +func (c *LinuxCgroup) Type() ResourceControllerType { + return LinuxCgroups +} + +func (c *LinuxCgroup) ID() string { + return c.path +} + +func (c *LinuxCgroup) Parent() string { + return filepath.Dir(c.path) } diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go deleted file mode 100644 index 6675396836..0000000000 --- a/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright (c) 2021-2022 Apple Inc. -// -// SPDX-License-Identifier: Apache-2.0 -// - -package cgroups - -import ( - "os" - "path/filepath" - "sync" - - "github.com/containerd/cgroups" - v1 "github.com/containerd/cgroups/stats/v1" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" -) - -type LinuxCgroup struct { - cgroup cgroups.Cgroup - path string - cpusets *specs.LinuxCPU - devices []specs.LinuxDeviceCgroup - - sync.Mutex -} - -func sandboxDevices() []specs.LinuxDeviceCgroup { - devices := []specs.LinuxDeviceCgroup{} - - defaultDevices := []string{ - "/dev/null", - "/dev/random", - "/dev/full", - "/dev/tty", - "/dev/zero", - "/dev/urandom", - "/dev/console", - } - - // Processes running in a device-cgroup are constrained, they have acccess - // only to the devices listed in the devices.list file. - // In order to run Virtual Machines and create virtqueues, hypervisors - // need access to certain character devices in the host, like kvm and vhost-net. - hypervisorDevices := []string{ - "/dev/kvm", // To run virtual machines - "/dev/vhost-net", // To create virtqueues - "/dev/vfio/vfio", // To access VFIO devices - "/dev/vhost-vsock", // To interact with vsock if - } - - defaultDevices = append(defaultDevices, hypervisorDevices...) - - for _, device := range defaultDevices { - ldevice, err := DeviceToLinuxDevice(device) - if err != nil { - cgroupsLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device) - continue - } - devices = append(devices, ldevice) - } - - wildcardMajor := int64(-1) - wildcardMinor := int64(-1) - ptsMajor := int64(136) - tunMajor := int64(10) - tunMinor := int64(200) - - wildcardDevices := []specs.LinuxDeviceCgroup{ - // allow mknod for any device - { - Allow: true, - Type: "c", - Major: &wildcardMajor, - Minor: &wildcardMinor, - Access: "m", - }, - { - Allow: true, - Type: "b", - Major: &wildcardMajor, - Minor: &wildcardMinor, - Access: "m", - }, - // /dev/pts/ - pts namespaces are "coming soon" - { - Allow: true, - Type: "c", - Major: &ptsMajor, - Minor: &wildcardMinor, - Access: "rwm", - }, - // tuntap - { - Allow: true, - Type: "c", - Major: &tunMajor, - Minor: &tunMinor, - Access: "rwm", - }, - } - - devices = append(devices, wildcardDevices...) - - return devices -} - -func NewCgroup(path string, resources *specs.LinuxResources) (Cgroup, error) { - var err error - - cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path)) - if err != nil { - return nil, err - } - - cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources) - if err != nil { - return nil, err - } - - return &LinuxCgroup{ - path: cgroupPath, - devices: resources.Devices, - cpusets: resources.CPU, - cgroup: cgroup, - }, nil -} - -func NewSandboxCgroup(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (Cgroup, error) { - var cgroup cgroups.Cgroup - sandboxResources := *resources - sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...) - - // Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence, - // if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect - if !IsSystemdCgroup(path) || !sandboxCgroupOnly { - return NewCgroup(path, &sandboxResources) - } - - slice, unit, err := getSliceAndUnit(path) - if err != nil { - return nil, err - } - // github.com/containerd/cgroups doesn't support creating a scope unit with - // v1 cgroups against systemd, the following interacts directly with systemd - // to create the cgroup and then load it using containerd's api - err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant - if err != nil { - return nil, err - } - - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return nil, err - } - - // load created cgroup and update with resources - if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil { - err = cgroup.Update(&sandboxResources) - } - - if err != nil { - return nil, err - } - - return &LinuxCgroup{ - path: path, - devices: sandboxResources.Devices, - cpusets: sandboxResources.CPU, - cgroup: cgroup, - }, nil -} - -func LoadCgroup(path string) (Cgroup, error) { - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return nil, err - } - - cgroup, err := cgroups.Load(cgHierarchy, cgPath) - if err != nil { - return nil, err - } - - return &LinuxCgroup{ - path: path, - cgroup: cgroup, - }, nil -} - -func (c *LinuxCgroup) Logger() *logrus.Entry { - return cgroupsLogger.WithField("source", "cgroups") -} - -func (c *LinuxCgroup) Delete() error { - return c.cgroup.Delete() -} - -func (c *LinuxCgroup) Stat() (*v1.Metrics, error) { - return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) -} - -func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error { - return c.cgroup.Add(cgroups.Process{Pid: pid}) -} - -func (c *LinuxCgroup) AddTask(pid int, subsystems ...string) error { - return c.cgroup.AddTask(cgroups.Process{Pid: pid}) -} - -func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error { - return c.cgroup.Update(resources) -} - -func (c *LinuxCgroup) MoveTo(path string) error { - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return err - } - - newCgroup, err := cgroups.Load(cgHierarchy, cgPath) - if err != nil { - return err - } - - return c.cgroup.MoveTo(newCgroup) -} - -func (c *LinuxCgroup) MoveToParent() error { - parentPath := filepath.Dir(c.path) - - return c.MoveTo(parentPath) -} - -func (c *LinuxCgroup) AddDevice(deviceHostPath string) error { - deviceResource, err := DeviceToLinuxDevice(deviceHostPath) - if err != nil { - return err - } - - c.Lock() - defer c.Unlock() - - c.devices = append(c.devices, deviceResource) - - if err := c.cgroup.Update(&specs.LinuxResources{ - Devices: c.devices, - }); err != nil { - return err - } - - return nil -} - -func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error { - deviceResource, err := DeviceToLinuxDevice(deviceHostPath) - if err != nil { - return err - } - - c.Lock() - defer c.Unlock() - - for i, d := range c.devices { - if d.Type == deviceResource.Type && - d.Major == deviceResource.Major && - d.Minor == deviceResource.Minor { - c.devices = append(c.devices[:i], c.devices[i+1:]...) - } - } - - if err := c.cgroup.Update(&specs.LinuxResources{ - Devices: c.devices, - }); err != nil { - return err - } - - return nil -} - -func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error { - c.Lock() - defer c.Unlock() - - if len(cpuset) > 0 { - // If we didn't have a cpuset defined, let's create: - if c.cpusets == nil { - c.cpusets = &specs.LinuxCPU{} - } - - c.cpusets.Cpus = cpuset - } - - if len(memset) > 0 { - // If we didn't have a cpuset defined, let's now create: - if c.cpusets == nil { - c.cpusets = &specs.LinuxCPU{} - } - - c.cpusets.Mems = memset - } - - return c.cgroup.Update(&specs.LinuxResources{ - CPU: c.cpusets, - }) -} - -func (c *LinuxCgroup) Path() string { - return c.path -} diff --git a/src/runtime/virtcontainers/pkg/cgroups/controller.go b/src/runtime/virtcontainers/pkg/cgroups/controller.go new file mode 100644 index 0000000000..fbdc4e4ab7 --- /dev/null +++ b/src/runtime/virtcontainers/pkg/cgroups/controller.go @@ -0,0 +1,82 @@ +// Copyright (c) 2021 Apple Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package cgroups + +import ( + v1 "github.com/containerd/cgroups/stats/v1" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +var ( + controllerLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") +) + +// SetLogger sets up a logger for this pkg +func SetLogger(logger *logrus.Entry) { + fields := controllerLogger.Data + + controllerLogger = logger.WithFields(fields) +} + +// HypervisorType describes an hypervisor type. +type ResourceControllerType string + +const ( + LinuxCgroups ResourceControllerType = "cgroups" +) + +// String converts an hypervisor type to a string. +func (rType *ResourceControllerType) String() string { + switch *rType { + case LinuxCgroups: + return string(LinuxCgroups) + default: + return "Unknown controller type" + } +} + +// ResourceController represents a system resources controller. +// On Linux this interface is implemented through the cgroups API. +type ResourceController interface { + // Type returns the resource controller implementation type. + Type() ResourceControllerType + + // The controller identifier, e.g. a Linux cgroups path. + ID() string + + // Parent returns the parent controller, on hierarchically + // defined resource (e.g. Linux cgroups). + Parent() string + + // Delete the controller. + Delete() error + + // Stat returns the statistics for the controller. + Stat() (*v1.Metrics, error) + + // AddProcess adds a process to a set of controllers. + AddProcess(int, ...string) error + + // AddThread adds a process thread to a set of controllers. + AddThread(int, ...string) error + + // Update updates the set of resources controlled, based on + // an OCI resources description. + Update(*specs.LinuxResources) error + + // MoveTo moves a controller to another one. + MoveTo(string) error + + // AddDevice adds a device resource to the controller. + AddDevice(string) error + + // RemoveDevice removes a device resource to the controller. + RemoveDevice(string) error + + // UpdateCpuSet updates the set of controlled CPUs and memory nodes. + UpdateCpuSet(string, string) error +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 37a36346d2..6b4ff14ec9 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -199,8 +199,8 @@ type Sandbox struct { config *SandboxConfig annotationsLock *sync.RWMutex wg *sync.WaitGroup - sandboxCgroup cgroups.Cgroup - overheadCgroup cgroups.Cgroup + sandboxCgroup cgroups.ResourceController + overheadCgroup cgroups.ResourceController cw *consoleWatcher containers map[string]*Container @@ -672,13 +672,13 @@ func (s *Sandbox) createCgroups() error { // Depending on the SandboxCgroupOnly value, this cgroup // will either hold all the pod threads (SandboxCgroupOnly is true) // or only the virtual CPU ones (SandboxCgroupOnly is false). - s.sandboxCgroup, err = cgroups.NewSandboxCgroup(cgroupPath, &resources, s.config.SandboxCgroupOnly) + s.sandboxCgroup, err = cgroups.NewSandboxResourceController(cgroupPath, &resources, s.config.SandboxCgroupOnly) if err != nil { return fmt.Errorf("Could not create the sandbox cgroup %v", err) } // Now that the sandbox cgroup is created, we can set the state cgroup root paths. - s.state.SandboxCgroupPath = s.sandboxCgroup.Path() + s.state.SandboxCgroupPath = s.sandboxCgroup.ID() s.state.OverheadCgroupPath = "" if s.config.SandboxCgroupOnly { @@ -688,14 +688,14 @@ func (s *Sandbox) createCgroups() error { // into the sandbox cgroup. // We're creating an overhead cgroup, with no constraints. Everything but // the vCPU threads will eventually make it there. - overheadCgroup, err := cgroups.NewCgroup(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{}) + overheadCgroup, err := cgroups.NewResourceController(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{}) // TODO: support systemd cgroups overhead cgroup // https://github.com/kata-containers/kata-containers/issues/2963 if err != nil { return err } s.overheadCgroup = overheadCgroup - s.state.OverheadCgroupPath = s.overheadCgroup.Path() + s.state.OverheadCgroupPath = s.overheadCgroup.ID() } return nil @@ -2131,12 +2131,13 @@ func (s *Sandbox) cgroupsDelete() error { return nil } - sandboxCgroup, err := cgroups.LoadCgroup(s.state.SandboxCgroupPath) + sandboxCgroup, err := cgroups.LoadResourceController(s.state.SandboxCgroupPath) if err != nil { return err } - if err := sandboxCgroup.MoveToParent(); err != nil { + resCtrlParent := sandboxCgroup.Parent() + if err := sandboxCgroup.MoveTo(resCtrlParent); err != nil { return err } @@ -2145,12 +2146,13 @@ func (s *Sandbox) cgroupsDelete() error { } if s.state.OverheadCgroupPath != "" { - overheadCgroup, err := cgroups.LoadCgroup(s.state.OverheadCgroupPath) + overheadCgroup, err := cgroups.LoadResourceController(s.state.OverheadCgroupPath) if err != nil { return err } - if err := s.overheadCgroup.MoveToParent(); err != nil { + resCtrlParent := overheadCgroup.Parent() + if err := s.overheadCgroup.MoveTo(resCtrlParent); err != nil { return err } @@ -2171,7 +2173,7 @@ func (s *Sandbox) constrainHypervisor(ctx context.Context) error { // All vCPU threads move to the sandbox cgroup. for _, i := range tids.vcpus { - if err := s.sandboxCgroup.AddTask(i); err != nil { + if err := s.sandboxCgroup.AddThread(i); err != nil { return err } }