diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go index ea912db3d5..bc12cb2b26 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go +++ b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go @@ -6,24 +6,22 @@ package cgroups import ( - "os" + "fmt" "path/filepath" - "sync" - "github.com/containerd/cgroups" v1 "github.com/containerd/cgroups/stats/v1" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) -type Cgroup struct { - cgroup cgroups.Cgroup - path string - cpusets *specs.LinuxCPU - devices []specs.LinuxDeviceCgroup +// prepend a kata specific string to oci cgroup path to +// form a different cgroup path, thus cAdvisor couldn't +// find kata containers cgroup path on host to prevent it +// from grabbing the stats data. +const CgroupKataPrefix = "kata" - sync.Mutex -} +// DefaultCgroupPath runtime-determined location in the cgroups hierarchy. +const DefaultCgroupPath = "/vc" var ( cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") @@ -36,286 +34,27 @@ func SetLogger(logger *logrus.Entry) { cgroupsLogger = logger.WithFields(fields) } -func sandboxDevices() []specs.LinuxDeviceCgroup { - devices := []specs.LinuxDeviceCgroup{} - - defaultDevices := []string{ - "/dev/null", - "/dev/random", - "/dev/full", - "/dev/tty", - "/dev/zero", - "/dev/urandom", - "/dev/console", +func RenameCgroupPath(path string) (string, error) { + if path == "" { + path = DefaultCgroupPath } - // Processes running in a device-cgroup are constrained, they have acccess - // only to the devices listed in the devices.list file. - // In order to run Virtual Machines and create virtqueues, hypervisors - // need access to certain character devices in the host, like kvm and vhost-net. - hypervisorDevices := []string{ - "/dev/kvm", // To run virtual machines - "/dev/vhost-net", // To create virtqueues - "/dev/vfio/vfio", // To access VFIO devices - "/dev/vhost-vsock", // To interact with vsock if - } + cgroupPathDir := filepath.Dir(path) + cgroupPathName := fmt.Sprintf("%s_%s", CgroupKataPrefix, filepath.Base(path)) + return filepath.Join(cgroupPathDir, cgroupPathName), nil - defaultDevices = append(defaultDevices, hypervisorDevices...) - - for _, device := range defaultDevices { - ldevice, err := DeviceToLinuxDevice(device) - if err != nil { - cgroupsLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device) - continue - } - devices = append(devices, ldevice) - } - - wildcardMajor := int64(-1) - wildcardMinor := int64(-1) - ptsMajor := int64(136) - tunMajor := int64(10) - tunMinor := int64(200) - - wildcardDevices := []specs.LinuxDeviceCgroup{ - // allow mknod for any device - { - Allow: true, - Type: "c", - Major: &wildcardMajor, - Minor: &wildcardMinor, - Access: "m", - }, - { - Allow: true, - Type: "b", - Major: &wildcardMajor, - Minor: &wildcardMinor, - Access: "m", - }, - // /dev/pts/ - pts namespaces are "coming soon" - { - Allow: true, - Type: "c", - Major: &ptsMajor, - Minor: &wildcardMinor, - Access: "rwm", - }, - // tuntap - { - Allow: true, - Type: "c", - Major: &tunMajor, - Minor: &tunMinor, - Access: "rwm", - }, - } - - devices = append(devices, wildcardDevices...) - - return devices } -func NewCgroup(path string, resources *specs.LinuxResources) (*Cgroup, error) { - var err error - - cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path)) - if err != nil { - return nil, err - } - - cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources) - if err != nil { - return nil, err - } - - return &Cgroup{ - path: cgroupPath, - devices: resources.Devices, - cpusets: resources.CPU, - cgroup: cgroup, - }, nil -} - -func NewSandboxCgroup(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (*Cgroup, error) { - var cgroup cgroups.Cgroup - sandboxResources := *resources - sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...) - - // Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence, - // if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect - if !IsSystemdCgroup(path) || !sandboxCgroupOnly { - return NewCgroup(path, &sandboxResources) - } - - slice, unit, err := getSliceAndUnit(path) - if err != nil { - return nil, err - } - // github.com/containerd/cgroups doesn't support creating a scope unit with - // v1 cgroups against systemd, the following interacts directly with systemd - // to create the cgroup and then load it using containerd's api - err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant - if err != nil { - return nil, err - } - - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return nil, err - } - - // load created cgroup and update with resources - if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil { - err = cgroup.Update(&sandboxResources) - } - - if err != nil { - return nil, err - } - - return &Cgroup{ - path: path, - devices: sandboxResources.Devices, - cpusets: sandboxResources.CPU, - cgroup: cgroup, - }, nil -} - -func Load(path string) (*Cgroup, error) { - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return nil, err - } - - cgroup, err := cgroups.Load(cgHierarchy, cgPath) - if err != nil { - return nil, err - } - - return &Cgroup{ - path: path, - cgroup: cgroup, - }, nil -} - -func (c *Cgroup) Logger() *logrus.Entry { - return cgroupsLogger.WithField("source", "cgroups") -} - -func (c *Cgroup) Delete() error { - return c.cgroup.Delete() -} - -func (c *Cgroup) Stat() (*v1.Metrics, error) { - return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) -} - -func (c *Cgroup) AddProcess(pid int, subsystems ...string) error { - return c.cgroup.Add(cgroups.Process{Pid: pid}) -} - -func (c *Cgroup) AddTask(pid int, subsystems ...string) error { - return c.cgroup.AddTask(cgroups.Process{Pid: pid}) -} - -func (c *Cgroup) Update(resources *specs.LinuxResources) error { - return c.cgroup.Update(resources) -} - -func (c *Cgroup) MoveTo(path string) error { - cgHierarchy, cgPath, err := cgroupHierarchy(path) - if err != nil { - return err - } - - newCgroup, err := cgroups.Load(cgHierarchy, cgPath) - if err != nil { - return err - } - - return c.cgroup.MoveTo(newCgroup) -} - -func (c *Cgroup) MoveToParent() error { - parentPath := filepath.Dir(c.path) - - return c.MoveTo(parentPath) -} - -func (c *Cgroup) AddDevice(deviceHostPath string) error { - deviceResource, err := DeviceToLinuxDevice(deviceHostPath) - if err != nil { - return err - } - - c.Lock() - defer c.Unlock() - - c.devices = append(c.devices, deviceResource) - - if err := c.cgroup.Update(&specs.LinuxResources{ - Devices: c.devices, - }); err != nil { - return err - } - - return nil -} - -func (c *Cgroup) RemoveDevice(deviceHostPath string) error { - deviceResource, err := DeviceToLinuxDevice(deviceHostPath) - if err != nil { - return err - } - - c.Lock() - defer c.Unlock() - - for i, d := range c.devices { - if d.Type == deviceResource.Type && - d.Major == deviceResource.Major && - d.Minor == deviceResource.Minor { - c.devices = append(c.devices[:i], c.devices[i+1:]...) - } - } - - if err := c.cgroup.Update(&specs.LinuxResources{ - Devices: c.devices, - }); err != nil { - return err - } - - return nil -} - -func (c *Cgroup) UpdateCpuSet(cpuset, memset string) error { - c.Lock() - defer c.Unlock() - - if len(cpuset) > 0 { - // If we didn't have a cpuset defined, let's create: - if c.cpusets == nil { - c.cpusets = &specs.LinuxCPU{} - } - - c.cpusets.Cpus = cpuset - } - - if len(memset) > 0 { - // If we didn't have a cpuset defined, let's now create: - if c.cpusets == nil { - c.cpusets = &specs.LinuxCPU{} - } - - c.cpusets.Mems = memset - } - - return c.cgroup.Update(&specs.LinuxResources{ - CPU: c.cpusets, - }) -} - -func (c *Cgroup) Path() string { - return c.path +type Cgroup interface { + Delete() error + Stat() (*v1.Metrics, error) + AddProcess(int, ...string) error + AddTask(int, ...string) error + Update(*specs.LinuxResources) error + MoveTo(string) error + MoveToParent() error + AddDevice(string) error + RemoveDevice(string) error + UpdateCpuSet(string, string) error + Path() string } diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go new file mode 100644 index 0000000000..6675396836 --- /dev/null +++ b/src/runtime/virtcontainers/pkg/cgroups/cgroups_linux.go @@ -0,0 +1,310 @@ +// Copyright (c) 2021-2022 Apple Inc. +// +// SPDX-License-Identifier: Apache-2.0 +// + +package cgroups + +import ( + "os" + "path/filepath" + "sync" + + "github.com/containerd/cgroups" + v1 "github.com/containerd/cgroups/stats/v1" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +type LinuxCgroup struct { + cgroup cgroups.Cgroup + path string + cpusets *specs.LinuxCPU + devices []specs.LinuxDeviceCgroup + + sync.Mutex +} + +func sandboxDevices() []specs.LinuxDeviceCgroup { + devices := []specs.LinuxDeviceCgroup{} + + defaultDevices := []string{ + "/dev/null", + "/dev/random", + "/dev/full", + "/dev/tty", + "/dev/zero", + "/dev/urandom", + "/dev/console", + } + + // Processes running in a device-cgroup are constrained, they have acccess + // only to the devices listed in the devices.list file. + // In order to run Virtual Machines and create virtqueues, hypervisors + // need access to certain character devices in the host, like kvm and vhost-net. + hypervisorDevices := []string{ + "/dev/kvm", // To run virtual machines + "/dev/vhost-net", // To create virtqueues + "/dev/vfio/vfio", // To access VFIO devices + "/dev/vhost-vsock", // To interact with vsock if + } + + defaultDevices = append(defaultDevices, hypervisorDevices...) + + for _, device := range defaultDevices { + ldevice, err := DeviceToLinuxDevice(device) + if err != nil { + cgroupsLogger.WithField("source", "cgroups").Warnf("Could not add %s to the devices cgroup", device) + continue + } + devices = append(devices, ldevice) + } + + wildcardMajor := int64(-1) + wildcardMinor := int64(-1) + ptsMajor := int64(136) + tunMajor := int64(10) + tunMinor := int64(200) + + wildcardDevices := []specs.LinuxDeviceCgroup{ + // allow mknod for any device + { + Allow: true, + Type: "c", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + { + Allow: true, + Type: "b", + Major: &wildcardMajor, + Minor: &wildcardMinor, + Access: "m", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Allow: true, + Type: "c", + Major: &ptsMajor, + Minor: &wildcardMinor, + Access: "rwm", + }, + // tuntap + { + Allow: true, + Type: "c", + Major: &tunMajor, + Minor: &tunMinor, + Access: "rwm", + }, + } + + devices = append(devices, wildcardDevices...) + + return devices +} + +func NewCgroup(path string, resources *specs.LinuxResources) (Cgroup, error) { + var err error + + cgroupPath, err := ValidCgroupPath(path, IsSystemdCgroup(path)) + if err != nil { + return nil, err + } + + cgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: cgroupPath, + devices: resources.Devices, + cpusets: resources.CPU, + cgroup: cgroup, + }, nil +} + +func NewSandboxCgroup(path string, resources *specs.LinuxResources, sandboxCgroupOnly bool) (Cgroup, error) { + var cgroup cgroups.Cgroup + sandboxResources := *resources + sandboxResources.Devices = append(sandboxResources.Devices, sandboxDevices()...) + + // Currently we know to handle systemd cgroup path only when it's the only cgroup (no overhead group), hence, + // if sandboxCgroupOnly is not true we treat it as cgroupfs path as it used to be, although it may be incorrect + if !IsSystemdCgroup(path) || !sandboxCgroupOnly { + return NewCgroup(path, &sandboxResources) + } + + slice, unit, err := getSliceAndUnit(path) + if err != nil { + return nil, err + } + // github.com/containerd/cgroups doesn't support creating a scope unit with + // v1 cgroups against systemd, the following interacts directly with systemd + // to create the cgroup and then load it using containerd's api + err = createCgroupsSystemd(slice, unit, uint32(os.Getpid())) // adding runtime process, it makes calling setupCgroups redundant + if err != nil { + return nil, err + } + + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return nil, err + } + + // load created cgroup and update with resources + if cgroup, err = cgroups.Load(cgHierarchy, cgPath); err == nil { + err = cgroup.Update(&sandboxResources) + } + + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: path, + devices: sandboxResources.Devices, + cpusets: sandboxResources.CPU, + cgroup: cgroup, + }, nil +} + +func LoadCgroup(path string) (Cgroup, error) { + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return nil, err + } + + cgroup, err := cgroups.Load(cgHierarchy, cgPath) + if err != nil { + return nil, err + } + + return &LinuxCgroup{ + path: path, + cgroup: cgroup, + }, nil +} + +func (c *LinuxCgroup) Logger() *logrus.Entry { + return cgroupsLogger.WithField("source", "cgroups") +} + +func (c *LinuxCgroup) Delete() error { + return c.cgroup.Delete() +} + +func (c *LinuxCgroup) Stat() (*v1.Metrics, error) { + return c.cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) +} + +func (c *LinuxCgroup) AddProcess(pid int, subsystems ...string) error { + return c.cgroup.Add(cgroups.Process{Pid: pid}) +} + +func (c *LinuxCgroup) AddTask(pid int, subsystems ...string) error { + return c.cgroup.AddTask(cgroups.Process{Pid: pid}) +} + +func (c *LinuxCgroup) Update(resources *specs.LinuxResources) error { + return c.cgroup.Update(resources) +} + +func (c *LinuxCgroup) MoveTo(path string) error { + cgHierarchy, cgPath, err := cgroupHierarchy(path) + if err != nil { + return err + } + + newCgroup, err := cgroups.Load(cgHierarchy, cgPath) + if err != nil { + return err + } + + return c.cgroup.MoveTo(newCgroup) +} + +func (c *LinuxCgroup) MoveToParent() error { + parentPath := filepath.Dir(c.path) + + return c.MoveTo(parentPath) +} + +func (c *LinuxCgroup) AddDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + c.devices = append(c.devices, deviceResource) + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *LinuxCgroup) RemoveDevice(deviceHostPath string) error { + deviceResource, err := DeviceToLinuxDevice(deviceHostPath) + if err != nil { + return err + } + + c.Lock() + defer c.Unlock() + + for i, d := range c.devices { + if d.Type == deviceResource.Type && + d.Major == deviceResource.Major && + d.Minor == deviceResource.Minor { + c.devices = append(c.devices[:i], c.devices[i+1:]...) + } + } + + if err := c.cgroup.Update(&specs.LinuxResources{ + Devices: c.devices, + }); err != nil { + return err + } + + return nil +} + +func (c *LinuxCgroup) UpdateCpuSet(cpuset, memset string) error { + c.Lock() + defer c.Unlock() + + if len(cpuset) > 0 { + // If we didn't have a cpuset defined, let's create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Cpus = cpuset + } + + if len(memset) > 0 { + // If we didn't have a cpuset defined, let's now create: + if c.cpusets == nil { + c.cpusets = &specs.LinuxCPU{} + } + + c.cpusets.Mems = memset + } + + return c.cgroup.Update(&specs.LinuxResources{ + CPU: c.cpusets, + }) +} + +func (c *LinuxCgroup) Path() string { + return c.path +} diff --git a/src/runtime/virtcontainers/pkg/cgroups/utils.go b/src/runtime/virtcontainers/pkg/cgroups/utils_linux.go similarity index 88% rename from src/runtime/virtcontainers/pkg/cgroups/utils.go rename to src/runtime/virtcontainers/pkg/cgroups/utils_linux.go index fd70b880e6..afc095bbac 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/utils.go +++ b/src/runtime/virtcontainers/pkg/cgroups/utils_linux.go @@ -20,26 +20,6 @@ import ( "golang.org/x/sys/unix" ) -// prepend a kata specific string to oci cgroup path to -// form a different cgroup path, thus cAdvisor couldn't -// find kata containers cgroup path on host to prevent it -// from grabbing the stats data. -const CgroupKataPrefix = "kata" - -// DefaultCgroupPath runtime-determined location in the cgroups hierarchy. -const DefaultCgroupPath = "/vc" - -func RenameCgroupPath(path string) (string, error) { - if path == "" { - path = DefaultCgroupPath - } - - cgroupPathDir := filepath.Dir(path) - cgroupPathName := fmt.Sprintf("%s_%s", CgroupKataPrefix, filepath.Base(path)) - return filepath.Join(cgroupPathDir, cgroupPathName), nil - -} - // validCgroupPath returns a valid cgroup path. // see https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path func ValidCgroupPath(path string, systemdCgroup bool) (string, error) { diff --git a/src/runtime/virtcontainers/pkg/cgroups/utils_test.go b/src/runtime/virtcontainers/pkg/cgroups/utils_linux_test.go similarity index 100% rename from src/runtime/virtcontainers/pkg/cgroups/utils_test.go rename to src/runtime/virtcontainers/pkg/cgroups/utils_linux_test.go diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 623ccb8dc2..37a36346d2 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -199,8 +199,8 @@ type Sandbox struct { config *SandboxConfig annotationsLock *sync.RWMutex wg *sync.WaitGroup - sandboxCgroup *cgroups.Cgroup - overheadCgroup *cgroups.Cgroup + sandboxCgroup cgroups.Cgroup + overheadCgroup cgroups.Cgroup cw *consoleWatcher containers map[string]*Container @@ -1752,9 +1752,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy span, ctx := katatrace.Trace(ctx, s.Logger(), "HotplugAddDevice", sandboxTracingTags, map[string]string{"sandbox_id": s.id}) defer span.End() - if err := s.sandboxCgroup.AddDevice(device.GetHostPath()); err != nil { - s.Logger().WithError(err).WithField("device", device). - Warn("Could not add device to cgroup") + if s.sandboxCgroup != nil { + if err := s.sandboxCgroup.AddDevice(device.GetHostPath()); err != nil { + s.Logger().WithError(err).WithField("device", device). + Warn("Could not add device to cgroup") + } } switch devType { @@ -1801,10 +1803,12 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy // HotplugRemoveDevice is used for removing a device from sandbox // Sandbox implement DeviceReceiver interface from device/api/interface.go func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, devType config.DeviceType) error { - defer func() { - if err := s.sandboxCgroup.RemoveDevice(device.GetHostPath()); err != nil { - s.Logger().WithError(err).WithField("device", device). - Warn("Could not add device to cgroup") + defer func() { + if s.sandboxCgroup != nil { + if err := s.sandboxCgroup.RemoveDevice(device.GetHostPath()); err != nil { + s.Logger().WithError(err).WithField("device", device). + Warn("Could not add device to cgroup") + } } }() @@ -2127,7 +2131,7 @@ func (s *Sandbox) cgroupsDelete() error { return nil } - sandboxCgroup, err := cgroups.Load(s.state.SandboxCgroupPath) + sandboxCgroup, err := cgroups.LoadCgroup(s.state.SandboxCgroupPath) if err != nil { return err } @@ -2141,7 +2145,7 @@ func (s *Sandbox) cgroupsDelete() error { } if s.state.OverheadCgroupPath != "" { - overheadCgroup, err := cgroups.Load(s.state.OverheadCgroupPath) + overheadCgroup, err := cgroups.LoadCgroup(s.state.OverheadCgroupPath) if err != nil { return err }