diff --git a/virtcontainers/pkg/cgroups/manager.go b/virtcontainers/pkg/cgroups/manager.go new file mode 100644 index 0000000000..ca029c21c9 --- /dev/null +++ b/virtcontainers/pkg/cgroups/manager.go @@ -0,0 +1,321 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package cgroups + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" + libcontcgroups "github.com/opencontainers/runc/libcontainer/cgroups" + libcontcgroupsfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" + libcontcgroupssystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type Config struct { + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + // If nil, New() will create one. + Cgroups *configs.Cgroup + + // CgroupPaths contains paths to all the cgroups setup for a container. Key is cgroup subsystem name + // with the value as the path. + CgroupPaths map[string]string + + // Resources represents the runtime resource constraints + Resources specs.LinuxResources + + // CgroupPath is the OCI spec cgroup path + CgroupPath string +} + +type Manager struct { + sync.Mutex + mgr libcontcgroups.Manager +} + +const ( + // file in the cgroup that contains the pids + cgroupProcs = "cgroup.procs" +) + +var ( + // If set to true, expects cgroupsPath to be of form "slice:prefix:name", otherwise cgroups creation will fail + systemdCgroup *bool + + cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") +) + +func EnableSystemdCgroup() { + systemd := true + systemdCgroup = &systemd +} + +func UseSystemdCgroup() bool { + if systemdCgroup != nil { + return *systemdCgroup + } + return false +} + +// returns the list of devices that a hypervisor may need +func hypervisorDevices() []specs.LinuxDeviceCgroup { + wildcard := int64(-1) + devicemapperMajor := int64(253) + + devices := []specs.LinuxDeviceCgroup{} + + devices = append(devices, + // hypervisor needs access to all devicemapper devices, + // since they can be hotplugged in the VM. + specs.LinuxDeviceCgroup{ + Allow: true, + Type: "b", + Major: &devicemapperMajor, + Minor: &wildcard, + Access: "rwm", + }) + + // Processes running in a device-cgroup are constrained, they have acccess + // only to the devices listed in the devices.list file. + // In order to run Virtual Machines and create virtqueues, hypervisors + // need access to certain character devices in the host, like kvm and vhost-net. + hypervisorDevices := []string{ + "/dev/kvm", // To run virtual machines + "/dev/vhost-net", // To create virtqueues + } + + for _, device := range hypervisorDevices { + var st unix.Stat_t + linuxDevice := specs.LinuxDeviceCgroup{ + Allow: true, + Access: "rwm", + } + + if err := unix.Stat(device, &st); err != nil { + cgroupsLogger.WithError(err).WithField("device", device).Warn("Could not get device information") + continue + } + + switch st.Mode & unix.S_IFMT { + case unix.S_IFCHR: + linuxDevice.Type = "c" + case unix.S_IFBLK: + linuxDevice.Type = "b" + } + + major := int64(unix.Major(st.Rdev)) + minor := int64(unix.Minor(st.Rdev)) + linuxDevice.Major = &major + linuxDevice.Minor = &minor + + devices = append(devices, linuxDevice) + } + + return devices +} + +// New creates a new CgroupManager +func New(config *Config) (*Manager, error) { + var err error + useSystemdCgroup := UseSystemdCgroup() + + devices := []specs.LinuxDeviceCgroup{} + copy(devices, config.Resources.Devices) + devices = append(devices, hypervisorDevices()...) + // Do not modify original devices + config.Resources.Devices = devices + + newSpec := specs.Spec{ + Linux: &specs.Linux{ + Resources: &config.Resources, + }, + } + + rootless := rootless.IsRootless() + + cgroups := config.Cgroups + cgroupPaths := config.CgroupPaths + + // Create a new cgroup if the current one is nil + // this cgroups must be saved later + if cgroups == nil { + if config.CgroupPath == "" && !rootless { + cgroupsLogger.Warn("cgroups have not been created and cgroup path is empty") + } + + newSpec.Linux.CgroupsPath, err = ValidCgroupPath(config.CgroupPath, useSystemdCgroup) + if err != nil { + return nil, fmt.Errorf("Invalid cgroup path: %v", err) + } + + if cgroups, err = specconv.CreateCgroupConfig(&specconv.CreateOpts{ + // cgroup name is taken from spec + CgroupName: "", + UseSystemdCgroup: useSystemdCgroup, + Spec: &newSpec, + RootlessCgroups: rootless, + }); err != nil { + return nil, fmt.Errorf("Could not create cgroup config: %v", err) + } + } + + // Set cgroupPaths to nil when the map is empty, it can and will be + // populated by `Manager.Apply()` when the runtime or any other process + // is moved to the cgroup. + if len(cgroupPaths) == 0 { + cgroupPaths = nil + } + + if useSystemdCgroup { + systemdCgroupFunc, err := libcontcgroupssystemd.NewSystemdCgroupsManager() + if err != nil { + return nil, fmt.Errorf("Could not create systemd cgroup manager: %v", err) + } + libcontcgroupssystemd.UseSystemd() + return &Manager{ + mgr: systemdCgroupFunc(cgroups, cgroupPaths), + }, nil + } + + return &Manager{ + mgr: &libcontcgroupsfs.Manager{ + Cgroups: cgroups, + Rootless: rootless, + Paths: cgroupPaths, + }, + }, nil +} + +// read all the pids in cgroupPath +func readPids(cgroupPath string) ([]int, error) { + pids := []int{} + f, err := os.Open(filepath.Join(cgroupPath, cgroupProcs)) + if err != nil { + return nil, err + } + defer f.Close() + buf := bufio.NewScanner(f) + + for buf.Scan() { + if t := buf.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + pids = append(pids, pid) + } + } + return pids, nil +} + +// write the pids into cgroup.procs +func writePids(pids []int, cgroupPath string) error { + cgroupProcsPath := filepath.Join(cgroupPath, cgroupProcs) + for _, pid := range pids { + if err := ioutil.WriteFile(cgroupProcsPath, + []byte(strconv.Itoa(pid)), + os.FileMode(0), + ); err != nil { + return err + } + } + return nil +} + +func (m *Manager) logger() *logrus.Entry { + return cgroupsLogger.WithField("source", "cgroup-manager") +} + +// move all the processes in the current cgroup to the parent +func (m *Manager) moveToParent() error { + m.Lock() + defer m.Unlock() + for _, cgroupPath := range m.mgr.GetPaths() { + pids, err := readPids(cgroupPath) + if err != nil { + return err + } + + if len(pids) == 0 { + // no pids in this cgroup + continue + } + + cgroupParentPath := filepath.Dir(filepath.Clean(cgroupPath)) + if err = writePids(pids, cgroupParentPath); err != nil { + if !strings.Contains(err.Error(), "no such process") { + return err + } + } + } + return nil +} + +// Add pid to cgroups +func (m *Manager) Add(pid int) error { + if rootless.IsRootless() { + m.logger().Debug("Unable to setup add pids to cgroup: running rootless") + return nil + } + + m.Lock() + defer m.Unlock() + return m.mgr.Apply(pid) +} + +// Apply constraints +func (m *Manager) Apply() error { + if rootless.IsRootless() { + m.logger().Debug("Unable to apply constraints: running rootless") + return nil + } + + cgroups, err := m.GetCgroups() + if err != nil { + return err + } + + m.Lock() + defer m.Unlock() + return m.mgr.Set(&configs.Config{ + Cgroups: cgroups, + }) +} + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + m.Lock() + defer m.Unlock() + return m.mgr.GetCgroups() +} + +func (m *Manager) GetPaths() map[string]string { + m.Lock() + defer m.Unlock() + return m.mgr.GetPaths() +} + +func (m *Manager) Destroy() error { + // cgroup can't be destroyed if it contains running processes + if err := m.moveToParent(); err != nil { + return fmt.Errorf("Could not move processes into parent cgroup: %v", err) + } + + m.Lock() + defer m.Unlock() + return m.mgr.Destroy() +} diff --git a/virtcontainers/pkg/cgroups/manager_test.go b/virtcontainers/pkg/cgroups/manager_test.go new file mode 100644 index 0000000000..0bf1f0c800 --- /dev/null +++ b/virtcontainers/pkg/cgroups/manager_test.go @@ -0,0 +1,55 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package cgroups + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestEnableSystemdCgroup(t *testing.T) { + assert := assert.New(t) + + orgSystemdCgroup := systemdCgroup + defer func() { + systemdCgroup = orgSystemdCgroup + }() + + useSystemdCgroup := UseSystemdCgroup() + if systemdCgroup != nil { + assert.Equal(*systemdCgroup, useSystemdCgroup) + } else { + assert.False(useSystemdCgroup) + } + + EnableSystemdCgroup() + assert.True(UseSystemdCgroup()) +} + +func TestNew(t *testing.T) { + assert := assert.New(t) + useSystemdCgroup := false + orgSystemdCgroup := systemdCgroup + defer func() { + systemdCgroup = orgSystemdCgroup + }() + systemdCgroup = &useSystemdCgroup + + c := &Config{ + Cgroups: nil, + CgroupPath: "", + } + + mgr, err := New(c) + assert.NoError(err) + assert.NotNil(mgr.mgr) + + useSystemdCgroup = true + mgr, err = New(c) + assert.Error(err) + assert.Nil(mgr) +} diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 2b1a605551..661861abe0 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -34,6 +34,7 @@ import ( "github.com/kata-containers/runtime/virtcontainers/persist" persistapi "github.com/kata-containers/runtime/virtcontainers/persist/api" "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" + vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups" "github.com/kata-containers/runtime/virtcontainers/pkg/compatoci" "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" @@ -214,6 +215,8 @@ type Sandbox struct { seccompSupported bool disableVMShutdown bool + cgroupMgr *vccgroups.Manager + ctx context.Context } @@ -597,6 +600,10 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } } + if err := s.createCgroupManager(); err != nil { + return nil, err + } + agentConfig, err := newAgentConfig(sandboxConfig.AgentType, sandboxConfig.AgentConfig) if err != nil { return nil, err @@ -609,6 +616,46 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } +func (s *Sandbox) createCgroupManager() error { + var err error + cgroupPath := "" + + // Do not change current cgroup configuration. + // Create a spec without constraints + resources := specs.LinuxResources{} + + if s.config == nil { + return fmt.Errorf("Could not create cgroup manager: empty sandbox configuration") + } + + spec := s.GetPatchedOCISpec() + if spec != nil { + cgroupPath = spec.Linux.CgroupsPath + + // kata should rely on the cgroup created and configured by + // container engine *only* if actual container was + // marked *explicitly* as sandbox through annotations. + if !s.config.HasCRIContainerType { + resources = *spec.Linux.Resources + } + } + + // Create the cgroup manager, this way it can be used later + // to create or detroy cgroups + if s.cgroupMgr, err = vccgroups.New( + &vccgroups.Config{ + Cgroups: s.config.Cgroups, + CgroupPaths: s.state.CgroupPaths, + Resources: resources, + CgroupPath: cgroupPath, + }, + ); err != nil { + return err + } + + return nil +} + // storeSandbox stores a sandbox config. func (s *Sandbox) storeSandbox() error { span, _ := s.trace("storeSandbox") @@ -1855,15 +1902,13 @@ func (s *Sandbox) cgroupsDelete() error { var cgroupSubsystems cgroups.Hierarchy if s.config.SandboxCgroupOnly { - cgroupSubsystems = cgroups.V1 - path = s.state.CgroupPath - s.Logger().WithField("path", path).Debug("Deleting sandbox cgroups (all subsystems)") - } else { - cgroupSubsystems = V1NoConstraints - path = cgroupNoConstraintsPath(s.state.CgroupPath) - s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup") + return s.cgroupMgr.Destroy() } + cgroupSubsystems = V1NoConstraints + path = cgroupNoConstraintsPath(s.state.CgroupPath) + s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup") + sandboxCgroups, err := cgroupsLoadFunc(cgroupSubsystems, cgroups.StaticPath(path)) if err == cgroups.ErrCgroupDeleted { // cgroup already deleted @@ -2049,60 +2094,27 @@ func (s *Sandbox) setupSandboxCgroup() error { s.Logger().WithField("hasCRIContainerType", s.config.HasCRIContainerType).Debug("Setting sandbox cgroup") - s.state.CgroupPath, err = validCgroupPath(spec.Linux.CgroupsPath, s.config.SystemdCgroup) + s.state.CgroupPath, err = vccgroups.ValidCgroupPath(spec.Linux.CgroupsPath, s.config.SystemdCgroup) if err != nil { return fmt.Errorf("Invalid cgroup path: %v", err) } - // Don't modify original resources, create a copy - resources := *spec.Linux.Resources - sandboxSpec := specs.Spec{ - Linux: &specs.Linux{ - Resources: &resources, - }, - } - - // kata should rely on the cgroup created and configured by - // container engine *only* if actual container was - // marked *explicitly* as sandbox through annotations. - if s.config.HasCRIContainerType { - // Do not change current cgroup configuration. - // Create a spec without constraints - sandboxSpec.Linux.Resources = &specs.LinuxResources{} - } - - sandboxSpec.Linux.CgroupsPath = s.state.CgroupPath - - // Remove this to improve device resource management, but first we need to fix some issues: - // - hypervisors will need access to following host's devices: - // * /dev/kvm - // * /dev/vhost-net - // - If devicemapper is the storage driver, hypervisor will need access to devicemapper devices: - // * The list of cgroup devices MUST BE updated when a new container is created in the POD - sandboxSpec.Linux.Resources.Devices = []specs.LinuxDeviceCgroup{} - - cmgr, err := newCgroupManager(s.config.Cgroups, s.state.CgroupPaths, &sandboxSpec) - if err != nil { - return fmt.Errorf("Could not create a new cgroup manager: %v", err) - } - runtimePid := os.Getpid() - // Add the runtime to the Kata sandbox cgroup - if err = cmgr.Apply(runtimePid); err != nil { + if err = s.cgroupMgr.Add(runtimePid); err != nil { return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err) } // `Apply` updates manager's Cgroups and CgroupPaths, // they both need to be saved since are used to create // or restore a cgroup managers. - if s.config.Cgroups, err = cmgr.GetCgroups(); err != nil { + if s.config.Cgroups, err = s.cgroupMgr.GetCgroups(); err != nil { return fmt.Errorf("Could not get cgroup configuration: %v", err) } - s.state.CgroupPaths = cmgr.GetPaths() + s.state.CgroupPaths = s.cgroupMgr.GetPaths() - if err = cmgr.Set(&configs.Config{Cgroups: s.config.Cgroups}); err != nil { + if err = s.cgroupMgr.Apply(); err != nil { return fmt.Errorf("Could not constrain cgroup: %v", err) } diff --git a/virtcontainers/sandbox_test.go b/virtcontainers/sandbox_test.go index 58d8874cdc..4ec8f3bc42 100644 --- a/virtcontainers/sandbox_test.go +++ b/virtcontainers/sandbox_test.go @@ -1514,6 +1514,7 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) { } t.Run(tt.name, func(t *testing.T) { + tt.s.createCgroupManager() if err := tt.s.setupSandboxCgroup(); (err != nil) != tt.wantErr { t.Errorf("Sandbox.SetupSandboxCgroupOnly() error = %v, wantErr %v", err, tt.wantErr) }