virtcontainers/pkg/cgroups: implement cgroup manager

cgroup manager is in charge to create and setup cgroups for
virtual containers, for example it adds /dev/kvm and
/dev/vhost-net to the list of cgroup devices in order to have
virtual containers working.

fixes #2438
fixes #2419

Signed-off-by: Julio Montes <julio.montes@intel.com>
This commit is contained in:
Julio Montes 2020-02-05 20:13:08 +00:00
parent 03cdf6c4a9
commit ea82922a54
4 changed files with 434 additions and 45 deletions

View File

@ -0,0 +1,321 @@
// Copyright (c) 2020 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package cgroups
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
libcontcgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontcgroupsfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
libcontcgroupssystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type Config struct {
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
// If nil, New() will create one.
Cgroups *configs.Cgroup
// CgroupPaths contains paths to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path.
CgroupPaths map[string]string
// Resources represents the runtime resource constraints
Resources specs.LinuxResources
// CgroupPath is the OCI spec cgroup path
CgroupPath string
}
type Manager struct {
sync.Mutex
mgr libcontcgroups.Manager
}
const (
// file in the cgroup that contains the pids
cgroupProcs = "cgroup.procs"
)
var (
// If set to true, expects cgroupsPath to be of form "slice:prefix:name", otherwise cgroups creation will fail
systemdCgroup *bool
cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups")
)
func EnableSystemdCgroup() {
systemd := true
systemdCgroup = &systemd
}
func UseSystemdCgroup() bool {
if systemdCgroup != nil {
return *systemdCgroup
}
return false
}
// returns the list of devices that a hypervisor may need
func hypervisorDevices() []specs.LinuxDeviceCgroup {
wildcard := int64(-1)
devicemapperMajor := int64(253)
devices := []specs.LinuxDeviceCgroup{}
devices = append(devices,
// hypervisor needs access to all devicemapper devices,
// since they can be hotplugged in the VM.
specs.LinuxDeviceCgroup{
Allow: true,
Type: "b",
Major: &devicemapperMajor,
Minor: &wildcard,
Access: "rwm",
})
// Processes running in a device-cgroup are constrained, they have acccess
// only to the devices listed in the devices.list file.
// In order to run Virtual Machines and create virtqueues, hypervisors
// need access to certain character devices in the host, like kvm and vhost-net.
hypervisorDevices := []string{
"/dev/kvm", // To run virtual machines
"/dev/vhost-net", // To create virtqueues
}
for _, device := range hypervisorDevices {
var st unix.Stat_t
linuxDevice := specs.LinuxDeviceCgroup{
Allow: true,
Access: "rwm",
}
if err := unix.Stat(device, &st); err != nil {
cgroupsLogger.WithError(err).WithField("device", device).Warn("Could not get device information")
continue
}
switch st.Mode & unix.S_IFMT {
case unix.S_IFCHR:
linuxDevice.Type = "c"
case unix.S_IFBLK:
linuxDevice.Type = "b"
}
major := int64(unix.Major(st.Rdev))
minor := int64(unix.Minor(st.Rdev))
linuxDevice.Major = &major
linuxDevice.Minor = &minor
devices = append(devices, linuxDevice)
}
return devices
}
// New creates a new CgroupManager
func New(config *Config) (*Manager, error) {
var err error
useSystemdCgroup := UseSystemdCgroup()
devices := []specs.LinuxDeviceCgroup{}
copy(devices, config.Resources.Devices)
devices = append(devices, hypervisorDevices()...)
// Do not modify original devices
config.Resources.Devices = devices
newSpec := specs.Spec{
Linux: &specs.Linux{
Resources: &config.Resources,
},
}
rootless := rootless.IsRootless()
cgroups := config.Cgroups
cgroupPaths := config.CgroupPaths
// Create a new cgroup if the current one is nil
// this cgroups must be saved later
if cgroups == nil {
if config.CgroupPath == "" && !rootless {
cgroupsLogger.Warn("cgroups have not been created and cgroup path is empty")
}
newSpec.Linux.CgroupsPath, err = ValidCgroupPath(config.CgroupPath, useSystemdCgroup)
if err != nil {
return nil, fmt.Errorf("Invalid cgroup path: %v", err)
}
if cgroups, err = specconv.CreateCgroupConfig(&specconv.CreateOpts{
// cgroup name is taken from spec
CgroupName: "",
UseSystemdCgroup: useSystemdCgroup,
Spec: &newSpec,
RootlessCgroups: rootless,
}); err != nil {
return nil, fmt.Errorf("Could not create cgroup config: %v", err)
}
}
// Set cgroupPaths to nil when the map is empty, it can and will be
// populated by `Manager.Apply()` when the runtime or any other process
// is moved to the cgroup.
if len(cgroupPaths) == 0 {
cgroupPaths = nil
}
if useSystemdCgroup {
systemdCgroupFunc, err := libcontcgroupssystemd.NewSystemdCgroupsManager()
if err != nil {
return nil, fmt.Errorf("Could not create systemd cgroup manager: %v", err)
}
libcontcgroupssystemd.UseSystemd()
return &Manager{
mgr: systemdCgroupFunc(cgroups, cgroupPaths),
}, nil
}
return &Manager{
mgr: &libcontcgroupsfs.Manager{
Cgroups: cgroups,
Rootless: rootless,
Paths: cgroupPaths,
},
}, nil
}
// read all the pids in cgroupPath
func readPids(cgroupPath string) ([]int, error) {
pids := []int{}
f, err := os.Open(filepath.Join(cgroupPath, cgroupProcs))
if err != nil {
return nil, err
}
defer f.Close()
buf := bufio.NewScanner(f)
for buf.Scan() {
if t := buf.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, err
}
pids = append(pids, pid)
}
}
return pids, nil
}
// write the pids into cgroup.procs
func writePids(pids []int, cgroupPath string) error {
cgroupProcsPath := filepath.Join(cgroupPath, cgroupProcs)
for _, pid := range pids {
if err := ioutil.WriteFile(cgroupProcsPath,
[]byte(strconv.Itoa(pid)),
os.FileMode(0),
); err != nil {
return err
}
}
return nil
}
func (m *Manager) logger() *logrus.Entry {
return cgroupsLogger.WithField("source", "cgroup-manager")
}
// move all the processes in the current cgroup to the parent
func (m *Manager) moveToParent() error {
m.Lock()
defer m.Unlock()
for _, cgroupPath := range m.mgr.GetPaths() {
pids, err := readPids(cgroupPath)
if err != nil {
return err
}
if len(pids) == 0 {
// no pids in this cgroup
continue
}
cgroupParentPath := filepath.Dir(filepath.Clean(cgroupPath))
if err = writePids(pids, cgroupParentPath); err != nil {
if !strings.Contains(err.Error(), "no such process") {
return err
}
}
}
return nil
}
// Add pid to cgroups
func (m *Manager) Add(pid int) error {
if rootless.IsRootless() {
m.logger().Debug("Unable to setup add pids to cgroup: running rootless")
return nil
}
m.Lock()
defer m.Unlock()
return m.mgr.Apply(pid)
}
// Apply constraints
func (m *Manager) Apply() error {
if rootless.IsRootless() {
m.logger().Debug("Unable to apply constraints: running rootless")
return nil
}
cgroups, err := m.GetCgroups()
if err != nil {
return err
}
m.Lock()
defer m.Unlock()
return m.mgr.Set(&configs.Config{
Cgroups: cgroups,
})
}
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
m.Lock()
defer m.Unlock()
return m.mgr.GetCgroups()
}
func (m *Manager) GetPaths() map[string]string {
m.Lock()
defer m.Unlock()
return m.mgr.GetPaths()
}
func (m *Manager) Destroy() error {
// cgroup can't be destroyed if it contains running processes
if err := m.moveToParent(); err != nil {
return fmt.Errorf("Could not move processes into parent cgroup: %v", err)
}
m.Lock()
defer m.Unlock()
return m.mgr.Destroy()
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2020 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package cgroups
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestEnableSystemdCgroup(t *testing.T) {
assert := assert.New(t)
orgSystemdCgroup := systemdCgroup
defer func() {
systemdCgroup = orgSystemdCgroup
}()
useSystemdCgroup := UseSystemdCgroup()
if systemdCgroup != nil {
assert.Equal(*systemdCgroup, useSystemdCgroup)
} else {
assert.False(useSystemdCgroup)
}
EnableSystemdCgroup()
assert.True(UseSystemdCgroup())
}
func TestNew(t *testing.T) {
assert := assert.New(t)
useSystemdCgroup := false
orgSystemdCgroup := systemdCgroup
defer func() {
systemdCgroup = orgSystemdCgroup
}()
systemdCgroup = &useSystemdCgroup
c := &Config{
Cgroups: nil,
CgroupPath: "",
}
mgr, err := New(c)
assert.NoError(err)
assert.NotNil(mgr.mgr)
useSystemdCgroup = true
mgr, err = New(c)
assert.Error(err)
assert.Nil(mgr)
}

View File

@ -34,6 +34,7 @@ import (
"github.com/kata-containers/runtime/virtcontainers/persist"
persistapi "github.com/kata-containers/runtime/virtcontainers/persist/api"
"github.com/kata-containers/runtime/virtcontainers/pkg/annotations"
vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups"
"github.com/kata-containers/runtime/virtcontainers/pkg/compatoci"
"github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
@ -214,6 +215,8 @@ type Sandbox struct {
seccompSupported bool
disableVMShutdown bool
cgroupMgr *vccgroups.Manager
ctx context.Context
}
@ -597,6 +600,10 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
}
}
if err := s.createCgroupManager(); err != nil {
return nil, err
}
agentConfig, err := newAgentConfig(sandboxConfig.AgentType, sandboxConfig.AgentConfig)
if err != nil {
return nil, err
@ -609,6 +616,46 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
func (s *Sandbox) createCgroupManager() error {
var err error
cgroupPath := ""
// Do not change current cgroup configuration.
// Create a spec without constraints
resources := specs.LinuxResources{}
if s.config == nil {
return fmt.Errorf("Could not create cgroup manager: empty sandbox configuration")
}
spec := s.GetPatchedOCISpec()
if spec != nil {
cgroupPath = spec.Linux.CgroupsPath
// kata should rely on the cgroup created and configured by
// container engine *only* if actual container was
// marked *explicitly* as sandbox through annotations.
if !s.config.HasCRIContainerType {
resources = *spec.Linux.Resources
}
}
// Create the cgroup manager, this way it can be used later
// to create or detroy cgroups
if s.cgroupMgr, err = vccgroups.New(
&vccgroups.Config{
Cgroups: s.config.Cgroups,
CgroupPaths: s.state.CgroupPaths,
Resources: resources,
CgroupPath: cgroupPath,
},
); err != nil {
return err
}
return nil
}
// storeSandbox stores a sandbox config.
func (s *Sandbox) storeSandbox() error {
span, _ := s.trace("storeSandbox")
@ -1855,15 +1902,13 @@ func (s *Sandbox) cgroupsDelete() error {
var cgroupSubsystems cgroups.Hierarchy
if s.config.SandboxCgroupOnly {
cgroupSubsystems = cgroups.V1
path = s.state.CgroupPath
s.Logger().WithField("path", path).Debug("Deleting sandbox cgroups (all subsystems)")
} else {
cgroupSubsystems = V1NoConstraints
path = cgroupNoConstraintsPath(s.state.CgroupPath)
s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
return s.cgroupMgr.Destroy()
}
cgroupSubsystems = V1NoConstraints
path = cgroupNoConstraintsPath(s.state.CgroupPath)
s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup")
sandboxCgroups, err := cgroupsLoadFunc(cgroupSubsystems, cgroups.StaticPath(path))
if err == cgroups.ErrCgroupDeleted {
// cgroup already deleted
@ -2049,60 +2094,27 @@ func (s *Sandbox) setupSandboxCgroup() error {
s.Logger().WithField("hasCRIContainerType", s.config.HasCRIContainerType).Debug("Setting sandbox cgroup")
s.state.CgroupPath, err = validCgroupPath(spec.Linux.CgroupsPath, s.config.SystemdCgroup)
s.state.CgroupPath, err = vccgroups.ValidCgroupPath(spec.Linux.CgroupsPath, s.config.SystemdCgroup)
if err != nil {
return fmt.Errorf("Invalid cgroup path: %v", err)
}
// Don't modify original resources, create a copy
resources := *spec.Linux.Resources
sandboxSpec := specs.Spec{
Linux: &specs.Linux{
Resources: &resources,
},
}
// kata should rely on the cgroup created and configured by
// container engine *only* if actual container was
// marked *explicitly* as sandbox through annotations.
if s.config.HasCRIContainerType {
// Do not change current cgroup configuration.
// Create a spec without constraints
sandboxSpec.Linux.Resources = &specs.LinuxResources{}
}
sandboxSpec.Linux.CgroupsPath = s.state.CgroupPath
// Remove this to improve device resource management, but first we need to fix some issues:
// - hypervisors will need access to following host's devices:
// * /dev/kvm
// * /dev/vhost-net
// - If devicemapper is the storage driver, hypervisor will need access to devicemapper devices:
// * The list of cgroup devices MUST BE updated when a new container is created in the POD
sandboxSpec.Linux.Resources.Devices = []specs.LinuxDeviceCgroup{}
cmgr, err := newCgroupManager(s.config.Cgroups, s.state.CgroupPaths, &sandboxSpec)
if err != nil {
return fmt.Errorf("Could not create a new cgroup manager: %v", err)
}
runtimePid := os.Getpid()
// Add the runtime to the Kata sandbox cgroup
if err = cmgr.Apply(runtimePid); err != nil {
if err = s.cgroupMgr.Add(runtimePid); err != nil {
return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err)
}
// `Apply` updates manager's Cgroups and CgroupPaths,
// they both need to be saved since are used to create
// or restore a cgroup managers.
if s.config.Cgroups, err = cmgr.GetCgroups(); err != nil {
if s.config.Cgroups, err = s.cgroupMgr.GetCgroups(); err != nil {
return fmt.Errorf("Could not get cgroup configuration: %v", err)
}
s.state.CgroupPaths = cmgr.GetPaths()
s.state.CgroupPaths = s.cgroupMgr.GetPaths()
if err = cmgr.Set(&configs.Config{Cgroups: s.config.Cgroups}); err != nil {
if err = s.cgroupMgr.Apply(); err != nil {
return fmt.Errorf("Could not constrain cgroup: %v", err)
}

View File

@ -1514,6 +1514,7 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) {
}
t.Run(tt.name, func(t *testing.T) {
tt.s.createCgroupManager()
if err := tt.s.setupSandboxCgroup(); (err != nil) != tt.wantErr {
t.Errorf("Sandbox.SetupSandboxCgroupOnly() error = %v, wantErr %v", err, tt.wantErr)
}