diff --git a/src/runtime/cli/kata-check.go b/src/runtime/cli/kata-check.go index b720fa74d..076fb0525 100644 --- a/src/runtime/cli/kata-check.go +++ b/src/runtime/cli/kata-check.go @@ -25,7 +25,6 @@ import ( "strings" "syscall" - "github.com/containerd/cgroups" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/oci" @@ -415,11 +414,6 @@ EXAMPLES: return errors.New("check: cannot determine runtime config") } - // check if cgroup can work use the same logic for creating containers - if _, err := vc.V1Constraints(); err != nil && err == cgroups.ErrMountPointNotExist && !runtimeConfig.SandboxCgroupOnly { - return fmt.Errorf("Cgroup v2 requires the following configuration: `sandbox_cgroup_only=true`.") - } - err := setCPUtype(runtimeConfig.HypervisorType) if err != nil { return err diff --git a/src/runtime/virtcontainers/api_test.go b/src/runtime/virtcontainers/api_test.go index 8e0f0e86f..d3999927f 100644 --- a/src/runtime/virtcontainers/api_test.go +++ b/src/runtime/virtcontainers/api_test.go @@ -130,8 +130,11 @@ func newTestSandboxConfigKataAgent() SandboxConfig { } func TestCreateSandboxNoopAgentSuccessful(t *testing.T) { - defer cleanUp() assert := assert.New(t) + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } + defer cleanUp() config := newTestSandboxConfigNoop() @@ -181,6 +184,9 @@ func TestCreateSandboxKataAgentSuccessful(t *testing.T) { } func TestCreateSandboxFailing(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() assert := assert.New(t) @@ -240,6 +246,9 @@ func createAndStartSandbox(ctx context.Context, config SandboxConfig) (sandbox V } func TestReleaseSandbox(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() config := newTestSandboxConfigNoop() @@ -254,6 +263,10 @@ func TestReleaseSandbox(t *testing.T) { } func TestCleanupContainer(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } + config := newTestSandboxConfigNoop() assert := assert.New(t) diff --git a/src/runtime/virtcontainers/cgroups.go b/src/runtime/virtcontainers/cgroups.go deleted file mode 100644 index eeaf095da..000000000 --- a/src/runtime/virtcontainers/cgroups.go +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright (c) 2018 Huawei Corporation -// Copyright (c) 2019 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -package virtcontainers - -import ( - "bufio" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/containerd/cgroups" - specs "github.com/opencontainers/runtime-spec/specs-go" -) - -type cgroupPather interface { - cgroups.Subsystem - Path(path string) string -} - -// unconstrained cgroups are placed here. -// for example /sys/fs/cgroup/memory/kata/$CGPATH -// where path is defined by the containers manager -const cgroupKataPath = "/kata/" - -var cgroupsLoadFunc = cgroups.Load -var cgroupsNewFunc = cgroups.New - -// V1Constraints returns the cgroups that are compatible with the VC architecture -// and hypervisor, constraints can be applied to these cgroups. -func V1Constraints() ([]cgroups.Subsystem, error) { - root, err := cgroupV1MountPoint() - if err != nil { - return nil, err - } - subsystems := []cgroups.Subsystem{ - cgroups.NewCpuset(root), - cgroups.NewCpu(root), - cgroups.NewCpuacct(root), - } - return cgroupsSubsystems(subsystems) -} - -// V1NoConstraints returns the cgroups that are *not* compatible with the VC -// architecture and hypervisor, constraints MUST NOT be applied to these cgroups. -func V1NoConstraints() ([]cgroups.Subsystem, error) { - root, err := cgroupV1MountPoint() - if err != nil { - return nil, err - } - subsystems := []cgroups.Subsystem{ - // Some constainers managers, like k8s, take the control of cgroups. - // k8s: the memory cgroup for the dns containers is small to place - // a hypervisor there. - cgroups.NewMemory(root), - } - return cgroupsSubsystems(subsystems) -} - -func cgroupsSubsystems(subsystems []cgroups.Subsystem) ([]cgroups.Subsystem, error) { - var enabled []cgroups.Subsystem - for _, s := range cgroupPathers(subsystems) { - // check and remove the default groups that do not exist - if _, err := os.Lstat(s.Path("/")); err == nil { - enabled = append(enabled, s) - } - } - return enabled, nil -} - -func cgroupPathers(subystems []cgroups.Subsystem) []cgroupPather { - var out []cgroupPather - for _, s := range subystems { - if p, ok := s.(cgroupPather); ok { - out = append(out, p) - } - } - return out -} - -// v1MountPoint returns the mount point where the cgroup -// mountpoints are mounted in a single hiearchy -func cgroupV1MountPoint() (string, error) { - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", err - } - defer f.Close() - scanner := bufio.NewScanner(f) - for scanner.Scan() { - if err := scanner.Err(); err != nil { - return "", err - } - var ( - text = scanner.Text() - fields = strings.Split(text, " ") - // safe as mountinfo encodes mountpoints with spaces as \040. - index = strings.Index(text, " - ") - postSeparatorFields = strings.Fields(text[index+3:]) - numPostFields = len(postSeparatorFields) - ) - // this is an error as we can't detect if the mount is for "cgroup" - if numPostFields == 0 { - return "", fmt.Errorf("Found no fields post '-' in %q", text) - } - if postSeparatorFields[0] == "cgroup" { - // check that the mount is properly formated. - if numPostFields < 3 { - return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) - } - return filepath.Dir(fields[4]), nil - } - } - return "", cgroups.ErrMountPointNotExist -} - -func cgroupNoConstraintsPath(path string) string { - return filepath.Join(cgroupKataPath, path) -} - -// return the parent cgroup for the given path -func parentCgroup(hierarchy cgroups.Hierarchy, path string) (cgroups.Cgroup, error) { - // append '/' just in case CgroupsPath doesn't start with it - parent := filepath.Dir("/" + path) - - parentCgroup, err := cgroupsLoadFunc(hierarchy, - cgroups.StaticPath(parent)) - if err != nil { - return nil, fmt.Errorf("Could not load parent cgroup %v: %v", parent, err) - } - - return parentCgroup, nil -} - -// validCPUResources checks CPU resources coherency -func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU { - if cpuSpec == nil { - return nil - } - - cpu := *cpuSpec - if cpu.Period != nil && *cpu.Period < 1 { - cpu.Period = nil - } - - if cpu.Quota != nil && *cpu.Quota < 1 { - cpu.Quota = nil - } - - if cpu.Shares != nil && *cpu.Shares < 1 { - cpu.Shares = nil - } - - if cpu.RealtimePeriod != nil && *cpu.RealtimePeriod < 1 { - cpu.RealtimePeriod = nil - } - - if cpu.RealtimeRuntime != nil && *cpu.RealtimeRuntime < 1 { - cpu.RealtimeRuntime = nil - } - - return &cpu -} diff --git a/src/runtime/virtcontainers/cgroups_test.go b/src/runtime/virtcontainers/cgroups_test.go deleted file mode 100644 index 582e93fc9..000000000 --- a/src/runtime/virtcontainers/cgroups_test.go +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2018 Huawei Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -package virtcontainers - -import ( - "context" - "fmt" - "os" - "os/exec" - "path/filepath" - "testing" - - "github.com/containerd/cgroups" - cgroupsstatsv1 "github.com/containerd/cgroups/stats/v1" - "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" - specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/stretchr/testify/assert" -) - -type mockCgroup struct { -} - -func (m *mockCgroup) New(string, *specs.LinuxResources) (cgroups.Cgroup, error) { - return &mockCgroup{}, nil -} -func (m *mockCgroup) Add(cgroups.Process) error { - return nil -} - -func (m *mockCgroup) AddTask(cgroups.Process) error { - return nil -} - -func (m *mockCgroup) Delete() error { - return nil -} - -func (m *mockCgroup) MoveTo(cgroups.Cgroup) error { - return nil -} - -func (m *mockCgroup) Stat(...cgroups.ErrorHandler) (*cgroupsstatsv1.Metrics, error) { - return &cgroupsstatsv1.Metrics{}, nil -} - -func (m *mockCgroup) Update(resources *specs.LinuxResources) error { - return nil -} - -func (m *mockCgroup) Processes(cgroups.Name, bool) ([]cgroups.Process, error) { - return nil, nil -} - -func (m *mockCgroup) Freeze() error { - return nil -} - -func (m *mockCgroup) Thaw() error { - return nil -} - -func (m *mockCgroup) OOMEventFD() (uintptr, error) { - return 0, nil -} - -func (m *mockCgroup) RegisterMemoryEvent(event cgroups.MemoryEvent) (uintptr, error) { - return 0, nil -} - -func (m *mockCgroup) State() cgroups.State { - return "" -} - -func (m *mockCgroup) Subsystems() []cgroups.Subsystem { - return nil -} - -func (m *mockCgroup) Tasks(cgroups.Name, bool) ([]cgroups.Task, error) { - return nil, nil -} - -func mockCgroupNew(hierarchy cgroups.Hierarchy, path cgroups.Path, resources *specs.LinuxResources, opts ...cgroups.InitOpts) (cgroups.Cgroup, error) { - return &mockCgroup{}, nil -} - -func mockCgroupLoad(hierarchy cgroups.Hierarchy, path cgroups.Path, opts ...cgroups.InitOpts) (cgroups.Cgroup, error) { - return &mockCgroup{}, nil -} - -func init() { - cgroupsNewFunc = mockCgroupNew - cgroupsLoadFunc = mockCgroupLoad -} - -func TestV1Constraints(t *testing.T) { - assert := assert.New(t) - - systems, err := V1Constraints() - assert.NoError(err) - assert.NotEmpty(systems) -} - -func TestV1NoConstraints(t *testing.T) { - assert := assert.New(t) - - systems, err := V1NoConstraints() - assert.NoError(err) - assert.NotEmpty(systems) -} - -func TestCgroupNoConstraintsPath(t *testing.T) { - assert := assert.New(t) - - cgrouPath := "abc" - expectedPath := filepath.Join(cgroupKataPath, cgrouPath) - path := cgroupNoConstraintsPath(cgrouPath) - assert.Equal(expectedPath, path) -} - -func TestUpdateCgroups(t *testing.T) { - assert := assert.New(t) - - oldCgroupsNew := cgroupsNewFunc - oldCgroupsLoad := cgroupsLoadFunc - cgroupsNewFunc = cgroups.New - cgroupsLoadFunc = cgroups.Load - defer func() { - cgroupsNewFunc = oldCgroupsNew - cgroupsLoadFunc = oldCgroupsLoad - }() - - s := &Sandbox{ - state: types.SandboxState{ - CgroupPath: "", - }, - config: &SandboxConfig{SandboxCgroupOnly: false}, - } - - ctx := context.Background() - - // empty path - err := s.cgroupsUpdate(ctx) - assert.NoError(err) - - // path doesn't exist - s.state.CgroupPath = "/abc/123/rgb" - err = s.cgroupsUpdate(ctx) - assert.Error(err) - - if os.Getuid() != 0 { - return - } - - s.state.CgroupPath = fmt.Sprintf("/kata-tests-%d", os.Getpid()) - testCgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(s.state.CgroupPath), &specs.LinuxResources{}) - assert.NoError(err) - defer testCgroup.Delete() - s.hypervisor = &mockHypervisor{mockPid: 0} - - // bad pid - err = s.cgroupsUpdate(ctx) - assert.Error(err) - - // fake workload - cmd := exec.Command("tail", "-f", "/dev/null") - assert.NoError(cmd.Start()) - s.hypervisor = &mockHypervisor{mockPid: cmd.Process.Pid} - - // no containers - err = s.cgroupsUpdate(ctx) - assert.NoError(err) - - s.config = &SandboxConfig{} - s.config.HypervisorConfig.NumVCPUs = 1 - - s.containers = map[string]*Container{ - "abc": { - process: Process{ - Pid: cmd.Process.Pid, - }, - config: &ContainerConfig{ - Annotations: containerAnnotations, - CustomSpec: newEmptySpec(), - }, - }, - "xyz": { - process: Process{ - Pid: cmd.Process.Pid, - }, - config: &ContainerConfig{ - Annotations: containerAnnotations, - CustomSpec: newEmptySpec(), - }, - }, - } - - err = s.cgroupsUpdate(context.Background()) - assert.NoError(err) - - // cleanup - assert.NoError(cmd.Process.Kill()) - err = s.cgroupsDelete() - assert.NoError(err) -} diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 203495e82..f72de590f 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -33,8 +33,8 @@ func (s *Sandbox) dumpState(ss *persistapi.SandboxState, cs map[string]persistap ss.GuestMemoryBlockSizeMB = s.state.GuestMemoryBlockSizeMB ss.GuestMemoryHotplugProbe = s.state.GuestMemoryHotplugProbe ss.State = string(s.state.State) - ss.CgroupPath = s.state.CgroupPath - ss.CgroupPaths = s.state.CgroupPaths + ss.SandboxCgroupPath = s.state.SandboxCgroupPath + ss.OverheadCgroupPath = s.state.OverheadCgroupPath for id, cont := range s.containers { state := persistapi.ContainerState{} @@ -188,7 +188,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { SystemdCgroup: sconfig.SystemdCgroup, SandboxCgroupOnly: sconfig.SandboxCgroupOnly, DisableGuestSeccomp: sconfig.DisableGuestSeccomp, - Cgroups: sconfig.Cgroups, } ss.Config.SandboxBindMounts = append(ss.Config.SandboxBindMounts, sconfig.SandboxBindMounts...) @@ -302,8 +301,8 @@ func (s *Sandbox) loadState(ss persistapi.SandboxState) { s.state.GuestMemoryBlockSizeMB = ss.GuestMemoryBlockSizeMB s.state.BlockIndexMap = ss.HypervisorState.BlockIndexMap s.state.State = types.StateString(ss.State) - s.state.CgroupPath = ss.CgroupPath - s.state.CgroupPaths = ss.CgroupPaths + s.state.SandboxCgroupPath = ss.SandboxCgroupPath + s.state.OverheadCgroupPath = ss.OverheadCgroupPath s.state.GuestMemoryHotplugProbe = ss.GuestMemoryHotplugProbe } @@ -459,7 +458,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { SystemdCgroup: savedConf.SystemdCgroup, SandboxCgroupOnly: savedConf.SandboxCgroupOnly, DisableGuestSeccomp: savedConf.DisableGuestSeccomp, - Cgroups: savedConf.Cgroups, } sconfig.SandboxBindMounts = append(sconfig.SandboxBindMounts, savedConf.SandboxBindMounts...) diff --git a/src/runtime/virtcontainers/persist/api/sandbox.go b/src/runtime/virtcontainers/persist/api/sandbox.go index 61b4afe88..1398cc20f 100644 --- a/src/runtime/virtcontainers/persist/api/sandbox.go +++ b/src/runtime/virtcontainers/persist/api/sandbox.go @@ -30,9 +30,12 @@ type SandboxState struct { // SandboxContainer specifies which container is used to start the sandbox/vm SandboxContainer string - // CgroupPath is the cgroup hierarchy where sandbox's processes - // including the hypervisor are placed. - CgroupPath string + // SandboxCgroupPath is the sandbox cgroup path + SandboxCgroupPath string + + // OverheadCgroupPath is the sandbox overhead cgroup path. + // It can be an empty string if sandbox_cgroup_only is set. + OverheadCgroupPath string // HypervisorState saves hypervisor specific data HypervisorState HypervisorState diff --git a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go index a138be092..7d2681b96 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/cgroups.go +++ b/src/runtime/virtcontainers/pkg/cgroups/cgroups.go @@ -26,6 +26,17 @@ type Cgroup struct { sync.Mutex } +var ( + cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") +) + +// SetLogger sets up a logger for this pkg +func SetLogger(logger *logrus.Entry) { + fields := cgroupsLogger.Data + + cgroupsLogger = logger.WithFields(fields) +} + func deviceToDeviceCgroup(device string) (*specs.LinuxDeviceCgroup, error) { var st unix.Stat_t diff --git a/src/runtime/virtcontainers/pkg/cgroups/manager.go b/src/runtime/virtcontainers/pkg/cgroups/manager.go deleted file mode 100644 index dad8bd423..000000000 --- a/src/runtime/virtcontainers/pkg/cgroups/manager.go +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -package cgroups - -import ( - "bufio" - "context" - "errors" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "strings" - "sync" - - "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" - "github.com/opencontainers/runc/libcontainer" - libcontcgroups "github.com/opencontainers/runc/libcontainer/cgroups" - libcontcgroupsfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/specconv" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" -) - -type Config struct { - // Cgroups specifies specific cgroup settings for the various subsystems that the container is - // placed into to limit the resources the container has available - // If nil, New() will create one. - Cgroups *configs.Cgroup - - // CgroupPaths contains paths to all the cgroups setup for a container. Key is cgroup subsystem name - // with the value as the path. - CgroupPaths map[string]string - - // Resources represents the runtime resource constraints - Resources specs.LinuxResources - - // CgroupPath is the OCI spec cgroup path - CgroupPath string -} - -type Manager struct { - mgr libcontcgroups.Manager - sync.Mutex -} - -const ( - // file in the cgroup that contains the pids - cgroupProcs = "cgroup.procs" -) - -var ( - cgroupsLogger = logrus.WithField("source", "virtcontainers/pkg/cgroups") -) - -// SetLogger sets up a logger for this pkg -func SetLogger(logger *logrus.Entry) { - fields := cgroupsLogger.Data - - cgroupsLogger = logger.WithFields(fields) -} - -// returns the list of devices that a hypervisor may need -func hypervisorDevices() []specs.LinuxDeviceCgroup { - devices := []specs.LinuxDeviceCgroup{} - - // Processes running in a device-cgroup are constrained, they have acccess - // only to the devices listed in the devices.list file. - // In order to run Virtual Machines and create virtqueues, hypervisors - // need access to certain character devices in the host, like kvm and vhost-net. - hypervisorDevices := []string{ - "/dev/kvm", // To run virtual machines - "/dev/vhost-net", // To create virtqueues - "/dev/vfio/vfio", // To access VFIO devices - } - - for _, device := range hypervisorDevices { - ldevice, err := DeviceToLinuxDevice(device) - if err != nil { - cgroupsLogger.WithError(err).Warnf("Could not get device information") - continue - } - devices = append(devices, ldevice) - } - - return devices -} - -// New creates a new CgroupManager -func New(config *Config) (*Manager, error) { - var err error - - devices := config.Resources.Devices - devices = append(devices, hypervisorDevices()...) - // Do not modify original devices - config.Resources.Devices = devices - - newSpec := specs.Spec{ - Linux: &specs.Linux{ - Resources: &config.Resources, - }, - } - - rootless := rootless.IsRootless() - - cgroups := config.Cgroups - cgroupPaths := config.CgroupPaths - - // determine if we are utilizing systemd managed cgroups based on the path provided - useSystemdCgroup := IsSystemdCgroup(config.CgroupPath) - - // Create a new cgroup if the current one is nil - // this cgroups must be saved later - if cgroups == nil { - if config.CgroupPath == "" && !rootless { - cgroupsLogger.Warn("cgroups have not been created and cgroup path is empty") - } - - newSpec.Linux.CgroupsPath, err = ValidCgroupPath(config.CgroupPath, useSystemdCgroup) - if err != nil { - return nil, fmt.Errorf("Invalid cgroup path: %v", err) - } - - if cgroups, err = specconv.CreateCgroupConfig(&specconv.CreateOpts{ - // cgroup name is taken from spec - CgroupName: "", - UseSystemdCgroup: useSystemdCgroup, - Spec: &newSpec, - RootlessCgroups: rootless, - }, nil); err != nil { - return nil, fmt.Errorf("Could not create cgroup config: %v", err) - } - } - - // Set cgroupPaths to nil when the map is empty, it can and will be - // populated by `Manager.Apply()` when the runtime or any other process - // is moved to the cgroup. - if len(cgroupPaths) == 0 { - cgroupPaths = nil - } - - if useSystemdCgroup { - factory, err := libcontainer.New("") - if err != nil { - return nil, fmt.Errorf("Could not create linux factory for systemd cgroup manager: %v", err) - } - lfactory, ok := factory.(*libcontainer.LinuxFactory) - if !ok { - return nil, errors.New("expected linux factory returned on linux based systems") - } - - err = libcontainer.SystemdCgroups(lfactory) - - if err != nil { - return nil, fmt.Errorf("Could not create systemd cgroup manager: %v", err) - } - - return &Manager{ - mgr: lfactory.NewCgroupsManager(cgroups, cgroupPaths), - }, nil - } - - return &Manager{ - mgr: libcontcgroupsfs.NewManager(cgroups, cgroupPaths, rootless), - }, nil -} - -// read all the pids in cgroupPath -func readPids(cgroupPath string) ([]int, error) { - pids := []int{} - f, err := os.Open(filepath.Join(cgroupPath, cgroupProcs)) - if err != nil { - return nil, err - } - defer f.Close() - buf := bufio.NewScanner(f) - - for buf.Scan() { - if t := buf.Text(); t != "" { - pid, err := strconv.Atoi(t) - if err != nil { - return nil, err - } - pids = append(pids, pid) - } - } - return pids, nil -} - -// write the pids into cgroup.procs -func writePids(pids []int, cgroupPath string) error { - cgroupProcsPath := filepath.Join(cgroupPath, cgroupProcs) - for _, pid := range pids { - if err := ioutil.WriteFile(cgroupProcsPath, - []byte(strconv.Itoa(pid)), - os.FileMode(0), - ); err != nil { - return err - } - } - return nil -} - -func (m *Manager) logger() *logrus.Entry { - return cgroupsLogger.WithField("source", "cgroup-manager") -} - -// move all the processes in the current cgroup to the parent -func (m *Manager) moveToParent() error { - m.Lock() - defer m.Unlock() - for _, cgroupPath := range m.mgr.GetPaths() { - - pids, err := readPids(cgroupPath) - // possible that the cgroupPath doesn't exist. If so, skip: - if os.IsNotExist(err) { - // The cgroup is not present on the filesystem: no pids to move. The systemd cgroup - // manager lists all of the subsystems, including those that are not actually being managed. - continue - } - if err != nil { - return err - } - - if len(pids) == 0 { - // no pids in this cgroup - continue - } - - cgroupParentPath := filepath.Dir(filepath.Clean(cgroupPath)) - if err = writePids(pids, cgroupParentPath); err != nil { - if !strings.Contains(err.Error(), "no such process") { - return err - } - } - } - return nil -} - -// Add pid to cgroups -func (m *Manager) Add(pid int) error { - if rootless.IsRootless() { - m.logger().Debug("Unable to setup add pids to cgroup: running rootless") - return nil - } - - m.Lock() - defer m.Unlock() - return m.mgr.Apply(pid) -} - -// Apply constraints -func (m *Manager) Apply() error { - if rootless.IsRootless() { - m.logger().Debug("Unable to apply constraints: running rootless") - return nil - } - - cgroups, err := m.GetCgroups() - if err != nil { - return err - } - - m.Lock() - defer m.Unlock() - return m.mgr.Set(cgroups.Resources) -} - -func (m *Manager) GetCgroups() (*configs.Cgroup, error) { - m.Lock() - defer m.Unlock() - return m.mgr.GetCgroups() -} - -func (m *Manager) GetPaths() map[string]string { - m.Lock() - defer m.Unlock() - return m.mgr.GetPaths() -} - -func (m *Manager) Destroy() error { - // cgroup can't be destroyed if it contains running processes - if err := m.moveToParent(); err != nil { - // If the process migration to the parent cgroup fails, then - // we expect the Destroy to fail as well. Let's log an error here - // and attempt to execute the Destroy still to help cleanup the hosts' FS. - m.logger().WithError(err).Error("Could not move processes into parent cgroup") - } - - m.Lock() - defer m.Unlock() - return m.mgr.Destroy() -} - -// AddDevice adds a device to the device cgroup -func (m *Manager) AddDevice(ctx context.Context, device string) error { - cgroups, err := m.GetCgroups() - if err != nil { - return err - } - - ld, err := DeviceToCgroupDeviceRule(device) - if err != nil { - return err - } - - m.Lock() - cgroups.Devices = append(cgroups.Devices, ld) - m.Unlock() - - return m.Apply() -} - -// RemoceDevice removed a device from the device cgroup -func (m *Manager) RemoveDevice(device string) error { - cgroups, err := m.GetCgroups() - if err != nil { - return err - } - - ld, err := DeviceToCgroupDeviceRule(device) - if err != nil { - return err - } - - m.Lock() - for i, d := range cgroups.Devices { - if d.Major == ld.Major && d.Minor == ld.Minor { - cgroups.Devices = append(cgroups.Devices[:i], cgroups.Devices[i+1:]...) - m.Unlock() - return m.Apply() - } - } - m.Unlock() - return fmt.Errorf("device %v not found in the cgroup", device) -} - -func (m *Manager) SetCPUSet(cpuset, memset string) error { - cgroups, err := m.GetCgroups() - if err != nil { - return err - } - - m.Lock() - cgroups.CpusetCpus = cpuset - cgroups.CpusetMems = memset - m.Unlock() - - return m.Apply() -} diff --git a/src/runtime/virtcontainers/pkg/cgroups/manager_test.go b/src/runtime/virtcontainers/pkg/cgroups/manager_test.go deleted file mode 100644 index 1e868cf69..000000000 --- a/src/runtime/virtcontainers/pkg/cgroups/manager_test.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 -// - -package cgroups - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -//very very basic test; should be expanded -func TestNew(t *testing.T) { - assert := assert.New(t) - - // create a cgroupfs cgroup manager - c := &Config{ - Cgroups: nil, - CgroupPath: "", - } - - mgr, err := New(c) - assert.NoError(err) - assert.NotNil(mgr.mgr) - - // create a systemd cgroup manager - s := &Config{ - Cgroups: nil, - CgroupPath: "system.slice:kubepod:container", - } - - mgr, err = New(s) - assert.NoError(err) - assert.NotNil(mgr.mgr) - -} diff --git a/src/runtime/virtcontainers/pkg/cgroups/utils.go b/src/runtime/virtcontainers/pkg/cgroups/utils.go index f5540f173..2915c471c 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/utils.go +++ b/src/runtime/virtcontainers/pkg/cgroups/utils.go @@ -49,13 +49,13 @@ func ValidCgroupPath(path string, systemdCgroup bool) (string, error) { // In the case of an absolute path (starting with /), the runtime MUST // take the path to be relative to the cgroups mount point. if filepath.IsAbs(path) { - return RenameCgroupPath(filepath.Clean(path)) + return filepath.Clean(path), nil } // In the case of a relative path (not starting with /), the runtime MAY // interpret the path relative to a runtime-determined location in the cgroups hierarchy. // clean up path and return a new path relative to DefaultCgroupPath - return RenameCgroupPath(filepath.Join(DefaultCgroupPath, filepath.Clean("/"+path))) + return filepath.Join(DefaultCgroupPath, filepath.Clean("/"+path)), nil } func IsSystemdCgroup(cgroupPath string) bool { diff --git a/src/runtime/virtcontainers/pkg/cgroups/utils_test.go b/src/runtime/virtcontainers/pkg/cgroups/utils_test.go index 27c7aa709..f02623d90 100644 --- a/src/runtime/virtcontainers/pkg/cgroups/utils_test.go +++ b/src/runtime/virtcontainers/pkg/cgroups/utils_test.go @@ -62,6 +62,8 @@ func TestValidCgroupPath(t *testing.T) { {"/../hi", false, false}, {"/../hi/foo", false, false}, {"o / m /../ g", false, false}, + {"/overhead/foobar", false, false}, + {"/sys/fs/cgroup/cpu/sandbox/kata_foobar", false, false}, // invalid systemd paths {"o / m /../ g", true, true}, @@ -93,13 +95,13 @@ func TestValidCgroupPath(t *testing.T) { if filepath.IsAbs(t.path) { cleanPath := filepath.Dir(filepath.Clean(t.path)) assert.True(strings.HasPrefix(path, cleanPath), - "%v should have prefix %v", cleanPath) + "%v should have prefix %v", path, cleanPath) } else if t.systemdCgroup { assert.Equal(t.path, path) } else { - assert.True(strings.HasPrefix(path, "/"+CgroupKataPrefix) || + assert.True( strings.HasPrefix(path, DefaultCgroupPath), - "%v should have prefix /%v or %v", path, CgroupKataPrefix, DefaultCgroupPath) + "%v should have prefix /%v", path, DefaultCgroupPath) } } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index de113a699..4812d5a51 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -20,9 +20,7 @@ import ( "sync" "syscall" - "github.com/containerd/cgroups" "github.com/containernetworking/plugins/pkg/ns" - "github.com/opencontainers/runc/libcontainer/configs" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -39,7 +37,7 @@ import ( pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" - vccgroups "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cgroups" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cgroups" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/cpuset" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless" @@ -118,10 +116,6 @@ type SandboxConfig struct { // Experimental features enabled Experimental []exp.Feature - // Cgroups specifies specific cgroup settings for the various subsystems that the container is - // placed into to limit the resources the container has available - Cgroups *configs.Cgroup - // Annotations keys must be unique strings and must be name-spaced // with e.g. reverse domain notation (org.clearlinux.key). Annotations map[string]string @@ -188,8 +182,8 @@ type Sandbox struct { config *SandboxConfig annotationsLock *sync.RWMutex wg *sync.WaitGroup - sandboxCgroup *vccgroups.Manager - overheadCgroup *vccgroups.Manager + sandboxCgroup *cgroups.Cgroup + overheadCgroup *cgroups.Cgroup cw *consoleWatcher containers map[string]*Container @@ -592,6 +586,13 @@ func (s *Sandbox) createCgroups() error { // Kata relies on the cgroup parent created and configured by the container // engine by default. The exception is for devices whitelist as well as sandbox-level // CPUSet. + // For the sandbox cgroups we create and manage, rename the base of the cgroup path to + // include "kata_" + cgroupPath, err = cgroups.RenameCgroupPath(cgroupPath) + if err != nil { + return err + } + if spec.Linux.Resources != nil { resources.Devices = spec.Linux.Resources.Devices @@ -637,7 +638,7 @@ func (s *Sandbox) createCgroups() error { if s.devManager != nil { for _, d := range s.devManager.GetAllDevices() { - dev, err := vccgroups.DeviceToLinuxDevice(d.GetHostPath()) + dev, err := cgroups.DeviceToLinuxDevice(d.GetHostPath()) if err != nil { s.Logger().WithError(err).WithField("device", d.GetHostPath()).Warn("Could not add device to sandbox resources") continue @@ -650,22 +651,14 @@ func (s *Sandbox) createCgroups() error { // Depending on the SandboxCgroupOnly value, this cgroup // will either hold all the pod threads (SandboxCgroupOnly is true) // or only the virtual CPU ones (SandboxCgroupOnly is false). - if s.sandboxCgroup, err = vccgroups.New( - &vccgroups.Config{ - Cgroups: s.config.Cgroups, - CgroupPaths: s.state.CgroupPaths, - Resources: resources, - CgroupPath: cgroupPath, - }, - ); err != nil { - return err + s.sandboxCgroup, err = cgroups.NewSandboxCgroup(cgroupPath, &resources) + if err != nil { + return fmt.Errorf("Could not create the sandbox cgroup %v", err) } - // Now that the sandbox cgroup is created, we can set the state cgroup root path. - s.state.CgroupPath, err = vccgroups.ValidCgroupPath(cgroupPath, s.config.SystemdCgroup) - if err != nil { - return fmt.Errorf("Invalid cgroup path: %v", err) - } + // Now that the sandbox cgroup is created, we can set the state cgroup root paths. + s.state.SandboxCgroupPath = s.sandboxCgroup.Path() + s.state.OverheadCgroupPath = "" if s.config.SandboxCgroupOnly { s.overheadCgroup = nil @@ -674,16 +667,12 @@ func (s *Sandbox) createCgroups() error { // into the sandbox cgroup. // We're creating an overhead cgroup, with no constraints. Everything but // the vCPU threads will eventually make it there. - if s.overheadCgroup, err = vccgroups.New( - &vccgroups.Config{ - Cgroups: nil, - CgroupPaths: nil, - Resources: specs.LinuxResources{}, - CgroupPath: cgroupKataOverheadPath, - }, - ); err != nil { + overheadCgroup, err := cgroups.NewCgroup(fmt.Sprintf("/%s/%s", cgroupKataOverheadPath, s.id), &specs.LinuxResources{}) + if err != nil { return err } + s.overheadCgroup = overheadCgroup + s.state.OverheadCgroupPath = s.overheadCgroup.Path() } return nil @@ -1540,33 +1529,15 @@ func (s *Sandbox) StatsContainer(ctx context.Context, containerID string) (Conta // Stats returns the stats of a running sandbox func (s *Sandbox) Stats(ctx context.Context) (SandboxStats, error) { - if s.state.CgroupPath == "" { - return SandboxStats{}, fmt.Errorf("sandbox cgroup path is empty") - } - var path string - var cgroupSubsystems cgroups.Hierarchy - - if s.config.SandboxCgroupOnly { - cgroupSubsystems = cgroups.V1 - path = s.state.CgroupPath - } else { - cgroupSubsystems = V1NoConstraints - path = cgroupNoConstraintsPath(s.state.CgroupPath) - } - - cgroup, err := cgroupsLoadFunc(cgroupSubsystems, cgroups.StaticPath(path)) - if err != nil { - return SandboxStats{}, fmt.Errorf("Could not load sandbox cgroup in %v: %v", s.state.CgroupPath, err) - } - - metrics, err := cgroup.Stat(cgroups.ErrorHandler(cgroups.IgnoreNotExist)) + metrics, err := s.sandboxCgroup.Stat() if err != nil { return SandboxStats{}, err } stats := SandboxStats{} + // TODO Do we want to aggregate the overhead cgroup stats to the sandbox ones? stats.CgroupStats.CPUStats.CPUUsage.TotalUsage = metrics.CPU.Usage.Total stats.CgroupStats.MemoryStats.Usage.Usage = metrics.Memory.Usage.Usage tids, err := s.hypervisor.getThreadIDs(ctx) @@ -1796,15 +1767,9 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy span, ctx := katatrace.Trace(ctx, s.Logger(), "HotplugAddDevice", sandboxTracingTags, map[string]string{"sandbox_id": s.id}) defer span.End() - if s.config.SandboxCgroupOnly { - // We are about to add a device to the hypervisor, - // the device cgroup MUST be updated since the hypervisor - // will need access to such device - hdev := device.GetHostPath() - if err := s.sandboxCgroup.AddDevice(ctx, hdev); err != nil { - s.Logger().WithError(err).WithField("device", hdev). - Warn("Could not add device to cgroup") - } + if err := s.sandboxCgroup.AddDevice(device.GetHostPath()); err != nil { + s.Logger().WithError(err).WithField("device", device). + Warn("Could not add device to cgroup") } switch devType { @@ -1852,14 +1817,9 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy // Sandbox implement DeviceReceiver interface from device/api/interface.go func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, devType config.DeviceType) error { defer func() { - if s.config.SandboxCgroupOnly { - // Remove device from cgroup, the hypervisor - // should not have access to such device anymore. - hdev := device.GetHostPath() - if err := s.sandboxCgroup.RemoveDevice(hdev); err != nil { - s.Logger().WithError(err).WithField("device", hdev). - Warn("Could not remove device from cgroup") - } + if err := s.sandboxCgroup.RemoveDevice(device.GetHostPath()); err != nil { + s.Logger().WithError(err).WithField("device", device). + Warn("Could not add device to cgroup") } }() @@ -2140,7 +2100,7 @@ func (s *Sandbox) cgroupsUpdate(ctx context.Context) error { } // We update the sandbox cgroup with potentially new virtual CPUs. - if err := s.sandboxCgroup.SetCPUSet(cpuset, memset); err != nil { + if err := s.sandboxCgroup.UpdateCpuSet(cpuset, memset); err != nil { return err } @@ -2160,21 +2120,39 @@ func (s *Sandbox) cgroupsUpdate(ctx context.Context) error { // to the parent and then delete the sandbox cgroup func (s *Sandbox) cgroupsDelete() error { s.Logger().Debug("Deleting sandbox cgroup") - if s.state.CgroupPath == "" { - s.Logger().Warnf("sandbox cgroups path is empty") + if s.state.SandboxCgroupPath == "" { + s.Logger().Warnf("sandbox cgroup path is empty") return nil } - if s.overheadCgroup != nil { - if err := s.overheadCgroup.Destroy(); err != nil { - return err - } + sandboxCgroup, err := cgroups.Load(s.state.SandboxCgroupPath) + if err != nil { + return err } - if err := s.sandboxCgroup.Destroy(); err != nil { + if err := sandboxCgroup.MoveToParent(); err != nil { return err } + if err := sandboxCgroup.Delete(); err != nil { + return err + } + + if s.state.OverheadCgroupPath != "" { + overheadCgroup, err := cgroups.Load(s.state.OverheadCgroupPath) + if err != nil { + return err + } + + if err := s.overheadCgroup.MoveToParent(); err != nil { + return err + } + + if err := overheadCgroup.Delete(); err != nil { + return err + } + } + return nil } @@ -2187,7 +2165,7 @@ func (s *Sandbox) constrainHypervisor(ctx context.Context) error { // All vCPU threads move to the sandbox cgroup. for _, i := range tids.vcpus { - if err := s.sandboxCgroup.Add(i); err != nil { + if err := s.sandboxCgroup.AddTask(i); err != nil { return err } } @@ -2198,8 +2176,6 @@ func (s *Sandbox) constrainHypervisor(ctx context.Context) error { // setupCgroups adds the runtime process to either the sandbox cgroup or the overhead one, // depending on the sandbox_cgroup_only configuration setting. func (s *Sandbox) setupCgroups() error { - var err error - vmmCgroup := s.sandboxCgroup if s.overheadCgroup != nil { vmmCgroup = s.overheadCgroup @@ -2211,27 +2187,8 @@ func (s *Sandbox) setupCgroups() error { // then move the vCPU threads between cgroups. runtimePid := os.Getpid() // Add the runtime to the VMM sandbox cgroup - if err := vmmCgroup.Add(runtimePid); err != nil { - return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err) - } - - // `Apply` updates the sandbox cgroup Cgroups and CgroupPaths, - // they both need to be saved since they are used to create - // or restore the sandbox cgroup. - if s.config.Cgroups, err = vmmCgroup.GetCgroups(); err != nil { - return fmt.Errorf("Could not get cgroup configuration: %v", err) - } - - s.state.CgroupPaths = vmmCgroup.GetPaths() - - if err := s.sandboxCgroup.Apply(); err != nil { - return fmt.Errorf("Could not constrain cgroup: %v", err) - } - - if s.overheadCgroup != nil { - if err = s.overheadCgroup.Apply(); err != nil { - return fmt.Errorf("Could not constrain cgroup: %v", err) - } + if err := vmmCgroup.AddProcess(runtimePid); err != nil { + return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err) } return nil diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index a39f241f4..b445d0893 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -52,6 +52,10 @@ func testCreateSandbox(t *testing.T, id string, nconfig NetworkConfig, containers []ContainerConfig, volumes []types.Volume) (*Sandbox, error) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } + sconfig := SandboxConfig{ ID: id, HypervisorType: htype, @@ -963,22 +967,6 @@ func TestEnterContainer(t *testing.T) { assert.Nil(t, err, "Enter container failed: %v", err) } -func TestDeleteStoreWhenCreateContainerFail(t *testing.T) { - hypervisorConfig := newHypervisorConfig(nil, nil) - s, err := testCreateSandbox(t, testSandboxID, MockHypervisor, hypervisorConfig, NetworkConfig{}, nil, nil) - if err != nil { - t.Fatal(err) - } - defer cleanUp() - - contID := "999" - contConfig := newTestContainerConfigNoop(contID) - contConfig.RootFs = RootFs{Target: "", Mounted: true} - s.state.CgroupPath = filepath.Join(testDir, "bad-cgroup") - _, err = s.CreateContainer(context.Background(), contConfig) - assert.NotNil(t, err, "Should fail to create container due to wrong cgroup") -} - func TestDeleteStoreWhenNewContainerFail(t *testing.T) { hConfig := newHypervisorConfig(nil, nil) p, err := testCreateSandbox(t, testSandboxID, MockHypervisor, hConfig, NetworkConfig{}, nil, nil) @@ -1451,7 +1439,7 @@ func TestSandboxExperimentalFeature(t *testing.T) { assert.True(t, sconfig.valid()) } -func TestSandbox_SetupSandboxCgroup(t *testing.T) { +func TestSandbox_Cgroups(t *testing.T) { sandboxContainer := ContainerConfig{} sandboxContainer.Annotations = make(map[string]string) sandboxContainer.Annotations[annotations.ContainerTypeKey] = string(PodSandbox) @@ -1486,8 +1474,8 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) { { "New sandbox, new config", &Sandbox{config: &SandboxConfig{}}, - true, false, + true, }, { "sandbox, container no sandbox type", @@ -1495,8 +1483,8 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) { config: &SandboxConfig{Containers: []ContainerConfig{ {}, }}}, - true, false, + true, }, { "sandbox, container sandbox type", @@ -1504,8 +1492,8 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) { config: &SandboxConfig{Containers: []ContainerConfig{ sandboxContainer, }}}, - true, false, + true, }, { "sandbox, empty linux json", @@ -1532,9 +1520,16 @@ func TestSandbox_SetupSandboxCgroup(t *testing.T) { } t.Run(tt.name, func(t *testing.T) { - tt.s.createCgroupManager() - if err := tt.s.setupSandboxCgroup(); (err != nil) != tt.wantErr { - t.Errorf("Sandbox.SetupSandboxCgroupOnly() error = %v, wantErr %v", err, tt.wantErr) + err := tt.s.createCgroups() + t.Logf("create groups error %v", err) + if (err != nil) != tt.wantErr { + t.Errorf("Sandbox.CreateCgroups() error = %v, wantErr %v", err, tt.wantErr) + } + + if err == nil { + if err := tt.s.setupCgroups(); (err != nil) != tt.wantErr { + t.Errorf("Sandbox.SetupCgroups() error = %v, wantErr %v", err, tt.wantErr) + } } }) } diff --git a/src/runtime/virtcontainers/types/sandbox.go b/src/runtime/virtcontainers/types/sandbox.go index 902017a6f..f4fc3e503 100644 --- a/src/runtime/virtcontainers/types/sandbox.go +++ b/src/runtime/virtcontainers/types/sandbox.go @@ -50,9 +50,15 @@ type SandboxState struct { State StateString `json:"state"` - // CgroupPath is the cgroup hierarchy where sandbox's processes - // including the hypervisor are placed. - CgroupPath string `json:"cgroupPath,omitempty"` + // SandboxCgroupPath is the cgroup path for all the sandbox processes, + // when sandbox_cgroup_only is set. When it's not set, part of those + // processes will be living under the overhead cgroup. + SandboxCgroupPath string `json:"sandboxCgroupPath,omitempty"` + + // OverheadCgroupPath is the path to the optional overhead cgroup + // path holding processes that should not be part of the sandbox + // cgroup. + OverheadCgroupPath string `json:"overheadCgroupPath,omitempty"` // PersistVersion indicates current storage api version. // It's also known as ABI version of kata-runtime.