From 34fe3b9d6d69d9fd7b1b5cfbc3c6fced7f17bc38 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Thu, 13 Sep 2018 20:10:29 +0800 Subject: [PATCH] cgroups: add host cgroup support Fixes #344 Add host cgroup support for kata. This commits only adds cpu.cfs_period and cpu.cfs_quota support. It will create 3-level hierarchy, take "cpu" cgroup as an example: ``` /sys/fs/cgroup |---cpu |---kata |--- |--vcpu |--- ``` * `vc` cgroup is common parent for all kata-container sandbox, it won't be removed after sandbox removed. This cgroup has no limitation. * `` cgroup is the layer for each sandbox, it contains all other qemu threads except for vcpu threads. In future, we can consider putting all shim processes and proxy process here. This cgroup has no limitation yet. * `vcpu` cgroup contains vcpu threads from qemu. Currently cpu quota and period constraint applies to this cgroup. Signed-off-by: Wei Zhang Signed-off-by: Jingxiao Lu --- cli/create.go | 113 ---------------- cli/create_test.go | 122 ----------------- cli/delete.go | 14 +- cli/oci.go | 132 ------------------- cli/oci_test.go | 196 --------------------------- cli/utils.go | 6 - virtcontainers/api.go | 5 + virtcontainers/cgroups.go | 191 +++++++++++++++++++++++++++ virtcontainers/cgroups_test.go | 211 ++++++++++++++++++++++++++++++ virtcontainers/hypervisor.go | 5 + virtcontainers/mock_hypervisor.go | 10 +- virtcontainers/qemu.go | 24 ++++ virtcontainers/sandbox.go | 25 +++- 13 files changed, 468 insertions(+), 586 deletions(-) create mode 100644 virtcontainers/cgroups.go create mode 100644 virtcontainers/cgroups_test.go diff --git a/cli/create.go b/cli/create.go index 0bbc288ca..3cd868b02 100644 --- a/cli/create.go +++ b/cli/create.go @@ -10,9 +10,7 @@ import ( "context" "errors" "fmt" - "io/ioutil" "os" - "path/filepath" "strings" vc "github.com/kata-containers/runtime/virtcontainers" @@ -177,26 +175,6 @@ func create(ctx context.Context, containerID, bundlePath, console, pidFilePath s } } - // config.json provides a cgroups path that has to be used to create "tasks" - // and "cgroups.procs" files. Those files have to be filled with a PID, which - // is shim's in our case. This is mandatory to make sure there is no one - // else (like Docker) trying to create those files on our behalf. We want to - // know those files location so that we can remove them when delete is called. - cgroupsPathList, err := processCgroupsPath(ctx, ociSpec, containerType.IsSandbox()) - if err != nil { - return err - } - - // cgroupsDirPath is CgroupsPath fetch from OCI spec - var cgroupsDirPath string - if ociSpec.Linux != nil { - cgroupsDirPath = ociSpec.Linux.CgroupsPath - } - - if err := createCgroupsFiles(ctx, containerID, cgroupsDirPath, cgroupsPathList, process.Pid); err != nil { - return err - } - // Creation of PID file has to be the last thing done in the create // because containerd considers the create complete after this file // is created. @@ -379,52 +357,6 @@ func createContainer(ctx context.Context, ociSpec oci.CompatOCISpec, containerID return c.Process(), nil } -func createCgroupsFiles(ctx context.Context, containerID string, cgroupsDirPath string, cgroupsPathList []string, pid int) error { - span, _ := trace(ctx, "createCgroupsFiles") - defer span.Finish() - - if len(cgroupsPathList) == 0 { - kataLog.WithField("pid", pid).Info("Cgroups files not created because cgroupsPath was empty") - return nil - } - - for _, cgroupsPath := range cgroupsPathList { - if err := os.MkdirAll(cgroupsPath, cgroupsDirMode); err != nil { - return err - } - - if strings.Contains(cgroupsPath, "cpu") && cgroupsDirPath != "" { - parent := strings.TrimSuffix(cgroupsPath, cgroupsDirPath) - copyParentCPUSet(cgroupsPath, parent) - } - - tasksFilePath := filepath.Join(cgroupsPath, cgroupsTasksFile) - procsFilePath := filepath.Join(cgroupsPath, cgroupsProcsFile) - - pidStr := fmt.Sprintf("%d", pid) - - for _, path := range []string{tasksFilePath, procsFilePath} { - f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, cgroupsFileMode) - if err != nil { - return err - } - defer f.Close() - - n, err := f.WriteString(pidStr) - if err != nil { - return err - } - - if n < len(pidStr) { - return fmt.Errorf("Could not write pid to %q: only %d bytes written out of %d", - path, n, len(pidStr)) - } - } - } - - return nil -} - func createPIDFile(ctx context.Context, pidFilePath string, pid int) error { span, _ := trace(ctx, "createPIDFile") defer span.Finish() @@ -457,48 +389,3 @@ func createPIDFile(ctx context.Context, pidFilePath string, pid int) error { return nil } - -// copyParentCPUSet copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func copyParentCPUSet(current, parent string) error { - currentCpus, currentMems, err := getCPUSet(current) - if err != nil { - return err - } - - parentCpus, parentMems, err := getCPUSet(parent) - if err != nil { - return err - } - - if len(parentCpus) < 1 || len(parentMems) < 1 { - return nil - } - - var cgroupsFileMode = os.FileMode(0600) - if isEmptyString(currentCpus) { - if err := writeFile(filepath.Join(current, "cpuset.cpus"), string(parentCpus), cgroupsFileMode); err != nil { - return err - } - } - - if isEmptyString(currentMems) { - if err := writeFile(filepath.Join(current, "cpuset.mems"), string(parentMems), cgroupsFileMode); err != nil { - return err - } - } - - return nil -} - -func getCPUSet(parent string) (cpus []byte, mems []byte, err error) { - if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { - return - } - - if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { - return - } - - return cpus, mems, nil -} diff --git a/cli/create_test.go b/cli/create_test.go index 362ec4820..02cd95ddf 100644 --- a/cli/create_test.go +++ b/cli/create_test.go @@ -36,22 +36,6 @@ const ( var testStrPID = fmt.Sprintf("%d", testPID) -func mockCPUSetContent(contents map[string]string) error { - for filePath, data := range contents { - if err := writeFile(filePath, data, testFileMode); err != nil { - return err - } - } - - return nil -} - -func testCreateCgroupsFilesSuccessful(t *testing.T, cgroupsDirPath string, cgroupsPathList []string, pid int) { - if err := createCgroupsFiles(context.Background(), "foo", cgroupsDirPath, cgroupsPathList, pid); err != nil { - t.Fatalf("This test should succeed (cgroupsPath %q, pid %d): %s", cgroupsPathList, pid, err) - } -} - // return the value of the *last* param with the specified key func findLastParam(key string, params []vc.Param) (string, error) { if key == "" { @@ -74,62 +58,6 @@ func findLastParam(key string, params []vc.Param) (string, error) { return "", fmt.Errorf("no param called %q found", name) } -func TestCgroupsFilesEmptyCgroupsPathSuccessful(t *testing.T) { - testCreateCgroupsFilesSuccessful(t, "", []string{}, testPID) -} - -func TestCreateCgroupsFilesFailToWriteFile(t *testing.T) { - if os.Geteuid() == 0 { - // The os.FileMode(0000) trick doesn't work for root. - t.Skip(testDisabledNeedNonRoot) - } - - assert := assert.New(t) - - tmpdir, err := ioutil.TempDir("", "") - assert.NoError(err) - defer os.RemoveAll(tmpdir) - - // create the file as a directory to force an error - file := filepath.Join(tmpdir, "cgroups-file") - err = os.MkdirAll(file, os.FileMode(0000)) - assert.NoError(err) - - files := []string{file} - - err = createCgroupsFiles(context.Background(), "foo", "cgroups-file", files, testPID) - assert.Error(err) -} - -func TestCgroupsFilesNonEmptyCgroupsPathSuccessful(t *testing.T) { - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - if err != nil { - t.Fatalf("Could not create temporary cgroups directory: %s", err) - } - - testCreateCgroupsFilesSuccessful(t, "cgroups-path-", []string{cgroupsPath}, testPID) - - defer os.RemoveAll(cgroupsPath) - - tasksPath := filepath.Join(cgroupsPath, cgroupsTasksFile) - procsPath := filepath.Join(cgroupsPath, cgroupsProcsFile) - - for _, path := range []string{tasksPath, procsPath} { - if _, err := os.Stat(path); err != nil { - t.Fatalf("Path %q should have been created: %s", path, err) - } - - fileBytes, err := ioutil.ReadFile(path) - if err != nil { - t.Fatalf("Could not read %q previously created: %s", path, err) - } - - if string(fileBytes) != testStrPID { - t.Fatalf("PID %s read from %q different from expected PID %s", string(fileBytes), path, testStrPID) - } - } -} - func TestCreatePIDFileSuccessful(t *testing.T) { pidDirPath, err := ioutil.TempDir(testDir, "pid-path-") if err != nil { @@ -1087,56 +1015,6 @@ func TestCreateCreateContainer(t *testing.T) { } } -func TestCopyParentCPUSetFail(t *testing.T) { - assert := assert.New(t) - - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - assert.NoError(err) - defer os.RemoveAll(cgroupsPath) - - err = copyParentCPUSet(cgroupsPath, testDir) - assert.Error(err) -} - -func TestCopyParentCPUSetSuccessful(t *testing.T) { - assert := assert.New(t) - - cgroupsPath, err := ioutil.TempDir(testDir, "cgroups-path-") - assert.NoError(err) - defer os.RemoveAll(cgroupsPath) - - cgroupsSrcPath := filepath.Join(cgroupsPath, "src") - err = os.Mkdir(cgroupsSrcPath, testDirMode) - assert.NoError(err) - - err = mockCPUSetContent(map[string]string{ - filepath.Join(cgroupsSrcPath, "cpuset.cpus"): "0-1", - filepath.Join(cgroupsSrcPath, "cpuset.mems"): "0-1", - }) - assert.NoError(err) - - cgroupsDstPath := filepath.Join(cgroupsPath, "dst") - err = os.Mkdir(cgroupsDstPath, testDirMode) - assert.NoError(err) - - fd, err := os.Create(filepath.Join(cgroupsDstPath, "cpuset.cpus")) - assert.NoError(err) - fd.Close() - - fd, err = os.Create(filepath.Join(cgroupsDstPath, "cpuset.mems")) - assert.NoError(err) - fd.Close() - - err = copyParentCPUSet(cgroupsDstPath, cgroupsSrcPath) - assert.NoError(err) - - currentCpus, currentMems, err := getCPUSet(cgroupsDstPath) - assert.NoError(err) - - assert.False(isEmptyString(currentCpus)) - assert.False(isEmptyString(currentMems)) -} - func TestSetKernelParams(t *testing.T) { assert := assert.New(t) diff --git a/cli/delete.go b/cli/delete.go index ff939f064..d7104f47d 100644 --- a/cli/delete.go +++ b/cli/delete.go @@ -123,19 +123,7 @@ func delete(ctx context.Context, containerID string, force bool) error { return err } - // In order to prevent any file descriptor leak related to cgroups files - // that have been previously created, we have to remove them before this - // function returns. - cgroupsPathList, err := processCgroupsPath(ctx, ociSpec, containerType.IsSandbox()) - if err != nil { - return err - } - - if err := delContainerIDMapping(ctx, containerID); err != nil { - return err - } - - return removeCgroupsPath(ctx, containerID, cgroupsPathList) + return delContainerIDMapping(ctx, containerID) } func deleteSandbox(ctx context.Context, sandboxID string) error { diff --git a/cli/oci.go b/cli/oci.go index 928ab3e6e..c54e6d60f 100644 --- a/cli/oci.go +++ b/cli/oci.go @@ -8,7 +8,6 @@ package main import ( "bufio" "context" - "errors" "fmt" "io/ioutil" "net" @@ -20,18 +19,11 @@ import ( "github.com/containernetworking/plugins/pkg/ns" vc "github.com/kata-containers/runtime/virtcontainers" - "github.com/kata-containers/runtime/virtcontainers/pkg/oci" "github.com/opencontainers/runc/libcontainer/utils" - specs "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" ) // Contants related to cgroup memory directory const ( - cgroupsTasksFile = "tasks" - cgroupsProcsFile = "cgroup.procs" - cgroupsDirMode = os.FileMode(0750) - cgroupsFileMode = os.FileMode(0640) ctrsMappingDirMode = os.FileMode(0750) // Filesystem type corresponding to CGROUP_SUPER_MAGIC as listed @@ -39,8 +31,6 @@ const ( cgroupFsType = 0x27e0eb ) -var errNeedLinuxResource = errors.New("Linux resource cannot be empty") - var cgroupsDirPath string var procMountInfo = "/proc/self/mountinfo" @@ -125,128 +115,6 @@ func validCreateParams(ctx context.Context, containerID, bundlePath string) (str return resolved, nil } -// processCgroupsPath process the cgroups path as expected from the -// OCI runtime specification. It returns a list of complete paths -// that should be created and used for every specified resource. -func processCgroupsPath(ctx context.Context, ociSpec oci.CompatOCISpec, isSandbox bool) ([]string, error) { - span, _ := trace(ctx, "processCgroupsPath") - defer span.Finish() - - var cgroupsPathList []string - - if ociSpec.Linux.CgroupsPath == "" { - return []string{}, nil - } - - if ociSpec.Linux.Resources == nil { - return []string{}, nil - } - - if ociSpec.Linux.Resources.Memory != nil { - memCgroupsPath, err := processCgroupsPathForResource(ociSpec, "memory", isSandbox) - if err != nil { - return []string{}, err - } - - if memCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, memCgroupsPath) - } - } - - if ociSpec.Linux.Resources.CPU != nil { - cpuCgroupsPath, err := processCgroupsPathForResource(ociSpec, "cpu", isSandbox) - if err != nil { - return []string{}, err - } - - if cpuCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, cpuCgroupsPath) - } - } - - if ociSpec.Linux.Resources.Pids != nil { - pidsCgroupsPath, err := processCgroupsPathForResource(ociSpec, "pids", isSandbox) - if err != nil { - return []string{}, err - } - - if pidsCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, pidsCgroupsPath) - } - } - - if ociSpec.Linux.Resources.BlockIO != nil { - blkIOCgroupsPath, err := processCgroupsPathForResource(ociSpec, "blkio", isSandbox) - if err != nil { - return []string{}, err - } - - if blkIOCgroupsPath != "" { - cgroupsPathList = append(cgroupsPathList, blkIOCgroupsPath) - } - } - - return cgroupsPathList, nil -} - -func processCgroupsPathForResource(ociSpec oci.CompatOCISpec, resource string, isSandbox bool) (string, error) { - if resource == "" { - return "", errNeedLinuxResource - } - - var err error - cgroupsDirPath, err = getCgroupsDirPath(procMountInfo) - if err != nil { - return "", fmt.Errorf("get CgroupsDirPath error: %s", err) - } - - // Relative cgroups path provided. - if filepath.IsAbs(ociSpec.Linux.CgroupsPath) == false { - return filepath.Join(cgroupsDirPath, resource, ociSpec.Linux.CgroupsPath), nil - } - - // Absolute cgroups path provided. - var cgroupMount specs.Mount - cgroupMountFound := false - for _, mount := range ociSpec.Mounts { - if mount.Type == "cgroup" { - cgroupMount = mount - cgroupMountFound = true - break - } - } - - if !cgroupMountFound { - // According to the OCI spec, an absolute path should be - // interpreted as relative to the system cgroup mount point - // when there is no cgroup mount point. - return filepath.Join(cgroupsDirPath, resource, ociSpec.Linux.CgroupsPath), nil - } - - if cgroupMount.Destination == "" { - return "", fmt.Errorf("cgroupsPath is absolute, cgroup mount destination cannot be empty") - } - - cgroupPath := filepath.Join(cgroupMount.Destination, resource) - - // It is not an error to have this cgroup not mounted. It is usually - // due to an old kernel version with missing support for specific - // cgroups. - fields := logrus.Fields{ - "path": cgroupPath, - "type": "cgroup", - } - - if !isCgroupMounted(cgroupPath) { - kataLog.WithFields(fields).Info("path not mounted") - return "", nil - } - - kataLog.WithFields(fields).Info("path mounted") - - return filepath.Join(cgroupPath, ociSpec.Linux.CgroupsPath), nil -} - func isCgroupMounted(cgroupPath string) bool { var statFs syscall.Statfs_t diff --git a/cli/oci_test.go b/cli/oci_test.go index 7ab3c4940..1fd9428d0 100644 --- a/cli/oci_test.go +++ b/cli/oci_test.go @@ -13,17 +13,13 @@ import ( "net" "os" "path/filepath" - "reflect" - "syscall" "testing" "time" vc "github.com/kata-containers/runtime/virtcontainers" vcAnnotations "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" - "github.com/kata-containers/runtime/virtcontainers/pkg/oci" "github.com/kata-containers/runtime/virtcontainers/pkg/vcmock" "github.com/opencontainers/runc/libcontainer/utils" - specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" ) @@ -32,38 +28,6 @@ var ( consoleSocketPathTest = "console-socket-test" ) -type cgroupTestDataType struct { - resource string - linuxSpec *specs.LinuxResources -} - -var cgroupTestData = []cgroupTestDataType{ - { - "memory", - &specs.LinuxResources{ - Memory: &specs.LinuxMemory{}, - }, - }, - { - "cpu", - &specs.LinuxResources{ - CPU: &specs.LinuxCPU{}, - }, - }, - { - "pids", - &specs.LinuxResources{ - Pids: &specs.LinuxPids{}, - }, - }, - { - "blkio", - &specs.LinuxResources{ - BlockIO: &specs.LinuxBlockIO{}, - }, - }, -} - func TestGetContainerInfoContainerIDEmptyFailure(t *testing.T) { assert := assert.New(t) status, _, err := getContainerInfo(context.Background(), "") @@ -181,141 +145,6 @@ func TestValidCreateParamsBundleIsAFile(t *testing.T) { assert.False(vcmock.IsMockError(err)) } -func testProcessCgroupsPath(t *testing.T, ociSpec oci.CompatOCISpec, expected []string) { - assert := assert.New(t) - result, err := processCgroupsPath(context.Background(), ociSpec, true) - - assert.NoError(err) - - if reflect.DeepEqual(result, expected) == false { - assert.FailNow("DeepEqual failed", "Result path %q should match the expected one %q", result, expected) - } -} - -func TestProcessCgroupsPathEmptyPathSuccessful(t *testing.T) { - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: "", - } - - testProcessCgroupsPath(t, ociSpec, []string{}) -} - -func TestProcessCgroupsPathEmptyResources(t *testing.T) { - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: "foo", - } - - testProcessCgroupsPath(t, ociSpec, []string{}) -} - -func TestProcessCgroupsPathRelativePathSuccessful(t *testing.T) { - relativeCgroupsPath := "relative/cgroups/path" - cgroupsDirPath = "/foo/runtime/base" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: relativeCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - - p := filepath.Join(cgroupsDirPath, d.resource, relativeCgroupsPath) - - testProcessCgroupsPath(t, ociSpec, []string{p}) - } -} - -func TestProcessCgroupsPathAbsoluteNoCgroupMountSuccessful(t *testing.T) { - absoluteCgroupsPath := "/absolute/cgroups/path" - cgroupsDirPath = "/foo/runtime/base" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: absoluteCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - - p := filepath.Join(cgroupsDirPath, d.resource, absoluteCgroupsPath) - - testProcessCgroupsPath(t, ociSpec, []string{p}) - } -} - -func TestProcessCgroupsPathAbsoluteNoCgroupMountDestinationFailure(t *testing.T) { - assert := assert.New(t) - absoluteCgroupsPath := "/absolute/cgroups/path" - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Mounts = []specs.Mount{ - { - Type: "cgroup", - }, - } - - ociSpec.Linux = &specs.Linux{ - CgroupsPath: absoluteCgroupsPath, - } - - for _, d := range cgroupTestData { - ociSpec.Linux.Resources = d.linuxSpec - for _, isSandbox := range []bool{true, false} { - _, err := processCgroupsPath(context.Background(), ociSpec, isSandbox) - assert.Error(err, "This test should fail because no cgroup mount destination provided") - } - } -} - -func TestProcessCgroupsPathAbsoluteSuccessful(t *testing.T) { - assert := assert.New(t) - - if os.Geteuid() != 0 { - t.Skip(testDisabledNeedRoot) - } - - memoryResource := "memory" - absoluteCgroupsPath := "/cgroup/mount/destination" - - cgroupMountDest, err := ioutil.TempDir("", "cgroup-memory-") - assert.NoError(err) - defer os.RemoveAll(cgroupMountDest) - - resourceMountPath := filepath.Join(cgroupMountDest, memoryResource) - err = os.MkdirAll(resourceMountPath, cgroupsDirMode) - assert.NoError(err) - - err = syscall.Mount("go-test", resourceMountPath, "cgroup", 0, memoryResource) - assert.NoError(err) - defer syscall.Unmount(resourceMountPath, 0) - - ociSpec := oci.CompatOCISpec{} - - ociSpec.Linux = &specs.Linux{ - Resources: &specs.LinuxResources{ - Memory: &specs.LinuxMemory{}, - }, - CgroupsPath: absoluteCgroupsPath, - } - - ociSpec.Mounts = []specs.Mount{ - { - Type: "cgroup", - Destination: cgroupMountDest, - }, - } - - testProcessCgroupsPath(t, ociSpec, []string{filepath.Join(resourceMountPath, absoluteCgroupsPath)}) -} - func TestSetupConsoleExistingConsolePathSuccessful(t *testing.T) { assert := assert.New(t) console, err := setupConsole(consolePathTest, "") @@ -436,31 +265,6 @@ func TestIsCgroupMounted(t *testing.T) { assert.True(isCgroupMounted(memoryCgroupPath), "%s is a cgroup", memoryCgroupPath) } -func TestProcessCgroupsPathForResource(t *testing.T) { - assert := assert.New(t) - - tmpdir, err := ioutil.TempDir("", "") - assert.NoError(err) - defer os.RemoveAll(tmpdir) - - bundlePath := filepath.Join(tmpdir, "bundle") - - err = makeOCIBundle(bundlePath) - assert.NoError(err) - - ociConfigFile := filepath.Join(bundlePath, specConfig) - assert.True(fileExists(ociConfigFile)) - - spec, err := readOCIConfigFile(ociConfigFile) - assert.NoError(err) - - for _, isSandbox := range []bool{true, false} { - _, err := processCgroupsPathForResource(spec, "", isSandbox) - assert.Error(err) - assert.False(vcmock.IsMockError(err)) - } -} - func TestGetCgroupsDirPath(t *testing.T) { assert := assert.New(t) diff --git a/cli/utils.go b/cli/utils.go index b544e0ba7..b13dff933 100644 --- a/cli/utils.go +++ b/cli/utils.go @@ -6,7 +6,6 @@ package main import ( - "bytes" "fmt" "io/ioutil" "os" @@ -227,11 +226,6 @@ func writeFile(filePath string, data string, fileMode os.FileMode) error { return nil } -// isEmptyString return if string is empty -func isEmptyString(b []byte) bool { - return len(bytes.Trim(b, "\n")) == 0 -} - // fileSize returns the number of bytes in the specified file func fileSize(file string) (int64, error) { st := syscall.Stat_t{} diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 64e65f2c6..dce9e1f0c 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -126,6 +126,11 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f return nil, err } + // Setup host cgroups + if err := s.setupCgroups(); err != nil { + return nil, err + } + return s, nil } diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go new file mode 100644 index 000000000..9ad0e8e2f --- /dev/null +++ b/virtcontainers/cgroups.go @@ -0,0 +1,191 @@ +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "encoding/json" + "fmt" + + "github.com/containerd/cgroups" + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + vcpuGroupName = "vcpu" + defaultCgroupParent = "/kata" +) + +type sandboxCgroups struct { + commonParent cgroups.Cgroup + sandboxSub cgroups.Cgroup + vcpuSub cgroups.Cgroup +} + +func (s *Sandbox) newCgroups() error { + // New will still succeed when cgroup exists + // create common parent for all kata-containers + // e.g. /sys/fs/cgroup/cpu/vc + parent, err := cgroups.New(cgroups.V1, + cgroups.StaticPath(defaultCgroupParent), &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %q", defaultCgroupParent) + } + + // create sub-cgroup for each sandbox + // e.g. /sys/fs/cgroup/cpu/vc/ + sandboxSub, err := parent.New(s.id, &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %s/%s", defaultCgroupParent, s.id) + } + + // create sub-cgroup for vcpu threads + vcpuSub, err := sandboxSub.New(vcpuGroupName, &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("failed to create cgroup for %s/%s/%s", defaultCgroupParent, s.id, vcpuGroupName) + } + + s.cgroup = &sandboxCgroups{ + commonParent: parent, + sandboxSub: sandboxSub, + vcpuSub: vcpuSub, + } + return nil +} + +func (s *Sandbox) destroyCgroups() error { + if s.cgroup == nil { + s.Logger().Warningf("cgroup is not initialized, no need to destroy") + return nil + } + + // first move all processes in subgroup to parent in case live process blocks + // deletion of cgroup + if err := s.cgroup.sandboxSub.MoveTo(s.cgroup.commonParent); err != nil { + return fmt.Errorf("failed to clear cgroup processes") + } + + return s.cgroup.sandboxSub.Delete() +} + +func (s *Sandbox) setupCgroups() error { + if s.cgroup == nil { + return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") + } + + resource, err := s.mergeSpecResource() + if err != nil { + return err + } + + if err := s.applyCPUCgroup(resource); err != nil { + return err + } + return nil +} + +func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { + if s.cgroup == nil { + return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") + } + + // apply cpu constraint to vcpu cgroup + if err := s.cgroup.vcpuSub.Update(rc); err != nil { + return err + } + + // when new container joins, new CPU could be hotplugged, so we + // have to query fresh vcpu info from hypervisor for every time. + tids, err := s.hypervisor.getThreadIDs() + if err != nil || tids == nil { + return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) + } + + // use Add() to add vcpu thread to s.cgroup, it will write thread id to + // `cgroup.procs` which will move all threads in qemu process to this cgroup + // immediately as default behaviour. + if len(tids.vcpus) > 0 { + if err := s.cgroup.sandboxSub.Add(cgroups.Process{ + Pid: tids.vcpus[0], + }); err != nil { + return err + } + } + + for _, i := range tids.vcpus { + if i <= 0 { + continue + } + + // In contrast, AddTask will write thread id to `tasks` + // After this, vcpu threads are in "vcpu" sub-cgroup, other threads in + // qemu will be left in parent cgroup untouched. + if err := s.cgroup.vcpuSub.AddTask(cgroups.Process{ + Pid: i, + }); err != nil { + return err + } + } + + return nil +} + +func (s *Sandbox) mergeSpecResource() (*specs.LinuxResources, error) { + if s.config == nil { + return nil, fmt.Errorf("sandbox config is nil") + } + + resource := &specs.LinuxResources{ + CPU: &specs.LinuxCPU{}, + } + + for _, c := range s.config.Containers { + config, ok := c.Annotations[annotations.ConfigJSONKey] + if !ok { + s.Logger().WithField("container", c.ID).Warningf("failed to find config from container annotations") + continue + } + + var spec specs.Spec + if err := json.Unmarshal([]byte(config), &spec); err != nil { + return nil, err + } + + // TODO: how to handle empty/unlimited resource? + // maybe we should add a default CPU/Memory delta when no + // resource limit is given. -- @WeiZhang555 + if spec.Linux == nil || spec.Linux.Resources == nil { + continue + } + // calculate cpu quota and period + s.mergeCPUResource(resource, spec.Linux.Resources) + } + return resource, nil +} + +func (s *Sandbox) mergeCPUResource(orig, rc *specs.LinuxResources) { + if orig.CPU == nil { + orig.CPU = &specs.LinuxCPU{} + } + + if rc.CPU != nil && rc.CPU.Quota != nil && rc.CPU.Period != nil && + *rc.CPU.Quota > 0 && *rc.CPU.Period > 0 { + if orig.CPU.Period == nil { + orig.CPU.Period = rc.CPU.Period + orig.CPU.Quota = rc.CPU.Quota + } else { + // this is an example to show how it works: + // container A and `orig` has quota: 5000 and period 10000 + // here comes container B with quota 40 and period 100, + // so use previous period 10000 as a baseline, container B + // has proportional resource of quota 4000 and period 10000, calculated as + // delta := 40 / 100 * 10000 = 4000 + // and final `*orig.CPU.Quota` = 5000 + 4000 = 9000 + delta := float64(*rc.CPU.Quota) / float64(*rc.CPU.Period) * float64(*orig.CPU.Period) + *orig.CPU.Quota += int64(delta) + } + } +} diff --git a/virtcontainers/cgroups_test.go b/virtcontainers/cgroups_test.go new file mode 100644 index 000000000..1a89690fb --- /dev/null +++ b/virtcontainers/cgroups_test.go @@ -0,0 +1,211 @@ +// Copyright (c) 2018 Huawei Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "bufio" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "reflect" + "strings" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/stretchr/testify/assert" + + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" +) + +func getCgroupDestination(subsystem string) (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return "", err + } + fields := strings.Fields(s.Text()) + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], nil + } + } + } + return "", fmt.Errorf("failed to find cgroup mountpoint for %q", subsystem) +} + +func TestMergeSpecResource(t *testing.T) { + s := &Sandbox{ + config: &SandboxConfig{ + Containers: []ContainerConfig{ + { + ID: "containerA", + Annotations: make(map[string]string), + }, + { + ID: "containerA", + Annotations: make(map[string]string), + }, + }, + }, + } + + contA := s.config.Containers[0] + contB := s.config.Containers[1] + + getIntP := func(x int64) *int64 { return &x } + getUintP := func(x uint64) *uint64 { return &x } + + type testData struct { + first *specs.LinuxResources + second *specs.LinuxResources + expected *specs.LinuxResources + } + + for _, testdata := range []testData{ + { + nil, + nil, + &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, + }, + { + nil, + &specs.LinuxResources{}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(0), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(10000), Period: getUintP(0)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + }, + { + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1000), Period: getUintP(2000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, + &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1400), Period: getUintP(2000)}}, + }, + } { + data, err := json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: testdata.first, + }, + }) + assert.Nil(t, err) + contA.Annotations[annotations.ConfigJSONKey] = string(data) + + data, err = json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: testdata.second, + }, + }) + assert.Nil(t, err) + contB.Annotations[annotations.ConfigJSONKey] = string(data) + + rc, err := s.mergeSpecResource() + assert.Nil(t, err) + assert.True(t, reflect.DeepEqual(rc, testdata.expected), "should be equal, got: %#v, expected: %#v", rc, testdata.expected) + } +} + +func TestSetupCgroups(t *testing.T) { + if os.Geteuid() != 0 { + t.Skip("Test disabled as requires root privileges") + } + + s := &Sandbox{ + id: "test-sandbox", + hypervisor: &mockHypervisor{}, + config: &SandboxConfig{ + Containers: []ContainerConfig{ + { + ID: "containerA", + Annotations: make(map[string]string), + }, + { + ID: "containerA", + Annotations: make(map[string]string), + }, + }, + }, + } + + contA := s.config.Containers[0] + contB := s.config.Containers[1] + + getIntP := func(x int64) *int64 { return &x } + getUintP := func(x uint64) *uint64 { return &x } + + data, err := json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Quota: getIntP(5000), + Period: getUintP(10000), + }, + }, + }, + }) + assert.Nil(t, err) + contA.Annotations[annotations.ConfigJSONKey] = string(data) + + data, err = json.Marshal(&specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Quota: getIntP(10000), + Period: getUintP(40000), + }, + }, + }, + }) + assert.Nil(t, err) + contB.Annotations[annotations.ConfigJSONKey] = string(data) + + err = s.newCgroups() + assert.Nil(t, err, "failed to create cgroups") + + defer s.destroyCgroups() + + // test if function works without error + err = s.setupCgroups() + assert.Nil(t, err, "setup host cgroup failed") + + // test if the quota and period value are written into cgroup files + cpu, err := getCgroupDestination("cpu") + assert.Nil(t, err, "failed to get cpu cgroup path") + assert.NotEqual(t, "", cpu, "cpu cgroup value can't be empty") + + parentDir := filepath.Join(cpu, defaultCgroupParent, "test-sandbox", "vcpu") + quotaFile := filepath.Join(parentDir, "cpu.cfs_quota_us") + periodFile := filepath.Join(parentDir, "cpu.cfs_period_us") + + expectedQuota := "7500\n" + expectedPeriod := "10000\n" + + fquota, err := os.Open(quotaFile) + assert.Nil(t, err, "open file %q failed", quotaFile) + defer fquota.Close() + data, err = ioutil.ReadAll(fquota) + assert.Nil(t, err) + assert.Equal(t, expectedQuota, string(data), "failed to get expected cfs_quota") + + fperiod, err := os.Open(periodFile) + assert.Nil(t, err, "open file %q failed", periodFile) + defer fperiod.Close() + data, err = ioutil.ReadAll(fperiod) + assert.Nil(t, err) + assert.Equal(t, expectedPeriod, string(data), "failed to get expected cfs_period") +} diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index d94591ff7..5e143fe46 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -252,6 +252,10 @@ type HypervisorConfig struct { DisableVhostNet bool } +type threadIDs struct { + vcpus []int +} + func (conf *HypervisorConfig) checkTemplateConfig() error { if conf.BootToBeTemplate && conf.BootFromTemplate { return fmt.Errorf("Cannot set both 'to be' and 'from' vm tempate") @@ -571,4 +575,5 @@ type hypervisor interface { disconnect() capabilities() capabilities hypervisorConfig() HypervisorConfig + getThreadIDs() (*threadIDs, error) } diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go index ca9c29d9f..de492f003 100644 --- a/virtcontainers/mock_hypervisor.go +++ b/virtcontainers/mock_hypervisor.go @@ -5,7 +5,10 @@ package virtcontainers -import "context" +import ( + "context" + "os" +) type mockHypervisor struct { } @@ -86,3 +89,8 @@ func (m *mockHypervisor) getSandboxConsole(sandboxID string) (string, error) { func (m *mockHypervisor) disconnect() { } + +func (m *mockHypervisor) getThreadIDs() (*threadIDs, error) { + vcpus := []int{os.Getpid()} + return &threadIDs{vcpus}, nil +} diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index 3c93a4f64..76ac46733 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -1293,3 +1293,27 @@ func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu return memory } + +func (q *qemu) getThreadIDs() (*threadIDs, error) { + span, _ := q.trace("getThreadIDs") + defer span.Finish() + + err := q.qmpSetup() + if err != nil { + return nil, err + } + + cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpus(q.qmpMonitorCh.ctx) + if err != nil { + q.Logger().WithError(err).Error("failed to query cpu infos") + return nil, err + } + + var tid threadIDs + for _, i := range cpuInfos { + if i.ThreadID > 0 { + tid.vcpus = append(tid.vcpus, i.ThreadID) + } + } + return &tid, nil +} diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index e72e3f102..7cffa51f6 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -30,9 +30,11 @@ import ( "github.com/vishvananda/netlink" ) -// vmStartTimeout represents the time in seconds a sandbox can wait before -// to consider the VM starting operation failed. -const vmStartTimeout = 10 +const ( + // vmStartTimeout represents the time in seconds a sandbox can wait before + // to consider the VM starting operation failed. + vmStartTimeout = 10 +) // stateString is a string representing a sandbox state. type stateString string @@ -491,6 +493,8 @@ type Sandbox struct { stateful bool ctx context.Context + + cgroup *sandboxCgroups } // ID returns the sandbox identifier string. @@ -861,6 +865,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } + // create new cgroup for sandbox + if err := s.newCgroups(); err != nil { + return nil, err + } + return s, nil } @@ -978,6 +987,12 @@ func (s *Sandbox) Delete() error { } } + // destroy sandbox cgroup + if err := s.destroyCgroups(); err != nil { + // continue the removal process even cgroup failed to destroy + s.Logger().WithError(err).Error("failed to destroy cgroup") + } + globalSandboxList.removeSandbox(s.id) if s.monitor != nil { @@ -1283,6 +1298,10 @@ func (s *Sandbox) CreateContainer(contConfig ContainerConfig) (VCContainer, erro return nil, err } + // Setup host cgroups for new container + if err := s.setupCgroups(); err != nil { + return nil, err + } return c, nil }