From a1c85902f6823768be97b0f3894a97c5c7a10d18 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 28 Jan 2019 14:56:23 -0600 Subject: [PATCH 1/5] virtcontainers: add method to get hypervisor PID hypervisor PID can be used to move the whole process and its threads into a new cgroup. Signed-off-by: Julio Montes --- virtcontainers/fc.go | 4 ++++ virtcontainers/hypervisor.go | 1 + virtcontainers/mock_hypervisor.go | 5 +++++ virtcontainers/qemu.go | 24 ++++++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/virtcontainers/fc.go b/virtcontainers/fc.go index 9111fff653..d8bcad9e4f 100644 --- a/virtcontainers/fc.go +++ b/virtcontainers/fc.go @@ -702,3 +702,7 @@ func (fc *firecracker) getThreadIDs() (*threadIDs, error) { func (fc *firecracker) cleanup() error { return nil } + +func (fc *firecracker) pid() int { + return fc.info.PID +} diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index dd37f8706d..6966b9e782 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -609,4 +609,5 @@ type hypervisor interface { hypervisorConfig() HypervisorConfig getThreadIDs() (*threadIDs, error) cleanup() error + pid() int } diff --git a/virtcontainers/mock_hypervisor.go b/virtcontainers/mock_hypervisor.go index 37dccaf901..fbf1c379bc 100644 --- a/virtcontainers/mock_hypervisor.go +++ b/virtcontainers/mock_hypervisor.go @@ -14,6 +14,7 @@ import ( ) type mockHypervisor struct { + mockPid int } func (m *mockHypervisor) capabilities() types.Capabilities { @@ -100,3 +101,7 @@ func (m *mockHypervisor) getThreadIDs() (*threadIDs, error) { func (m *mockHypervisor) cleanup() error { return nil } + +func (m *mockHypervisor) pid() int { + return m.mockPid +} diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index 2220108385..d8e0ec27a5 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -8,6 +8,7 @@ package virtcontainers import ( "context" "fmt" + "io/ioutil" "math" "os" "path/filepath" @@ -500,6 +501,8 @@ func (q *qemu) createSandbox(ctx context.Context, id string, hypervisorConfig *H return err } + pidFile := q.pidFile() + qemuConfig := govmmQemu.Config{ Name: fmt.Sprintf("sandbox-%s", q.id), UUID: q.state.UUID, @@ -518,6 +521,7 @@ func (q *qemu) createSandbox(ctx context.Context, id string, hypervisorConfig *H VGA: "none", GlobalParam: "kvm-pit.lost_tick_policy=discard", Bios: firmwarePath, + PidFile: pidFile, } if ioThread != nil { @@ -1567,3 +1571,23 @@ func (q *qemu) cleanup() error { return nil } + +func (q *qemu) pidFile() string { + return filepath.Join(store.RunVMStoragePath, q.id, "pid") +} + +func (q *qemu) pid() int { + data, err := ioutil.ReadFile(q.pidFile()) + if err != nil { + q.Logger().WithError(err).Error("Could not read qemu pid file") + return 0 + } + + pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t ")) + if err != nil { + q.Logger().WithError(err).Error("Could not convert string to int") + return 0 + } + + return pid +} From 9758cdba7c17a9c87a00280e18a4b62d1a8963a0 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 28 Jan 2019 15:01:58 -0600 Subject: [PATCH 2/5] virtcontainers: move cpu cgroup implementation cpu cgroups are container's specific hence all containers even the sandbox should be able o create, delete and update their cgroups. The cgroup crated matches with the cgroup path passed by the containers manager. fixes #1117 fixes #1118 fixes #1021 Signed-off-by: Julio Montes --- virtcontainers/container.go | 118 ++++++++++++++++++++++++++++++-- virtcontainers/types/sandbox.go | 12 ++++ 2 files changed, 126 insertions(+), 4 deletions(-) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 7952dfbd61..30f9a78267 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -9,6 +9,7 @@ package virtcontainers import ( "context" "encoding/hex" + "encoding/json" "fmt" "io" "os" @@ -16,6 +17,7 @@ import ( "syscall" "time" + "github.com/containerd/cgroups" "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" "github.com/kata-containers/runtime/virtcontainers/types" "github.com/kata-containers/runtime/virtcontainers/utils" @@ -759,10 +761,8 @@ func (c *Container) create() (err error) { } c.process = *process - // If this is a sandbox container, store the pid for sandbox - ann := c.GetAnnotations() - if ann[annotations.ContainerTypeKey] == string(PodSandbox) { - c.sandbox.setSandboxPid(c.process.Pid) + if err = c.newCgroups(); err != nil { + return } // Store the container process returned by the agent. @@ -788,6 +788,10 @@ func (c *Container) delete() error { return err } + if err := c.deleteCgroups(); err != nil { + return err + } + return c.store.Delete() } @@ -1055,6 +1059,10 @@ func (c *Container) update(resources specs.LinuxResources) error { return err } + if err := c.updateCgroups(resources); err != nil { + return err + } + return c.sandbox.agent.updateContainer(c.sandbox, *c, resources) } @@ -1243,3 +1251,105 @@ func (c *Container) detachDevices() error { } return nil } + +// creates a new cgroup and return the cgroups path +func (c *Container) newCgroups() error { + ann := c.GetAnnotations() + + config, ok := ann[annotations.ConfigJSONKey] + if !ok { + return fmt.Errorf("Could not find json config in annotations") + } + + var spec specs.Spec + if err := json.Unmarshal([]byte(config), &spec); err != nil { + return err + } + + // https://github.com/kata-containers/runtime/issues/168 + resources := specs.LinuxResources{ + CPU: nil, + } + + if spec.Linux != nil && spec.Linux.Resources != nil { + resources.CPU = validCPUResources(spec.Linux.Resources.CPU) + } + + cgroup, err := cgroupsNewFunc(cgroups.V1, + cgroups.StaticPath(spec.Linux.CgroupsPath), &resources) + if err != nil { + return fmt.Errorf("Could not create cgroup for %v: %v", spec.Linux.CgroupsPath, err) + } + + c.state.Resources = resources + c.state.CgroupPath = spec.Linux.CgroupsPath + + // Add shim into cgroup + if c.process.Pid > 0 { + if err := cgroup.Add(cgroups.Process{Pid: c.process.Pid}); err != nil { + return fmt.Errorf("Could not add PID %d to cgroup %v: %v", c.process.Pid, spec.Linux.CgroupsPath, err) + } + } + + return nil +} + +func (c *Container) deleteCgroups() error { + cgroup, err := cgroupsLoadFunc(cgroups.V1, + cgroups.StaticPath(c.state.CgroupPath)) + + if err == cgroups.ErrCgroupDeleted { + // cgroup already deleted + return nil + } + + if err != nil { + return fmt.Errorf("Could not load container cgroup %v: %v", c.state.CgroupPath, err) + } + + // move running process here, that way cgroup can be removed + parent, err := parentCgroup(c.state.CgroupPath) + if err != nil { + // parent cgroup doesn't exist, that means there are no process running + // and the container cgroup was removed. + c.Logger().WithError(err).Warn("Container cgroup doesn't exist") + return nil + } + + if err := cgroup.MoveTo(parent); err != nil { + // Don't fail, cgroup can be deleted + c.Logger().WithError(err).Warn("Could not move container process into parent cgroup") + } + + if err := cgroup.Delete(); err != nil { + return fmt.Errorf("Could not delete container cgroup %v: %v", c.state.CgroupPath, err) + } + + return nil +} + +func (c *Container) updateCgroups(resources specs.LinuxResources) error { + cgroup, err := cgroupsLoadFunc(cgroups.V1, + cgroups.StaticPath(c.state.CgroupPath)) + if err != nil { + return fmt.Errorf("Could not load cgroup %v: %v", c.state.CgroupPath, err) + } + + // Issue: https://github.com/kata-containers/runtime/issues/168 + r := specs.LinuxResources{ + CPU: validCPUResources(resources.CPU), + } + + // update cgroup + if err := cgroup.Update(&r); err != nil { + return fmt.Errorf("Could not update cgroup %v: %v", c.state.CgroupPath, err) + } + + // store new resources + c.state.Resources = r + if err := c.store.Store(store.State, c.state); err != nil { + return err + } + + return nil +} diff --git a/virtcontainers/types/sandbox.go b/virtcontainers/types/sandbox.go index 5bf986b834..b197a63333 100644 --- a/virtcontainers/types/sandbox.go +++ b/virtcontainers/types/sandbox.go @@ -8,6 +8,8 @@ package types import ( "fmt" "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" ) // StateString is a string representing a sandbox state. @@ -44,6 +46,16 @@ type State struct { // GuestMemoryBlockSizeMB is the size of memory block of guestos GuestMemoryBlockSizeMB uint32 `json:"guestMemoryBlockSize"` + + // CgroupPath is the cgroup hierarchy where sandbox's processes + // including the hypervisor are placed. + CgroupPath string `json:"cgroupPath,omitempty"` + + // Resources contains the resources assigned to the container. + // When a container is created resources specified in the config json + // are used, those resources change when a container is updated but + // the config json is not updated. + Resources specs.LinuxResources `json:"resources,omitempty"` } // Valid checks that the sandbox state is valid. From 5201860bb0ad377c7e2263bc9b0c0173900a00d9 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 28 Jan 2019 15:22:41 -0600 Subject: [PATCH 3/5] virtcontainers: reimplement sandbox cgroup All containers run in different cgroups even the sandbox, with this new implementation the sandbox cpu cgroup wil be equal to the sum of all its containers and the hypervisor process will be placed there impacting to the containers running in the sandbox (VM). The default number of vcpus is used when the sandbox has no constraints. For example, if default_vcpus is 2, then quota will be 200000 and period 100000. **c-ray test** http://www.futuretech.blinkenlights.nl/c-ray.html ``` +=============================================+ | | 6 threads 6cpus | 1 thread 1 cpu | +=============================================+ | current | 40 seconds | 122 seconds | +============================================== | new | 37 seconds | 124 seconds | +============================================== ``` current = current cgroups implementation new = new cgroups implementation **workload** ```yaml apiVersion: v1 kind: Pod metadata: name: c-ray annotations: io.kubernetes.cri.untrusted-workload: "true" spec: restartPolicy: Never containers: - name: c-ray-1 image: docker.io/devimc/c-ray:latest imagePullPolicy: IfNotPresent args: ["-t", "6", "-s", "1600x1200", "-r", "8", "-i", "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"] resources: limits: cpu: 6 - name: c-ray-2 image: docker.io/devimc/c-ray:latest imagePullPolicy: IfNotPresent args: ["-t", "1", "-s", "1600x1200", "-r", "8", "-i", "/c-ray-1.1/sphfract", "-o", "/tmp/output.ppm"] resources: limits: cpu: 1 ``` fixes #1153 Signed-off-by: Julio Montes --- virtcontainers/api.go | 5 - virtcontainers/api_test.go | 11 +- virtcontainers/cgroups.go | 390 +++++++++++++++++++++++---------- virtcontainers/cgroups_test.go | 346 ++++++++++++++--------------- virtcontainers/sandbox.go | 42 ++-- virtcontainers/sandbox_test.go | 7 +- 6 files changed, 471 insertions(+), 330 deletions(-) diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 8deded3c1f..007b95ac57 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -121,11 +121,6 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f return nil, err } - // Setup host cgroups - if err = s.setupCgroups(); err != nil { - return nil, err - } - return s, nil } diff --git a/virtcontainers/api_test.go b/virtcontainers/api_test.go index ec19e72acc..1d77f812a4 100644 --- a/virtcontainers/api_test.go +++ b/virtcontainers/api_test.go @@ -18,6 +18,7 @@ import ( "testing" "github.com/containernetworking/plugins/pkg/ns" + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" "github.com/kata-containers/runtime/virtcontainers/pkg/mock" vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" "github.com/kata-containers/runtime/virtcontainers/store" @@ -31,13 +32,15 @@ const ( ) var sandboxAnnotations = map[string]string{ - "sandbox.foo": "sandbox.bar", - "sandbox.hello": "sandbox.world", + "sandbox.foo": "sandbox.bar", + "sandbox.hello": "sandbox.world", + annotations.ConfigJSONKey: `{"linux":{"resources":{}}}`, } var containerAnnotations = map[string]string{ - "container.foo": "container.bar", - "container.hello": "container.world", + "container.foo": "container.bar", + "container.hello": "container.world", + annotations.ConfigJSONKey: `{"linux":{"resources":{}}}`, } func newBasicTestCmd() types.Cmd { diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go index 20fa6b07bc..74c8fce9c6 100644 --- a/virtcontainers/cgroups.go +++ b/virtcontainers/cgroups.go @@ -1,4 +1,5 @@ // Copyright (c) 2018 Huawei Corporation +// Copyright (c) 2019 Intel Corporation // // SPDX-License-Identifier: Apache-2.0 // @@ -6,95 +7,218 @@ package virtcontainers import ( - "encoding/json" + "bufio" "fmt" + "math" + "os" + "path/filepath" + "strings" "github.com/containerd/cgroups" "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" specs "github.com/opencontainers/runtime-spec/specs-go" ) -const ( - vcpuGroupName = "vcpu" - defaultCgroupParent = "/kata" -) - -type sandboxCgroups struct { - commonParent cgroups.Cgroup - sandboxSub cgroups.Cgroup - vcpuSub cgroups.Cgroup +type cgroupPather interface { + cgroups.Subsystem + Path(path string) string } -func (s *Sandbox) newCgroups() error { - // New will still succeed when cgroup exists - // create common parent for all kata-containers - // e.g. /sys/fs/cgroup/cpu/vc - parent, err := cgroups.New(cgroups.V1, - cgroups.StaticPath(defaultCgroupParent), &specs.LinuxResources{}) - if err != nil { - return fmt.Errorf("failed to create cgroup for %q", defaultCgroupParent) - } +// unconstrained cgroups are placed here. +// for example /sys/fs/cgroup/memory/kata/$CGPATH +// where path is defined by the containers manager +const cgroupKataPath = "/kata/" - // create sub-cgroup for each sandbox - // e.g. /sys/fs/cgroup/cpu/vc/ - sandboxSub, err := parent.New(s.id, &specs.LinuxResources{}) - if err != nil { - return fmt.Errorf("failed to create cgroup for %s/%s", defaultCgroupParent, s.id) - } +var cgroupsLoadFunc = cgroups.Load +var cgroupsNewFunc = cgroups.New - // create sub-cgroup for vcpu threads - vcpuSub, err := sandboxSub.New(vcpuGroupName, &specs.LinuxResources{}) +// V1Constraints returns the cgroups that are compatible with th VC architecture +// and hypervisor, constraints can be applied to these cgroups. +func V1Constraints() ([]cgroups.Subsystem, error) { + root, err := cgroupV1MountPoint() if err != nil { - return fmt.Errorf("failed to create cgroup for %s/%s/%s", defaultCgroupParent, s.id, vcpuGroupName) + return nil, err } - - s.cgroup = &sandboxCgroups{ - commonParent: parent, - sandboxSub: sandboxSub, - vcpuSub: vcpuSub, + subsystems := []cgroups.Subsystem{ + cgroups.NewCputset(root), + cgroups.NewCpu(root), + cgroups.NewCpuacct(root), } - return nil + return cgroupsSubsystems(subsystems) } -func (s *Sandbox) destroyCgroups() error { - if s.cgroup == nil { - s.Logger().Warningf("cgroup is not initialized, no need to destroy") +// V1NoConstraints returns the cgroups that are *not* compatible with th VC +// architecture and hypervisor, constraints MUST NOT be applied to these cgroups. +func V1NoConstraints() ([]cgroups.Subsystem, error) { + root, err := cgroupV1MountPoint() + if err != nil { + return nil, err + } + subsystems := []cgroups.Subsystem{ + // Some constainers managers, like k8s, take the control of cgroups. + // k8s: the memory cgroup for the dns containers is small to place + // a hypervisor there. + cgroups.NewMemory(root), + } + return cgroupsSubsystems(subsystems) +} + +func cgroupsSubsystems(subsystems []cgroups.Subsystem) ([]cgroups.Subsystem, error) { + var enabled []cgroups.Subsystem + for _, s := range cgroupPathers(subsystems) { + // check and remove the default groups that do not exist + if _, err := os.Lstat(s.Path("/")); err == nil { + enabled = append(enabled, s) + } + } + return enabled, nil +} + +func cgroupPathers(subystems []cgroups.Subsystem) []cgroupPather { + var out []cgroupPather + for _, s := range subystems { + if p, ok := s.(cgroupPather); ok { + out = append(out, p) + } + } + return out +} + +// v1MountPoint returns the mount point where the cgroup +// mountpoints are mounted in a single hiearchy +func cgroupV1MountPoint() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + if err := scanner.Err(); err != nil { + return "", err + } + var ( + text = scanner.Text() + fields = strings.Split(text, " ") + // safe as mountinfo encodes mountpoints with spaces as \040. + index = strings.Index(text, " - ") + postSeparatorFields = strings.Fields(text[index+3:]) + numPostFields = len(postSeparatorFields) + ) + // this is an error as we can't detect if the mount is for "cgroup" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + if postSeparatorFields[0] == "cgroup" { + // check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + return filepath.Dir(fields[4]), nil + } + } + return "", cgroups.ErrMountPointNotExist +} + +func cgroupNoConstraintsPath(path string) string { + return filepath.Join(cgroupKataPath, path) +} + +// return the parent cgroup for the given path +func parentCgroup(path string) (cgroups.Cgroup, error) { + // append '/' just in case CgroupsPath doesn't start with it + parent := filepath.Dir("/" + path) + + parentCgroup, err := cgroupsLoadFunc(cgroups.V1, + cgroups.StaticPath(parent)) + if err != nil { + return nil, fmt.Errorf("Could not load parent cgroup %v: %v", parent, err) + } + + return parentCgroup, nil +} + +func (s *Sandbox) updateCgroups() error { + if s.state.CgroupPath == "" { + s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty") return nil } - // first move all processes in subgroup to parent in case live process blocks - // deletion of cgroup - if err := s.cgroup.sandboxSub.MoveTo(s.cgroup.commonParent); err != nil { - return fmt.Errorf("failed to clear cgroup processes") + cgroup, err := cgroupsLoadFunc(V1Constraints, cgroups.StaticPath(s.state.CgroupPath)) + if err != nil { + return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err) } - return s.cgroup.sandboxSub.Delete() -} - -func (s *Sandbox) setupCgroups() error { - if s.cgroup == nil { - return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") + if err := s.constrainHypervisor(cgroup); err != nil { + return err } - resource, err := s.mergeSpecResource() + if len(s.containers) <= 1 { + // nothing to update + return nil + } + + resources, err := s.resources() if err != nil { return err } - if err := s.applyCPUCgroup(resource); err != nil { - return err + if err := cgroup.Update(&resources); err != nil { + return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err) } + return nil } -func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { - if s.cgroup == nil { - return fmt.Errorf("failed to setup uninitialized cgroup for sandbox") +func (s *Sandbox) deleteCgroups() error { + s.Logger().Debug("Deleting sandbox cgroup") + + path := cgroupNoConstraintsPath(s.state.CgroupPath) + s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup") + noConstraintsCgroup, err := cgroupsLoadFunc(V1NoConstraints, cgroups.StaticPath(path)) + if err == cgroups.ErrCgroupDeleted { + // cgroup already deleted + return nil } - // apply cpu constraint to vcpu cgroup - if err := s.cgroup.vcpuSub.Update(rc); err != nil { - return err + if err != nil { + return fmt.Errorf("Could not load cgroup without constraints %v: %v", path, err) + } + + // move running process here, that way cgroup can be removed + parent, err := parentCgroup(path) + if err != nil { + // parent cgroup doesn't exist, that means there are no process running + // and the no constraints cgroup was removed. + s.Logger().WithError(err).Warn("Parent cgroup doesn't exist") + return nil + } + + if err := noConstraintsCgroup.MoveTo(parent); err != nil { + // Don't fail, cgroup can be deleted + s.Logger().WithError(err).Warn("Could not move process from no constraints to parent cgroup") + } + + return noConstraintsCgroup.Delete() +} + +func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error { + pid := s.hypervisor.pid() + if pid <= 0 { + return fmt.Errorf("Invalid hypervisor PID: %d", pid) + } + + // Move hypervisor into cgroups without constraints, + // those cgroups are not yet supported. + resources := &specs.LinuxResources{} + path := cgroupNoConstraintsPath(s.state.CgroupPath) + noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources) + if err != nil { + return fmt.Errorf("Could not create cgroup %v: %v", path, err) + } + + if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil { + return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err) } // when new container joins, new CPU could be hotplugged, so we @@ -103,33 +227,21 @@ func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { if err != nil { return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) } - if tids == nil { + if tids == nil || len(tids.vcpus) == 0 { // If there's no tid returned from the hypervisor, this is not // a bug. It simply means there is nothing to constrain, hence // let's return without any error from here. return nil } - // use Add() to add vcpu thread to s.cgroup, it will write thread id to - // `cgroup.procs` which will move all threads in qemu process to this cgroup - // immediately as default behaviour. - if len(tids.vcpus) > 0 { - if err := s.cgroup.sandboxSub.Add(cgroups.Process{ - Pid: tids.vcpus[0], - }); err != nil { - return err - } - } - + // We are about to move just the vcpus (threads) into cgroups with constraints. + // Move whole hypervisor process whould be easier but the IO/network performance + // whould be impacted. for _, i := range tids.vcpus { - if i <= 0 { - continue - } - // In contrast, AddTask will write thread id to `tasks` // After this, vcpu threads are in "vcpu" sub-cgroup, other threads in // qemu will be left in parent cgroup untouched. - if err := s.cgroup.vcpuSub.AddTask(cgroups.Process{ + if err := cgroup.AddTask(cgroups.Process{ Pid: i, }); err != nil { return err @@ -139,59 +251,107 @@ func (s *Sandbox) applyCPUCgroup(rc *specs.LinuxResources) error { return nil } -func (s *Sandbox) mergeSpecResource() (*specs.LinuxResources, error) { - if s.config == nil { - return nil, fmt.Errorf("sandbox config is nil") +func (s *Sandbox) resources() (specs.LinuxResources, error) { + resources := specs.LinuxResources{ + CPU: s.cpuResources(), } - resource := &specs.LinuxResources{ - CPU: &specs.LinuxCPU{}, + return resources, nil +} + +func (s *Sandbox) cpuResources() *specs.LinuxCPU { + quota := int64(0) + period := uint64(0) + shares := uint64(0) + realtimePeriod := uint64(0) + realtimeRuntime := int64(0) + + cpu := &specs.LinuxCPU{ + Quota: "a, + Period: &period, + Shares: &shares, + RealtimePeriod: &realtimePeriod, + RealtimeRuntime: &realtimeRuntime, } - for _, c := range s.config.Containers { - config, ok := c.Annotations[annotations.ConfigJSONKey] - if !ok { - s.Logger().WithField("container", c.ID).Warningf("failed to find config from container annotations") + for _, c := range s.containers { + ann := c.GetAnnotations() + if ann[annotations.ContainerTypeKey] == string(PodSandbox) { + // skip sandbox container continue } - var spec specs.Spec - if err := json.Unmarshal([]byte(config), &spec); err != nil { - return nil, err - } - - // TODO: how to handle empty/unlimited resource? - // maybe we should add a default CPU/Memory delta when no - // resource limit is given. -- @WeiZhang555 - if spec.Linux == nil || spec.Linux.Resources == nil { + if c.state.Resources.CPU == nil { continue } - // calculate cpu quota and period - s.mergeCPUResource(resource, spec.Linux.Resources) - } - return resource, nil -} -func (s *Sandbox) mergeCPUResource(orig, rc *specs.LinuxResources) { - if orig.CPU == nil { - orig.CPU = &specs.LinuxCPU{} - } + if c.state.Resources.CPU.Shares != nil { + shares = uint64(math.Max(float64(*c.state.Resources.CPU.Shares), float64(shares))) + } - if rc.CPU != nil && rc.CPU.Quota != nil && rc.CPU.Period != nil && - *rc.CPU.Quota > 0 && *rc.CPU.Period > 0 { - if orig.CPU.Period == nil { - orig.CPU.Period = rc.CPU.Period - orig.CPU.Quota = rc.CPU.Quota - } else { - // this is an example to show how it works: - // container A and `orig` has quota: 5000 and period 10000 - // here comes container B with quota 40 and period 100, - // so use previous period 10000 as a baseline, container B - // has proportional resource of quota 4000 and period 10000, calculated as - // delta := 40 / 100 * 10000 = 4000 - // and final `*orig.CPU.Quota` = 5000 + 4000 = 9000 - delta := float64(*rc.CPU.Quota) / float64(*rc.CPU.Period) * float64(*orig.CPU.Period) - *orig.CPU.Quota += int64(delta) + if c.state.Resources.CPU.Quota != nil { + quota += *c.state.Resources.CPU.Quota + } + + if c.state.Resources.CPU.Period != nil { + period = uint64(math.Max(float64(*c.state.Resources.CPU.Period), float64(period))) + } + + if c.state.Resources.CPU.Cpus != "" { + cpu.Cpus += c.state.Resources.CPU.Cpus + "," + } + + if c.state.Resources.CPU.RealtimeRuntime != nil { + realtimeRuntime += *c.state.Resources.CPU.RealtimeRuntime + } + + if c.state.Resources.CPU.RealtimePeriod != nil { + realtimePeriod += *c.state.Resources.CPU.RealtimePeriod + } + + if c.state.Resources.CPU.Mems != "" { + cpu.Mems += c.state.Resources.CPU.Mems + "," } } + + cpu.Cpus = strings.Trim(cpu.Cpus, " \n\t,") + + // use a default constraint for sandboxes without cpu constraints + if period == uint64(0) && quota == int64(0) { + // set a quota and period equal to the default number of vcpus + quota = int64(s.config.HypervisorConfig.NumVCPUs) * 100000 + period = 100000 + } + + return validCPUResources(cpu) +} + +// validCPUResources checks CPU resources coherency +func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU { + if cpuSpec == nil { + return nil + } + + cpu := *cpuSpec + if cpu.Period != nil && *cpu.Period < 1 { + cpu.Period = nil + } + + if cpu.Quota != nil && *cpu.Quota < 1 { + cpu.Quota = nil + } + + if cpu.Shares != nil && *cpu.Shares < 1 { + cpu.Shares = nil + } + + if cpu.RealtimePeriod != nil && *cpu.RealtimePeriod < 1 { + cpu.RealtimePeriod = nil + } + + if cpu.RealtimeRuntime != nil && *cpu.RealtimeRuntime < 1 { + cpu.RealtimeRuntime = nil + } + + return &cpu } diff --git a/virtcontainers/cgroups_test.go b/virtcontainers/cgroups_test.go index 1a89690fb4..4e93c2dd43 100644 --- a/virtcontainers/cgroups_test.go +++ b/virtcontainers/cgroups_test.go @@ -6,206 +6,188 @@ package virtcontainers import ( - "bufio" - "encoding/json" "fmt" - "io/ioutil" "os" + "os/exec" "path/filepath" - "reflect" - "strings" "testing" + "github.com/containerd/cgroups" + "github.com/kata-containers/runtime/virtcontainers/types" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" - - "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" ) -func getCgroupDestination(subsystem string) (string, error) { - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", err - } - defer f.Close() - s := bufio.NewScanner(f) - for s.Scan() { - if err := s.Err(); err != nil { - return "", err - } - fields := strings.Fields(s.Text()) - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], nil - } - } - } - return "", fmt.Errorf("failed to find cgroup mountpoint for %q", subsystem) +type mockCgroup struct { } -func TestMergeSpecResource(t *testing.T) { - s := &Sandbox{ - config: &SandboxConfig{ - Containers: []ContainerConfig{ - { - ID: "containerA", - Annotations: make(map[string]string), - }, - { - ID: "containerA", - Annotations: make(map[string]string), - }, - }, - }, - } - - contA := s.config.Containers[0] - contB := s.config.Containers[1] - - getIntP := func(x int64) *int64 { return &x } - getUintP := func(x uint64) *uint64 { return &x } - - type testData struct { - first *specs.LinuxResources - second *specs.LinuxResources - expected *specs.LinuxResources - } - - for _, testdata := range []testData{ - { - nil, - nil, - &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, - }, - { - nil, - &specs.LinuxResources{}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{}}, - }, - { - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(0), Period: getUintP(100000)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, - }, - { - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(10000), Period: getUintP(0)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, - }, - { - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1000), Period: getUintP(2000)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(20000), Period: getUintP(100000)}}, - &specs.LinuxResources{CPU: &specs.LinuxCPU{Quota: getIntP(1400), Period: getUintP(2000)}}, - }, - } { - data, err := json.Marshal(&specs.Spec{ - Linux: &specs.Linux{ - Resources: testdata.first, - }, - }) - assert.Nil(t, err) - contA.Annotations[annotations.ConfigJSONKey] = string(data) - - data, err = json.Marshal(&specs.Spec{ - Linux: &specs.Linux{ - Resources: testdata.second, - }, - }) - assert.Nil(t, err) - contB.Annotations[annotations.ConfigJSONKey] = string(data) - - rc, err := s.mergeSpecResource() - assert.Nil(t, err) - assert.True(t, reflect.DeepEqual(rc, testdata.expected), "should be equal, got: %#v, expected: %#v", rc, testdata.expected) - } +func (m *mockCgroup) New(string, *specs.LinuxResources) (cgroups.Cgroup, error) { + return &mockCgroup{}, nil +} +func (m *mockCgroup) Add(cgroups.Process) error { + return nil } -func TestSetupCgroups(t *testing.T) { - if os.Geteuid() != 0 { - t.Skip("Test disabled as requires root privileges") - } +func (m *mockCgroup) AddTask(cgroups.Process) error { + return nil +} + +func (m *mockCgroup) Delete() error { + return nil +} + +func (m *mockCgroup) MoveTo(cgroups.Cgroup) error { + return nil +} + +func (m *mockCgroup) Stat(...cgroups.ErrorHandler) (*cgroups.Metrics, error) { + return &cgroups.Metrics{}, nil +} + +func (m *mockCgroup) Update(resources *specs.LinuxResources) error { + return nil +} + +func (m *mockCgroup) Processes(cgroups.Name, bool) ([]cgroups.Process, error) { + return nil, nil +} + +func (m *mockCgroup) Freeze() error { + return nil +} + +func (m *mockCgroup) Thaw() error { + return nil +} + +func (m *mockCgroup) OOMEventFD() (uintptr, error) { + return 0, nil +} + +func (m *mockCgroup) State() cgroups.State { + return "" +} + +func (m *mockCgroup) Subsystems() []cgroups.Subsystem { + return nil +} + +func mockCgroupNew(hierarchy cgroups.Hierarchy, path cgroups.Path, resources *specs.LinuxResources) (cgroups.Cgroup, error) { + return &mockCgroup{}, nil +} + +func mockCgroupLoad(hierarchy cgroups.Hierarchy, path cgroups.Path) (cgroups.Cgroup, error) { + return &mockCgroup{}, nil +} + +func init() { + cgroupsNewFunc = mockCgroupNew + cgroupsLoadFunc = mockCgroupLoad +} + +func TestV1Constraints(t *testing.T) { + assert := assert.New(t) + + systems, err := V1Constraints() + assert.NoError(err) + assert.NotEmpty(systems) +} + +func TestV1NoConstraints(t *testing.T) { + assert := assert.New(t) + + systems, err := V1NoConstraints() + assert.NoError(err) + assert.NotEmpty(systems) +} + +func TestCgroupNoConstraintsPath(t *testing.T) { + assert := assert.New(t) + + cgrouPath := "abc" + expectedPath := filepath.Join(cgroupKataPath, cgrouPath) + path := cgroupNoConstraintsPath(cgrouPath) + assert.Equal(expectedPath, path) +} + +func TestUpdateCgroups(t *testing.T) { + assert := assert.New(t) + + oldCgroupsNew := cgroupsNewFunc + oldCgroupsLoad := cgroupsLoadFunc + cgroupsNewFunc = cgroups.New + cgroupsLoadFunc = cgroups.Load + defer func() { + cgroupsNewFunc = oldCgroupsNew + cgroupsLoadFunc = oldCgroupsLoad + }() s := &Sandbox{ - id: "test-sandbox", - hypervisor: &mockHypervisor{}, - config: &SandboxConfig{ - Containers: []ContainerConfig{ - { - ID: "containerA", - Annotations: make(map[string]string), - }, - { - ID: "containerA", - Annotations: make(map[string]string), - }, + state: types.State{ + CgroupPath: "", + }, + } + + // empty path + err := s.updateCgroups() + assert.NoError(err) + + // path doesn't exist + s.state.CgroupPath = "/abc/123/rgb" + err = s.updateCgroups() + assert.Error(err) + + if os.Getuid() != 0 { + return + } + + s.state.CgroupPath = fmt.Sprintf("/kata-tests-%d", os.Getpid()) + testCgroup, err := cgroups.New(cgroups.V1, cgroups.StaticPath(s.state.CgroupPath), &specs.LinuxResources{}) + assert.NoError(err) + defer testCgroup.Delete() + s.hypervisor = &mockHypervisor{mockPid: 0} + + // bad pid + err = s.updateCgroups() + assert.Error(err) + + // fake workload + cmd := exec.Command("tail", "-f", "/dev/null") + assert.NoError(cmd.Start()) + s.state.Pid = cmd.Process.Pid + s.hypervisor = &mockHypervisor{mockPid: s.state.Pid} + + // no containers + err = s.updateCgroups() + assert.NoError(err) + + s.config = &SandboxConfig{} + s.config.HypervisorConfig.NumVCPUs = 1 + + s.containers = map[string]*Container{ + "abc": { + process: Process{ + Pid: s.state.Pid, + }, + config: &ContainerConfig{ + Annotations: containerAnnotations, + }, + }, + "xyz": { + process: Process{ + Pid: s.state.Pid, + }, + config: &ContainerConfig{ + Annotations: containerAnnotations, }, }, } - contA := s.config.Containers[0] - contB := s.config.Containers[1] + err = s.updateCgroups() + assert.NoError(err) - getIntP := func(x int64) *int64 { return &x } - getUintP := func(x uint64) *uint64 { return &x } - - data, err := json.Marshal(&specs.Spec{ - Linux: &specs.Linux{ - Resources: &specs.LinuxResources{ - CPU: &specs.LinuxCPU{ - Quota: getIntP(5000), - Period: getUintP(10000), - }, - }, - }, - }) - assert.Nil(t, err) - contA.Annotations[annotations.ConfigJSONKey] = string(data) - - data, err = json.Marshal(&specs.Spec{ - Linux: &specs.Linux{ - Resources: &specs.LinuxResources{ - CPU: &specs.LinuxCPU{ - Quota: getIntP(10000), - Period: getUintP(40000), - }, - }, - }, - }) - assert.Nil(t, err) - contB.Annotations[annotations.ConfigJSONKey] = string(data) - - err = s.newCgroups() - assert.Nil(t, err, "failed to create cgroups") - - defer s.destroyCgroups() - - // test if function works without error - err = s.setupCgroups() - assert.Nil(t, err, "setup host cgroup failed") - - // test if the quota and period value are written into cgroup files - cpu, err := getCgroupDestination("cpu") - assert.Nil(t, err, "failed to get cpu cgroup path") - assert.NotEqual(t, "", cpu, "cpu cgroup value can't be empty") - - parentDir := filepath.Join(cpu, defaultCgroupParent, "test-sandbox", "vcpu") - quotaFile := filepath.Join(parentDir, "cpu.cfs_quota_us") - periodFile := filepath.Join(parentDir, "cpu.cfs_period_us") - - expectedQuota := "7500\n" - expectedPeriod := "10000\n" - - fquota, err := os.Open(quotaFile) - assert.Nil(t, err, "open file %q failed", quotaFile) - defer fquota.Close() - data, err = ioutil.ReadAll(fquota) - assert.Nil(t, err) - assert.Equal(t, expectedQuota, string(data), "failed to get expected cfs_quota") - - fperiod, err := os.Open(periodFile) - assert.Nil(t, err, "open file %q failed", periodFile) - defer fperiod.Close() - data, err = ioutil.ReadAll(fperiod) - assert.Nil(t, err) - assert.Equal(t, expectedPeriod, string(data), "failed to get expected cfs_period") + // cleanup + assert.NoError(cmd.Process.Kill()) + err = s.deleteCgroups() + assert.NoError(err) } diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 1e1e9fc6bf..a72722aa7d 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -24,6 +24,7 @@ import ( "github.com/kata-containers/runtime/virtcontainers/device/config" "github.com/kata-containers/runtime/virtcontainers/device/drivers" deviceManager "github.com/kata-containers/runtime/virtcontainers/device/manager" + "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" "github.com/kata-containers/runtime/virtcontainers/store" "github.com/kata-containers/runtime/virtcontainers/types" @@ -175,8 +176,6 @@ type Sandbox struct { seccompSupported bool ctx context.Context - - cgroup *sandboxCgroups } // ID returns the sandbox identifier string. @@ -541,11 +540,6 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } - // create new cgroup for sandbox - if err := s.newCgroups(); err != nil { - return nil, err - } - return s, nil } @@ -702,10 +696,8 @@ func (s *Sandbox) Delete() error { } } - // destroy sandbox cgroup - if err := s.destroyCgroups(); err != nil { - // continue the removal process even cgroup failed to destroy - s.Logger().WithError(err).Error("failed to destroy cgroup") + if err := s.deleteCgroups(); err != nil { + return err } globalSandboxList.removeSandbox(s.id) @@ -987,6 +979,13 @@ func (s *Sandbox) addContainer(c *Container) error { } s.containers[c.id] = c + ann := c.GetAnnotations() + if ann[annotations.ContainerTypeKey] == string(PodSandbox) { + s.state.Pid = c.process.Pid + s.state.CgroupPath = c.state.CgroupPath + return s.store.Store(store.State, s.state) + } + return nil } @@ -1048,10 +1047,10 @@ func (s *Sandbox) CreateContainer(contConfig ContainerConfig) (VCContainer, erro return nil, err } - // Setup host cgroups for new container - if err := s.setupCgroups(); err != nil { + if err := s.updateCgroups(); err != nil { return nil, err } + return c, nil } @@ -1202,6 +1201,10 @@ func (s *Sandbox) UpdateContainer(containerID string, resources specs.LinuxResou return err } + if err := s.updateCgroups(); err != nil { + return err + } + return c.storeContainer() } @@ -1269,6 +1272,10 @@ func (s *Sandbox) createContainers() error { } } + if err := s.updateCgroups(); err != nil { + return err + } + return nil } @@ -1427,15 +1434,6 @@ func (s *Sandbox) decrementSandboxBlockIndex() error { return nil } -// setSandboxPid sets the Pid of the the shim process belonging to the -// sandbox container as the Pid of the sandbox. -func (s *Sandbox) setSandboxPid(pid int) error { - s.state.Pid = pid - - // update on-disk state - return s.store.Store(store.State, s.state) -} - func (s *Sandbox) setContainersState(state types.StateString) error { if state == "" { return errNeedState diff --git a/virtcontainers/sandbox_test.go b/virtcontainers/sandbox_test.go index e5b361b035..c60d123173 100644 --- a/virtcontainers/sandbox_test.go +++ b/virtcontainers/sandbox_test.go @@ -53,6 +53,7 @@ func testCreateSandbox(t *testing.T, id string, NetworkConfig: nconfig, Volumes: volumes, Containers: containers, + Annotations: sandboxAnnotations, } sandbox, err := createSandbox(context.Background(), sconfig, nil) @@ -689,7 +690,8 @@ func TestSandboxGetContainer(t *testing.T) { func TestContainerSetStateBlockIndex(t *testing.T) { containers := []ContainerConfig{ { - ID: "100", + ID: "100", + Annotations: containerAnnotations, }, } @@ -784,7 +786,8 @@ func TestContainerStateSetFstype(t *testing.T) { containers := []ContainerConfig{ { - ID: "100", + ID: "100", + Annotations: containerAnnotations, }, } From 62c393c11900762920d65080a14e9d966c2c5b40 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 18 Feb 2019 11:56:10 -0600 Subject: [PATCH 4/5] virtcontainers: change container's state to stop asap container is killed by force, container's state MUST change its state to stop immediately to avoid leaving it in a bad state. fixes #1088 Signed-off-by: Julio Montes --- virtcontainers/container.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 30f9a78267..92a1befc7e 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -928,6 +928,13 @@ func (c *Container) stop() error { // get failed if the process hasn't exited. c.sandbox.agent.waitProcess(c, c.id) + // container was killed by force, container MUST change its state + // as soon as possible just in case one of below operations fail leaving + // the containers in a bad state. + if err := c.setContainerState(types.StateStopped); err != nil { + return err + } + if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil { return err } @@ -940,7 +947,7 @@ func (c *Container) stop() error { return err } - return c.setContainerState(types.StateStopped) + return nil } func (c *Container) enter(cmd types.Cmd) (*Process, error) { From 58d278560e169b63ae301358fb033fec59a3e691 Mon Sep 17 00:00:00 2001 From: Julio Montes Date: Mon, 18 Feb 2019 12:00:55 -0600 Subject: [PATCH 5/5] virtcontainers: don't try to talk with the proxy when it's not running To avoid long timeouts, the runtime shouldn't try to talk with the proxy when it's not running. Signed-off-by: Julio Montes --- virtcontainers/kata_agent.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index dcf4735d47..f07b891068 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -1569,6 +1569,13 @@ func (k *kataAgent) sendReq(request interface{}) (interface{}, error) { span.SetTag("request", request) defer span.Finish() + if k.state.ProxyPid > 0 { + // check that proxy is running before talk with it avoiding long timeouts + if err := syscall.Kill(k.state.ProxyPid, syscall.Signal(0)); err != nil { + return nil, fmt.Errorf("Proxy is not running: %v", err) + } + } + if err := k.connect(); err != nil { return nil, err }