diff --git a/Makefile b/Makefile index eecc9d4097..879b8a9aa3 100644 --- a/Makefile +++ b/Makefile @@ -183,6 +183,9 @@ DEFDISABLENESTINGCHECKS := false DEFMSIZE9P := 8192 DEFHOTPLUGVFIOONROOTBUS := false +# Default cgroup model +DEFSANDBOXCGROUPONLY ?= false + SED = sed CLI_DIR = cli @@ -424,6 +427,7 @@ USER_VARS += DEFDISABLENESTINGCHECKS USER_VARS += DEFMSIZE9P USER_VARS += DEFHOTPLUGVFIOONROOTBUS USER_VARS += DEFENTROPYSOURCE +USER_VARS += DEFSANDBOXCGROUPONLY USER_VARS += BUILDFLAGS @@ -579,6 +583,7 @@ $(GENERATED_FILES): %: %.in $(MAKEFILE_LIST) VERSION .git-commit -e "s|@DEFMSIZE9P@|$(DEFMSIZE9P)|g" \ -e "s|@DEFHOTPLUGONROOTBUS@|$(DEFHOTPLUGVFIOONROOTBUS)|g" \ -e "s|@DEFENTROPYSOURCE@|$(DEFENTROPYSOURCE)|g" \ + -e "s|@DEFSANDBOXCGROUPONLY@|$(DEFSANDBOXCGROUPONLY)|g" \ $< > $@ generate-config: $(CONFIGS) diff --git a/cli/config/configuration-acrn.toml.in b/cli/config/configuration-acrn.toml.in index b3da087443..b38dd3436c 100644 --- a/cli/config/configuration-acrn.toml.in +++ b/cli/config/configuration-acrn.toml.in @@ -228,6 +228,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # (default: false) #disable_new_netns = true +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The sandbox cgroup is not constrained by the runtime +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # They may break compatibility, and are prepared for a big version bump. diff --git a/cli/config/configuration-fc.toml.in b/cli/config/configuration-fc.toml.in index b9137c114b..520642d914 100644 --- a/cli/config/configuration-fc.toml.in +++ b/cli/config/configuration-fc.toml.in @@ -330,6 +330,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # (default: false) #disable_new_netns = true +# if enable, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The sandbox cgroup is not constrained by the runtime +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # They may break compatibility, and are prepared for a big version bump. diff --git a/cli/config/configuration-nemu.toml.in b/cli/config/configuration-nemu.toml.in index 9486e2cf4a..3173e364ce 100644 --- a/cli/config/configuration-nemu.toml.in +++ b/cli/config/configuration-nemu.toml.in @@ -404,6 +404,12 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # (default: false) #disable_new_netns = true +# if enable, the runtime use the parent cgroup of a container PodSandbox. This +# should be enabled for users where the caller setup the parent cgroup of the +# containers running in a sandbox so all the resouces of the kata container run +# in the same cgroup and performance isolation its more accurate. +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # They may break compatibility, and are prepared for a big version bump. diff --git a/cli/config/configuration-qemu.toml.in b/cli/config/configuration-qemu.toml.in index 6d5c84605a..a03ee568fb 100644 --- a/cli/config/configuration-qemu.toml.in +++ b/cli/config/configuration-qemu.toml.in @@ -412,6 +412,14 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # (default: false) #disable_new_netns = true +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The sandbox cgroup is not constrained by the runtime +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # They may break compatibility, and are prepared for a big version bump. diff --git a/cli/kata-env.go b/cli/kata-env.go index f9c1e9d808..90a73b3c41 100644 --- a/cli/kata-env.go +++ b/cli/kata-env.go @@ -69,6 +69,7 @@ type RuntimeInfo struct { Trace bool DisableGuestSeccomp bool DisableNewNetNs bool + SandboxCgroupOnly bool Experimental []exp.Feature Path string } @@ -187,6 +188,7 @@ func getRuntimeInfo(configFile string, config oci.RuntimeConfig) RuntimeInfo { Config: runtimeConfig, Path: runtimePath, DisableNewNetNs: config.DisableNewNetNs, + SandboxCgroupOnly: config.SandboxCgroupOnly, Experimental: config.Experimental, DisableGuestSeccomp: config.DisableGuestSeccomp, } diff --git a/pkg/katautils/config.go b/pkg/katautils/config.go index 9c3432d2b5..283736abd8 100644 --- a/pkg/katautils/config.go +++ b/pkg/katautils/config.go @@ -133,6 +133,7 @@ type runtime struct { Tracing bool `toml:"enable_tracing"` DisableNewNetNs bool `toml:"disable_new_netns"` DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` + SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` Experimental []string `toml:"experimental"` InterNetworkModel string `toml:"internetworking_model"` } @@ -1054,6 +1055,7 @@ func LoadConfiguration(configPath string, ignoreLogging, builtIn bool) (resolved config.ProxyConfig = vc.ProxyConfig{Debug: config.Debug} } + config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs for _, f := range tomlConf.Runtime.Experimental { feature := exp.Get(f) diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 285d3f6786..769ae660f1 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -75,6 +75,13 @@ func createSandboxFromConfig(ctx context.Context, sandboxConfig SandboxConfig, f return nil, err } + // Move runtime to sandbox cgroup so all process are created there. + if s.config.SandboxCgroupOnly { + if err := s.setupSandboxCgroup(); err != nil { + return nil, err + } + } + // cleanup sandbox resources in case of any failure defer func() { if err != nil { diff --git a/virtcontainers/cgroups.go b/virtcontainers/cgroups.go index 89f91b70ea..ddb0adcdc7 100644 --- a/virtcontainers/cgroups.go +++ b/virtcontainers/cgroups.go @@ -9,13 +9,11 @@ package virtcontainers import ( "bufio" "fmt" - "math" "os" "path/filepath" "strings" "github.com/containerd/cgroups" - "github.com/kata-containers/runtime/virtcontainers/pkg/annotations" specs "github.com/opencontainers/runtime-spec/specs-go" ) @@ -144,194 +142,6 @@ func parentCgroup(hierarchy cgroups.Hierarchy, path string) (cgroups.Cgroup, err return parentCgroup, nil } -func (s *Sandbox) updateCgroups() error { - if s.state.CgroupPath == "" { - s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty") - return nil - } - - cgroup, err := cgroupsLoadFunc(V1Constraints, cgroups.StaticPath(s.state.CgroupPath)) - if err != nil { - return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err) - } - - if err := s.constrainHypervisor(cgroup); err != nil { - return err - } - - if len(s.containers) <= 1 { - // nothing to update - return nil - } - - resources, err := s.resources() - if err != nil { - return err - } - - if err := cgroup.Update(&resources); err != nil { - return fmt.Errorf("Could not update cgroup %v: %v", s.state.CgroupPath, err) - } - - return nil -} - -func (s *Sandbox) deleteCgroups() error { - s.Logger().Debug("Deleting sandbox cgroup") - - path := cgroupNoConstraintsPath(s.state.CgroupPath) - s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup") - noConstraintsCgroup, err := cgroupsLoadFunc(V1NoConstraints, cgroups.StaticPath(path)) - if err == cgroups.ErrCgroupDeleted { - // cgroup already deleted - return nil - } - - if err != nil { - return fmt.Errorf("Could not load cgroup without constraints %v: %v", path, err) - } - - // move running process here, that way cgroup can be removed - parent, err := parentCgroup(V1NoConstraints, path) - if err != nil { - // parent cgroup doesn't exist, that means there are no process running - // and the no constraints cgroup was removed. - s.Logger().WithError(err).Warn("Parent cgroup doesn't exist") - return nil - } - - if err := noConstraintsCgroup.MoveTo(parent); err != nil { - // Don't fail, cgroup can be deleted - s.Logger().WithError(err).Warn("Could not move process from no constraints to parent cgroup") - } - - return noConstraintsCgroup.Delete() -} - -func (s *Sandbox) constrainHypervisor(cgroup cgroups.Cgroup) error { - pids := s.hypervisor.getPids() - if len(pids) == 0 || pids[0] == 0 { - return fmt.Errorf("Invalid hypervisor PID: %+v", pids) - } - - // Move hypervisor into cgroups without constraints, - // those cgroups are not yet supported. - resources := &specs.LinuxResources{} - path := cgroupNoConstraintsPath(s.state.CgroupPath) - noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources) - if err != nil { - return fmt.Errorf("Could not create cgroup %v: %v", path, err) - } - for _, pid := range pids { - if pid <= 0 { - s.Logger().Warnf("Invalid hypervisor pid: %d", pid) - continue - } - if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil { - return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err) - } - } - - // when new container joins, new CPU could be hotplugged, so we - // have to query fresh vcpu info from hypervisor for every time. - tids, err := s.hypervisor.getThreadIDs() - if err != nil { - return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) - } - if len(tids.vcpus) == 0 { - // If there's no tid returned from the hypervisor, this is not - // a bug. It simply means there is nothing to constrain, hence - // let's return without any error from here. - return nil - } - - // We are about to move just the vcpus (threads) into cgroups with constraints. - // Move whole hypervisor process whould be easier but the IO/network performance - // whould be impacted. - for _, i := range tids.vcpus { - // In contrast, AddTask will write thread id to `tasks` - // After this, vcpu threads are in "vcpu" sub-cgroup, other threads in - // qemu will be left in parent cgroup untouched. - if err := cgroup.AddTask(cgroups.Process{ - Pid: i, - }); err != nil { - return err - } - } - - return nil -} - -func (s *Sandbox) resources() (specs.LinuxResources, error) { - resources := specs.LinuxResources{ - CPU: s.cpuResources(), - } - - return resources, nil -} - -func (s *Sandbox) cpuResources() *specs.LinuxCPU { - // Use default period and quota if they are not specified. - // Container will inherit the constraints from its parent. - quota := int64(0) - period := uint64(0) - shares := uint64(0) - realtimePeriod := uint64(0) - realtimeRuntime := int64(0) - - cpu := &specs.LinuxCPU{ - Quota: "a, - Period: &period, - Shares: &shares, - RealtimePeriod: &realtimePeriod, - RealtimeRuntime: &realtimeRuntime, - } - - for _, c := range s.containers { - ann := c.GetAnnotations() - if ann[annotations.ContainerTypeKey] == string(PodSandbox) { - // skip sandbox container - continue - } - - if c.config.Resources.CPU == nil { - continue - } - - if c.config.Resources.CPU.Shares != nil { - shares = uint64(math.Max(float64(*c.config.Resources.CPU.Shares), float64(shares))) - } - - if c.config.Resources.CPU.Quota != nil { - quota += *c.config.Resources.CPU.Quota - } - - if c.config.Resources.CPU.Period != nil { - period = uint64(math.Max(float64(*c.config.Resources.CPU.Period), float64(period))) - } - - if c.config.Resources.CPU.Cpus != "" { - cpu.Cpus += c.config.Resources.CPU.Cpus + "," - } - - if c.config.Resources.CPU.RealtimeRuntime != nil { - realtimeRuntime += *c.config.Resources.CPU.RealtimeRuntime - } - - if c.config.Resources.CPU.RealtimePeriod != nil { - realtimePeriod += *c.config.Resources.CPU.RealtimePeriod - } - - if c.config.Resources.CPU.Mems != "" { - cpu.Mems += c.config.Resources.CPU.Mems + "," - } - } - - cpu.Cpus = strings.Trim(cpu.Cpus, " \n\t,") - - return validCPUResources(cpu) -} - // validCPUResources checks CPU resources coherency func validCPUResources(cpuSpec *specs.LinuxCPU) *specs.LinuxCPU { if cpuSpec == nil { diff --git a/virtcontainers/cgroups_test.go b/virtcontainers/cgroups_test.go index 361cb066a3..9c8dc691fb 100644 --- a/virtcontainers/cgroups_test.go +++ b/virtcontainers/cgroups_test.go @@ -133,12 +133,12 @@ func TestUpdateCgroups(t *testing.T) { } // empty path - err := s.updateCgroups() + err := s.cgroupsUpdate() assert.NoError(err) // path doesn't exist s.state.CgroupPath = "/abc/123/rgb" - err = s.updateCgroups() + err = s.cgroupsUpdate() assert.Error(err) if os.Getuid() != 0 { @@ -152,7 +152,7 @@ func TestUpdateCgroups(t *testing.T) { s.hypervisor = &mockHypervisor{mockPid: 0} // bad pid - err = s.updateCgroups() + err = s.cgroupsUpdate() assert.Error(err) // fake workload @@ -161,7 +161,7 @@ func TestUpdateCgroups(t *testing.T) { s.hypervisor = &mockHypervisor{mockPid: cmd.Process.Pid} // no containers - err = s.updateCgroups() + err = s.cgroupsUpdate() assert.NoError(err) s.config = &SandboxConfig{} @@ -186,11 +186,11 @@ func TestUpdateCgroups(t *testing.T) { }, } - err = s.updateCgroups() + err = s.cgroupsUpdate() assert.NoError(err) // cleanup assert.NoError(cmd.Process.Kill()) - err = s.deleteCgroups() + err = s.cgroupsDelete() assert.NoError(err) } diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 14a37a420d..f273d696e1 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -887,8 +887,10 @@ func (c *Container) create() (err error) { } c.process = *process - if err = c.newCgroups(); err != nil { - return + if !c.sandbox.config.SandboxCgroupOnly { + if err = c.cgroupsCreate(); err != nil { + return + } } if !c.sandbox.supportNewStore() { @@ -916,8 +918,10 @@ func (c *Container) delete() error { return err } - if err := c.deleteCgroups(); err != nil { - return err + if !c.sandbox.config.SandboxCgroupOnly { + if err := c.cgroupsDelete(); err != nil { + return err + } } return c.store.Delete() @@ -1208,8 +1212,10 @@ func (c *Container) update(resources specs.LinuxResources) error { return err } - if err := c.updateCgroups(resources); err != nil { - return err + if !c.sandbox.config.SandboxCgroupOnly { + if err := c.cgroupsUpdate(resources); err != nil { + return err + } } return c.sandbox.agent.updateContainer(c.sandbox, *c, resources) @@ -1430,8 +1436,8 @@ func (c *Container) detachDevices() error { return nil } -// creates a new cgroup and return the cgroups path -func (c *Container) newCgroups() (err error) { +// cgroupsCreate creates cgroups on the host for the associated container +func (c *Container) cgroupsCreate() (err error) { ann := c.GetAnnotations() config, ok := ann[annotations.ConfigJSONKey] @@ -1477,7 +1483,14 @@ func (c *Container) newCgroups() (err error) { return nil } -func (c *Container) deleteCgroups() error { +// cgroupsDelete deletes the cgroups on the host for the associated container +func (c *Container) cgroupsDelete() error { + + if c.state.CgroupPath == "" { + c.Logger().Debug("container does not have host cgroups: nothing to update") + return nil + } + cgroup, err := cgroupsLoadFunc(cgroups.V1, cgroups.StaticPath(c.state.CgroupPath)) @@ -1505,13 +1518,19 @@ func (c *Container) deleteCgroups() error { } if err := cgroup.Delete(); err != nil { - return fmt.Errorf("Could not delete container cgroup %v: %v", c.state.CgroupPath, err) + return fmt.Errorf("Could not delete container cgroup path='%v': error='%v'", c.state.CgroupPath, err) } return nil } -func (c *Container) updateCgroups(resources specs.LinuxResources) error { +// cgroupsUpdate updates cgroups on the host for the associated container +func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error { + + if c.state.CgroupPath == "" { + c.Logger().Debug("container does not have host cgroups: nothing to update") + return nil + } cgroup, err := cgroupsLoadFunc(cgroups.V1, cgroups.StaticPath(c.state.CgroupPath)) if err != nil { @@ -1525,7 +1544,7 @@ func (c *Container) updateCgroups(resources specs.LinuxResources) error { // update cgroup if err := cgroup.Update(&r); err != nil { - return fmt.Errorf("Could not update cgroup %v: %v", c.state.CgroupPath, err) + return fmt.Errorf("Could not update container cgroup path='%v': error='%v'", c.state.CgroupPath, err) } // store new resources diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go index c4f26c8dea..0c3c1e61e4 100644 --- a/virtcontainers/pkg/oci/utils.go +++ b/virtcontainers/pkg/oci/utils.go @@ -141,6 +141,9 @@ type RuntimeConfig struct { //Determines if create a netns for hypervisor process DisableNewNetNs bool + //Determines kata processes are managed only in sandbox cgroup + SandboxCgroupOnly bool + //Experimental features enabled Experimental []exp.Feature } @@ -516,6 +519,8 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c SystemdCgroup: systemdCgroup, + SandboxCgroupOnly: runtime.SandboxCgroupOnly, + DisableGuestSeccomp: runtime.DisableGuestSeccomp, Experimental: runtime.Experimental, diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 575cb2483d..3d04941b5f 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -7,13 +7,18 @@ package virtcontainers import ( "context" + "encoding/json" "fmt" "io" + "math" "net" "os" + "path/filepath" + "strings" "sync" "syscall" + "github.com/containerd/cgroups" "github.com/containernetworking/plugins/pkg/ns" "github.com/kata-containers/agent/protocols/grpc" "github.com/kata-containers/runtime/virtcontainers/device/api" @@ -101,6 +106,9 @@ type SandboxConfig struct { // SystemdCgroup enables systemd cgroup support SystemdCgroup bool + // SandboxCgroupOnly enables cgroup only at podlevel in the host + SandboxCgroupOnly bool + DisableGuestSeccomp bool // Experimental features enabled @@ -755,7 +763,7 @@ func (s *Sandbox) Delete() error { } } - if err := s.deleteCgroups(); err != nil { + if err := s.cgroupsDelete(); err != nil { return err } @@ -1073,14 +1081,6 @@ func (s *Sandbox) addContainer(c *Container) error { } s.containers[c.id] = c - ann := c.GetAnnotations() - if ann[annotations.ContainerTypeKey] == string(PodSandbox) { - s.state.CgroupPath = c.state.CgroupPath - if !s.supportNewStore() { - return s.store.Store(store.State, s.state) - } - } - return nil } @@ -1153,7 +1153,7 @@ func (s *Sandbox) CreateContainer(contConfig ContainerConfig) (VCContainer, erro return nil, err } - if err = s.updateCgroups(); err != nil { + if err = s.cgroupsUpdate(); err != nil { return nil, err } @@ -1330,7 +1330,7 @@ func (s *Sandbox) UpdateContainer(containerID string, resources specs.LinuxResou return err } - if err := s.updateCgroups(); err != nil { + if err := s.cgroupsUpdate(); err != nil { return err } @@ -1421,7 +1421,7 @@ func (s *Sandbox) createContainers() error { } } - if err := s.updateCgroups(); err != nil { + if err := s.cgroupsUpdate(); err != nil { return err } if err := s.storeSandbox(); err != nil { @@ -1880,3 +1880,266 @@ func (s *Sandbox) calculateSandboxCPUs() uint32 { func (s *Sandbox) GetHypervisorType() string { return string(s.config.HypervisorType) } + +func (s *Sandbox) cgroupsUpdate() error { + if s.state.CgroupPath == "" { + s.Logger().Warn("sandbox's cgroup won't be updated: cgroup path is empty") + return nil + } + + cgroup, err := cgroupsLoadFunc(V1Constraints, cgroups.StaticPath(s.state.CgroupPath)) + if err != nil { + return fmt.Errorf("Could not load cgroup %v: %v", s.state.CgroupPath, err) + } + + if err := s.constrainHypervisorVCPUs(cgroup); err != nil { + return err + } + + if len(s.containers) <= 1 { + // nothing to update + return nil + } + + resources, err := s.resources() + if err != nil { + return err + } + + if err := cgroup.Update(&resources); err != nil { + return fmt.Errorf("Could not update sandbox cgroup path='%v' error='%v'", s.state.CgroupPath, err) + } + + return nil +} + +func (s *Sandbox) cgroupsDelete() error { + s.Logger().Debug("Deleting sandbox cgroup") + if s.state.CgroupPath == "" { + s.Logger().Warnf("sandox cgroups path is empty") + return nil + } + + var path string + cgroupSubystems := V1NoConstraints + + if s.config.SandboxCgroupOnly { + // Override V1NoConstraints, if SandboxCgroupOnly is enabled + cgroupSubystems = cgroups.V1 + path = s.state.CgroupPath + s.Logger().WithField("path", path).Debug("Deleting sandbox cgroups (all subsystems)") + } else { + path = cgroupNoConstraintsPath(s.state.CgroupPath) + s.Logger().WithField("path", path).Debug("Deleting no constraints cgroup") + } + + sandboxCgroups, err := cgroupsLoadFunc(cgroupSubystems, cgroups.StaticPath(path)) + if err == cgroups.ErrCgroupDeleted { + // cgroup already deleted + s.Logger().Warnf("cgroup already deleted: '%s'", err) + return nil + } + + if err != nil { + return fmt.Errorf("Could not load cgroups %v: %v", path, err) + } + + // move running process here, that way cgroup can be removed + parent, err := parentCgroup(cgroupSubystems, path) + if err != nil { + // parent cgroup doesn't exist, that means there are no process running + // and the no constraints cgroup was removed. + s.Logger().WithError(err).Warn("Parent cgroup doesn't exist") + return nil + } + + if err := sandboxCgroups.MoveTo(parent); err != nil { + // Don't fail, cgroup can be deleted + s.Logger().WithError(err).Warnf("Could not move process from %s to parent cgroup", path) + } + + return sandboxCgroups.Delete() +} + +func (s *Sandbox) constrainHypervisorVCPUs(cgroup cgroups.Cgroup) error { + pids := s.hypervisor.getPids() + if len(pids) == 0 || pids[0] == 0 { + return fmt.Errorf("Invalid hypervisor PID: %+v", pids) + } + + // Move hypervisor into cgroups without constraints, + // those cgroups are not yet supported. + resources := &specs.LinuxResources{} + path := cgroupNoConstraintsPath(s.state.CgroupPath) + noConstraintsCgroup, err := cgroupsNewFunc(V1NoConstraints, cgroups.StaticPath(path), resources) + if err != nil { + return fmt.Errorf("Could not create cgroup %v: %v", path, err) + + } + for _, pid := range pids { + if pid <= 0 { + s.Logger().Warnf("Invalid hypervisor pid: %d", pid) + continue + } + + if err := noConstraintsCgroup.Add(cgroups.Process{Pid: pid}); err != nil { + return fmt.Errorf("Could not add hypervisor PID %d to cgroup %v: %v", pid, path, err) + } + + } + + // when new container joins, new CPU could be hotplugged, so we + // have to query fresh vcpu info from hypervisor for every time. + tids, err := s.hypervisor.getThreadIDs() + if err != nil { + return fmt.Errorf("failed to get thread ids from hypervisor: %v", err) + } + if len(tids.vcpus) == 0 { + // If there's no tid returned from the hypervisor, this is not + // a bug. It simply means there is nothing to constrain, hence + // let's return without any error from here. + return nil + } + + // We are about to move just the vcpus (threads) into cgroups with constraints. + // Move whole hypervisor process whould be easier but the IO/network performance + // whould be impacted. + for _, i := range tids.vcpus { + // In contrast, AddTask will write thread id to `tasks` + // After this, vcpu threads are in "vcpu" sub-cgroup, other threads in + // qemu will be left in parent cgroup untouched. + if err := cgroup.AddTask(cgroups.Process{ + Pid: i, + }); err != nil { + return err + } + } + + return nil +} + +func (s *Sandbox) resources() (specs.LinuxResources, error) { + resources := specs.LinuxResources{ + CPU: s.cpuResources(), + } + + return resources, nil +} + +func (s *Sandbox) cpuResources() *specs.LinuxCPU { + // Use default period and quota if they are not specified. + // Container will inherit the constraints from its parent. + quota := int64(0) + period := uint64(0) + shares := uint64(0) + realtimePeriod := uint64(0) + realtimeRuntime := int64(0) + + cpu := &specs.LinuxCPU{ + Quota: "a, + Period: &period, + Shares: &shares, + RealtimePeriod: &realtimePeriod, + RealtimeRuntime: &realtimeRuntime, + } + + for _, c := range s.containers { + ann := c.GetAnnotations() + if ann[annotations.ContainerTypeKey] == string(PodSandbox) { + // skip sandbox container + continue + } + + if c.config.Resources.CPU == nil { + continue + } + + if c.config.Resources.CPU.Shares != nil { + shares = uint64(math.Max(float64(*c.config.Resources.CPU.Shares), float64(shares))) + } + + if c.config.Resources.CPU.Quota != nil { + quota += *c.config.Resources.CPU.Quota + } + + if c.config.Resources.CPU.Period != nil { + period = uint64(math.Max(float64(*c.config.Resources.CPU.Period), float64(period))) + } + + if c.config.Resources.CPU.Cpus != "" { + cpu.Cpus += c.config.Resources.CPU.Cpus + "," + } + + if c.config.Resources.CPU.RealtimeRuntime != nil { + realtimeRuntime += *c.config.Resources.CPU.RealtimeRuntime + } + + if c.config.Resources.CPU.RealtimePeriod != nil { + realtimePeriod += *c.config.Resources.CPU.RealtimePeriod + } + + if c.config.Resources.CPU.Mems != "" { + cpu.Mems += c.config.Resources.CPU.Mems + "," + } + } + + cpu.Cpus = strings.Trim(cpu.Cpus, " \n\t,") + + return validCPUResources(cpu) +} + +// setupSandboxCgroup creates and joins sandbox cgroups for the sandbox config +func (s *Sandbox) setupSandboxCgroup() error { + var podSandboxConfig *ContainerConfig + + if s.config == nil { + return fmt.Errorf("Sandbox config is nil") + } + + // get the container associated with the PodSandbox annotation. In Kubernetes, this + // represents the pause container. In Docker, this is the container. We derive the + // cgroup path from this container. + for _, cConfig := range s.config.Containers { + if cConfig.Annotations[annotations.ContainerTypeKey] == string(PodSandbox) { + podSandboxConfig = &cConfig + break + } + } + + if podSandboxConfig == nil { + return fmt.Errorf("Failed to find cgroup path for sandbox: Container of type '%s' not found", PodSandbox) + } + + configJSON, ok := podSandboxConfig.Annotations[annotations.ConfigJSONKey] + if !ok { + return fmt.Errorf("Could not find json config in annotations for container '%s'", podSandboxConfig.ID) + } + + var spec specs.Spec + if err := json.Unmarshal([]byte(configJSON), &spec); err != nil { + return err + } + + if spec.Linux == nil { + // Cgroup path is optional, though expected. If not defined, skip the setup + s.Logger().WithField("sandboxid", podSandboxConfig.ID).Warning("no cgroup path provided for pod sandbox, not creating sandbox cgroup") + return nil + } + validContainerCgroup := utils.ValidCgroupPath(spec.Linux.CgroupsPath) + + // Create a Kata sandbox cgroup with the cgroup of the sandbox container as the parent + s.state.CgroupPath = filepath.Join(filepath.Dir(validContainerCgroup), cgroupKataPrefix+"_"+podSandboxConfig.ID) + cgroup, err := cgroupsNewFunc(cgroups.V1, cgroups.StaticPath(s.state.CgroupPath), &specs.LinuxResources{}) + if err != nil { + return fmt.Errorf("Could not create sandbox cgroup in %v: %v", s.state.CgroupPath, err) + + } + + // Add the runtime to the Kata sandbox cgroup + runtimePid := os.Getpid() + if err := cgroup.Add(cgroups.Process{Pid: runtimePid}); err != nil { + return fmt.Errorf("Could not add runtime PID %d to sandbox cgroup: %v", runtimePid, err) + } + + return nil +} diff --git a/virtcontainers/sandbox_test.go b/virtcontainers/sandbox_test.go index 67c02c4a87..0f3942a6ab 100644 --- a/virtcontainers/sandbox_test.go +++ b/virtcontainers/sandbox_test.go @@ -1516,3 +1516,99 @@ func TestSandboxExperimentalFeature(t *testing.T) { assert.NotNil(t, exp.Get(testFeature.Name)) assert.True(t, sconfig.valid()) } + +/* +func TestSandbox_joinSandboxCgroup(t *testing.T) { + + mockValidCgroup := &Sandbox{} + mockValidCgroup.state.CgroupPath = "/my/cgroup" + + tests := []struct { + name string + s *Sandbox + wantErr bool + }{ + {"New Config", &Sandbox{}, false}, + {"Mock cgroup path", mockValidCgroup, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.s.joinSandboxCgroup(); (err != nil) != tt.wantErr { + t.Errorf("Sandbox.joinSandboxCgroup() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} +*/ + +func TestSandbox_SetupSandboxCgroup(t *testing.T) { + sandboxContainer := ContainerConfig{} + sandboxContainer.Annotations = make(map[string]string) + sandboxContainer.Annotations[annotations.ContainerTypeKey] = string(PodSandbox) + + emptyJSONLinux := ContainerConfig{} + emptyJSONLinux.Annotations = make(map[string]string) + emptyJSONLinux.Annotations[annotations.ContainerTypeKey] = string(PodSandbox) + emptyJSONLinux.Annotations[annotations.ConfigJSONKey] = "{}" + + successfulContainer := ContainerConfig{} + successfulContainer.Annotations = make(map[string]string) + successfulContainer.Annotations[annotations.ContainerTypeKey] = string(PodSandbox) + successfulContainer.Annotations[annotations.ConfigJSONKey] = "{\"linux\": { \"cgroupsPath\": \"/myRuntime/myContainer\" }}" + + tests := []struct { + name string + s *Sandbox + wantErr bool + }{ + { + "New sandbox", + &Sandbox{}, + true, + }, + { + "New sandbox, new config", + &Sandbox{config: &SandboxConfig{}}, + true, + }, + { + "sandbox, container no sandbox type", + &Sandbox{ + config: &SandboxConfig{Containers: []ContainerConfig{ + {}, + }}}, + true, + }, + { + "sandbox, container sandbox type", + &Sandbox{ + config: &SandboxConfig{Containers: []ContainerConfig{ + sandboxContainer, + }}}, + true, + }, + { + "sandbox, empty linux json", + &Sandbox{ + config: &SandboxConfig{Containers: []ContainerConfig{ + emptyJSONLinux, + }}}, + false, + }, + { + "sandbox, successful config", + &Sandbox{ + config: &SandboxConfig{Containers: []ContainerConfig{ + successfulContainer, + }}}, + false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := tt.s.setupSandboxCgroup(); (err != nil) != tt.wantErr { + t.Errorf("Sandbox.SetupSandboxCgroupOnly() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +}