Merge pull request #2941 from egernst/sandbox-sizing-feature

Sandbox sizing feature
This commit is contained in:
Eric Ernst
2022-01-27 09:37:57 -08:00
committed by GitHub
10 changed files with 380 additions and 24 deletions

View File

@@ -157,6 +157,32 @@ docker run --cpus 4 -ti debian bash -c "nproc; cat /sys/fs/cgroup/cpu,cpuacct/cp
400000 # cfs quota
```
## Virtual CPU handling without hotplug
In some cases, the hardware and/or software architecture being utilized does not support
hotplug. For example, Firecracker VMM does not support CPU or memory hotplug. Similarly,
the current Linux Kernel for aarch64 does not support CPU or memory hotplug. To appropriately
size the virtual machine for the workload within the container or pod, we provide a `static_sandbox_resource_mgmt`
flag within the Kata Containers configuration. When this is set, the runtime will:
- Size the VM based on the workload requirements as well as the `default_vcpus` option specified in the configuration.
- Not resize the virtual machine after it has been launched.
VM size determination varies depending on the type of container being run, and may not always
be available. If workload sizing information is not available, the virtual machine will be started with the
`default_vcpus`.
In the case of a pod, the initial sandbox container (pause container) typically doesn't contain any resource
information in its runtime `spec`. It is possible that the upper layer runtime
(i.e. containerd or CRI-O) may pass sandbox sizing annotations within the pause container's
`spec`. If these are provided, we will use this to appropriately size the VM. In particular,
we'll calculate the number of CPUs required for the workload and augment this by `default_vcpus`
configuration option, and use this for the virtual machine size.
In the case of a single container (i.e., not a pod), if the container specifies resource requirements,
the container's `spec` will provide the sizing information directly. If these are set, we will
calculate the number of CPUs required for the workload and augment this by `default_vcpus`
configuration option, and use this for the virtual machine size.
[1]: https://docs.docker.com/config/containers/resource_constraints/#cpu
[2]: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource

View File

@@ -187,6 +187,8 @@ DEFVFIOMODE := guest-kernel
# Default cgroup model
DEFSANDBOXCGROUPONLY ?= false
DEFSTATICRESOURCEMGMT ?= false
DEFBINDMOUNTS := []
# Features
@@ -279,6 +281,7 @@ ifneq (,$(FCCMD))
# firecracker-specific options (all should be suffixed by "_FC")
DEFBLOCKSTORAGEDRIVER_FC := virtio-mmio
DEFNETWORKMODEL_FC := tcfilter
DEFSTATICRESOURCEMGMT_FC = true
KERNELTYPE_FC = uncompressed
KERNEL_NAME_FC = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_FC))
KERNELPATH_FC = $(KERNELDIR)/$(KERNEL_NAME_FC)
@@ -449,6 +452,8 @@ USER_VARS += DEFMSIZE9P
USER_VARS += DEFENTROPYSOURCE
USER_VARS += DEFVALIDENTROPYSOURCES
USER_VARS += DEFSANDBOXCGROUPONLY
USER_VARS += DEFSTATICRESOURCEMGMT
USER_VARS += DEFSTATICRESOURCEMGMT_FC
USER_VARS += DEFBINDMOUNTS
USER_VARS += DEFVFIOMODE
USER_VARS += FEATURE_SELINUX

View File

@@ -180,13 +180,6 @@ block_device_driver = "virtio-blk"
# the container network interface
# Options:
#
# - bridged (Deprecated)
# Uses a linux bridge to interconnect the container interface to
# the VM. Works for most cases except macvlan and ipvlan.
# ***NOTE: This feature has been deprecated with plans to remove this
# feature in the future. Please use other network models listed below.
#
#
# - macvtap
# Used when the Container network interface can be bridged using
# macvtap.
@@ -224,7 +217,7 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
# `disable_new_netns` conflicts with `internetworking_model=bridged` and `internetworking_model=macvtap`. It works only
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
# (like OVS) directly.
# (default: false)
@@ -238,6 +231,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`

View File

@@ -332,7 +332,7 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# (default: false)
#disable_new_netns = true
# if enable, the runtime will add all the kata processes inside one dedicated cgroup.
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
@@ -340,6 +340,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_FC@
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.

View File

@@ -516,6 +516,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`

View File

@@ -103,6 +103,20 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con
s.ctx = newCtx
defer span.End()
// Sandbox sizing information *may* be provided in two scenarios:
// 1. The upper layer runtime (ie, containerd or crio) provide sandbox sizing information as an annotation
// in the 'sandbox container's' spec. This would typically be a scenario where as part of a create sandbox
// request the upper layer runtime receives this information as part of a pod, and makes it available to us
// for sizing purposes.
// 2. If this is not a sandbox infrastructure container, but instead a standalone single container (analogous to "docker run..."),
// then the container spec itself will contain appropriate sizing information for the entire sandbox (since it is
// a single container.
if containerType == vc.PodSandbox {
s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateSandboxSizing(ociSpec)
} else {
s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateContainerSizing(ociSpec)
}
if rootFs.Mounted, err = checkAndMount(s, r); err != nil {
return nil, err
}

View File

@@ -149,6 +149,7 @@ type runtime struct {
DisableNewNetNs bool `toml:"disable_new_netns"`
DisableGuestSeccomp bool `toml:"disable_guest_seccomp"`
SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"`
StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"`
EnablePprof bool `toml:"enable_pprof"`
}
@@ -1125,6 +1126,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat
config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp
config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt
config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly
config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs
config.EnablePprof = tomlConf.Runtime.EnablePprof

View File

@@ -24,11 +24,13 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
dockershimAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations/dockershim"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
vcutils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
)
type annotationContainerType struct {
@@ -125,7 +127,16 @@ type RuntimeConfig struct {
//Determines if seccomp should be applied inside guest
DisableGuestSeccomp bool
//Determines if create a netns for hypervisor process
// Sandbox sizing information which, if provided, indicates the size of
// the sandbox needed for the workload(s)
SandboxCPUs uint32
SandboxMemMB uint32
// Determines if we should attempt to size the VM at boot time and skip
// any later resource updates.
StaticSandboxResourceMgmt bool
// Determines if create a netns for hypervisor process
DisableNewNetNs bool
//Determines kata processes are managed only in sandbox cgroup
@@ -873,6 +884,13 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c
vcAnnotations.BundlePathKey: bundlePath,
},
SandboxResources: vc.SandboxResourceSizing{
WorkloadCPUs: runtime.SandboxCPUs,
WorkloadMemMB: runtime.SandboxMemMB,
},
StaticResourceMgmt: runtime.StaticSandboxResourceMgmt,
ShmSize: shmSize,
VfioMode: runtime.VfioMode,
@@ -894,6 +912,25 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c
return vc.SandboxConfig{}, err
}
// If we are utilizing static resource management for the sandbox, ensure that the hypervisor is started
// with the base number of CPU/memory (which is equal to the default CPU/memory specified for the runtime
// configuration or annotations) as well as any specified workload resources.
if sandboxConfig.StaticResourceMgmt {
sandboxConfig.SandboxResources.BaseCPUs = sandboxConfig.HypervisorConfig.NumVCPUs
sandboxConfig.SandboxResources.BaseMemMB = sandboxConfig.HypervisorConfig.MemorySize
sandboxConfig.HypervisorConfig.NumVCPUs += sandboxConfig.SandboxResources.WorkloadCPUs
sandboxConfig.HypervisorConfig.MemorySize += sandboxConfig.SandboxResources.WorkloadMemMB
ociLog.WithFields(logrus.Fields{
"workload cpu": sandboxConfig.SandboxResources.WorkloadCPUs,
"default cpu": sandboxConfig.SandboxResources.BaseCPUs,
"workload mem in MB": sandboxConfig.SandboxResources.WorkloadMemMB,
"default mem": sandboxConfig.SandboxResources.BaseMemMB,
}).Debugf("static resources set")
}
return sandboxConfig, nil
}
@@ -1046,3 +1083,89 @@ func (a *annotationConfiguration) setUintWithCheck(f func(uint64) error) error {
}
return nil
}
// CalculateSandboxSizing will calculate the number of CPUs and amount of Memory that should
// be added to the VM if sandbox annotations are provided with this sizing details
func CalculateSandboxSizing(spec *specs.Spec) (numCPU, memSizeMB uint32) {
var memory, quota int64
var period uint64
var err error
if spec == nil || spec.Annotations == nil {
return 0, 0
}
// For each annotation, if it isn't defined, or if there's an error in parsing, we'll log
// a warning and continue the calculation with 0 value. We expect values like,
// Annotations[SandboxMem] = "1048576"
// Annotations[SandboxCPUPeriod] = "100000"
// Annotations[SandboxCPUQuota] = "220000"
// ... to result in VM resources of 1 (MB) for memory, and 3 for CPU (2200 mCPU rounded up to 3).
annotation, ok := spec.Annotations[ctrAnnotations.SandboxCPUPeriod]
if ok {
period, err = strconv.ParseUint(annotation, 10, 32)
if err != nil {
ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUPeriod: %s", annotation)
period = 0
}
}
annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUQuota]
if ok {
quota, err = strconv.ParseInt(annotation, 10, 32)
if err != nil {
ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUQuota: %s", annotation)
quota = 0
}
}
annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem]
if ok {
memory, err = strconv.ParseInt(annotation, 10, 32)
if err != nil {
ociLog.Warningf("sandbox-sizing: failure to parse SandboxMem: %s", annotation)
memory = 0
}
}
return calculateVMResources(period, quota, memory)
}
// CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed
// based on the provided LinuxResources
func CalculateContainerSizing(spec *specs.Spec) (numCPU, memSizeMB uint32) {
var memory, quota int64
var period uint64
if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil {
return 0, 0
}
resources := spec.Linux.Resources
if resources.CPU != nil && resources.CPU.Quota != nil && resources.CPU.Period != nil {
quota = *resources.CPU.Quota
period = *resources.CPU.Period
}
if resources.Memory != nil && resources.Memory.Limit != nil {
memory = *resources.Memory.Limit
}
return calculateVMResources(period, quota, memory)
}
func calculateVMResources(period uint64, quota int64, memory int64) (numCPU, memSizeMB uint32) {
numCPU = vcutils.CalculateVCpusFromMilliCpus(vcutils.CalculateMilliCPUs(quota, period))
if memory < 0 {
// While spec allows for a negative value to indicate unconstrained, we don't
// see this in practice. Since we rely only on default memory if the workload
// is unconstrained, we will treat as 0 for VM resource accounting.
ociLog.Infof("memory limit provided < 0, treating as 0 MB for VM sizing: %d", memory)
memSizeMB = 0
} else {
memSizeMB = uint32(memory / 1024 / 1024)
}
return numCPU, memSizeMB
}

View File

@@ -1061,3 +1061,149 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) {
}
}
}
func getCtrResourceSpec(memory, quota int64, period uint64) *specs.Spec {
return &specs.Spec{
Linux: &specs.Linux{
Resources: &specs.LinuxResources{
CPU: &specs.LinuxCPU{
Quota: &quota,
Period: &period,
},
Memory: &specs.LinuxMemory{
Limit: &memory,
},
},
},
}
}
func makeSizingAnnotations(memory, quota, period string) *specs.Spec {
spec := specs.Spec{
Annotations: make(map[string]string),
}
spec.Annotations[ctrAnnotations.SandboxCPUPeriod] = period
spec.Annotations[ctrAnnotations.SandboxCPUQuota] = quota
spec.Annotations[ctrAnnotations.SandboxMem] = memory
return &spec
}
func TestCalculateContainerSizing(t *testing.T) {
assert := assert.New(t)
testCases := []struct {
spec *specs.Spec
expectedCPU uint32
expectedMem uint32
}{
{
spec: nil,
expectedCPU: 0,
expectedMem: 0,
},
{
spec: &specs.Spec{},
expectedCPU: 0,
expectedMem: 0,
},
{
spec: &specs.Spec{
Linux: &specs.Linux{
Resources: &specs.LinuxResources{
CPU: &specs.LinuxCPU{},
Memory: &specs.LinuxMemory{},
},
},
},
expectedCPU: 0,
expectedMem: 0,
},
{
spec: getCtrResourceSpec(1024*1024, 200, 100),
expectedCPU: 2,
expectedMem: 1,
},
{
spec: getCtrResourceSpec(1024*1024*1024, 200, 1),
expectedCPU: 200,
expectedMem: 1024,
},
{
spec: getCtrResourceSpec(-1*1024*1024*1024, 200, 1),
expectedCPU: 200,
expectedMem: 0,
},
{
spec: getCtrResourceSpec(0, 10, 0),
expectedCPU: 0,
expectedMem: 0,
},
{
spec: getCtrResourceSpec(-1, 10, 1),
expectedCPU: 10,
expectedMem: 0,
},
}
for _, tt := range testCases {
cpu, mem := CalculateContainerSizing(tt.spec)
assert.Equal(tt.expectedCPU, cpu, "unexpected CPU")
assert.Equal(tt.expectedMem, mem, "unexpected memory")
}
}
func TestCalculateSandboxSizing(t *testing.T) {
assert := assert.New(t)
testCases := []struct {
spec *specs.Spec
expectedCPU uint32
expectedMem uint32
}{
{
spec: nil,
expectedCPU: 0,
expectedMem: 0,
},
{
spec: &specs.Spec{},
expectedCPU: 0,
expectedMem: 0,
},
{
spec: makeSizingAnnotations("1048576", "200", "100"),
expectedCPU: 2,
expectedMem: 1,
},
{
spec: makeSizingAnnotations("1024", "200", "1"),
expectedCPU: 200,
expectedMem: 0,
},
{
spec: makeSizingAnnotations("foobar", "200", "spaghetti"),
expectedCPU: 0,
expectedMem: 0,
},
{
spec: makeSizingAnnotations("-1048576", "-100", "1"),
expectedCPU: 0,
expectedMem: 0,
},
{
spec: makeSizingAnnotations("-1", "100", "1"),
expectedCPU: 100,
expectedMem: 0,
},
}
for _, tt := range testCases {
cpu, mem := CalculateSandboxSizing(tt.spec)
assert.Equal(tt.expectedCPU, cpu, "unexpected CPU")
assert.Equal(tt.expectedMem, mem, "unexpected memory")
}
}

View File

@@ -99,6 +99,17 @@ type SandboxStats struct {
Cpus int
}
type SandboxResourceSizing struct {
// The number of CPUs required for the sandbox workload(s)
WorkloadCPUs uint32
// The base number of CPUs for the VM that are assigned as overhead
BaseCPUs uint32
// The amount of memory required for the sandbox workload(s)
WorkloadMemMB uint32
// The base amount of memory required for that VM that is assigned as overhead
BaseMemMB uint32
}
// SandboxConfig is a Sandbox configuration.
type SandboxConfig struct {
// Volumes is a list of shared volumes between the host and the Sandbox.
@@ -132,6 +143,11 @@ type SandboxConfig struct {
HypervisorConfig HypervisorConfig
SandboxResources SandboxResourceSizing
// StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM)
StaticResourceMgmt bool
ShmSize uint64
VfioMode config.VFIOModeType
@@ -1573,7 +1589,7 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
}
// Update resources after having added containers to the sandbox, since
// container status is requiered to know if more resources should be added.
// container status is required to know if more resources should be added.
if err := s.updateResources(ctx); err != nil {
return err
}
@@ -1909,6 +1925,10 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
return fmt.Errorf("sandbox config is nil")
}
if s.config.StaticResourceMgmt {
s.Logger().Debug("no resources updated: static resource management is set")
return nil
}
sandboxVCPUs, err := s.calculateSandboxCPUs()
if err != nil {
return err