From 8cde54131af6f036e390e7cf8a44c9b8d70d2ff8 Mon Sep 17 00:00:00 2001 From: Eric Ernst Date: Wed, 15 Dec 2021 16:45:58 -0800 Subject: [PATCH] runtime: introduce static sandbox resource management There are software and hardware architectures which do not support dynamically adjusting the CPU and memory resources associated with a sandbox. For these, today, they rely on "default CPU" and "default memory" configuration options for the runtime, either set by annotation or by the configuration toml on disk. In the case of a single container (launched by ctr, or something like "docker run"), we could allow for sizing the VM correctly, since all of the information is already available to us at creation time. In the sandbox / pod container case, it is possible for the upper layer container runtime (ie, containerd or crio) could send a specific annotation indicating the total workload resource requirements associated with the sandbox creation request. In the case of sizing information not being provided, we will follow same behavior as today: start the VM with (just) the default CPU/memory. If this information is provided, we'll track this as Workload specific resources, and track default sizing information as Base resources. We will update the hypervisor configuration to utilize Base+Workload resources, thus starting the VM with the appropriate amount of CPU and memory. In this scenario (we start the VM with the "right" amount of CPU/Memory), we do not want to update the VM resources when containers are added, or adjusted in size. This functionality is introduced behind a configuration flag, `static_sandbox_resource_mgmt`. This is defaulted to false for all configurations except Firecracker, which is set to true. This'll greatly improve UX for folks who are utilizing Kata with a VMM or hardware architecture that doesn't support hotplug. Note, users will still be unable to do in place vertical pod autoscaling or other dynamic container/pod sizing with this enabled. Fixes: #3264 Signed-off-by: Eric Ernst --- src/runtime/Makefile | 5 + src/runtime/config/configuration-clh.toml.in | 9 ++ src/runtime/config/configuration-fc.toml.in | 9 ++ src/runtime/config/configuration-qemu.toml.in | 9 ++ src/runtime/pkg/containerd-shim-v2/create.go | 14 ++ src/runtime/pkg/katautils/config.go | 28 ++-- src/runtime/pkg/oci/utils.go | 125 ++++++++++++++- src/runtime/pkg/oci/utils_test.go | 146 ++++++++++++++++++ src/runtime/virtcontainers/sandbox.go | 22 ++- 9 files changed, 352 insertions(+), 15 deletions(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index e09a999b5d..ad84739d2e 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -187,6 +187,8 @@ DEFVFIOMODE := guest-kernel # Default cgroup model DEFSANDBOXCGROUPONLY ?= false +DEFSTATICRESOURCEMGMT ?= false + DEFBINDMOUNTS := [] # Features @@ -279,6 +281,7 @@ ifneq (,$(FCCMD)) # firecracker-specific options (all should be suffixed by "_FC") DEFBLOCKSTORAGEDRIVER_FC := virtio-mmio DEFNETWORKMODEL_FC := tcfilter + DEFSTATICRESOURCEMGMT_FC = true KERNELTYPE_FC = uncompressed KERNEL_NAME_FC = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_FC)) KERNELPATH_FC = $(KERNELDIR)/$(KERNEL_NAME_FC) @@ -449,6 +452,8 @@ USER_VARS += DEFMSIZE9P USER_VARS += DEFENTROPYSOURCE USER_VARS += DEFVALIDENTROPYSOURCES USER_VARS += DEFSANDBOXCGROUPONLY +USER_VARS += DEFSTATICRESOURCEMGMT +USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFBINDMOUNTS USER_VARS += DEFVFIOMODE USER_VARS += FEATURE_SELINUX diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 733a4e2a2f..7f06dd8eb3 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -231,6 +231,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime/config/configuration-fc.toml.in b/src/runtime/config/configuration-fc.toml.in index 83176131a3..ad3e5c6721 100644 --- a/src/runtime/config/configuration-fc.toml.in +++ b/src/runtime/config/configuration-fc.toml.in @@ -340,6 +340,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_FC@ + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 2f6600fc81..c27e94bb36 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -516,6 +516,15 @@ disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ # See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime/pkg/containerd-shim-v2/create.go b/src/runtime/pkg/containerd-shim-v2/create.go index ae47bb249b..ef9374686b 100644 --- a/src/runtime/pkg/containerd-shim-v2/create.go +++ b/src/runtime/pkg/containerd-shim-v2/create.go @@ -103,6 +103,20 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con s.ctx = newCtx defer span.End() + // Sandbox sizing information *may* be provided in two scenarios: + // 1. The upper layer runtime (ie, containerd or crio) provide sandbox sizing information as an annotation + // in the 'sandbox container's' spec. This would typically be a scenario where as part of a create sandbox + // request the upper layer runtime receives this information as part of a pod, and makes it available to us + // for sizing purposes. + // 2. If this is not a sandbox infrastructure container, but instead a standalone single container (analogous to "docker run..."), + // then the container spec itself will contain appropriate sizing information for the entire sandbox (since it is + // a single container. + if containerType == vc.PodSandbox { + s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateSandboxSizing(ociSpec) + } else { + s.config.SandboxCPUs, s.config.SandboxMemMB = oci.CalculateContainerSizing(ociSpec) + } + if rootFs.Mounted, err = checkAndMount(s, r); err != nil { return nil, err } diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 20ed1b695a..67fc685dc1 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -137,19 +137,20 @@ type hypervisor struct { } type runtime struct { - InterNetworkModel string `toml:"internetworking_model"` - JaegerEndpoint string `toml:"jaeger_endpoint"` - JaegerUser string `toml:"jaeger_user"` - JaegerPassword string `toml:"jaeger_password"` - VfioMode string `toml:"vfio_mode"` - SandboxBindMounts []string `toml:"sandbox_bind_mounts"` - Experimental []string `toml:"experimental"` - Debug bool `toml:"enable_debug"` - Tracing bool `toml:"enable_tracing"` - DisableNewNetNs bool `toml:"disable_new_netns"` - DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` - SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` - EnablePprof bool `toml:"enable_pprof"` + InterNetworkModel string `toml:"internetworking_model"` + JaegerEndpoint string `toml:"jaeger_endpoint"` + JaegerUser string `toml:"jaeger_user"` + JaegerPassword string `toml:"jaeger_password"` + VfioMode string `toml:"vfio_mode"` + SandboxBindMounts []string `toml:"sandbox_bind_mounts"` + Experimental []string `toml:"experimental"` + Debug bool `toml:"enable_debug"` + Tracing bool `toml:"enable_tracing"` + DisableNewNetNs bool `toml:"disable_new_netns"` + DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` + SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` + StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"` + EnablePprof bool `toml:"enable_pprof"` } type agent struct { @@ -1125,6 +1126,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat config.DisableGuestSeccomp = tomlConf.Runtime.DisableGuestSeccomp + config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs config.EnablePprof = tomlConf.Runtime.EnablePprof diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 71a116f45a..63c052caf7 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -24,11 +24,13 @@ import ( "k8s.io/apimachinery/pkg/api/resource" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/device/config" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" dockershimAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations/dockershim" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" + vcutils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) type annotationContainerType struct { @@ -125,7 +127,16 @@ type RuntimeConfig struct { //Determines if seccomp should be applied inside guest DisableGuestSeccomp bool - //Determines if create a netns for hypervisor process + // Sandbox sizing information which, if provided, indicates the size of + // the sandbox needed for the workload(s) + SandboxCPUs uint32 + SandboxMemMB uint32 + + // Determines if we should attempt to size the VM at boot time and skip + // any later resource updates. + StaticSandboxResourceMgmt bool + + // Determines if create a netns for hypervisor process DisableNewNetNs bool //Determines kata processes are managed only in sandbox cgroup @@ -873,6 +884,13 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c vcAnnotations.BundlePathKey: bundlePath, }, + SandboxResources: vc.SandboxResourceSizing{ + WorkloadCPUs: runtime.SandboxCPUs, + WorkloadMemMB: runtime.SandboxMemMB, + }, + + StaticResourceMgmt: runtime.StaticSandboxResourceMgmt, + ShmSize: shmSize, VfioMode: runtime.VfioMode, @@ -894,6 +912,25 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid, c return vc.SandboxConfig{}, err } + // If we are utilizing static resource management for the sandbox, ensure that the hypervisor is started + // with the base number of CPU/memory (which is equal to the default CPU/memory specified for the runtime + // configuration or annotations) as well as any specified workload resources. + if sandboxConfig.StaticResourceMgmt { + sandboxConfig.SandboxResources.BaseCPUs = sandboxConfig.HypervisorConfig.NumVCPUs + sandboxConfig.SandboxResources.BaseMemMB = sandboxConfig.HypervisorConfig.MemorySize + + sandboxConfig.HypervisorConfig.NumVCPUs += sandboxConfig.SandboxResources.WorkloadCPUs + sandboxConfig.HypervisorConfig.MemorySize += sandboxConfig.SandboxResources.WorkloadMemMB + + ociLog.WithFields(logrus.Fields{ + "workload cpu": sandboxConfig.SandboxResources.WorkloadCPUs, + "default cpu": sandboxConfig.SandboxResources.BaseCPUs, + "workload mem in MB": sandboxConfig.SandboxResources.WorkloadMemMB, + "default mem": sandboxConfig.SandboxResources.BaseMemMB, + }).Debugf("static resources set") + + } + return sandboxConfig, nil } @@ -1046,3 +1083,89 @@ func (a *annotationConfiguration) setUintWithCheck(f func(uint64) error) error { } return nil } + +// CalculateSandboxSizing will calculate the number of CPUs and amount of Memory that should +// be added to the VM if sandbox annotations are provided with this sizing details +func CalculateSandboxSizing(spec *specs.Spec) (numCPU, memSizeMB uint32) { + var memory, quota int64 + var period uint64 + var err error + + if spec == nil || spec.Annotations == nil { + return 0, 0 + } + + // For each annotation, if it isn't defined, or if there's an error in parsing, we'll log + // a warning and continue the calculation with 0 value. We expect values like, + // Annotations[SandboxMem] = "1048576" + // Annotations[SandboxCPUPeriod] = "100000" + // Annotations[SandboxCPUQuota] = "220000" + // ... to result in VM resources of 1 (MB) for memory, and 3 for CPU (2200 mCPU rounded up to 3). + annotation, ok := spec.Annotations[ctrAnnotations.SandboxCPUPeriod] + if ok { + period, err = strconv.ParseUint(annotation, 10, 32) + if err != nil { + ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUPeriod: %s", annotation) + period = 0 + } + } + + annotation, ok = spec.Annotations[ctrAnnotations.SandboxCPUQuota] + if ok { + quota, err = strconv.ParseInt(annotation, 10, 32) + if err != nil { + ociLog.Warningf("sandbox-sizing: failure to parse SandboxCPUQuota: %s", annotation) + quota = 0 + } + } + + annotation, ok = spec.Annotations[ctrAnnotations.SandboxMem] + if ok { + memory, err = strconv.ParseInt(annotation, 10, 32) + if err != nil { + ociLog.Warningf("sandbox-sizing: failure to parse SandboxMem: %s", annotation) + memory = 0 + } + } + + return calculateVMResources(period, quota, memory) +} + +// CalculateContainerSizing will calculate the number of CPUs and amount of memory that is needed +// based on the provided LinuxResources +func CalculateContainerSizing(spec *specs.Spec) (numCPU, memSizeMB uint32) { + var memory, quota int64 + var period uint64 + + if spec == nil || spec.Linux == nil || spec.Linux.Resources == nil { + return 0, 0 + } + + resources := spec.Linux.Resources + + if resources.CPU != nil && resources.CPU.Quota != nil && resources.CPU.Period != nil { + quota = *resources.CPU.Quota + period = *resources.CPU.Period + } + + if resources.Memory != nil && resources.Memory.Limit != nil { + memory = *resources.Memory.Limit + } + + return calculateVMResources(period, quota, memory) +} + +func calculateVMResources(period uint64, quota int64, memory int64) (numCPU, memSizeMB uint32) { + numCPU = vcutils.CalculateVCpusFromMilliCpus(vcutils.CalculateMilliCPUs(quota, period)) + + if memory < 0 { + // While spec allows for a negative value to indicate unconstrained, we don't + // see this in practice. Since we rely only on default memory if the workload + // is unconstrained, we will treat as 0 for VM resource accounting. + ociLog.Infof("memory limit provided < 0, treating as 0 MB for VM sizing: %d", memory) + memSizeMB = 0 + } else { + memSizeMB = uint32(memory / 1024 / 1024) + } + return numCPU, memSizeMB +} diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 95e9625425..ea8128ed27 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -1061,3 +1061,149 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) { } } } + +func getCtrResourceSpec(memory, quota int64, period uint64) *specs.Spec { + return &specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Quota: "a, + Period: &period, + }, + Memory: &specs.LinuxMemory{ + Limit: &memory, + }, + }, + }, + } + +} + +func makeSizingAnnotations(memory, quota, period string) *specs.Spec { + spec := specs.Spec{ + Annotations: make(map[string]string), + } + spec.Annotations[ctrAnnotations.SandboxCPUPeriod] = period + spec.Annotations[ctrAnnotations.SandboxCPUQuota] = quota + spec.Annotations[ctrAnnotations.SandboxMem] = memory + + return &spec +} + +func TestCalculateContainerSizing(t *testing.T) { + assert := assert.New(t) + + testCases := []struct { + spec *specs.Spec + expectedCPU uint32 + expectedMem uint32 + }{ + { + spec: nil, + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: &specs.Spec{}, + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: &specs.Spec{ + Linux: &specs.Linux{ + Resources: &specs.LinuxResources{ + CPU: &specs.LinuxCPU{}, + Memory: &specs.LinuxMemory{}, + }, + }, + }, + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: getCtrResourceSpec(1024*1024, 200, 100), + expectedCPU: 2, + expectedMem: 1, + }, + { + spec: getCtrResourceSpec(1024*1024*1024, 200, 1), + expectedCPU: 200, + expectedMem: 1024, + }, + { + spec: getCtrResourceSpec(-1*1024*1024*1024, 200, 1), + expectedCPU: 200, + expectedMem: 0, + }, + { + spec: getCtrResourceSpec(0, 10, 0), + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: getCtrResourceSpec(-1, 10, 1), + expectedCPU: 10, + expectedMem: 0, + }, + } + + for _, tt := range testCases { + + cpu, mem := CalculateContainerSizing(tt.spec) + assert.Equal(tt.expectedCPU, cpu, "unexpected CPU") + assert.Equal(tt.expectedMem, mem, "unexpected memory") + } +} + +func TestCalculateSandboxSizing(t *testing.T) { + assert := assert.New(t) + + testCases := []struct { + spec *specs.Spec + expectedCPU uint32 + expectedMem uint32 + }{ + { + spec: nil, + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: &specs.Spec{}, + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: makeSizingAnnotations("1048576", "200", "100"), + expectedCPU: 2, + expectedMem: 1, + }, + { + spec: makeSizingAnnotations("1024", "200", "1"), + expectedCPU: 200, + expectedMem: 0, + }, + { + spec: makeSizingAnnotations("foobar", "200", "spaghetti"), + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: makeSizingAnnotations("-1048576", "-100", "1"), + expectedCPU: 0, + expectedMem: 0, + }, + { + spec: makeSizingAnnotations("-1", "100", "1"), + expectedCPU: 100, + expectedMem: 0, + }, + } + + for _, tt := range testCases { + + cpu, mem := CalculateSandboxSizing(tt.spec) + assert.Equal(tt.expectedCPU, cpu, "unexpected CPU") + assert.Equal(tt.expectedMem, mem, "unexpected memory") + } +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 3574d28eab..6dabac5e19 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -99,6 +99,17 @@ type SandboxStats struct { Cpus int } +type SandboxResourceSizing struct { + // The number of CPUs required for the sandbox workload(s) + WorkloadCPUs uint32 + // The base number of CPUs for the VM that are assigned as overhead + BaseCPUs uint32 + // The amount of memory required for the sandbox workload(s) + WorkloadMemMB uint32 + // The base amount of memory required for that VM that is assigned as overhead + BaseMemMB uint32 +} + // SandboxConfig is a Sandbox configuration. type SandboxConfig struct { // Volumes is a list of shared volumes between the host and the Sandbox. @@ -132,6 +143,11 @@ type SandboxConfig struct { HypervisorConfig HypervisorConfig + SandboxResources SandboxResourceSizing + + // StaticResourceMgmt indicates if the shim should rely on statically sizing the sandbox (VM) + StaticResourceMgmt bool + ShmSize uint64 VfioMode config.VFIOModeType @@ -1573,7 +1589,7 @@ func (s *Sandbox) createContainers(ctx context.Context) error { } // Update resources after having added containers to the sandbox, since - // container status is requiered to know if more resources should be added. + // container status is required to know if more resources should be added. if err := s.updateResources(ctx); err != nil { return err } @@ -1909,6 +1925,10 @@ func (s *Sandbox) updateResources(ctx context.Context) error { return fmt.Errorf("sandbox config is nil") } + if s.config.StaticResourceMgmt { + s.Logger().Debug("no resources updated: static resource management is set") + return nil + } sandboxVCPUs, err := s.calculateSandboxCPUs() if err != nil { return err