From 317ebb81de1bf97056d932d14d595af9d0965cdb Mon Sep 17 00:00:00 2001 From: Cameron Baird Date: Tue, 29 Jul 2025 21:44:59 +0000 Subject: [PATCH] runtime: Enforce that OCI memory limit exceeds 128MB baseline For our Kata UVM, we know we need at least 128MB of memory to prevent instability in the guest. Enforce this constraint with a descriptive error to prevent users from destabilizing the UVM with faulty k8s configurations. Signed-off-by: Cameron Baird --- src/runtime/Makefile | 6 +++ src/runtime/config/configuration-clh.toml.in | 5 +++ src/runtime/pkg/katautils/config.go | 44 ++++++++++--------- src/runtime/pkg/oci/utils.go | 7 +++ .../node-builder/azure-linux/package_build.sh | 4 +- 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/runtime/Makefile b/src/runtime/Makefile index 49635b15d5..7e6cbf1f12 100644 --- a/src/runtime/Makefile +++ b/src/runtime/Makefile @@ -278,6 +278,11 @@ DEFSTATICRESOURCEMGMT_TEE = true DEFSTATICSANDBOXWORKLOADMEM ?= 2048 DEFSTATICSANDBOXWORKLOADVCPUS ?= 1 +# If set, the runtime will enforce that pods deployed in a sandbox +# explicitly setting memory limits using resources.limits.memory +# allow at least this amount of memory in MiB so that the sandbox can properly start. +DEFSANDBOXWORKLOADMEMMIN ?= 128 + DEFDISABLEIMAGENVDIMM ?= false DEFBINDMOUNTS := [] @@ -751,6 +756,7 @@ USER_VARS += DEFSTATICRESOURCEMGMT_FC USER_VARS += DEFSTATICRESOURCEMGMT_STRATOVIRT USER_VARS += DEFSTATICRESOURCEMGMT_TEE USER_VARS += DEFSTATICSANDBOXWORKLOADMEM +USER_VARS += DEFSANDBOXWORKLOADMEMMIN USER_VARS += DEFSTATICSANDBOXWORKLOADVCPUS USER_VARS += DEFBINDMOUNTS USER_VARS += DEFCREATECONTAINERTIMEOUT diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 44d0934225..6206f0a385 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -442,6 +442,11 @@ static_sandbox_default_workload_mem=@DEFSTATICSANDBOXWORKLOADMEM@ # default amount of vcpus available within the sandbox. static_sandbox_default_workload_vcpus=@DEFSTATICSANDBOXWORKLOADVCPUS@ +# The runtime will enforce that pods deployed in a sandbox +# explicitly setting memory limits using resources.limits.memory +# allow at least this amount of memory in MiB so that the sandbox can properly start. +sandbox_workload_mem_min=@DEFSANDBOXWORKLOADMEMMIN@ + # If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 357915917b..cd4461e0cc 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -174,28 +174,29 @@ type hypervisor struct { } type runtime struct { - InterNetworkModel string `toml:"internetworking_model"` - JaegerEndpoint string `toml:"jaeger_endpoint"` - JaegerUser string `toml:"jaeger_user"` - JaegerPassword string `toml:"jaeger_password"` - VfioMode string `toml:"vfio_mode"` - GuestSeLinuxLabel string `toml:"guest_selinux_label"` - SandboxBindMounts []string `toml:"sandbox_bind_mounts"` - Experimental []string `toml:"experimental"` - Tracing bool `toml:"enable_tracing"` - DisableNewNetNs bool `toml:"disable_new_netns"` - DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` - EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"` - Debug bool `toml:"enable_debug"` - SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` - StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"` - StaticSandboxWorkloadDefaultMem uint32 `toml:"static_sandbox_default_workload_mem"` + InterNetworkModel string `toml:"internetworking_model"` + JaegerEndpoint string `toml:"jaeger_endpoint"` + JaegerUser string `toml:"jaeger_user"` + JaegerPassword string `toml:"jaeger_password"` + VfioMode string `toml:"vfio_mode"` + GuestSeLinuxLabel string `toml:"guest_selinux_label"` + SandboxBindMounts []string `toml:"sandbox_bind_mounts"` + Experimental []string `toml:"experimental"` + Tracing bool `toml:"enable_tracing"` + DisableNewNetNs bool `toml:"disable_new_netns"` + DisableGuestSeccomp bool `toml:"disable_guest_seccomp"` + EnableVCPUsPinning bool `toml:"enable_vcpus_pinning"` + Debug bool `toml:"enable_debug"` + SandboxCgroupOnly bool `toml:"sandbox_cgroup_only"` + StaticSandboxResourceMgmt bool `toml:"static_sandbox_resource_mgmt"` + StaticSandboxWorkloadDefaultMem uint32 `toml:"static_sandbox_default_workload_mem"` StaticSandboxWorkloadDefaultVcpus float32 `toml:"static_sandbox_default_workload_vcpus"` - EnablePprof bool `toml:"enable_pprof"` - DisableGuestEmptyDir bool `toml:"disable_guest_empty_dir"` - CreateContainerTimeout uint64 `toml:"create_container_timeout"` - DanConf string `toml:"dan_conf"` - ForceGuestPull bool `toml:"experimental_force_guest_pull"` + SandboxWorkloadMemMin uint32 `toml:"sandbox_workload_mem_min"` + EnablePprof bool `toml:"enable_pprof"` + DisableGuestEmptyDir bool `toml:"disable_guest_empty_dir"` + CreateContainerTimeout uint64 `toml:"create_container_timeout"` + DanConf string `toml:"dan_conf"` + ForceGuestPull bool `toml:"experimental_force_guest_pull"` } type agent struct { @@ -1565,6 +1566,7 @@ func LoadConfiguration(configPath string, ignoreLogging bool) (resolvedConfigPat config.GuestSeLinuxLabel = tomlConf.Runtime.GuestSeLinuxLabel config.StaticSandboxResourceMgmt = tomlConf.Runtime.StaticSandboxResourceMgmt config.StaticSandboxWorkloadDefaultMem = tomlConf.Runtime.StaticSandboxWorkloadDefaultMem + config.SandboxWorkloadMemMin = tomlConf.Runtime.SandboxWorkloadMemMin config.StaticSandboxWorkloadDefaultVcpus = tomlConf.Runtime.StaticSandboxWorkloadDefaultVcpus config.SandboxCgroupOnly = tomlConf.Runtime.SandboxCgroupOnly config.DisableNewNetNs = tomlConf.Runtime.DisableNewNetNs diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index aad04053d2..4ff3231069 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -159,6 +159,9 @@ type RuntimeConfig struct { // vcpus to allocate for workloads within the sandbox when workload vcpus is unspecified StaticSandboxWorkloadDefaultVcpus float32 + // Minimum memory (in MiB) to enforce is allocated for workloads within the sandbox when workload memory is specified + SandboxWorkloadMemMin uint32 + // Determines if create a netns for hypervisor process DisableNewNetNs bool @@ -1202,6 +1205,10 @@ func SandboxConfig(ocispec specs.Spec, runtime RuntimeConfig, bundlePath, cid st } + if sandboxConfig.SandboxResources.WorkloadMemMB < runtime.SandboxWorkloadMemMin { + return vc.SandboxConfig{}, fmt.Errorf("pod memory limit too low: minimum %dMiB, got %dMiB", runtime.SandboxWorkloadMemMin, sandboxConfig.SandboxResources.WorkloadMemMB) + } + return sandboxConfig, nil } diff --git a/tools/osbuilder/node-builder/azure-linux/package_build.sh b/tools/osbuilder/node-builder/azure-linux/package_build.sh index fb93eec197..346ba5a9f0 100755 --- a/tools/osbuilder/node-builder/azure-linux/package_build.sh +++ b/tools/osbuilder/node-builder/azure-linux/package_build.sh @@ -29,9 +29,9 @@ runtime_make_flags="SKIP_GO_VERSION_CHECK=1 QEMUCMD= FCCMD= ACRNCMD= STRATOVIRTC # - for ConfPods we explicitly set the cloud-hypervisor path. The path is independent of the PREFIX variable # as we have a single CLH binary for both vanilla Kata and ConfPods if [ "${CONF_PODS}" == "no" ]; then - runtime_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION}" + runtime_make_flags+=" DEFSTATICRESOURCEMGMT_CLH=true KERNELPATH_CLH=${KERNEL_BINARY_LOCATION} DEFSANDBOXWORKLOADMEMMIN=128" else - runtime_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION}" + runtime_make_flags+=" CLHPATH=${CLOUD_HYPERVISOR_LOCATION} DEFSANDBOXWORKLOADMEMMIN=192" fi # On Mariner 3.0 we use cgroupsv2 with a single sandbox cgroup