From a34c74a2d4c802b8475bc9ac5c00ebb3c7d8090d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 24 Jun 2026 18:47:38 +0200 Subject: [PATCH 1/4] runtime-rs: size static sandboxes with overhead values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When static sandbox sizing is enabled, keep configured defaults when workloads do not specify CPU or memory limits. When limits are present, size the VM as requested resources plus overhead_vcpus/overhead_memory values derived from runtime-rs profile defaults. Limit-driven vCPU sizing is clamped to a minimum of one vCPU so a 0.0 result never yields an unbootable VM, and sandbox setup fails early with a clear, actionable error when the computed memory is 0 MiB (pointing at memory limits or non-zero default/overhead memory settings). This keeps static VM sizing predictable across runtime-rs profiles, including NVIDIA ones. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../kata-types/src/config/hypervisor/mod.rs | 57 +++++++ src/runtime-rs/Makefile | 28 +++ ...configuration-clh-azure-runtime-rs.toml.in | 17 ++ .../configuration-clh-runtime-rs.toml.in | 17 ++ .../config/configuration-dragonball.toml.in | 17 ++ ...iguration-qemu-coco-dev-runtime-rs.toml.in | 17 ++ ...uration-qemu-nvidia-gpu-runtime-rs.toml.in | 17 ++ ...ion-qemu-nvidia-gpu-snp-runtime-rs.toml.in | 17 ++ ...ion-qemu-nvidia-gpu-tdx-runtime-rs.toml.in | 17 ++ .../configuration-qemu-runtime-rs.toml.in | 17 ++ .../configuration-qemu-se-runtime-rs.toml.in | 17 ++ .../configuration-qemu-snp-runtime-rs.toml.in | 17 ++ .../configuration-qemu-tdx-runtime-rs.toml.in | 17 ++ .../resource/src/cpu_mem/initial_size.rs | 161 ++++++++++++++---- .../pod-guest-pull-in-trusted-storage.yaml.in | 1 + 15 files changed, 401 insertions(+), 33 deletions(-) diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index f40bed3b1f..eaab7eb82f 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -641,6 +641,13 @@ pub struct CpuInfo { /// - `> number of physical cores`: Set to actual number of physical cores #[serde(default)] pub default_vcpus: f32, + /// vCPU overhead to be added when sandbox/container CPU limits are provided. + /// + /// This value is used by runtime-rs static sandbox sizing as: + /// - if no CPU limits are provided: use `default_vcpus` + /// - if CPU limits are provided: use `overhead_vcpus + workload_vcpus` + #[serde(default)] + pub overhead_vcpus: f32, /// Default maximum number of vCPUs per SB/VM: /// - Unspecified or `0`: Set to actual number of physical cores or @@ -973,6 +980,14 @@ pub struct MemoryInfo { /// Default memory size in MiB for SB/VM. #[serde(default)] pub default_memory: u32, + /// Memory overhead in MiB to be added when sandbox/container memory + /// limits are provided. + /// + /// This value is used by runtime-rs static sandbox sizing as: + /// - if no memory limits are provided: use `default_memory` + /// - if memory limits are provided: use `overhead_memory + workload_memory` + #[serde(default)] + pub overhead_memory: u32, /// Default maximum memory in MiB per SB/VM: /// - Unspecified or `0`: Set to actual physical RAM @@ -1974,11 +1989,13 @@ mod tests { input: &mut CpuInfo { cpu_features: "".to_string(), default_vcpus: 0.0, + overhead_vcpus: 0.0, default_maxvcpus: 0, }, output: CpuInfo { cpu_features: "".to_string(), default_vcpus, + overhead_vcpus: 0.0, default_maxvcpus: node_cpus as u32, }, }, @@ -1987,11 +2004,13 @@ mod tests { input: &mut CpuInfo { cpu_features: "a,b,c".to_string(), default_vcpus: 9999999.0, + overhead_vcpus: 0.0, default_maxvcpus: 9999999, }, output: CpuInfo { cpu_features: "a,b,c".to_string(), default_vcpus: node_cpus, + overhead_vcpus: 0.0, default_maxvcpus: node_cpus as u32, }, }, @@ -2000,14 +2019,31 @@ mod tests { input: &mut CpuInfo { cpu_features: "a, b ,c".to_string(), default_vcpus: -1.0, + overhead_vcpus: 0.0, default_maxvcpus: 1, }, output: CpuInfo { cpu_features: "a,b,c".to_string(), default_vcpus: 1.0, + overhead_vcpus: 0.0, default_maxvcpus: 1, }, }, + TestData { + desc: "overhead_vcpus explicitly set keeps value", + input: &mut CpuInfo { + cpu_features: "x, y".to_string(), + default_vcpus: 0.0, + overhead_vcpus: 0.5, + default_maxvcpus: 2, + }, + output: CpuInfo { + cpu_features: "x,y".to_string(), + default_vcpus, + overhead_vcpus: 0.5, + default_maxvcpus: 2, + }, + }, ]; for tc in tests.iter_mut() { @@ -2029,9 +2065,30 @@ mod tests { "test[{}] default_maxvcpus", tc.desc ); + assert_eq!( + tc.input.overhead_vcpus, tc.output.overhead_vcpus, + "test[{}] overhead_vcpus", + tc.desc + ); } } + #[test] + fn test_memory_info_adjust_config_keeps_explicit_overhead_memory() { + let mut mem = MemoryInfo { + default_memory: 1024, + overhead_memory: 512, + default_maxmemory: 4096, + ..Default::default() + }; + + mem.adjust_config().unwrap(); + + assert_eq!(mem.overhead_memory, 512); + assert_eq!(mem.default_memory, 1024); + assert_eq!(mem.default_maxmemory, 4096); + } + #[cfg(all(target_arch = "powerpc64", target_endian = "little"))] use rstest::rstest; diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index edcdcf2ee3..aa79ae33a2 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -161,6 +161,22 @@ DEFVCPUS := 1 DEFMAXVCPUS := 0 ##VAR DEFMEMSZ= Default memory size in MiB DEFMEMSZ := 2048 +##VAR DEFOVERHEADVCPUS_QEMU= vCPU overhead for qemu runtimes +DEFOVERHEADVCPUS_QEMU := 0.2 +##VAR DEFOVERHEADMEMSZ_QEMU= Memory overhead (MiB) for qemu runtimes +DEFOVERHEADMEMSZ_QEMU := 32 +##VAR DEFOVERHEADVCPUS_CLH= vCPU overhead for clh runtimes +DEFOVERHEADVCPUS_CLH := 0.2 +##VAR DEFOVERHEADMEMSZ_CLH= Memory overhead (MiB) for clh runtimes +DEFOVERHEADMEMSZ_CLH := 32 +##VAR DEFOVERHEADVCPUS_DB= vCPU overhead for dragonball runtimes +DEFOVERHEADVCPUS_DB := 0.2 +##VAR DEFOVERHEADMEMSZ_DB= Memory overhead (MiB) for dragonball runtimes +DEFOVERHEADMEMSZ_DB := 32 +##VAR DEFOVERHEADVCPUS_TEE= vCPU overhead for TEE runtimes +DEFOVERHEADVCPUS_TEE := 0.4 +##VAR DEFOVERHEADMEMSZ_TEE= Memory overhead (MiB) for SNP/TDX runtimes +DEFOVERHEADMEMSZ_TEE := 128 ##VAR DEFMEMSLOTS= Default memory slots # Cases to consider : # - nvdimm rootfs image @@ -452,6 +468,8 @@ endif KERNELVERITYPARAMS_NV ?= DEFAULTVCPUS_NV := 1 DEFAULTMEMORY_NV := 8192 + DEFOVERHEADVCPUS_NV := 0.5 + DEFOVERHEADMEMSZ_NV := 512 DEFAULTTIMEOUT_NV := 1200 DEFAULTLAUNCHPROCESSTIMEOUT_NV := 15 DEFAULTPCIEROOTPORT_NV := 8 @@ -672,6 +690,14 @@ USER_VARS += SHAREDIR USER_VARS += SYSCONFDIR USER_VARS += DEFVCPUS USER_VARS += DEFVCPUS_QEMU +USER_VARS += DEFOVERHEADVCPUS_QEMU +USER_VARS += DEFOVERHEADMEMSZ_QEMU +USER_VARS += DEFOVERHEADVCPUS_CLH +USER_VARS += DEFOVERHEADMEMSZ_CLH +USER_VARS += DEFOVERHEADVCPUS_TEE +USER_VARS += DEFOVERHEADVCPUS_DB +USER_VARS += DEFOVERHEADMEMSZ_DB +USER_VARS += DEFOVERHEADMEMSZ_TEE USER_VARS += DEFMAXVCPUS USER_VARS += DEFMAXVCPUS_DB USER_VARS += DEFMAXVCPUS_QEMU @@ -760,6 +786,8 @@ USER_VARS += KERNELPARAMS_CONFIDENTIAL_NV USER_VARS += KERNELVERITYPARAMS_NV USER_VARS += DEFAULTVCPUS_NV USER_VARS += DEFAULTMEMORY_NV +USER_VARS += DEFOVERHEADVCPUS_NV +USER_VARS += DEFOVERHEADMEMSZ_NV USER_VARS += DEFAULTTIMEOUT_NV USER_VARS += DEFAULTLAUNCHPROCESSTIMEOUT_NV USER_VARS += DEFAULTPCIEROOTPORT_NV diff --git a/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in b/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in index 308ad7bbd9..668f5e527d 100644 --- a/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-clh-azure-runtime-rs.toml.in @@ -65,6 +65,15 @@ kernel_params = "@KERNELPARAMS@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_CLH@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -85,6 +94,14 @@ default_maxvcpus = @DEFMAXVCPUS@ # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_CLH@ + # Shared file system type: # - virtio-fs # - virtio-fs-nydus diff --git a/src/runtime-rs/config/configuration-clh-runtime-rs.toml.in b/src/runtime-rs/config/configuration-clh-runtime-rs.toml.in index c34e95b152..842f77fcd9 100644 --- a/src/runtime-rs/config/configuration-clh-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-clh-runtime-rs.toml.in @@ -65,6 +65,15 @@ kernel_params = "@KERNELPARAMS@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_CLH@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -85,6 +94,14 @@ default_maxvcpus = @DEFMAXVCPUS@ # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_CLH@ + # Shared file system type: # - virtio-fs # - virtio-fs-nydus diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 44e5c903f2..bc13e8427c 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -68,6 +68,15 @@ firmware = "@FIRMWAREPATH@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_DB@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number @@ -112,6 +121,14 @@ reclaim_guest_freed_memory = false # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_DB@ + # Default maximum memory in MiB per SB / VM # unspecified or == 0 --> will be set to the actual amount of physical RAM # > 0 <= amount of physical RAM --> will be set to the specified number diff --git a/src/runtime-rs/config/configuration-qemu-coco-dev-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-coco-dev-runtime-rs.toml.in index 437cd740c4..3002a8254d 100644 --- a/src/runtime-rs/config/configuration-qemu-coco-dev-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-coco-dev-runtime-rs.toml.in @@ -107,6 +107,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS_QEMU@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_TEE@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -149,6 +158,14 @@ reclaim_guest_freed_memory = false # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_TEE@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-runtime-rs.toml.in index 3738301bcd..ff1785ad14 100644 --- a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-runtime-rs.toml.in @@ -99,6 +99,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFAULTVCPUS_NV@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_NV@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -141,6 +150,14 @@ reclaim_guest_freed_memory = false # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFAULTMEMORY_NV@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_NV@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-snp-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-snp-runtime-rs.toml.in index 06f7a2e9f9..82814e2bf0 100644 --- a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-snp-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-snp-runtime-rs.toml.in @@ -140,6 +140,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFAULTVCPUS_NV@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_NV@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -182,6 +191,14 @@ reclaim_guest_freed_memory = false # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFAULTMEMORY_NV@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_NV@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-tdx-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-tdx-runtime-rs.toml.in index 9ae7041cc5..1df79b54b9 100644 --- a/src/runtime-rs/config/configuration-qemu-nvidia-gpu-tdx-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-nvidia-gpu-tdx-runtime-rs.toml.in @@ -116,6 +116,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFAULTVCPUS_NV@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_NV@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -158,6 +167,14 @@ reclaim_guest_freed_memory = false # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFAULTMEMORY_NV@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_NV@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in index 47adda3409..897ae5166d 100644 --- a/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in @@ -86,6 +86,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS_QEMU@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_QEMU@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -128,6 +137,14 @@ reclaim_guest_freed_memory = false # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_QEMU@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in index 7b3d1649e4..21b8d27560 100644 --- a/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-se-runtime-rs.toml.in @@ -95,6 +95,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS_QEMU@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_TEE@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -127,6 +136,14 @@ default_bridges = @DEFBRIDGES@ # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_TEE@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-snp-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-snp-runtime-rs.toml.in index de39c6a424..2770269b00 100644 --- a/src/runtime-rs/config/configuration-qemu-snp-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-snp-runtime-rs.toml.in @@ -133,6 +133,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = @DEFVCPUS_QEMU@ +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_TEE@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -166,6 +175,14 @@ default_bridges = @DEFBRIDGES@ # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_TEE@ + # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/config/configuration-qemu-tdx-runtime-rs.toml.in b/src/runtime-rs/config/configuration-qemu-tdx-runtime-rs.toml.in index ccf5b4da37..d09e7583c4 100644 --- a/src/runtime-rs/config/configuration-qemu-tdx-runtime-rs.toml.in +++ b/src/runtime-rs/config/configuration-qemu-tdx-runtime-rs.toml.in @@ -111,6 +111,15 @@ cpu_features = "@CPUFEATURES@" # > number of physical cores --> will be set to the actual number of physical cores default_vcpus = 1 +# Guest-side vCPU overhead budget (fractional) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_vcpus = requested_vcpus + overhead_vcpus +# (rounded up at boot). If a workload limit is set on another dimension (for example +# memory) but CPU is missing, requested_vcpus is treated as 0 and vm_vcpus equals +# overhead_vcpus (minimum 1 at boot). When no workload limits are present, +# default_vcpus is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_vcpus = @DEFOVERHEADVCPUS_TEE@ + # Default maximum number of vCPUs per SB/VM: # unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number # of vCPUs supported by KVM if that number is exceeded @@ -143,6 +152,14 @@ default_bridges = @DEFBRIDGES@ # Default memory size in MiB for SB/VM. # If unspecified then it will be set @DEFMEMSZ@ MiB. default_memory = @DEFMEMSZ@ + +# Guest-side memory overhead budget (MiB) used with static_sandbox_resource_mgmt. +# When workload limits are present, vm_memory = requested_memory + overhead_memory. +# If a workload limit is set on another dimension (for example CPU) but memory is +# missing, requested_memory is treated as 0, so vm_memory equals overhead_memory. +# When no workload limits are present, default_memory is used instead. +# See docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md +overhead_memory = @DEFOVERHEADMEMSZ_TEE@ # # Default memory slots per SB/VM. # If unspecified then it will be set @DEFMEMSLOTS@. diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index b886d9faae..934207d37c 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -6,7 +6,7 @@ use std::{collections::HashMap, convert::TryFrom}; -use anyhow::{Context, Result}; +use anyhow::{ensure, Context, Result}; use kata_types::{ annotations::Annotation, config::TomlConfig, container::ContainerType, cpu::LinuxContainerCpuResources, k8s::container_type, @@ -159,28 +159,36 @@ impl InitialSizeManager { .get_mut(hypervisor_name) .context("failed to get hypervisor config")?; - if self.resource.vcpu > 0.0 { - info!(sl!(), "resource with vcpu {}", self.resource.vcpu); - if config.runtime.static_sandbox_resource_mgmt { - hv.cpu_info.default_vcpus += self.resource.vcpu; - } - } - - if config.runtime.static_sandbox_resource_mgmt { - let new_vcpus_ceil = hv.cpu_info.default_vcpus.ceil() as u32; - hv.cpu_info.default_maxvcpus = new_vcpus_ceil; - } - self.resource.orig_toml_default_mem = hv.memory_info.default_memory; - if self.resource.mem_mb > 0 { - info!(sl!(), "resource with memory {}", self.resource.mem_mb); - if config.runtime.static_sandbox_resource_mgmt { - hv.memory_info.default_memory += self.resource.mem_mb; - if hv.memory_info.default_maxmemory < hv.memory_info.default_memory { - hv.memory_info.default_maxmemory = hv.memory_info.default_memory; - } - } + + // Non-static mode keeps configured defaults unchanged. + if !config.runtime.static_sandbox_resource_mgmt { + validate_non_zero_sandbox_memory(hypervisor_name, hv.memory_info.default_memory)?; + return Ok(()); } + + if self.resource.vcpu > 0.0 || self.resource.mem_mb > 0 { + if self.resource.vcpu > 0.0 { + info!(sl!(), "resource with vcpu {}", self.resource.vcpu); + } + if self.resource.mem_mb > 0 { + info!(sl!(), "resource with memory {}", self.resource.mem_mb); + } + + hv.cpu_info.default_vcpus = + (hv.cpu_info.overhead_vcpus + self.resource.vcpu).max(1.0); + + hv.memory_info.default_memory = + hv.memory_info.overhead_memory + self.resource.mem_mb; + hv.memory_info.default_maxmemory = hv + .memory_info + .default_maxmemory + .max(hv.memory_info.default_memory); + } + + hv.cpu_info.default_maxvcpus = hv.cpu_info.default_vcpus.ceil() as u32; + + validate_non_zero_sandbox_memory(hypervisor_name, hv.memory_info.default_memory)?; Ok(()) } @@ -189,6 +197,15 @@ impl InitialSizeManager { } } +fn validate_non_zero_sandbox_memory(hypervisor_name: &str, memory_mib: u32) -> Result<()> { + ensure!( + memory_mib > 0, + "computed sandbox memory is 0 MiB for hypervisor '{}'; set a non-zero memory limit or configure non-zero default_memory/overhead_memory", + hypervisor_name + ); + Ok(()) +} + fn get_nr_vcpu(resource: &LinuxContainerCpuResources) -> f32 { if let Some(v) = resource.get_vcpus() { v as f32 @@ -227,6 +244,7 @@ mod tests { use super::*; use kata_types::annotations::cri_containerd; use oci_spec::runtime::{LinuxBuilder, LinuxMemory, LinuxMemoryBuilder, LinuxResourcesBuilder}; + use rstest::rstest; use std::collections::HashMap; #[derive(Clone)] struct InputData { @@ -398,8 +416,10 @@ mod tests { fn make_config( default_vcpus: f32, + overhead_vcpus: f32, default_maxvcpus: u32, default_memory: u32, + overhead_memory: u32, default_maxmemory: u32, static_sandbox_resource_mgmt: bool, ) -> TomlConfig { @@ -411,8 +431,10 @@ mod tests { .insert("qemu".to_owned(), Hypervisor::default()); config.hypervisor.entry("qemu".to_owned()).and_modify(|hv| { hv.cpu_info.default_vcpus = default_vcpus; + hv.cpu_info.overhead_vcpus = overhead_vcpus; hv.cpu_info.default_maxvcpus = default_maxvcpus; hv.memory_info.default_memory = default_memory; + hv.memory_info.overhead_memory = overhead_memory; hv.memory_info.default_maxmemory = default_maxmemory; }); config.runtime.hypervisor_name = "qemu".to_owned(); @@ -422,7 +444,7 @@ mod tests { #[test] fn test_setup_config_static_applies_vcpu_and_memory() { - let mut config = make_config(1.0, 4, 256, 4096, true); + let mut config = make_config(1.0, 0.5, 4, 256, 128, 4096, true); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 1.2, @@ -433,13 +455,13 @@ mod tests { mgr.setup_config(&mut config).unwrap(); let hv = config.hypervisor.get("qemu").unwrap(); - assert_eq!(hv.cpu_info.default_vcpus, 2.2); - assert_eq!(hv.memory_info.default_memory, 768); + assert_eq!(hv.cpu_info.default_vcpus, 1.7); + assert_eq!(hv.memory_info.default_memory, 640); } #[test] fn test_setup_config_non_static_does_not_apply() { - let mut config = make_config(1.0, 4, 256, 4096, false); + let mut config = make_config(1.0, 0.5, 4, 256, 128, 4096, false); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 1.2, @@ -456,7 +478,7 @@ mod tests { #[test] fn test_setup_config_clamps_maxvcpus() { - let mut config = make_config(1.0, 2, 256, 4096, true); + let mut config = make_config(1.0, 1.0, 2, 256, 128, 4096, true); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 2.5, @@ -473,7 +495,7 @@ mod tests { #[test] fn test_setup_config_static_reduces_maxvcpus_to_static_total() { - let mut config = make_config(1.0, 8, 256, 4096, true); + let mut config = make_config(1.0, 0.5, 8, 256, 128, 4096, true); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 1.2, @@ -484,13 +506,13 @@ mod tests { mgr.setup_config(&mut config).unwrap(); let hv = config.hypervisor.get("qemu").unwrap(); - assert_eq!(hv.cpu_info.default_vcpus, 2.2); - assert_eq!(hv.cpu_info.default_maxvcpus, 3); + assert_eq!(hv.cpu_info.default_vcpus, 1.7); + assert_eq!(hv.cpu_info.default_maxvcpus, 2); } #[test] fn test_setup_config_clamps_maxmemory() { - let mut config = make_config(1.0, 4, 256, 300, true); + let mut config = make_config(1.0, 0.5, 4, 256, 128, 300, true); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 0.0, @@ -501,13 +523,13 @@ mod tests { mgr.setup_config(&mut config).unwrap(); let hv = config.hypervisor.get("qemu").unwrap(); - assert_eq!(hv.memory_info.default_memory, 768); - assert_eq!(hv.memory_info.default_maxmemory, 768); + assert_eq!(hv.memory_info.default_memory, 640); + assert_eq!(hv.memory_info.default_maxmemory, 640); } #[test] fn test_setup_config_preserves_orig_toml_default_mem() { - let mut config = make_config(1.0, 4, 256, 4096, true); + let mut config = make_config(1.0, 0.5, 4, 256, 128, 4096, true); let mut mgr = InitialSizeManager { resource: InitialSize { vcpu: 0.0, @@ -551,4 +573,77 @@ mod tests { assert!((mgr.resource.vcpu - 1.2).abs() < VCPU_TOLERANCE); assert_eq!(mgr.resource.mem_mb, 256); } + + #[test] + fn test_setup_config_static_without_limits_uses_toml_defaults() { + let mut config = make_config(2.0, 0.5, 8, 512, 128, 4096, true); + let mut mgr = InitialSizeManager { + resource: InitialSize { + vcpu: 0.0, + mem_mb: 0, + orig_toml_default_mem: 0, + }, + }; + + mgr.setup_config(&mut config).unwrap(); + let hv = config.hypervisor.get("qemu").unwrap(); + assert_eq!(hv.cpu_info.default_vcpus, 2.0); + assert_eq!(hv.memory_info.default_memory, 512); + } + + #[test] + fn test_setup_config_static_errors_on_zero_memory() { + let mut config = make_config(1.0, 0.5, 8, 1024, 0, 4096, true); + let mut mgr = InitialSizeManager { + resource: InitialSize { + vcpu: 1.0, + mem_mb: 0, + orig_toml_default_mem: 0, + }, + }; + + let err = mgr.setup_config(&mut config).unwrap_err().to_string(); + assert!(err.contains("computed sandbox memory is 0 MiB")); + assert!(err.contains("default_memory/overhead_memory")); + } + + #[rstest] + #[case::both_limits(3.0, 0.75, 1024, 256, 1.25, 1024, 2.0, 1280)] + #[case::cpu_only_limit(3.0, 0.5, 1024, 128, 1.5, 0, 2.0, 128)] + #[case::memory_only_limit(3.0, 0.5, 1024, 128, 0.0, 512, 1.0, 640)] + #[case::both_limits_zero_overhead(3.0, 0.0, 1024, 0, 1.25, 1024, 1.25, 1024)] + #[case::memory_only_zero_overhead(3.0, 0.0, 1024, 0, 0.0, 512, 1.0, 512)] + fn test_setup_config_static_requested_vs_defaults( + #[case] default_vcpus: f32, + #[case] overhead_vcpus: f32, + #[case] default_memory: u32, + #[case] overhead_memory: u32, + #[case] requested_vcpus: f32, + #[case] requested_mem_mb: u32, + #[case] expected_default_vcpus: f32, + #[case] expected_default_memory: u32, + ) { + let mut config = make_config( + default_vcpus, + overhead_vcpus, + 8, + default_memory, + overhead_memory, + 4096, + true, + ); + let mut mgr = InitialSizeManager { + resource: InitialSize { + vcpu: requested_vcpus, + mem_mb: requested_mem_mb, + orig_toml_default_mem: 0, + }, + }; + + mgr.setup_config(&mut config).unwrap(); + let hv = config.hypervisor.get("qemu").unwrap(); + + assert_eq!(hv.cpu_info.default_vcpus, expected_default_vcpus); + assert_eq!(hv.memory_info.default_memory, expected_default_memory); + } } diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-guest-pull-in-trusted-storage.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/pod-guest-pull-in-trusted-storage.yaml.in index 8bc921b17f..a1fcb53393 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-guest-pull-in-trusted-storage.yaml.in +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-guest-pull-in-trusted-storage.yaml.in @@ -31,6 +31,7 @@ spec: resources: limits: cpu: "2" + memory: "2Gi" volumeDevices: - devicePath: /dev/trusted_store name: trusted-storage From 346a3be9ad3aa491878e280da9b9bb756f74147f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 24 Jun 2026 18:52:25 +0200 Subject: [PATCH 2/4] docs: document runtime-rs sandbox overhead sizing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a how-to describing how runtime-rs sizes static sandboxes from overhead plus requested CPU/memory, including that fractional vCPU results are rounded up for VMM-visible vCPU counts, and link it from the how-to README. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- docs/how-to/README.md | 1 + ...how-to-size-sandbox-overhead-runtime-rs.md | 367 ++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100644 docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md diff --git a/docs/how-to/README.md b/docs/how-to/README.md index 134dd35e67..248081b583 100644 --- a/docs/how-to/README.md +++ b/docs/how-to/README.md @@ -51,6 +51,7 @@ - [How to use mem-agent to decrease the memory usage of Kata container](how-to-use-memory-agent.md) - [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md) - [How to use passthroughfd-IO with runtime-rs and Dragonball](how-to-use-passthroughfd-io-within-runtime-rs.md) +- [How to size sandbox overhead in runtime-rs](how-to-size-sandbox-overhead-runtime-rs.md) - [How to use EROFS snapshotter with Kata Containers](how-to-use-erofs-snapshotter-with-kata.md) - [How to use NUMA with Kata Containers](how-to-use-numa-with-kata.md) diff --git a/docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md b/docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md new file mode 100644 index 0000000000..bc73c905bb --- /dev/null +++ b/docs/how-to/how-to-size-sandbox-overhead-runtime-rs.md @@ -0,0 +1,367 @@ +# How to size `overhead_*` for runtime-rs sandbox sizing + +This document explains how `overhead_vcpus` and `overhead_memory` are expected +to be used in runtime-rs. + +> [!WARNING] +> For runtime-rs, using `static_sandbox_resource_mgmt` is the recommended mode. +> Disabling it is not recommended for production sandbox sizing. + +> [!IMPORTANT] +> For correct and predictable Kata sandbox sizing in Kubernetes, workload CPU +> and memory limits **must** be set. Without limits, runtime-rs falls back to +> `default_vcpus` and `default_memory`, which is a compatibility fallback and +> not the intended production sizing model. + +## Why these fields exist + +In runtime-rs, static sandbox sizing is enabled by default. Kata must pick VM resources before +starting the workload. In Kubernetes, pod limits represent workload resources, +but the VM also needs extra resources for guest/kernel/runtime overhead. + +`overhead_vcpus` and `overhead_memory` represent that extra budget. + +## Sizing model + +With runtime-rs static sandbox sizing, Kata uses: + +- If workload limits are present: + - `vm_vcpus = requested_vcpus + overhead_vcpus` + - `vm_memory = requested_memory + overhead_memory` +- If workload limits are not present: + - `vm_vcpus = default_vcpus` + - `vm_memory = default_memory` + +In other words, `default_*` is the fallback for "no limits", while +`overhead_*` is the additive budget for "limits are set". +For CPU, runtime-rs sums workload and overhead values, and if the computed +result is fractional it is rounded up to the next integer (`ceil`), since VMMs +expose integer vCPU counts. A minimum of `1` vCPU is enforced for the +limit-driven path, including the `0 + 0` edge case. + +## `podFixed` as a sizing function + +Treat `RuntimeClass.overhead.podFixed` as a function of expected VM size: +larger VMs usually need larger overhead budgets, for both static and dynamic +allocation environments. + +Operationally, this usually leads to one of two models: + +- Single runtime class: one conservative `podFixed` value that works across all + expected workload sizes. +- Multiple runtime classes (for example S/M/L/XL): each class has a tailored + `podFixed` and runtime profile for tighter node-level accounting. + +Kata cannot ship a single correct value for this function, because it depends on +a large number of deployment-specific factors, including: + +- the hypervisor in use (each has a different memory/CPU footprint), +- the file-sharing mechanism (`virtio-fs` vs. others), +- the presence of CoCo guest components, +- the VM image in use (our released images, or downstream-modified ones), +- hardware features such as GPUs (or anything else requiring large DMA buffers). + +These factors, the inherent brittleness of overhead measurements, and how much +headroom a cluster owner is willing to "waste" to guarantee stable operation, +all feed into the value. Downstream operators should therefore measure and tune +this function for their own deployments. + +## Recommended operator/admin workflow + +The Kubernetes documentation defines `RuntimeClass.overhead.podFixed` as: + +> podFixed represents the fixed resource overhead associated with running a pod. + +For Kata, that overhead has two parts: the *guest-side* overhead (the extra +CPU/memory the VM needs on top of the workload) and the *host-side* overhead +(the runtime, hypervisor, and helper processes running on the node). `podFixed` +must account for **both**, while Kata `overhead_*` accounts for the guest-side +part only. + +A practical workflow is therefore: + +1. Estimate (or measure) the guest-side overhead. Kata profiles ship with a + starting value, but you should refine it for your environment. +2. Set Kata `overhead_*` per runtime profile to that guest-side value. +3. Estimate (or measure) the host-side overhead. +4. Set `RuntimeClass.overhead.podFixed` to the sum of the guest-side and + host-side overhead. This naturally keeps `podFixed` higher than `overhead_*`. +5. Validate with representative workloads (small/medium/large). As rough + starting points for the measurements: + - guest-side overhead: subtract a container's used memory (for example, + `free` inside the container) from the nominal VM size; + - host-side overhead: subtract the nominal VM size from the pod's host + cgroup usage, for example + `cat /sys/fs/cgroup/kubepods.slice/**/memory.current`. + +For production-oriented Kata deployments, assume users provide workload limits. +The no-limits path exists as a compatibility fallback, not as the primary sizing +model. + +Kata profiles initialize `overhead_*` to values derived from Pod Overhead (for +example, 80% for CPU and memory), but this is only a policy input and should be +tuned by downstream operators and admins. + +## Who sets what: admin vs user + +In many environments, the "admin" and the "user" are different personas. In +smaller environments they may be the same person or team. + +- Admin/operator responsibilities: + - Set runtime defaults (`default_*`) and overhead values (`overhead_*`). + - Set and maintain `RuntimeClass.overhead.podFixed`. + - Provide runtime classes that users can select per workload profile. + - Ensure those policies are aligned for each runtime profile. + - Validate behavior with representative workloads and adjust if needed. + +- User/application responsibilities: + - Set pod/container CPU and memory limits for workload intent. + - Use the runtime class provided by admins for the workload profile. + - Avoid relying on default sizing when deterministic resources are required. + +## Example 1: limits set on both CPU and memory + +**Scenario intent:** show the standard production case with explicit workload limits. + +**Consequence:** users get predictable sizing plus admin-defined overhead budget. + +**`RuntimeClass.overhead.podFixed` relationship:** `podFixed` should be higher than +`overhead_*`, since `podFixed` must include host-side runtime components. + +Given the runtime profile: + +- `default_vcpus = 2` +- `default_memory = 1024` +- `overhead_vcpus = 0.5` +- `overhead_memory = 128` + +And the matching `RuntimeClass.overhead.podFixed`: + +- `cpu = 600m` (`0.6`) +- `memory = 160Mi` + +Workload limits: + +- CPU quota/period equivalent to `1.5 vCPUs` +- memory limit `600 MiB` + +Kata VM sizing (guest side): + +- `vm_vcpus = 1.5 + 0.5 = 2.0` +- `vm_memory = 600 + 128 = 728 MiB` + +Kubernetes accounting for the whole pod (`sum(limits) + podFixed`): + +- `pod_cpu = 1.5 + 0.6 = 2.1` +- `pod_memory = 600 + 160 = 760 MiB` + +Note that `podFixed` (`160Mi`) is higher than `overhead_memory` (`128`), since it +must also cover the host-side runtime components that live outside the VM. + +## Example 2: partial limits (split by dimension) + +**Scenario intent:** show what happens when only one limit is provided. + +**Consequence:** once any limit exists, overhead logic applies to both dimensions. + +**`RuntimeClass.overhead.podFixed` relationship:** same rule as Example 1; +`podFixed` should remain higher than `overhead_*`. + +Given: + +- `default_vcpus = 2` +- `default_memory = 1024` +- `overhead_vcpus = 0.5` +- `overhead_memory = 128` + +### 2A. Memory limit only + +Workload sets: + +- memory limit = `512 MiB` +- no CPU limit + +Result: + +- CPU is rounded up for boot: `vm_vcpus = ceil(0 + 0.5) = 1` +- Memory uses overhead formula: `vm_memory = 512 + 128 = 640 MiB` + +### 2B. CPU limit only + +Workload sets: + +- CPU quota/period equivalent to `1.5 vCPUs` +- no memory limit + +Result: + +- CPU uses overhead formula: `vm_vcpus = 1.5 + 0.5 = 2.0` +- Memory still uses overhead baseline: `vm_memory = 0 + 128 = 128 MiB` + +This is the reason workload memory limits **must** be set (see the note at the +top of this document): with a CPU limit but no memory limit, the VM is sized +with `overhead_memory` only, which is almost certainly too small to run a real +workload. It is the explicit overhead baseline, not a default fallback to +`default_memory`. As a safety net, if the computed sandbox memory would be `0` +(for example, a CPU-only workload with `overhead_memory = 0`), runtime-rs fails +early with an actionable error instead of booting an unusable VM. + +This mirrors runtime-rs behavior: once limits are present for a sandbox, overhead +is applied on both dimensions, and any missing dimension uses `0 + overhead_*` +(with fractional CPU results rounded up). + +## Example 3: `overhead_* = 0` (zero-overhead model) + +**Scenario intent:** user-driven exact workload sizing by setting `overhead_* = 0`. + +**Consequence:** users get exactly requested VM sizes when limits are set, but they +are accountable for accounting workload-related overhead in those limits. + +**`RuntimeClass.overhead.podFixed` relationship:** `podFixed` is still required to +cover host-side resource usage (not guest-side), and should be tuned +independently. + +Some deployments may choose to set: + +- `overhead_vcpus = 0` +- `overhead_memory = 0` + +With: + +- `default_vcpus = 2` +- `default_memory = 1024` + +### 3A. Limits set on both dimensions + +Workload limits: + +- CPU = `1.5 vCPUs` +- memory = `600 MiB` + +Result: + +- `vm_vcpus = 1.5 + 0 = 1.5` +- `vm_memory = 600 + 0 = 600 MiB` + +### 3B. No limits + +Result: + +- `vm_vcpus = default_vcpus = 2` +- `vm_memory = default_memory = 1024 MiB` + +This keeps defaults as fallback only, while limit-driven sizing becomes purely +workload-based. + +## Example 4: no limits (fallback path) + +**Scenario intent:** show compatibility fallback behavior when users do not +provide limits. + +**Consequence:** VM sizing comes from admin-defined defaults. This is acceptable +for basic workloads and testing, **but not the intended production sizing +posture**. + +**`RuntimeClass.overhead.podFixed` relationship:** in this case, `podFixed` +should be higher than the effective default baseline (`default_*`) to account +for host-side components as well. Kubernetes does not know Kata `default_*` +values; if `podFixed` is too low, host-side usage can exceed the pod budget and +the pod may be killed. + +Given: + +- `default_vcpus = 2` +- `default_memory = 1024` (MiB) +- `overhead_vcpus = 0.5` +- `overhead_memory = 128` (MiB) + +Pod/container limits are not set. + +Result: + +- VM boots with `2 vCPUs` and `1024 MiB`. +- `overhead_*` is not used in this case. + +## Runtime profile snippet + +```toml +[hypervisor.qemu] +default_vcpus = 2 +default_memory = 1024 +overhead_vcpus = 0.5 +overhead_memory = 128 +``` + +## Helm examples + +With kata-deploy Helm, the recommended pattern is to set `overhead_*` in a +runtime `dropIn` and set the corresponding `RuntimeClass.overhead.podFixed` +to a higher value in the same values file. + +For runtime-rs, `static_sandbox_resource_mgmt` is already enabled by default, so +these examples focus on `overhead_*` and related policy values. + +### Example A: custom runtime profile + +```yaml +customRuntimes: + enabled: true + runtimes: + my-qemu-runtime-rs: + baseConfig: "qemu" + dropIn: | + [hypervisor.qemu] + default_vcpus = 2 + default_memory = 1024 + overhead_vcpus = 0.5 + overhead_memory = 128 + runtimeClass: | + kind: RuntimeClass + apiVersion: node.k8s.io/v1 + metadata: + name: kata-my-qemu-runtime-rs + labels: + app.kubernetes.io/managed-by: kata-deploy + handler: kata-my-qemu-runtime-rs + overhead: + podFixed: + cpu: "600m" + memory: "160Mi" + scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" +``` + +In this example: + +- Kata overhead used for VM sizing is `0.5 vCPU` and `128Mi`. +- Kubernetes scheduler/accounting overhead is `600m` and `160Mi`. +- The gap (`podFixed` > `overhead_*`) leaves extra budget for components outside + the guest workload cgroup model. + +### Example B: override a default shim with `shims..dropIn` + +If you do not need a new runtime class, you can patch an existing runtime-rs +shim directly: + +```yaml +shims: + qemu: + enabled: true + dropIn: | + [hypervisor.qemu] + overhead_vcpus = 0.5 + overhead_memory = 128 +``` + +This updates Kata sizing behavior for that shim. If you also control the +runtime class YAML externally, keep `podFixed` greater than `overhead_*` under +the same sizing policy. + +## Kubernetes alignment notes + +- `RuntimeClass.overhead.podFixed` and Kata `overhead_*` should be managed by + the same operator/admin policy, with `podFixed` set higher than `overhead_*`. +- Mismatched values can produce surprising behavior under pressure. +- Upstream runtime-rs does not auto-fetch RuntimeClass overhead from Kubernetes; + the configured `overhead_*` values are the source used for VM sizing. From b2f7314d314bebc899de9194064440c379b0c198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 24 Jun 2026 18:52:25 +0200 Subject: [PATCH 3/4] tests: harden sandbox sizing manifests for k8s cpu workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Route runtime-rs tests to dedicated manifests/templates and ensure the CPU allocation workloads always carry explicit memory limits, avoiding Dragonball sandbox startup failures from InvalidMemorySize(0). Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../kubernetes/k8s-sandbox-vcpus-allocation.bats | 10 ++++++++-- .../runtimeclass_workloads/pod-cpu-defaults.yaml | 5 +++++ .../kubernetes/runtimeclass_workloads/pod-cpu.yaml | 2 ++ .../runtimeclass_workloads/pod-number-cpu.yaml | 2 ++ .../pod-sandbox-vcpus-allocation.yaml | 5 +++++ tests/integration/kubernetes/tests_common.sh | 2 +- 6 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/integration/kubernetes/k8s-sandbox-vcpus-allocation.bats b/tests/integration/kubernetes/k8s-sandbox-vcpus-allocation.bats index e24f88b4a8..47118ee1c1 100644 --- a/tests/integration/kubernetes/k8s-sandbox-vcpus-allocation.bats +++ b/tests/integration/kubernetes/k8s-sandbox-vcpus-allocation.bats @@ -31,8 +31,14 @@ setup() { # Create the pods kubectl create -f "${yaml_file}" - # Wait for completion - kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=$timeout pod --all + # Wait for each test container to terminate successfully. Using container + # termination state is more robust than pod phase checks, which can lag. + for pod in "${pods[@]}"; do + kubectl wait \ + --for=jsonpath='{.status.containerStatuses[0].state.terminated.reason}'=Completed \ + --timeout=$timeout \ + "pod/${pod}" + done # Check the pods for i in {0..2}; do diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu-defaults.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu-defaults.yaml index 7973ea1bbc..ee227af14c 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu-defaults.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu-defaults.yaml @@ -13,3 +13,8 @@ spec: - name: default-cpu-demo-ctr image: quay.io/prometheus/busybox:latest command: ["tail", "-f", "/dev/null"] + resources: + limits: + memory: "128Mi" + requests: + memory: "64Mi" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu.yaml index 976bf44236..0ca927ecd2 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-cpu.yaml @@ -18,5 +18,7 @@ spec: resources: limits: cpu: "1" + memory: "128Mi" requests: cpu: "500m" + memory: "64Mi" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-number-cpu.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-number-cpu.yaml index acbc89fce7..e17bd61ce9 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-number-cpu.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-number-cpu.yaml @@ -16,6 +16,7 @@ spec: resources: limits: cpu: "500m" + memory: "128Mi" - name: c2 image: quay.io/prometheus/busybox:latest command: @@ -24,3 +25,4 @@ spec: resources: limits: cpu: "500m" + memory: "128Mi" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-sandbox-vcpus-allocation.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-sandbox-vcpus-allocation.yaml index d5b1aaff6e..1ceb19973d 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-sandbox-vcpus-allocation.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-sandbox-vcpus-allocation.yaml @@ -15,6 +15,9 @@ spec: containers: - name: vcpus-less-than-one-with-no-limits image: quay.io/prometheus/busybox:latest + resources: + limits: + memory: "128Mi" command: ['nproc', '--all'] restartPolicy: Never --- @@ -32,6 +35,7 @@ spec: resources: limits: cpu: "0.25" + memory: "128Mi" command: ['nproc', '--all'] restartPolicy: Never --- @@ -49,5 +53,6 @@ spec: resources: limits: cpu: "1.2" + memory: "128Mi" command: ['nproc', '--all'] restartPolicy: Never diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index edc1ee09a7..f76566c07a 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -223,7 +223,7 @@ remove_kata_runtime_config_dropin_file() { } is_runtime_rs() { - [[ "${KATA_HYPERVISOR}" == *-runtime-rs ]] + [[ "${KATA_HYPERVISOR}" == *-runtime-rs ]] || [[ "${KATA_HYPERVISOR}" == "dragonball" ]] } # Copy the right combination of drop-ins from drop-in-examples/ into From a6645950847b7014a214f55d9e70d9185c3d01ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 24 Jun 2026 18:52:25 +0200 Subject: [PATCH 4/4] kata-deploy: bump qemu RuntimeClass overhead for the aarch64 VMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With sandbox_cgroup_only the shim, QEMU and virtiofsd run inside the pod's memory cgroup, whose limit is the workload limit plus the RuntimeClass pod overhead. On aarch64 the VMM host footprint is much larger than on x86 (QEMU's own anon RSS is ~160Mi+ before any guest RAM, on top of the shmem-backed guest memory), so the 160Mi overhead is too small: small-memory-limit pods get their qemu-system process OOM-killed by the pod cgroup (CONSTRAINT_MEMCG), and the agent vsock never comes up (ENODEV), so the sandbox fails to start. Raise the pod overhead to 320Mi for the qemu shims that run on aarch64 (qemu, qemu-runtime-rs, qemu-coco-dev-runtime-rs). The value is applied on all architectures for simplicity; x86 is over-provisioned by ~160Mi, which is acceptable. The TEE/GPU shims already carry far larger overhead and amd64-only shims (clh*, dragonball, fc) are unaffected. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../kata-deploy/templates/runtimeclasses.yaml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/runtimeclasses.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/runtimeclasses.yaml index 0a47eea7b4..c03e582834 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/runtimeclasses.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/runtimeclasses.yaml @@ -1,5 +1,12 @@ {{- /* Common RuntimeClass overhead defaults keyed by shim/baseConfig. + + NOTE: the QEMU shims use 320Mi rather than 160Mi. On aarch64 the VMM host + footprint is larger (QEMU's own anon RSS is ~160Mi+ before any guest RAM), + and with sandbox_cgroup_only the VMM runs inside the pod cgroup, so a 160Mi + overhead lets the VMM get OOM-killed for small-memory-limit pods. 320Mi keeps + it comfortably within the cgroup. It is applied on all arches for simplicity + (x86 is over-provisioned by ~160Mi, which is acceptable). */ -}} {{- define "kata-deploy.runtimeClassConfigs" -}} {{- toYaml (dict @@ -9,10 +16,10 @@ "clh-azure-runtime-rs" (dict "memory" "130Mi" "cpu" "250m") "dragonball" (dict "memory" "130Mi" "cpu" "250m") "fc" (dict "memory" "130Mi" "cpu" "250m") - "qemu" (dict "memory" "160Mi" "cpu" "250m") + "qemu" (dict "memory" "320Mi" "cpu" "250m") "qemu-coco-dev" (dict "memory" "160Mi" "cpu" "250m") - "qemu-coco-dev-runtime-rs" (dict "memory" "160Mi" "cpu" "250m") - "qemu-runtime-rs" (dict "memory" "160Mi" "cpu" "250m") + "qemu-coco-dev-runtime-rs" (dict "memory" "320Mi" "cpu" "250m") + "qemu-runtime-rs" (dict "memory" "320Mi" "cpu" "250m") "qemu-se-runtime-rs" (dict "memory" "1024Mi" "cpu" "1.0") "qemu-se" (dict "memory" "1024Mi" "cpu" "1.0") "qemu-snp" (dict "memory" "2048Mi" "cpu" "1.0")