From e93558e810d5bec2188727a66aa5b9759c1919e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 4 Jun 2026 22:48:28 +0200 Subject: [PATCH 1/4] runtime-rs: default static sizing-related config flags to true MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add top-level runtime-rs Makefile options `DEFSANDBOXCGROUP_ONLY` and `DEFSTATICRESOURCEMGMT`, both defaulting to true, and use them for the runtime defaults that previously disabled these paths. This aligns runtime-rs defaults with static sandbox resource management, which sizes sandbox memory up front instead of relying on memory hotplug, helping avoid architecture-specific hotplug limitations. Signed-off-by: Fabiano Fidêncio --- src/runtime-rs/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 64a1be32e1..de0cc88878 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -269,7 +269,7 @@ ifneq (,$(DBCMD)) KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME_DB,$(KERNELTYPE_DB)) KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB) DEFSANDBOXCGROUPONLY_DB := true - DEFSTATICRESOURCEMGMT_DB := false + DEFSTATICRESOURCEMGMT_DB := true RUNTIMENAME := virt_container PIPESIZE := 1 DBSHAREDFS := inline-virtio-fs @@ -315,7 +315,7 @@ ifneq (,$(CLHCMD)) VMROOTFSDRIVER_CLH := virtio-blk-pci DEFSANDBOXCGROUPONLY_CLH := true - DEFSTATICRESOURCEMGMT_CLH := false + DEFSTATICRESOURCEMGMT_CLH := true endif ifneq (,$(QEMUCMD)) @@ -489,10 +489,10 @@ endif KERNELPATH_COCO = $(KERNELDIR)/$(KERNEL_NAME_COCO) # overriding options - DEFSTATICRESOURCEMGMT_QEMU := false + DEFSTATICRESOURCEMGMT_QEMU := true # qemu-specific options - DEFSANDBOXCGROUPONLY_QEMU := false + DEFSANDBOXCGROUPONLY_QEMU := true DEFENABLEVCPUSPINNING_QEMU := false ifeq ($(ARCH), s390x) VMROOTFSDRIVER_QEMU := virtio-blk-ccw @@ -562,7 +562,7 @@ ifneq (,$(REMOTE)) SYSCONFIG_PATHS += $(SYSCONFDIR_REMOTE) CONFIGS += $(CONFIG_REMOTE) # remote-specific options (all should be suffixed by "_REMOTE") - DEFSANDBOXCGROUPONLY_REMOTE := false + DEFSANDBOXCGROUPONLY_REMOTE := true endif ifeq ($(HYPERVISOR),$(HYPERVISOR_DB)) From ed34d7811dd0540380a7b102e85a8af10629ea50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sun, 7 Jun 2026 13:04:04 +0200 Subject: [PATCH 2/4] runtime-rs: supplement static sizing from sandbox annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When static sandbox resource management is enabled, CRI CPU/memory sizing may live only in sandbox annotations and be missing from the OCI spec. Let's fill missing sizing fields from annotations before applying static VM sizing so runtime-rs follows the expected Kubernetes behavior for constrained pods. Signed-off-by: Fabiano Fidêncio Assisted-by: Codex --- .../resource/src/cpu_mem/initial_size.rs | 48 +++++++++++++++++++ src/runtime-rs/crates/runtimes/src/manager.rs | 7 +++ 2 files changed, 55 insertions(+) diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 4c35bc09f2..d2531b6933 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -132,6 +132,22 @@ impl InitialSizeManager { }) } + // Merge sizing values from sandbox annotations when the current source + // (typically the OCI spec) does not carry CRI sandbox sizing keys. + pub fn supplement_from_annotations(&mut self, annotation: &HashMap) -> Result<()> { + let from_annotation = + InitialSize::try_from(annotation).context("failed to construct static resource")?; + + if self.resource.vcpu == 0.0 { + self.resource.vcpu = from_annotation.vcpu; + } + if self.resource.mem_mb == 0 { + self.resource.mem_mb = from_annotation.mem_mb; + } + + Ok(()) + } + pub fn setup_config(&mut self, config: &mut TomlConfig) -> Result<()> { // update this data to the hypervisor config for later use by hypervisor let hypervisor_name = &config.runtime.hypervisor_name; @@ -500,4 +516,36 @@ mod tests { mgr.setup_config(&mut config).unwrap(); assert_eq!(mgr.get_orig_toml_default_mem(), 256); } + + #[test] + fn test_supplement_from_annotations_fills_missing_spec_sizing() { + let mut mgr = InitialSizeManager { + resource: InitialSize { + vcpu: 0.0, + mem_mb: 0, + orig_toml_default_mem: 0, + }, + }; + + let ann = HashMap::from([ + ( + cri_containerd::SANDBOX_CPU_PERIOD_KEY.to_string(), + "100000".to_string(), + ), + ( + cri_containerd::SANDBOX_CPU_QUOTA_KEY.to_string(), + "120000".to_string(), + ), + ( + cri_containerd::SANDBOX_MEM_KEY.to_string(), + (256 * MIB).to_string(), + ), + ]); + + mgr.supplement_from_annotations(&ann).unwrap(); + + const VCPU_TOLERANCE: f32 = 0.0001; + assert!((mgr.resource.vcpu - 1.2).abs() < VCPU_TOLERANCE); + assert_eq!(mgr.resource.mem_mb, 256); + } } diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index a4ec1b8c45..d5b59cbce1 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -216,6 +216,13 @@ impl RuntimeHandlerManagerInner { .context("failed to construct static resource manager")? }; + // For CRI sandboxes, sizing annotations are carried in PodSandboxConfig + // and may be absent from the OCI sandbox spec. Fill any missing sizing + // values from sandbox annotations before applying static sizing. + initial_size_manager + .supplement_from_annotations(&sandbox_config.annotations) + .context("failed to supplement static resource manager from annotations")?; + initial_size_manager .setup_config(&mut config) .context("failed to setup static resource mgmt config")?; From 4d569c22b44d0bceca2a68045650a2c29fe114ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sun, 7 Jun 2026 13:04:20 +0200 Subject: [PATCH 3/4] runtime-rs: enforce a minimum vsock reconnect window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Low-CPU sandboxes can take longer than a few seconds to complete guest boot and start the agent. Let's clamp the reconnect timeout to a safe minimum so sandbox startup does not fail early with transient vsock ECONNRESET. Signed-off-by: Fabiano Fidêncio Assisted-by: Codex --- src/runtime-rs/crates/agent/src/sock/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/runtime-rs/crates/agent/src/sock/mod.rs b/src/runtime-rs/crates/agent/src/sock/mod.rs index a82da79072..54549660fb 100644 --- a/src/runtime-rs/crates/agent/src/sock/mod.rs +++ b/src/runtime-rs/crates/agent/src/sock/mod.rs @@ -90,9 +90,14 @@ pub struct ConnectConfig { impl ConnectConfig { pub fn new(dial_timeout_ms: u64, reconnect_timeout_ms: u64) -> Self { + // With static sandbox resource sizing enabled by default, tiny CPU + // allocations can make early guest boot/agent startup exceed 3s on + // loaded nodes. Keep a reasonable lower bound to avoid premature + // sandbox teardown during agent bring-up. + const MIN_RECONNECT_TIMEOUT_MS: u64 = 10_000; Self { dial_timeout_ms, - reconnect_timeout_ms, + reconnect_timeout_ms: reconnect_timeout_ms.max(MIN_RECONNECT_TIMEOUT_MS), } } } From 4dc288401e6b739c2b52222080144a07cf98c653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sun, 7 Jun 2026 13:38:13 +0200 Subject: [PATCH 4/4] runtime-rs: make sandbox cgroup runtime attach idempotent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dragonball nerdctl CI job can race when creating and attaching the runtime process to the sandbox cgroup, surfacing an os error 17 (AlreadyExists) during shim task creation. Let's retry add_proc once on this pre-existing cgroup condition so startup remains robust. Signed-off-by: Fabiano Fidêncio Assisted-by: Codex --- .../resource/src/cgroups/resource_inner.rs | 47 ++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs index b6088bf8d0..bd56240b39 100644 --- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs +++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs @@ -5,6 +5,7 @@ // use std::collections::{HashMap, HashSet}; +use std::error::Error as _; use std::process; use std::str::FromStr; use std::time::Duration; @@ -41,6 +42,36 @@ pub(crate) struct CgroupsResourceInner { } impl CgroupsResourceInner { + fn is_already_exists_error(err: &cgroups_rs::manager::Error) -> bool { + let mut source = err.source(); + + while let Some(inner_err) = source { + if let Some(io_err) = inner_err.downcast_ref::() { + if io_err.kind() == std::io::ErrorKind::AlreadyExists { + return true; + } + } + + source = inner_err.source(); + } + + false + } + + fn add_proc_with_existing_retry( + cgroup: &mut CgroupManager, + pid: CgroupPid, + context: &str, + ) -> Result<()> { + match cgroup.add_proc(pid) { + Ok(_) => Ok(()), + Err(err) if Self::is_already_exists_error(&err) => cgroup + .add_proc(pid) + .with_context(|| format!("{context} (retry after pre-existing cgroup)")), + Err(err) => Err(err).context(context.to_string()), + } + } + /// Create cgroup managers according to the cgroup configuration. /// /// # Returns @@ -90,13 +121,17 @@ impl CgroupsResourceInner { // The runtime is prioritized to be added to the overhead cgroup. let pid = CgroupPid::from(process::id() as u64); if let Some(overhead_cgroup) = overhead_cgroup.as_mut() { - overhead_cgroup - .add_proc(pid) - .context("add runtime to overhead cgroup")?; + Self::add_proc_with_existing_retry( + overhead_cgroup, + pid, + "add runtime to overhead cgroup", + )?; } else { - sandbox_cgroup - .add_proc(pid) - .context("add runtime to sandbox cgroup")?; + Self::add_proc_with_existing_retry( + &mut sandbox_cgroup, + pid, + "add runtime to sandbox cgroup", + )?; } Ok(Self {