diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 64a1be32e1..de0cc88878 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -269,7 +269,7 @@ ifneq (,$(DBCMD)) KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME_DB,$(KERNELTYPE_DB)) KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB) DEFSANDBOXCGROUPONLY_DB := true - DEFSTATICRESOURCEMGMT_DB := false + DEFSTATICRESOURCEMGMT_DB := true RUNTIMENAME := virt_container PIPESIZE := 1 DBSHAREDFS := inline-virtio-fs @@ -315,7 +315,7 @@ ifneq (,$(CLHCMD)) VMROOTFSDRIVER_CLH := virtio-blk-pci DEFSANDBOXCGROUPONLY_CLH := true - DEFSTATICRESOURCEMGMT_CLH := false + DEFSTATICRESOURCEMGMT_CLH := true endif ifneq (,$(QEMUCMD)) @@ -489,10 +489,10 @@ endif KERNELPATH_COCO = $(KERNELDIR)/$(KERNEL_NAME_COCO) # overriding options - DEFSTATICRESOURCEMGMT_QEMU := false + DEFSTATICRESOURCEMGMT_QEMU := true # qemu-specific options - DEFSANDBOXCGROUPONLY_QEMU := false + DEFSANDBOXCGROUPONLY_QEMU := true DEFENABLEVCPUSPINNING_QEMU := false ifeq ($(ARCH), s390x) VMROOTFSDRIVER_QEMU := virtio-blk-ccw @@ -562,7 +562,7 @@ ifneq (,$(REMOTE)) SYSCONFIG_PATHS += $(SYSCONFDIR_REMOTE) CONFIGS += $(CONFIG_REMOTE) # remote-specific options (all should be suffixed by "_REMOTE") - DEFSANDBOXCGROUPONLY_REMOTE := false + DEFSANDBOXCGROUPONLY_REMOTE := true endif ifeq ($(HYPERVISOR),$(HYPERVISOR_DB)) diff --git a/src/runtime-rs/crates/agent/src/sock/mod.rs b/src/runtime-rs/crates/agent/src/sock/mod.rs index a82da79072..54549660fb 100644 --- a/src/runtime-rs/crates/agent/src/sock/mod.rs +++ b/src/runtime-rs/crates/agent/src/sock/mod.rs @@ -90,9 +90,14 @@ pub struct ConnectConfig { impl ConnectConfig { pub fn new(dial_timeout_ms: u64, reconnect_timeout_ms: u64) -> Self { + // With static sandbox resource sizing enabled by default, tiny CPU + // allocations can make early guest boot/agent startup exceed 3s on + // loaded nodes. Keep a reasonable lower bound to avoid premature + // sandbox teardown during agent bring-up. + const MIN_RECONNECT_TIMEOUT_MS: u64 = 10_000; Self { dial_timeout_ms, - reconnect_timeout_ms, + reconnect_timeout_ms: reconnect_timeout_ms.max(MIN_RECONNECT_TIMEOUT_MS), } } } diff --git a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs index b6088bf8d0..bd56240b39 100644 --- a/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs +++ b/src/runtime-rs/crates/resource/src/cgroups/resource_inner.rs @@ -5,6 +5,7 @@ // use std::collections::{HashMap, HashSet}; +use std::error::Error as _; use std::process; use std::str::FromStr; use std::time::Duration; @@ -41,6 +42,36 @@ pub(crate) struct CgroupsResourceInner { } impl CgroupsResourceInner { + fn is_already_exists_error(err: &cgroups_rs::manager::Error) -> bool { + let mut source = err.source(); + + while let Some(inner_err) = source { + if let Some(io_err) = inner_err.downcast_ref::() { + if io_err.kind() == std::io::ErrorKind::AlreadyExists { + return true; + } + } + + source = inner_err.source(); + } + + false + } + + fn add_proc_with_existing_retry( + cgroup: &mut CgroupManager, + pid: CgroupPid, + context: &str, + ) -> Result<()> { + match cgroup.add_proc(pid) { + Ok(_) => Ok(()), + Err(err) if Self::is_already_exists_error(&err) => cgroup + .add_proc(pid) + .with_context(|| format!("{context} (retry after pre-existing cgroup)")), + Err(err) => Err(err).context(context.to_string()), + } + } + /// Create cgroup managers according to the cgroup configuration. /// /// # Returns @@ -90,13 +121,17 @@ impl CgroupsResourceInner { // The runtime is prioritized to be added to the overhead cgroup. let pid = CgroupPid::from(process::id() as u64); if let Some(overhead_cgroup) = overhead_cgroup.as_mut() { - overhead_cgroup - .add_proc(pid) - .context("add runtime to overhead cgroup")?; + Self::add_proc_with_existing_retry( + overhead_cgroup, + pid, + "add runtime to overhead cgroup", + )?; } else { - sandbox_cgroup - .add_proc(pid) - .context("add runtime to sandbox cgroup")?; + Self::add_proc_with_existing_retry( + &mut sandbox_cgroup, + pid, + "add runtime to sandbox cgroup", + )?; } Ok(Self { diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 4c35bc09f2..d2531b6933 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -132,6 +132,22 @@ impl InitialSizeManager { }) } + // Merge sizing values from sandbox annotations when the current source + // (typically the OCI spec) does not carry CRI sandbox sizing keys. + pub fn supplement_from_annotations(&mut self, annotation: &HashMap) -> Result<()> { + let from_annotation = + InitialSize::try_from(annotation).context("failed to construct static resource")?; + + if self.resource.vcpu == 0.0 { + self.resource.vcpu = from_annotation.vcpu; + } + if self.resource.mem_mb == 0 { + self.resource.mem_mb = from_annotation.mem_mb; + } + + Ok(()) + } + pub fn setup_config(&mut self, config: &mut TomlConfig) -> Result<()> { // update this data to the hypervisor config for later use by hypervisor let hypervisor_name = &config.runtime.hypervisor_name; @@ -500,4 +516,36 @@ mod tests { mgr.setup_config(&mut config).unwrap(); assert_eq!(mgr.get_orig_toml_default_mem(), 256); } + + #[test] + fn test_supplement_from_annotations_fills_missing_spec_sizing() { + let mut mgr = InitialSizeManager { + resource: InitialSize { + vcpu: 0.0, + mem_mb: 0, + orig_toml_default_mem: 0, + }, + }; + + let ann = HashMap::from([ + ( + cri_containerd::SANDBOX_CPU_PERIOD_KEY.to_string(), + "100000".to_string(), + ), + ( + cri_containerd::SANDBOX_CPU_QUOTA_KEY.to_string(), + "120000".to_string(), + ), + ( + cri_containerd::SANDBOX_MEM_KEY.to_string(), + (256 * MIB).to_string(), + ), + ]); + + mgr.supplement_from_annotations(&ann).unwrap(); + + const VCPU_TOLERANCE: f32 = 0.0001; + assert!((mgr.resource.vcpu - 1.2).abs() < VCPU_TOLERANCE); + assert_eq!(mgr.resource.mem_mb, 256); + } } diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index a4ec1b8c45..d5b59cbce1 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -216,6 +216,13 @@ impl RuntimeHandlerManagerInner { .context("failed to construct static resource manager")? }; + // For CRI sandboxes, sizing annotations are carried in PodSandboxConfig + // and may be absent from the OCI sandbox spec. Fill any missing sizing + // values from sandbox annotations before applying static sizing. + initial_size_manager + .supplement_from_annotations(&sandbox_config.annotations) + .context("failed to supplement static resource manager from annotations")?; + initial_size_manager .setup_config(&mut config) .context("failed to setup static resource mgmt config")?;