diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index 1d73643686..067ff6776a 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -130,6 +130,12 @@ pub struct Runtime { /// Vendor customized runtime configuration. #[serde(default, flatten)] pub vendor: RuntimeVendor, + + /// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process + /// will be skipped, and 2) the runtime will not exit even if the health check fails. + /// This option is typically used to retain abnormal information for debugging. + #[serde(default)] + pub keep_abnormal: bool, } impl ConfigOps for Runtime { diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 174f270e7a..4c7d3db053 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -214,7 +214,14 @@ dial_timeout = 45 # system log # (default: disabled) #enable_debug = true -# + +# If enabled, enabled, it means that 1) if the runtime exits abnormally, +# the cleanup process will be skipped, and 2) the runtime will not exit +# even if the health check fails. +# This option is typically used to retain abnormal information for debugging. +# (default: false) +#keep_abnormal = true + # Internetworking model # Determines how the VM should be connected to the # the container network interface diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index f97861f23b..b32c367735 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -14,6 +14,7 @@ use common::{ RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv, }; use hypervisor::Param; +use kata_sys_util::spec::load_oci_spec; use kata_types::{ annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig, }; @@ -190,9 +191,16 @@ impl RuntimeHandlerManager { let sender = inner.msg_sender.clone(); let sandbox_state = persist::from_disk::(&inner.id) .context("failed to load the sandbox state")?; + + let config = if let Ok(spec) = load_oci_spec() { + load_config(&spec, &None).context("load config")? + } else { + TomlConfig::default() + }; + let sandbox_args = SandboxRestoreArgs { sid: inner.id.clone(), - toml_config: TomlConfig::default(), + toml_config: config, sender, }; match sandbox_state.sandbox_type.clone() { @@ -208,6 +216,10 @@ impl RuntimeHandlerManager { } #[cfg(feature = "virt")] name if name == VirtContainer::name() => { + if sandbox_args.toml_config.runtime.keep_abnormal { + info!(sl!(), "skip cleanup for keep_abnormal"); + return Ok(()); + } let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state) .await .context("failed to restore the sandbox")?; diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs index f6d60c4c41..81fb3d58b5 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs @@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1; pub struct HealthCheck { pub keep_alive: bool, - keep_vm: bool, + keep_abnormal: bool, stop_tx: mpsc::Sender<()>, stop_rx: Arc>>, } impl HealthCheck { - pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck { + pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck { let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE); HealthCheck { keep_alive, - keep_vm, + keep_abnormal, stop_tx: tx, stop_rx: Arc::new(Mutex::new(rx)), } @@ -46,7 +46,7 @@ impl HealthCheck { info!(sl!(), "start runtime keep alive"); let stop_rx = self.stop_rx.clone(); - let keep_vm = self.keep_vm; + let keep_abnormal = self.keep_abnormal; let _ = tokio::spawn(async move { let mut version_check_threshold_count = 0; @@ -87,7 +87,7 @@ impl HealthCheck { error!(sl!(), "failed to do {} agent health check: {}", id, e); if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() { error!(sl!(), "failed to receive stop monitor signal"); - if !keep_vm { + if !keep_abnormal { ::std::process::exit(1); } } else { diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 881b5f78b8..c5ec38e463 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -75,6 +75,8 @@ impl VirtSandbox { hypervisor: Arc, resource_manager: Arc, ) -> Result { + let config = resource_manager.config().await; + let keep_abnormal = config.runtime.keep_abnormal; Ok(Self { sid: sid.to_string(), msg_sender: Arc::new(Mutex::new(msg_sender)), @@ -82,7 +84,7 @@ impl VirtSandbox { agent, hypervisor, resource_manager, - monitor: Arc::new(HealthCheck::new(true, false)), + monitor: Arc::new(HealthCheck::new(true, keep_abnormal)), }) } @@ -440,6 +442,7 @@ impl Persist for VirtSandbox { }?; let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default())); let sid = sandbox_args.sid; + let keep_abnormal = config.runtime.keep_abnormal; let args = ManagerArgs { sid: sid.clone(), agent: agent.clone(), @@ -454,7 +457,7 @@ impl Persist for VirtSandbox { agent, hypervisor, resource_manager, - monitor: Arc::new(HealthCheck::new(true, false)), + monitor: Arc::new(HealthCheck::new(true, keep_abnormal)), }) } }