From cc8ea3232ef8ad574abf56a19d3fb5e5fa6377ee Mon Sep 17 00:00:00 2001 From: mengze Date: Mon, 17 Apr 2023 16:41:01 +0800 Subject: [PATCH] runtime-rs: support keep_abnormal in toml config This patch adds keep_abnormal in runtime config. If keep_abnormal = true, it means that 1) if the runtime exits abnormally, the cleanup process will be skipped, and 2) the runtime will not exit even if the health check fails. This option is typically used to retain abnormal information for debugging and should NOT be enabled by default. Fixes: #6717 Signed-off-by: mengze Signed-off-by: quanweiZhou --- src/libs/kata-types/src/config/runtime.rs | 6 ++++++ .../config/configuration-dragonball.toml.in | 9 ++++++++- src/runtime-rs/crates/runtimes/src/manager.rs | 14 +++++++++++++- .../runtimes/virt_container/src/health_check.rs | 10 +++++----- .../crates/runtimes/virt_container/src/sandbox.rs | 7 +++++-- 5 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index 1d73643686..067ff6776a 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -130,6 +130,12 @@ pub struct Runtime { /// Vendor customized runtime configuration. #[serde(default, flatten)] pub vendor: RuntimeVendor, + + /// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process + /// will be skipped, and 2) the runtime will not exit even if the health check fails. + /// This option is typically used to retain abnormal information for debugging. + #[serde(default)] + pub keep_abnormal: bool, } impl ConfigOps for Runtime { diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 174f270e7a..4c7d3db053 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -214,7 +214,14 @@ dial_timeout = 45 # system log # (default: disabled) #enable_debug = true -# + +# If enabled, enabled, it means that 1) if the runtime exits abnormally, +# the cleanup process will be skipped, and 2) the runtime will not exit +# even if the health check fails. +# This option is typically used to retain abnormal information for debugging. +# (default: false) +#keep_abnormal = true + # Internetworking model # Determines how the VM should be connected to the # the container network interface diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index f97861f23b..b32c367735 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -14,6 +14,7 @@ use common::{ RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv, }; use hypervisor::Param; +use kata_sys_util::spec::load_oci_spec; use kata_types::{ annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig, }; @@ -190,9 +191,16 @@ impl RuntimeHandlerManager { let sender = inner.msg_sender.clone(); let sandbox_state = persist::from_disk::(&inner.id) .context("failed to load the sandbox state")?; + + let config = if let Ok(spec) = load_oci_spec() { + load_config(&spec, &None).context("load config")? + } else { + TomlConfig::default() + }; + let sandbox_args = SandboxRestoreArgs { sid: inner.id.clone(), - toml_config: TomlConfig::default(), + toml_config: config, sender, }; match sandbox_state.sandbox_type.clone() { @@ -208,6 +216,10 @@ impl RuntimeHandlerManager { } #[cfg(feature = "virt")] name if name == VirtContainer::name() => { + if sandbox_args.toml_config.runtime.keep_abnormal { + info!(sl!(), "skip cleanup for keep_abnormal"); + return Ok(()); + } let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state) .await .context("failed to restore the sandbox")?; diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs index f6d60c4c41..81fb3d58b5 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs @@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1; pub struct HealthCheck { pub keep_alive: bool, - keep_vm: bool, + keep_abnormal: bool, stop_tx: mpsc::Sender<()>, stop_rx: Arc>>, } impl HealthCheck { - pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck { + pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck { let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE); HealthCheck { keep_alive, - keep_vm, + keep_abnormal, stop_tx: tx, stop_rx: Arc::new(Mutex::new(rx)), } @@ -46,7 +46,7 @@ impl HealthCheck { info!(sl!(), "start runtime keep alive"); let stop_rx = self.stop_rx.clone(); - let keep_vm = self.keep_vm; + let keep_abnormal = self.keep_abnormal; let _ = tokio::spawn(async move { let mut version_check_threshold_count = 0; @@ -87,7 +87,7 @@ impl HealthCheck { error!(sl!(), "failed to do {} agent health check: {}", id, e); if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() { error!(sl!(), "failed to receive stop monitor signal"); - if !keep_vm { + if !keep_abnormal { ::std::process::exit(1); } } else { diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 881b5f78b8..c5ec38e463 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -75,6 +75,8 @@ impl VirtSandbox { hypervisor: Arc, resource_manager: Arc, ) -> Result { + let config = resource_manager.config().await; + let keep_abnormal = config.runtime.keep_abnormal; Ok(Self { sid: sid.to_string(), msg_sender: Arc::new(Mutex::new(msg_sender)), @@ -82,7 +84,7 @@ impl VirtSandbox { agent, hypervisor, resource_manager, - monitor: Arc::new(HealthCheck::new(true, false)), + monitor: Arc::new(HealthCheck::new(true, keep_abnormal)), }) } @@ -440,6 +442,7 @@ impl Persist for VirtSandbox { }?; let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default())); let sid = sandbox_args.sid; + let keep_abnormal = config.runtime.keep_abnormal; let args = ManagerArgs { sid: sid.clone(), agent: agent.clone(), @@ -454,7 +457,7 @@ impl Persist for VirtSandbox { agent, hypervisor, resource_manager, - monitor: Arc::new(HealthCheck::new(true, false)), + monitor: Arc::new(HealthCheck::new(true, keep_abnormal)), }) } }