runtime-rs: support keep_abnormal in toml config

This patch adds keep_abnormal in runtime config. If keep_abnormal =
true, it means that 1) if the runtime exits abnormally, the cleanup
process will be skipped, and 2) the runtime will not exit even if the
health check fails.

This option is typically used to retain abnormal information for
debugging and should NOT be enabled by default.

Fixes: #6717

Signed-off-by: mengze <mengze@linux.alibaba.com>
Signed-off-by: quanweiZhou <quanweiZhou@linux.alibaba.com>
This commit is contained in:
mengze 2023-04-17 16:41:01 +08:00
parent 97291d88e9
commit cc8ea3232e
5 changed files with 37 additions and 9 deletions

View File

@ -130,6 +130,12 @@ pub struct Runtime {
/// Vendor customized runtime configuration. /// Vendor customized runtime configuration.
#[serde(default, flatten)] #[serde(default, flatten)]
pub vendor: RuntimeVendor, pub vendor: RuntimeVendor,
/// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process
/// will be skipped, and 2) the runtime will not exit even if the health check fails.
/// This option is typically used to retain abnormal information for debugging.
#[serde(default)]
pub keep_abnormal: bool,
} }
impl ConfigOps for Runtime { impl ConfigOps for Runtime {

View File

@ -214,7 +214,14 @@ dial_timeout = 45
# system log # system log
# (default: disabled) # (default: disabled)
#enable_debug = true #enable_debug = true
#
# If enabled, enabled, it means that 1) if the runtime exits abnormally,
# the cleanup process will be skipped, and 2) the runtime will not exit
# even if the health check fails.
# This option is typically used to retain abnormal information for debugging.
# (default: false)
#keep_abnormal = true
# Internetworking model # Internetworking model
# Determines how the VM should be connected to the # Determines how the VM should be connected to the
# the container network interface # the container network interface

View File

@ -14,6 +14,7 @@ use common::{
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv, RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
}; };
use hypervisor::Param; use hypervisor::Param;
use kata_sys_util::spec::load_oci_spec;
use kata_types::{ use kata_types::{
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig, annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
}; };
@ -190,9 +191,16 @@ impl RuntimeHandlerManager {
let sender = inner.msg_sender.clone(); let sender = inner.msg_sender.clone();
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id) let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
.context("failed to load the sandbox state")?; .context("failed to load the sandbox state")?;
let config = if let Ok(spec) = load_oci_spec() {
load_config(&spec, &None).context("load config")?
} else {
TomlConfig::default()
};
let sandbox_args = SandboxRestoreArgs { let sandbox_args = SandboxRestoreArgs {
sid: inner.id.clone(), sid: inner.id.clone(),
toml_config: TomlConfig::default(), toml_config: config,
sender, sender,
}; };
match sandbox_state.sandbox_type.clone() { match sandbox_state.sandbox_type.clone() {
@ -208,6 +216,10 @@ impl RuntimeHandlerManager {
} }
#[cfg(feature = "virt")] #[cfg(feature = "virt")]
name if name == VirtContainer::name() => { name if name == VirtContainer::name() => {
if sandbox_args.toml_config.runtime.keep_abnormal {
info!(sl!(), "skip cleanup for keep_abnormal");
return Ok(());
}
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state) let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
.await .await
.context("failed to restore the sandbox")?; .context("failed to restore the sandbox")?;

View File

@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1;
pub struct HealthCheck { pub struct HealthCheck {
pub keep_alive: bool, pub keep_alive: bool,
keep_vm: bool, keep_abnormal: bool,
stop_tx: mpsc::Sender<()>, stop_tx: mpsc::Sender<()>,
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>, stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
} }
impl HealthCheck { impl HealthCheck {
pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck { pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck {
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE); let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
HealthCheck { HealthCheck {
keep_alive, keep_alive,
keep_vm, keep_abnormal,
stop_tx: tx, stop_tx: tx,
stop_rx: Arc::new(Mutex::new(rx)), stop_rx: Arc::new(Mutex::new(rx)),
} }
@ -46,7 +46,7 @@ impl HealthCheck {
info!(sl!(), "start runtime keep alive"); info!(sl!(), "start runtime keep alive");
let stop_rx = self.stop_rx.clone(); let stop_rx = self.stop_rx.clone();
let keep_vm = self.keep_vm; let keep_abnormal = self.keep_abnormal;
let _ = tokio::spawn(async move { let _ = tokio::spawn(async move {
let mut version_check_threshold_count = 0; let mut version_check_threshold_count = 0;
@ -87,7 +87,7 @@ impl HealthCheck {
error!(sl!(), "failed to do {} agent health check: {}", id, e); error!(sl!(), "failed to do {} agent health check: {}", id, e);
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() { if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
error!(sl!(), "failed to receive stop monitor signal"); error!(sl!(), "failed to receive stop monitor signal");
if !keep_vm { if !keep_abnormal {
::std::process::exit(1); ::std::process::exit(1);
} }
} else { } else {

View File

@ -75,6 +75,8 @@ impl VirtSandbox {
hypervisor: Arc<dyn Hypervisor>, hypervisor: Arc<dyn Hypervisor>,
resource_manager: Arc<ResourceManager>, resource_manager: Arc<ResourceManager>,
) -> Result<Self> { ) -> Result<Self> {
let config = resource_manager.config().await;
let keep_abnormal = config.runtime.keep_abnormal;
Ok(Self { Ok(Self {
sid: sid.to_string(), sid: sid.to_string(),
msg_sender: Arc::new(Mutex::new(msg_sender)), msg_sender: Arc::new(Mutex::new(msg_sender)),
@ -82,7 +84,7 @@ impl VirtSandbox {
agent, agent,
hypervisor, hypervisor,
resource_manager, resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)), monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
}) })
} }
@ -440,6 +442,7 @@ impl Persist for VirtSandbox {
}?; }?;
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default())); let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
let sid = sandbox_args.sid; let sid = sandbox_args.sid;
let keep_abnormal = config.runtime.keep_abnormal;
let args = ManagerArgs { let args = ManagerArgs {
sid: sid.clone(), sid: sid.clone(),
agent: agent.clone(), agent: agent.clone(),
@ -454,7 +457,7 @@ impl Persist for VirtSandbox {
agent, agent,
hypervisor, hypervisor,
resource_manager, resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)), monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
}) })
} }
} }