runtime-rs: support keep_abnormal in toml config

This patch adds keep_abnormal in runtime config. If keep_abnormal =
true, it means that 1) if the runtime exits abnormally, the cleanup
process will be skipped, and 2) the runtime will not exit even if the
health check fails.

This option is typically used to retain abnormal information for
debugging and should NOT be enabled by default.

Fixes: #6717

Signed-off-by: mengze <mengze@linux.alibaba.com>
Signed-off-by: quanweiZhou <quanweiZhou@linux.alibaba.com>
This commit is contained in:
mengze 2023-04-17 16:41:01 +08:00
parent 97291d88e9
commit cc8ea3232e
5 changed files with 37 additions and 9 deletions

View File

@ -130,6 +130,12 @@ pub struct Runtime {
/// Vendor customized runtime configuration.
#[serde(default, flatten)]
pub vendor: RuntimeVendor,
/// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process
/// will be skipped, and 2) the runtime will not exit even if the health check fails.
/// This option is typically used to retain abnormal information for debugging.
#[serde(default)]
pub keep_abnormal: bool,
}
impl ConfigOps for Runtime {

View File

@ -214,7 +214,14 @@ dial_timeout = 45
# system log
# (default: disabled)
#enable_debug = true
#
# If enabled, enabled, it means that 1) if the runtime exits abnormally,
# the cleanup process will be skipped, and 2) the runtime will not exit
# even if the health check fails.
# This option is typically used to retain abnormal information for debugging.
# (default: false)
#keep_abnormal = true
# Internetworking model
# Determines how the VM should be connected to the
# the container network interface

View File

@ -14,6 +14,7 @@ use common::{
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
};
use hypervisor::Param;
use kata_sys_util::spec::load_oci_spec;
use kata_types::{
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
};
@ -190,9 +191,16 @@ impl RuntimeHandlerManager {
let sender = inner.msg_sender.clone();
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
.context("failed to load the sandbox state")?;
let config = if let Ok(spec) = load_oci_spec() {
load_config(&spec, &None).context("load config")?
} else {
TomlConfig::default()
};
let sandbox_args = SandboxRestoreArgs {
sid: inner.id.clone(),
toml_config: TomlConfig::default(),
toml_config: config,
sender,
};
match sandbox_state.sandbox_type.clone() {
@ -208,6 +216,10 @@ impl RuntimeHandlerManager {
}
#[cfg(feature = "virt")]
name if name == VirtContainer::name() => {
if sandbox_args.toml_config.runtime.keep_abnormal {
info!(sl!(), "skip cleanup for keep_abnormal");
return Ok(());
}
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
.await
.context("failed to restore the sandbox")?;

View File

@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1;
pub struct HealthCheck {
pub keep_alive: bool,
keep_vm: bool,
keep_abnormal: bool,
stop_tx: mpsc::Sender<()>,
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
}
impl HealthCheck {
pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck {
pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck {
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
HealthCheck {
keep_alive,
keep_vm,
keep_abnormal,
stop_tx: tx,
stop_rx: Arc::new(Mutex::new(rx)),
}
@ -46,7 +46,7 @@ impl HealthCheck {
info!(sl!(), "start runtime keep alive");
let stop_rx = self.stop_rx.clone();
let keep_vm = self.keep_vm;
let keep_abnormal = self.keep_abnormal;
let _ = tokio::spawn(async move {
let mut version_check_threshold_count = 0;
@ -87,7 +87,7 @@ impl HealthCheck {
error!(sl!(), "failed to do {} agent health check: {}", id, e);
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
error!(sl!(), "failed to receive stop monitor signal");
if !keep_vm {
if !keep_abnormal {
::std::process::exit(1);
}
} else {

View File

@ -75,6 +75,8 @@ impl VirtSandbox {
hypervisor: Arc<dyn Hypervisor>,
resource_manager: Arc<ResourceManager>,
) -> Result<Self> {
let config = resource_manager.config().await;
let keep_abnormal = config.runtime.keep_abnormal;
Ok(Self {
sid: sid.to_string(),
msg_sender: Arc::new(Mutex::new(msg_sender)),
@ -82,7 +84,7 @@ impl VirtSandbox {
agent,
hypervisor,
resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)),
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
})
}
@ -440,6 +442,7 @@ impl Persist for VirtSandbox {
}?;
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
let sid = sandbox_args.sid;
let keep_abnormal = config.runtime.keep_abnormal;
let args = ManagerArgs {
sid: sid.clone(),
agent: agent.clone(),
@ -454,7 +457,7 @@ impl Persist for VirtSandbox {
agent,
hypervisor,
resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)),
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
})
}
}