mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-26 07:22:20 +00:00
runtime-rs: support keep_abnormal in toml config
This patch adds keep_abnormal in runtime config. If keep_abnormal = true, it means that 1) if the runtime exits abnormally, the cleanup process will be skipped, and 2) the runtime will not exit even if the health check fails. This option is typically used to retain abnormal information for debugging and should NOT be enabled by default. Fixes: #6717 Signed-off-by: mengze <mengze@linux.alibaba.com> Signed-off-by: quanweiZhou <quanweiZhou@linux.alibaba.com>
This commit is contained in:
parent
97291d88e9
commit
cc8ea3232e
@ -130,6 +130,12 @@ pub struct Runtime {
|
||||
/// Vendor customized runtime configuration.
|
||||
#[serde(default, flatten)]
|
||||
pub vendor: RuntimeVendor,
|
||||
|
||||
/// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process
|
||||
/// will be skipped, and 2) the runtime will not exit even if the health check fails.
|
||||
/// This option is typically used to retain abnormal information for debugging.
|
||||
#[serde(default)]
|
||||
pub keep_abnormal: bool,
|
||||
}
|
||||
|
||||
impl ConfigOps for Runtime {
|
||||
|
@ -214,7 +214,14 @@ dial_timeout = 45
|
||||
# system log
|
||||
# (default: disabled)
|
||||
#enable_debug = true
|
||||
#
|
||||
|
||||
# If enabled, enabled, it means that 1) if the runtime exits abnormally,
|
||||
# the cleanup process will be skipped, and 2) the runtime will not exit
|
||||
# even if the health check fails.
|
||||
# This option is typically used to retain abnormal information for debugging.
|
||||
# (default: false)
|
||||
#keep_abnormal = true
|
||||
|
||||
# Internetworking model
|
||||
# Determines how the VM should be connected to the
|
||||
# the container network interface
|
||||
|
@ -14,6 +14,7 @@ use common::{
|
||||
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
|
||||
};
|
||||
use hypervisor::Param;
|
||||
use kata_sys_util::spec::load_oci_spec;
|
||||
use kata_types::{
|
||||
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
|
||||
};
|
||||
@ -190,9 +191,16 @@ impl RuntimeHandlerManager {
|
||||
let sender = inner.msg_sender.clone();
|
||||
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
|
||||
.context("failed to load the sandbox state")?;
|
||||
|
||||
let config = if let Ok(spec) = load_oci_spec() {
|
||||
load_config(&spec, &None).context("load config")?
|
||||
} else {
|
||||
TomlConfig::default()
|
||||
};
|
||||
|
||||
let sandbox_args = SandboxRestoreArgs {
|
||||
sid: inner.id.clone(),
|
||||
toml_config: TomlConfig::default(),
|
||||
toml_config: config,
|
||||
sender,
|
||||
};
|
||||
match sandbox_state.sandbox_type.clone() {
|
||||
@ -208,6 +216,10 @@ impl RuntimeHandlerManager {
|
||||
}
|
||||
#[cfg(feature = "virt")]
|
||||
name if name == VirtContainer::name() => {
|
||||
if sandbox_args.toml_config.runtime.keep_abnormal {
|
||||
info!(sl!(), "skip cleanup for keep_abnormal");
|
||||
return Ok(());
|
||||
}
|
||||
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
|
||||
.await
|
||||
.context("failed to restore the sandbox")?;
|
||||
|
@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1;
|
||||
|
||||
pub struct HealthCheck {
|
||||
pub keep_alive: bool,
|
||||
keep_vm: bool,
|
||||
keep_abnormal: bool,
|
||||
stop_tx: mpsc::Sender<()>,
|
||||
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
|
||||
}
|
||||
|
||||
impl HealthCheck {
|
||||
pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck {
|
||||
pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck {
|
||||
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
|
||||
HealthCheck {
|
||||
keep_alive,
|
||||
keep_vm,
|
||||
keep_abnormal,
|
||||
stop_tx: tx,
|
||||
stop_rx: Arc::new(Mutex::new(rx)),
|
||||
}
|
||||
@ -46,7 +46,7 @@ impl HealthCheck {
|
||||
info!(sl!(), "start runtime keep alive");
|
||||
|
||||
let stop_rx = self.stop_rx.clone();
|
||||
let keep_vm = self.keep_vm;
|
||||
let keep_abnormal = self.keep_abnormal;
|
||||
let _ = tokio::spawn(async move {
|
||||
let mut version_check_threshold_count = 0;
|
||||
|
||||
@ -87,7 +87,7 @@ impl HealthCheck {
|
||||
error!(sl!(), "failed to do {} agent health check: {}", id, e);
|
||||
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
|
||||
error!(sl!(), "failed to receive stop monitor signal");
|
||||
if !keep_vm {
|
||||
if !keep_abnormal {
|
||||
::std::process::exit(1);
|
||||
}
|
||||
} else {
|
||||
|
@ -75,6 +75,8 @@ impl VirtSandbox {
|
||||
hypervisor: Arc<dyn Hypervisor>,
|
||||
resource_manager: Arc<ResourceManager>,
|
||||
) -> Result<Self> {
|
||||
let config = resource_manager.config().await;
|
||||
let keep_abnormal = config.runtime.keep_abnormal;
|
||||
Ok(Self {
|
||||
sid: sid.to_string(),
|
||||
msg_sender: Arc::new(Mutex::new(msg_sender)),
|
||||
@ -82,7 +84,7 @@ impl VirtSandbox {
|
||||
agent,
|
||||
hypervisor,
|
||||
resource_manager,
|
||||
monitor: Arc::new(HealthCheck::new(true, false)),
|
||||
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
|
||||
})
|
||||
}
|
||||
|
||||
@ -440,6 +442,7 @@ impl Persist for VirtSandbox {
|
||||
}?;
|
||||
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
|
||||
let sid = sandbox_args.sid;
|
||||
let keep_abnormal = config.runtime.keep_abnormal;
|
||||
let args = ManagerArgs {
|
||||
sid: sid.clone(),
|
||||
agent: agent.clone(),
|
||||
@ -454,7 +457,7 @@ impl Persist for VirtSandbox {
|
||||
agent,
|
||||
hypervisor,
|
||||
resource_manager,
|
||||
monitor: Arc::new(HealthCheck::new(true, false)),
|
||||
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user