mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-26 15:32:30 +00:00
runtime-rs: support keep_abnormal in toml config
This patch adds keep_abnormal in runtime config. If keep_abnormal = true, it means that 1) if the runtime exits abnormally, the cleanup process will be skipped, and 2) the runtime will not exit even if the health check fails. This option is typically used to retain abnormal information for debugging and should NOT be enabled by default. Fixes: #6717 Signed-off-by: mengze <mengze@linux.alibaba.com> Signed-off-by: quanweiZhou <quanweiZhou@linux.alibaba.com>
This commit is contained in:
parent
97291d88e9
commit
cc8ea3232e
@ -130,6 +130,12 @@ pub struct Runtime {
|
|||||||
/// Vendor customized runtime configuration.
|
/// Vendor customized runtime configuration.
|
||||||
#[serde(default, flatten)]
|
#[serde(default, flatten)]
|
||||||
pub vendor: RuntimeVendor,
|
pub vendor: RuntimeVendor,
|
||||||
|
|
||||||
|
/// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process
|
||||||
|
/// will be skipped, and 2) the runtime will not exit even if the health check fails.
|
||||||
|
/// This option is typically used to retain abnormal information for debugging.
|
||||||
|
#[serde(default)]
|
||||||
|
pub keep_abnormal: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ConfigOps for Runtime {
|
impl ConfigOps for Runtime {
|
||||||
|
@ -214,7 +214,14 @@ dial_timeout = 45
|
|||||||
# system log
|
# system log
|
||||||
# (default: disabled)
|
# (default: disabled)
|
||||||
#enable_debug = true
|
#enable_debug = true
|
||||||
#
|
|
||||||
|
# If enabled, enabled, it means that 1) if the runtime exits abnormally,
|
||||||
|
# the cleanup process will be skipped, and 2) the runtime will not exit
|
||||||
|
# even if the health check fails.
|
||||||
|
# This option is typically used to retain abnormal information for debugging.
|
||||||
|
# (default: false)
|
||||||
|
#keep_abnormal = true
|
||||||
|
|
||||||
# Internetworking model
|
# Internetworking model
|
||||||
# Determines how the VM should be connected to the
|
# Determines how the VM should be connected to the
|
||||||
# the container network interface
|
# the container network interface
|
||||||
|
@ -14,6 +14,7 @@ use common::{
|
|||||||
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
|
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
|
||||||
};
|
};
|
||||||
use hypervisor::Param;
|
use hypervisor::Param;
|
||||||
|
use kata_sys_util::spec::load_oci_spec;
|
||||||
use kata_types::{
|
use kata_types::{
|
||||||
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
|
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
|
||||||
};
|
};
|
||||||
@ -190,9 +191,16 @@ impl RuntimeHandlerManager {
|
|||||||
let sender = inner.msg_sender.clone();
|
let sender = inner.msg_sender.clone();
|
||||||
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
|
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
|
||||||
.context("failed to load the sandbox state")?;
|
.context("failed to load the sandbox state")?;
|
||||||
|
|
||||||
|
let config = if let Ok(spec) = load_oci_spec() {
|
||||||
|
load_config(&spec, &None).context("load config")?
|
||||||
|
} else {
|
||||||
|
TomlConfig::default()
|
||||||
|
};
|
||||||
|
|
||||||
let sandbox_args = SandboxRestoreArgs {
|
let sandbox_args = SandboxRestoreArgs {
|
||||||
sid: inner.id.clone(),
|
sid: inner.id.clone(),
|
||||||
toml_config: TomlConfig::default(),
|
toml_config: config,
|
||||||
sender,
|
sender,
|
||||||
};
|
};
|
||||||
match sandbox_state.sandbox_type.clone() {
|
match sandbox_state.sandbox_type.clone() {
|
||||||
@ -208,6 +216,10 @@ impl RuntimeHandlerManager {
|
|||||||
}
|
}
|
||||||
#[cfg(feature = "virt")]
|
#[cfg(feature = "virt")]
|
||||||
name if name == VirtContainer::name() => {
|
name if name == VirtContainer::name() => {
|
||||||
|
if sandbox_args.toml_config.runtime.keep_abnormal {
|
||||||
|
info!(sl!(), "skip cleanup for keep_abnormal");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
|
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
|
||||||
.await
|
.await
|
||||||
.context("failed to restore the sandbox")?;
|
.context("failed to restore the sandbox")?;
|
||||||
|
@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1;
|
|||||||
|
|
||||||
pub struct HealthCheck {
|
pub struct HealthCheck {
|
||||||
pub keep_alive: bool,
|
pub keep_alive: bool,
|
||||||
keep_vm: bool,
|
keep_abnormal: bool,
|
||||||
stop_tx: mpsc::Sender<()>,
|
stop_tx: mpsc::Sender<()>,
|
||||||
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
|
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HealthCheck {
|
impl HealthCheck {
|
||||||
pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck {
|
pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck {
|
||||||
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
|
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
|
||||||
HealthCheck {
|
HealthCheck {
|
||||||
keep_alive,
|
keep_alive,
|
||||||
keep_vm,
|
keep_abnormal,
|
||||||
stop_tx: tx,
|
stop_tx: tx,
|
||||||
stop_rx: Arc::new(Mutex::new(rx)),
|
stop_rx: Arc::new(Mutex::new(rx)),
|
||||||
}
|
}
|
||||||
@ -46,7 +46,7 @@ impl HealthCheck {
|
|||||||
info!(sl!(), "start runtime keep alive");
|
info!(sl!(), "start runtime keep alive");
|
||||||
|
|
||||||
let stop_rx = self.stop_rx.clone();
|
let stop_rx = self.stop_rx.clone();
|
||||||
let keep_vm = self.keep_vm;
|
let keep_abnormal = self.keep_abnormal;
|
||||||
let _ = tokio::spawn(async move {
|
let _ = tokio::spawn(async move {
|
||||||
let mut version_check_threshold_count = 0;
|
let mut version_check_threshold_count = 0;
|
||||||
|
|
||||||
@ -87,7 +87,7 @@ impl HealthCheck {
|
|||||||
error!(sl!(), "failed to do {} agent health check: {}", id, e);
|
error!(sl!(), "failed to do {} agent health check: {}", id, e);
|
||||||
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
|
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
|
||||||
error!(sl!(), "failed to receive stop monitor signal");
|
error!(sl!(), "failed to receive stop monitor signal");
|
||||||
if !keep_vm {
|
if !keep_abnormal {
|
||||||
::std::process::exit(1);
|
::std::process::exit(1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -75,6 +75,8 @@ impl VirtSandbox {
|
|||||||
hypervisor: Arc<dyn Hypervisor>,
|
hypervisor: Arc<dyn Hypervisor>,
|
||||||
resource_manager: Arc<ResourceManager>,
|
resource_manager: Arc<ResourceManager>,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
|
let config = resource_manager.config().await;
|
||||||
|
let keep_abnormal = config.runtime.keep_abnormal;
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
sid: sid.to_string(),
|
sid: sid.to_string(),
|
||||||
msg_sender: Arc::new(Mutex::new(msg_sender)),
|
msg_sender: Arc::new(Mutex::new(msg_sender)),
|
||||||
@ -82,7 +84,7 @@ impl VirtSandbox {
|
|||||||
agent,
|
agent,
|
||||||
hypervisor,
|
hypervisor,
|
||||||
resource_manager,
|
resource_manager,
|
||||||
monitor: Arc::new(HealthCheck::new(true, false)),
|
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -440,6 +442,7 @@ impl Persist for VirtSandbox {
|
|||||||
}?;
|
}?;
|
||||||
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
|
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
|
||||||
let sid = sandbox_args.sid;
|
let sid = sandbox_args.sid;
|
||||||
|
let keep_abnormal = config.runtime.keep_abnormal;
|
||||||
let args = ManagerArgs {
|
let args = ManagerArgs {
|
||||||
sid: sid.clone(),
|
sid: sid.clone(),
|
||||||
agent: agent.clone(),
|
agent: agent.clone(),
|
||||||
@ -454,7 +457,7 @@ impl Persist for VirtSandbox {
|
|||||||
agent,
|
agent,
|
||||||
hypervisor,
|
hypervisor,
|
||||||
resource_manager,
|
resource_manager,
|
||||||
monitor: Arc::new(HealthCheck::new(true, false)),
|
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user