Merge pull request #6718 from openanolis/mengze/keep_abnormal

runtime-rs: support keep_abnormal in toml config
This commit is contained in:
Bin Liu 2023-04-26 12:36:52 +08:00 committed by GitHub
commit 509bc8b6c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 37 additions and 9 deletions

View File

@ -130,6 +130,12 @@ pub struct Runtime {
/// Vendor customized runtime configuration. /// Vendor customized runtime configuration.
#[serde(default, flatten)] #[serde(default, flatten)]
pub vendor: RuntimeVendor, pub vendor: RuntimeVendor,
/// If keep_abnormal is enabled, it means that 1) if the runtime exits abnormally, the cleanup process
/// will be skipped, and 2) the runtime will not exit even if the health check fails.
/// This option is typically used to retain abnormal information for debugging.
#[serde(default)]
pub keep_abnormal: bool,
} }
impl ConfigOps for Runtime { impl ConfigOps for Runtime {

View File

@ -214,7 +214,14 @@ dial_timeout = 45
# system log # system log
# (default: disabled) # (default: disabled)
#enable_debug = true #enable_debug = true
#
# If enabled, enabled, it means that 1) if the runtime exits abnormally,
# the cleanup process will be skipped, and 2) the runtime will not exit
# even if the health check fails.
# This option is typically used to retain abnormal information for debugging.
# (default: false)
#keep_abnormal = true
# Internetworking model # Internetworking model
# Determines how the VM should be connected to the # Determines how the VM should be connected to the
# the container network interface # the container network interface

View File

@ -14,6 +14,7 @@ use common::{
RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv, RuntimeHandler, RuntimeInstance, Sandbox, SandboxNetworkEnv,
}; };
use hypervisor::Param; use hypervisor::Param;
use kata_sys_util::spec::load_oci_spec;
use kata_types::{ use kata_types::{
annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig, annotations::Annotation, config::default::DEFAULT_GUEST_DNS_FILE, config::TomlConfig,
}; };
@ -190,9 +191,16 @@ impl RuntimeHandlerManager {
let sender = inner.msg_sender.clone(); let sender = inner.msg_sender.clone();
let sandbox_state = persist::from_disk::<SandboxState>(&inner.id) let sandbox_state = persist::from_disk::<SandboxState>(&inner.id)
.context("failed to load the sandbox state")?; .context("failed to load the sandbox state")?;
let config = if let Ok(spec) = load_oci_spec() {
load_config(&spec, &None).context("load config")?
} else {
TomlConfig::default()
};
let sandbox_args = SandboxRestoreArgs { let sandbox_args = SandboxRestoreArgs {
sid: inner.id.clone(), sid: inner.id.clone(),
toml_config: TomlConfig::default(), toml_config: config,
sender, sender,
}; };
match sandbox_state.sandbox_type.clone() { match sandbox_state.sandbox_type.clone() {
@ -208,6 +216,10 @@ impl RuntimeHandlerManager {
} }
#[cfg(feature = "virt")] #[cfg(feature = "virt")]
name if name == VirtContainer::name() => { name if name == VirtContainer::name() => {
if sandbox_args.toml_config.runtime.keep_abnormal {
info!(sl!(), "skip cleanup for keep_abnormal");
return Ok(());
}
let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state) let sandbox = VirtSandbox::restore(sandbox_args, sandbox_state)
.await .await
.context("failed to restore the sandbox")?; .context("failed to restore the sandbox")?;

View File

@ -21,17 +21,17 @@ const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1;
pub struct HealthCheck { pub struct HealthCheck {
pub keep_alive: bool, pub keep_alive: bool,
keep_vm: bool, keep_abnormal: bool,
stop_tx: mpsc::Sender<()>, stop_tx: mpsc::Sender<()>,
stop_rx: Arc<Mutex<mpsc::Receiver<()>>>, stop_rx: Arc<Mutex<mpsc::Receiver<()>>>,
} }
impl HealthCheck { impl HealthCheck {
pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck { pub fn new(keep_alive: bool, keep_abnormal: bool) -> HealthCheck {
let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE); let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE);
HealthCheck { HealthCheck {
keep_alive, keep_alive,
keep_vm, keep_abnormal,
stop_tx: tx, stop_tx: tx,
stop_rx: Arc::new(Mutex::new(rx)), stop_rx: Arc::new(Mutex::new(rx)),
} }
@ -46,7 +46,7 @@ impl HealthCheck {
info!(sl!(), "start runtime keep alive"); info!(sl!(), "start runtime keep alive");
let stop_rx = self.stop_rx.clone(); let stop_rx = self.stop_rx.clone();
let keep_vm = self.keep_vm; let keep_abnormal = self.keep_abnormal;
let _ = tokio::spawn(async move { let _ = tokio::spawn(async move {
let mut version_check_threshold_count = 0; let mut version_check_threshold_count = 0;
@ -87,7 +87,7 @@ impl HealthCheck {
error!(sl!(), "failed to do {} agent health check: {}", id, e); error!(sl!(), "failed to do {} agent health check: {}", id, e);
if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() { if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() {
error!(sl!(), "failed to receive stop monitor signal"); error!(sl!(), "failed to receive stop monitor signal");
if !keep_vm { if !keep_abnormal {
::std::process::exit(1); ::std::process::exit(1);
} }
} else { } else {

View File

@ -75,6 +75,8 @@ impl VirtSandbox {
hypervisor: Arc<dyn Hypervisor>, hypervisor: Arc<dyn Hypervisor>,
resource_manager: Arc<ResourceManager>, resource_manager: Arc<ResourceManager>,
) -> Result<Self> { ) -> Result<Self> {
let config = resource_manager.config().await;
let keep_abnormal = config.runtime.keep_abnormal;
Ok(Self { Ok(Self {
sid: sid.to_string(), sid: sid.to_string(),
msg_sender: Arc::new(Mutex::new(msg_sender)), msg_sender: Arc::new(Mutex::new(msg_sender)),
@ -82,7 +84,7 @@ impl VirtSandbox {
agent, agent,
hypervisor, hypervisor,
resource_manager, resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)), monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
}) })
} }
@ -440,6 +442,7 @@ impl Persist for VirtSandbox {
}?; }?;
let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default())); let agent = Arc::new(KataAgent::new(kata_types::config::Agent::default()));
let sid = sandbox_args.sid; let sid = sandbox_args.sid;
let keep_abnormal = config.runtime.keep_abnormal;
let args = ManagerArgs { let args = ManagerArgs {
sid: sid.clone(), sid: sid.clone(),
agent: agent.clone(), agent: agent.clone(),
@ -454,7 +457,7 @@ impl Persist for VirtSandbox {
agent, agent,
hypervisor, hypervisor,
resource_manager, resource_manager,
monitor: Arc::new(HealthCheck::new(true, false)), monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
}) })
} }
} }