diff --git a/src/runtime-rs/crates/resource/src/cgroups/mod.rs b/src/runtime-rs/crates/resource/src/cgroups/mod.rs index b7f515d7f0..d32fa479d3 100644 --- a/src/runtime-rs/crates/resource/src/cgroups/mod.rs +++ b/src/runtime-rs/crates/resource/src/cgroups/mod.rs @@ -26,6 +26,8 @@ use oci::LinuxResources; use persist::sandbox_persist::Persist; use tokio::sync::RwLock; +use crate::ResourceUpdateOp; + const OS_ERROR_NO_SUCH_PROCESS: i32 = 3; pub struct CgroupArgs { @@ -149,29 +151,51 @@ impl CgroupsResource { &self, cid: &str, linux_resources: Option<&LinuxResources>, + op: ResourceUpdateOp, h: &dyn Hypervisor, ) -> Result<()> { - let resource = self.calc_resource(linux_resources); - let changed = self.update_resources(cid, resource).await; + let new_resources = self.calc_resource(linux_resources); + let old_resources = self.update_resources(cid, new_resources.clone(), op).await; - if !changed { - return Ok(()); - } - - self.do_update_cgroups(h).await - } - - async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool { - let mut resources = self.resources.write().await; - let old_resource = resources.insert(cid.to_owned(), new_resource.clone()); - - if let Some(old_resource) = old_resource { - if old_resource == new_resource { - return false; + if let Some(old_resource) = old_resources.clone() { + if old_resource == new_resources { + return Ok(()); } } - true + match self.do_update_cgroups(h).await { + Err(e) => { + // if update failed, we should roll back the records in resources + let mut resources = self.resources.write().await; + match op { + ResourceUpdateOp::Add => { + resources.remove(cid); + } + ResourceUpdateOp::Update | ResourceUpdateOp::Del => { + if let Some(old_resource) = old_resources { + resources.insert(cid.to_owned(), old_resource.clone()); + } + } + } + Err(e) + } + Ok(()) => Ok(()), + } + } + + async fn update_resources( + &self, + cid: &str, + new_resource: Resources, + op: ResourceUpdateOp, + ) -> Option { + let mut resources = self.resources.write().await; + match op { + ResourceUpdateOp::Add | ResourceUpdateOp::Update => { + resources.insert(cid.to_owned(), new_resource.clone()) + } + ResourceUpdateOp::Del => resources.remove(cid), + } } async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> { diff --git a/src/runtime-rs/crates/resource/src/lib.rs b/src/runtime-rs/crates/resource/src/lib.rs index e008c357e9..4e4aae9e87 100644 --- a/src/runtime-rs/crates/resource/src/lib.rs +++ b/src/runtime-rs/crates/resource/src/lib.rs @@ -32,7 +32,7 @@ pub enum ResourceConfig { ShareFs(SharedFsInfo), } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum ResourceUpdateOp { Add, Del, diff --git a/src/runtime-rs/crates/resource/src/manager.rs b/src/runtime-rs/crates/resource/src/manager.rs index 78c3d8e22e..6345dbff64 100644 --- a/src/runtime-rs/crates/resource/src/manager.rs +++ b/src/runtime-rs/crates/resource/src/manager.rs @@ -113,15 +113,6 @@ impl ResourceManager { inner.dump().await } - pub async fn update_cgroups( - &self, - cid: &str, - linux_resources: Option<&LinuxResources>, - ) -> Result<()> { - let inner = self.inner.read().await; - inner.update_cgroups(cid, linux_resources).await - } - pub async fn update_linux_resource( &self, cid: &str, diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 3eced83400..d565368eba 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -317,17 +317,7 @@ impl ResourceManagerInner { sb_bindmnt.cleanup_sandbox_bind_mounts() } } - - pub async fn update_cgroups( - &self, - cid: &str, - linux_resources: Option<&LinuxResources>, - ) -> Result<()> { - self.cgroups_resource - .update_cgroups(cid, linux_resources, self.hypervisor.as_ref()) - .await - } - + pub async fn cleanup(&self) -> Result<()> { // clean up cgroup self.cgroups_resource @@ -365,15 +355,24 @@ impl ResourceManagerInner { ) -> Result<()> { let linux_cpus = || -> Option<&LinuxCpu> { linux_resources.as_ref()?.cpu.as_ref() }(); - self.cpu_resource - .update_cpu_resources( - cid, - linux_cpus, - op, - self.hypervisor.as_ref(), - self.agent.as_ref(), - ) + // if static_sandbox_resource_mgmt, we will not have to update sandbox's cpu or mem resource + if !self.toml_config.runtime.static_sandbox_resource_mgmt { + self.cpu_resource + .update_cpu_resources( + cid, + linux_cpus, + op, + self.hypervisor.as_ref(), + self.agent.as_ref(), + ) + .await?; + } + + // we should firstly update the vcpus and mems, and then update the host cgroups + self.cgroups_resource + .update_cgroups(cid, linux_resources, op, self.hypervisor.as_ref()) .await?; + Ok(()) } } diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs index 165cda63e3..cbde460a57 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs @@ -19,7 +19,7 @@ use common::{ use kata_sys_util::k8s::update_ephemeral_storage_type; use oci::{LinuxResources, Process as OCIProcess}; -use resource::ResourceManager; +use resource::{ResourceManager, ResourceUpdateOp}; use tokio::sync::RwLock; use super::{ @@ -155,11 +155,12 @@ impl Container { // update cgroups self.resource_manager - .update_cgroups( + .update_linux_resource( &config.container_id, spec.linux .as_ref() .and_then(|linux| linux.resources.as_ref()), + ResourceUpdateOp::Add, ) .await?; @@ -402,7 +403,11 @@ impl Container { pub async fn update(&self, resources: &LinuxResources) -> Result<()> { self.resource_manager - .update_cgroups(&self.config.container_id, Some(resources)) + .update_linux_resource( + &self.config.container_id, + Some(resources), + ResourceUpdateOp::Update, + ) .await?; let req = agent::UpdateContainerRequest {