feat(runtime-rs): merge the update_cgroups in update_linux_resources

Updating vCPU resources and memory resources of the sandbox and updating cgroups on the host will always happening together, and they are all updated based on the linux resources declarations of all the containers. So we merge update_cgroups into the update_linux_resources, so we can better manage the resources allocated to one pod in the host. Fixes: #5030 Signed-off-by: Yushuo <y-shuo@linux.alibaba.com> Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
2025-08-20 00:48:04 +00:00 · 2023-05-06 17:14:36 +08:00 · 2023-05-06 17:14:36 +08:00 · a39e1e6cd1
commit a39e1e6cd1
parent fa6dff9f70
5 changed files with 68 additions and 49 deletions
--- a/src/runtime-rs/crates/resource/src/cgroups/mod.rs
+++ b/src/runtime-rs/crates/resource/src/cgroups/mod.rs
@ -26,6 +26,8 @@ use oci::LinuxResources;
 use persist::sandbox_persist::Persist;
 use tokio::sync::RwLock;

+use crate::ResourceUpdateOp;
+
 const OS_ERROR_NO_SUCH_PROCESS: i32 = 3;

 pub struct CgroupArgs {
@ -149,29 +151,51 @@ impl CgroupsResource {
        &self,
        cid: &str,
        linux_resources: Option<&LinuxResources>,
+        op: ResourceUpdateOp,
        h: &dyn Hypervisor,
    ) -> Result<()> {
-        let resource = self.calc_resource(linux_resources);
-        let changed = self.update_resources(cid, resource).await;
+        let new_resources = self.calc_resource(linux_resources);
+        let old_resources = self.update_resources(cid, new_resources.clone(), op).await;

-        if !changed {
-            return Ok(());
-        }
-
-        self.do_update_cgroups(h).await
-    }
-
-    async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool {
-        let mut resources = self.resources.write().await;
-        let old_resource = resources.insert(cid.to_owned(), new_resource.clone());
-
-        if let Some(old_resource) = old_resource {
-            if old_resource == new_resource {
-                return false;
+        if let Some(old_resource) = old_resources.clone() {
+            if old_resource == new_resources {
+                return Ok(());
            }
        }

-        true
+        match self.do_update_cgroups(h).await {
+            Err(e) => {
+                // if update failed, we should roll back the records in resources
+                let mut resources = self.resources.write().await;
+                match op {
+                    ResourceUpdateOp::Add => {
+                        resources.remove(cid);
+                    }
+                    ResourceUpdateOp::Update | ResourceUpdateOp::Del => {
+                        if let Some(old_resource) = old_resources {
+                            resources.insert(cid.to_owned(), old_resource.clone());
+                        }
+                    }
+                }
+                Err(e)
+            }
+            Ok(()) => Ok(()),
+        }
+    }
+
+    async fn update_resources(
+        &self,
+        cid: &str,
+        new_resource: Resources,
+        op: ResourceUpdateOp,
+    ) -> Option<Resources> {
+        let mut resources = self.resources.write().await;
+        match op {
+            ResourceUpdateOp::Add | ResourceUpdateOp::Update => {
+                resources.insert(cid.to_owned(), new_resource.clone())
+            }
+            ResourceUpdateOp::Del => resources.remove(cid),
+        }
    }

    async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> {
--- a/src/runtime-rs/crates/resource/src/lib.rs
+++ b/src/runtime-rs/crates/resource/src/lib.rs
@ -32,7 +32,7 @@ pub enum ResourceConfig {
    ShareFs(SharedFsInfo),
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub enum ResourceUpdateOp {
    Add,
    Del,
--- a/src/runtime-rs/crates/resource/src/manager.rs
+++ b/src/runtime-rs/crates/resource/src/manager.rs
@ -113,15 +113,6 @@ impl ResourceManager {
        inner.dump().await
    }

-    pub async fn update_cgroups(
-        &self,
-        cid: &str,
-        linux_resources: Option<&LinuxResources>,
-    ) -> Result<()> {
-        let inner = self.inner.read().await;
-        inner.update_cgroups(cid, linux_resources).await
-    }
-
    pub async fn update_linux_resource(
        &self,
        cid: &str,
--- a/src/runtime-rs/crates/resource/src/manager_inner.rs
+++ b/src/runtime-rs/crates/resource/src/manager_inner.rs
@ -317,17 +317,7 @@ impl ResourceManagerInner {
            sb_bindmnt.cleanup_sandbox_bind_mounts()
        }
    }
-
-    pub async fn update_cgroups(
-        &self,
-        cid: &str,
-        linux_resources: Option<&LinuxResources>,
-    ) -> Result<()> {
-        self.cgroups_resource
-            .update_cgroups(cid, linux_resources, self.hypervisor.as_ref())
-            .await
-    }
-
+    
    pub async fn cleanup(&self) -> Result<()> {
        // clean up cgroup
        self.cgroups_resource
@ -365,15 +355,24 @@ impl ResourceManagerInner {
    ) -> Result<()> {
        let linux_cpus = || -> Option<&LinuxCpu> { linux_resources.as_ref()?.cpu.as_ref() }();

-        self.cpu_resource
-            .update_cpu_resources(
-                cid,
-                linux_cpus,
-                op,
-                self.hypervisor.as_ref(),
-                self.agent.as_ref(),
-            )
+        // if static_sandbox_resource_mgmt, we will not have to update sandbox's cpu or mem resource
+        if !self.toml_config.runtime.static_sandbox_resource_mgmt {
+            self.cpu_resource
+                .update_cpu_resources(
+                    cid,
+                    linux_cpus,
+                    op,
+                    self.hypervisor.as_ref(),
+                    self.agent.as_ref(),
+                )
+                .await?;
+        }
+
+        // we should firstly update the vcpus and mems, and then update the host cgroups
+        self.cgroups_resource
+            .update_cgroups(cid, linux_resources, op, self.hypervisor.as_ref())
            .await?;
+
        Ok(())
    }
 }
--- a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs
+++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs
@ -19,7 +19,7 @@ use common::{
 use kata_sys_util::k8s::update_ephemeral_storage_type;

 use oci::{LinuxResources, Process as OCIProcess};
-use resource::ResourceManager;
+use resource::{ResourceManager, ResourceUpdateOp};
 use tokio::sync::RwLock;

 use super::{
@ -155,11 +155,12 @@ impl Container {

        // update cgroups
        self.resource_manager
-            .update_cgroups(
+            .update_linux_resource(
                &config.container_id,
                spec.linux
                    .as_ref()
                    .and_then(|linux| linux.resources.as_ref()),
+                ResourceUpdateOp::Add,
            )
            .await?;

@ -402,7 +403,11 @@ impl Container {

    pub async fn update(&self, resources: &LinuxResources) -> Result<()> {
        self.resource_manager
-            .update_cgroups(&self.config.container_id, Some(resources))
+            .update_linux_resource(
+                &self.config.container_id,
+                Some(resources),
+                ResourceUpdateOp::Update,
+            )
            .await?;

        let req = agent::UpdateContainerRequest {