feat(runtime-rs): modify onlineCpuMemRequest

Some vmms, such as dragonball, will actively help us perform online cpu operations when doing cpu hotplug. Under the old onlineCpuMem interface, it is difficult to adapt to this situation. So we modify the semantics of nb_cpus in onlineCpuMemRequest. In the original semantics, nb_cpus represents the number of newly added CPUs that need to be online. The modified semantics become that the number of online CPUs in the guest needs to be guaranteed. Fixes: #5030 Signed-off-by: Yushuo <y-shuo@linux.alibaba.com> Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
2025-10-08 18:13:28 +00:00 · 2023-05-09 14:35:44 +08:00
parent d66f7572dd
commit aaa96c749b
8 changed files with 44 additions and 21 deletions
--- a/src/agent/src/linux_abi.rs
+++ b/src/agent/src/linux_abi.rs
@@ -81,7 +81,8 @@ cfg_if! {
 // sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00
 pub const ACPI_DEV_PATH: &str = "/devices/LNXSYSTM";

-pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu";
+pub const SYSFS_CPU_PATH: &str = "/sys/devices/system/cpu";
+pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu/online";

 pub const SYSFS_MEMORY_BLOCK_SIZE_PATH: &str = "/sys/devices/system/memory/block_size_bytes";
 pub const SYSFS_MEMORY_HOTPLUG_PROBE_PATH: &str = "/sys/devices/system/memory/probe";
--- a/src/agent/src/sandbox.rs
+++ b/src/agent/src/sandbox.rs
@@ -12,6 +12,7 @@ use crate::pci;
 use crate::uevent::{Uevent, UeventMatcher};
 use crate::watcher::BindWatcher;
 use anyhow::{anyhow, Context, Result};
+use kata_types::cpu::CpuSet;
 use libc::pid_t;
 use oci::{Hook, Hooks};
 use protocols::agent::OnlineCPUMemRequest;
@@ -25,6 +26,7 @@ use std::collections::HashMap;
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::{thread, time};
 use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -263,12 +265,12 @@ impl Sandbox {
    pub fn online_cpu_memory(&self, req: &OnlineCPUMemRequest) -> Result<()> {
        if req.nb_cpus > 0 {
            // online cpus
-            online_cpus(&self.logger, req.nb_cpus as i32)?;
+            online_cpus(&self.logger, req.nb_cpus as i32).context("online cpus")?;
        }

        if !req.cpu_only {
            // online memory
-            online_memory(&self.logger)?;
+            online_memory(&self.logger).context("online memory")?;
        }

        if req.nb_cpus == 0 {
@@ -432,23 +434,33 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res

 // max wait for all CPUs to online will use 50 * 100 = 5 seconds.
 const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50;
-const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100;
+const ONLINE_CPUMEM_MAX_RETRIES: i32 = 100;

 #[instrument]
 fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
-    let mut onlined_count: i32 = 0;
+    let mut onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
+    // for some vmms, like dragonball, they will online cpus for us
+    // so check first whether agent need to do the online operation
+    if onlined_cpu_count >= num {
+        return Ok(num);
+    }

    for i in 0..ONLINE_CPUMEM_MAX_RETRIES {
-        let r = online_resources(
+        // online num resources
+        online_resources(
            logger,
-            SYSFS_CPU_ONLINE_PATH,
+            SYSFS_CPU_PATH,
            r"cpu[0-9]+",
-            num - onlined_count,
-        );
+            num - onlined_cpu_count,
+        )
+        .context("online cpu resource")?;

-        onlined_count += r?;
-        if onlined_count == num {
-            info!(logger, "online {} CPU(s) after {} retries", num, i);
+        onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
+        if onlined_cpu_count >= num {
+            info!(
+                logger,
+                "Currently {} onlined CPU(s) after {} retries", onlined_cpu_count, i
+            );
            return Ok(num);
        }
        thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS));
@@ -463,10 +475,18 @@ fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {

 #[instrument]
 fn online_memory(logger: &Logger) -> Result<()> {
-    online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)?;
+    online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)
+        .context("online memory resource")?;
    Ok(())
 }

+fn onlined_cpus() -> Result<i32> {
+    let content =
+        fs::read_to_string(SYSFS_CPU_ONLINE_PATH).context("read sysfs cpu online file")?;
+    let online_cpu_set = CpuSet::from_str(content.trim())?;
+    Ok(online_cpu_set.len() as i32)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/src/libs/protocols/protos/agent.proto
+++ b/src/libs/protocols/protos/agent.proto
@@ -366,7 +366,8 @@ message OnlineCPUMemRequest {
 	// resources are connected asynchronously and the agent returns immediately.
 	bool wait = 1;

-	// NbCpus specifies the number of CPUs that were added and the agent has to online.
+	// NbCpus specifies the number of CPUs that should be onlined in the guest.
+	// Special value 0 means agent will skip this check.
 	uint32 nb_cpus = 2;

 	// CpuOnly specifies whether only online CPU or not.
--- a/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs
+++ b/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs
@@ -176,7 +176,7 @@ impl CpuResource {
            agent
                .online_cpu_mem(OnlineCPUMemRequest {
                    wait: false,
-                    nb_cpus: add,
+                    nb_cpus: new,
                    cpu_only: true,
                })
                .await
--- a/src/runtime/virtcontainers/agent.go
+++ b/src/runtime/virtcontainers/agent.go
@@ -10,6 +10,7 @@ import (
 	"time"

 	"context"
+
 	persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
 	pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
@@ -119,7 +120,7 @@ type agent interface {

 	// onlineCPUMem will online CPUs and Memory inside the Sandbox.
 	// This function should be called after hot adding vCPUs or Memory.
-	// cpus specifies the number of CPUs that were added and the agent should online
+	// cpus specifies the number of CPUs that should be onlined in the guest, and special value 0 means agent will skip this check.
 	// cpuOnly specifies that we should online cpu or online memory or both
 	onlineCPUMem(ctx context.Context, cpus uint32, cpuOnly bool) error

--- a/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go
+++ b/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go
@@ -1924,7 +1924,8 @@ type OnlineCPUMemRequest struct {
 	// If true the agent returns once all resources have been connected, otherwise all
 	// resources are connected asynchronously and the agent returns immediately.
 	Wait bool `protobuf:"varint,1,opt,name=wait,proto3" json:"wait,omitempty"`
-	// NbCpus specifies the number of CPUs that were added and the agent has to online.
+	// NbCpus specifies the number of CPUs that should be onlined in the guest.
+	// Special value 0 means agent will skip this check.
 	NbCpus uint32 `protobuf:"varint,2,opt,name=nb_cpus,json=nbCpus,proto3" json:"nb_cpus,omitempty"`
 	// CpuOnly specifies whether only online CPU or not.
 	CpuOnly              bool     `protobuf:"varint,3,opt,name=cpu_only,json=cpuOnly,proto3" json:"cpu_only,omitempty"`
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -2117,9 +2117,8 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
 	s.Logger().Debugf("Request to hypervisor to update oldCPUs/newCPUs: %d/%d", oldCPUs, newCPUs)
 	// If the CPUs were increased, ask agent to online them
 	if oldCPUs < newCPUs {
-		vcpusAdded := newCPUs - oldCPUs
-		s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", vcpusAdded)
-		if err := s.agent.onlineCPUMem(ctx, vcpusAdded, true); err != nil {
+		s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", newCPUs)
+		if err := s.agent.onlineCPUMem(ctx, newCPUs, true); err != nil {
 			return err
 		}
 	}
--- a/src/runtime/virtcontainers/vm.go
+++ b/src/runtime/virtcontainers/vm.go
@@ -293,7 +293,7 @@ func (v *VM) AddMemory(ctx context.Context, numMB uint32) error {
 // OnlineCPUMemory puts the hotplugged CPU and memory online.
 func (v *VM) OnlineCPUMemory(ctx context.Context) error {
 	v.logger().Infof("online CPU %d and memory", v.cpuDelta)
-	err := v.agent.onlineCPUMem(ctx, v.cpuDelta, false)
+	err := v.agent.onlineCPUMem(ctx, v.cpu, false)
 	if err == nil {
 		v.cpuDelta = 0
 	}