diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index 042acd0aed..de131faf02 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -81,7 +81,8 @@ cfg_if! { // sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00 pub const ACPI_DEV_PATH: &str = "/devices/LNXSYSTM"; -pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu"; +pub const SYSFS_CPU_PATH: &str = "/sys/devices/system/cpu"; +pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu/online"; pub const SYSFS_MEMORY_BLOCK_SIZE_PATH: &str = "/sys/devices/system/memory/block_size_bytes"; pub const SYSFS_MEMORY_HOTPLUG_PROBE_PATH: &str = "/sys/devices/system/memory/probe"; diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index 34275cc411..175304ed14 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -12,6 +12,7 @@ use crate::pci; use crate::uevent::{Uevent, UeventMatcher}; use crate::watcher::BindWatcher; use anyhow::{anyhow, Context, Result}; +use kata_types::cpu::CpuSet; use libc::pid_t; use oci::{Hook, Hooks}; use protocols::agent::OnlineCPUMemRequest; @@ -25,6 +26,7 @@ use std::collections::HashMap; use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; +use std::str::FromStr; use std::sync::Arc; use std::{thread, time}; use tokio::sync::mpsc::{channel, Receiver, Sender}; @@ -263,12 +265,12 @@ impl Sandbox { pub fn online_cpu_memory(&self, req: &OnlineCPUMemRequest) -> Result<()> { if req.nb_cpus > 0 { // online cpus - online_cpus(&self.logger, req.nb_cpus as i32)?; + online_cpus(&self.logger, req.nb_cpus as i32).context("online cpus")?; } if !req.cpu_only { // online memory - online_memory(&self.logger)?; + online_memory(&self.logger).context("online memory")?; } if req.nb_cpus == 0 { @@ -432,23 +434,33 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res // max wait for all CPUs to online will use 50 * 100 = 5 seconds. const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50; -const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100; +const ONLINE_CPUMEM_MAX_RETRIES: i32 = 100; #[instrument] fn online_cpus(logger: &Logger, num: i32) -> Result { - let mut onlined_count: i32 = 0; + let mut onlined_cpu_count = onlined_cpus().context("onlined cpu count")?; + // for some vmms, like dragonball, they will online cpus for us + // so check first whether agent need to do the online operation + if onlined_cpu_count >= num { + return Ok(num); + } for i in 0..ONLINE_CPUMEM_MAX_RETRIES { - let r = online_resources( + // online num resources + online_resources( logger, - SYSFS_CPU_ONLINE_PATH, + SYSFS_CPU_PATH, r"cpu[0-9]+", - num - onlined_count, - ); + num - onlined_cpu_count, + ) + .context("online cpu resource")?; - onlined_count += r?; - if onlined_count == num { - info!(logger, "online {} CPU(s) after {} retries", num, i); + onlined_cpu_count = onlined_cpus().context("onlined cpu count")?; + if onlined_cpu_count >= num { + info!( + logger, + "Currently {} onlined CPU(s) after {} retries", onlined_cpu_count, i + ); return Ok(num); } thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS)); @@ -463,10 +475,18 @@ fn online_cpus(logger: &Logger, num: i32) -> Result { #[instrument] fn online_memory(logger: &Logger) -> Result<()> { - online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)?; + online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1) + .context("online memory resource")?; Ok(()) } +fn onlined_cpus() -> Result { + let content = + fs::read_to_string(SYSFS_CPU_ONLINE_PATH).context("read sysfs cpu online file")?; + let online_cpu_set = CpuSet::from_str(content.trim())?; + Ok(online_cpu_set.len() as i32) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/libs/protocols/protos/agent.proto b/src/libs/protocols/protos/agent.proto index 3ad755256c..9ed7c0a049 100644 --- a/src/libs/protocols/protos/agent.proto +++ b/src/libs/protocols/protos/agent.proto @@ -366,7 +366,8 @@ message OnlineCPUMemRequest { // resources are connected asynchronously and the agent returns immediately. bool wait = 1; - // NbCpus specifies the number of CPUs that were added and the agent has to online. + // NbCpus specifies the number of CPUs that should be onlined in the guest. + // Special value 0 means agent will skip this check. uint32 nb_cpus = 2; // CpuOnly specifies whether only online CPU or not. diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs b/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs index 7892c014eb..661e1a5e3a 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/cpu.rs @@ -176,7 +176,7 @@ impl CpuResource { agent .online_cpu_mem(OnlineCPUMemRequest { wait: false, - nb_cpus: add, + nb_cpus: new, cpu_only: true, }) .await diff --git a/src/runtime/virtcontainers/agent.go b/src/runtime/virtcontainers/agent.go index ddf11d9ce2..8f0aca26ee 100644 --- a/src/runtime/virtcontainers/agent.go +++ b/src/runtime/virtcontainers/agent.go @@ -10,6 +10,7 @@ import ( "time" "context" + persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api" pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" @@ -119,7 +120,7 @@ type agent interface { // onlineCPUMem will online CPUs and Memory inside the Sandbox. // This function should be called after hot adding vCPUs or Memory. - // cpus specifies the number of CPUs that were added and the agent should online + // cpus specifies the number of CPUs that should be onlined in the guest, and special value 0 means agent will skip this check. // cpuOnly specifies that we should online cpu or online memory or both onlineCPUMem(ctx context.Context, cpus uint32, cpuOnly bool) error diff --git a/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go b/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go index a8dd81104a..3a1b7c607f 100644 --- a/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go +++ b/src/runtime/virtcontainers/pkg/agent/protocols/grpc/agent.pb.go @@ -1924,7 +1924,8 @@ type OnlineCPUMemRequest struct { // If true the agent returns once all resources have been connected, otherwise all // resources are connected asynchronously and the agent returns immediately. Wait bool `protobuf:"varint,1,opt,name=wait,proto3" json:"wait,omitempty"` - // NbCpus specifies the number of CPUs that were added and the agent has to online. + // NbCpus specifies the number of CPUs that should be onlined in the guest. + // Special value 0 means agent will skip this check. NbCpus uint32 `protobuf:"varint,2,opt,name=nb_cpus,json=nbCpus,proto3" json:"nb_cpus,omitempty"` // CpuOnly specifies whether only online CPU or not. CpuOnly bool `protobuf:"varint,3,opt,name=cpu_only,json=cpuOnly,proto3" json:"cpu_only,omitempty"` diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index b0697fd840..9bd5f402eb 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -2117,9 +2117,8 @@ func (s *Sandbox) updateResources(ctx context.Context) error { s.Logger().Debugf("Request to hypervisor to update oldCPUs/newCPUs: %d/%d", oldCPUs, newCPUs) // If the CPUs were increased, ask agent to online them if oldCPUs < newCPUs { - vcpusAdded := newCPUs - oldCPUs - s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", vcpusAdded) - if err := s.agent.onlineCPUMem(ctx, vcpusAdded, true); err != nil { + s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", newCPUs) + if err := s.agent.onlineCPUMem(ctx, newCPUs, true); err != nil { return err } } diff --git a/src/runtime/virtcontainers/vm.go b/src/runtime/virtcontainers/vm.go index b5dec99122..72afa9581c 100644 --- a/src/runtime/virtcontainers/vm.go +++ b/src/runtime/virtcontainers/vm.go @@ -293,7 +293,7 @@ func (v *VM) AddMemory(ctx context.Context, numMB uint32) error { // OnlineCPUMemory puts the hotplugged CPU and memory online. func (v *VM) OnlineCPUMemory(ctx context.Context) error { v.logger().Infof("online CPU %d and memory", v.cpuDelta) - err := v.agent.onlineCPUMem(ctx, v.cpuDelta, false) + err := v.agent.onlineCPUMem(ctx, v.cpu, false) if err == nil { v.cpuDelta = 0 }