feat(runtime-rs): modify onlineCpuMemRequest

Some vmms, such as dragonball, will actively help us
perform online cpu operations when doing cpu hotplug.
Under the old onlineCpuMem interface, it is difficult
to adapt to this situation.

So we modify the semantics of nb_cpus in onlineCpuMemRequest.
In the original semantics, nb_cpus represents the number of
newly added CPUs that need to be online. The modified
semantics become that the number of online CPUs in the guest
needs to be guaranteed.

Fixes: #5030

Signed-off-by: Yushuo <y-shuo@linux.alibaba.com>
Signed-off-by: Ji-Xinyou <jerryji0414@outlook.com>
This commit is contained in:
Yushuo 2023-05-09 14:35:44 +08:00
parent d66f7572dd
commit aaa96c749b
8 changed files with 44 additions and 21 deletions

View File

@ -81,7 +81,8 @@ cfg_if! {
// sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00
pub const ACPI_DEV_PATH: &str = "/devices/LNXSYSTM";
pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu";
pub const SYSFS_CPU_PATH: &str = "/sys/devices/system/cpu";
pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu/online";
pub const SYSFS_MEMORY_BLOCK_SIZE_PATH: &str = "/sys/devices/system/memory/block_size_bytes";
pub const SYSFS_MEMORY_HOTPLUG_PROBE_PATH: &str = "/sys/devices/system/memory/probe";

View File

@ -12,6 +12,7 @@ use crate::pci;
use crate::uevent::{Uevent, UeventMatcher};
use crate::watcher::BindWatcher;
use anyhow::{anyhow, Context, Result};
use kata_types::cpu::CpuSet;
use libc::pid_t;
use oci::{Hook, Hooks};
use protocols::agent::OnlineCPUMemRequest;
@ -25,6 +26,7 @@ use std::collections::HashMap;
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;
use std::{thread, time};
use tokio::sync::mpsc::{channel, Receiver, Sender};
@ -263,12 +265,12 @@ impl Sandbox {
pub fn online_cpu_memory(&self, req: &OnlineCPUMemRequest) -> Result<()> {
if req.nb_cpus > 0 {
// online cpus
online_cpus(&self.logger, req.nb_cpus as i32)?;
online_cpus(&self.logger, req.nb_cpus as i32).context("online cpus")?;
}
if !req.cpu_only {
// online memory
online_memory(&self.logger)?;
online_memory(&self.logger).context("online memory")?;
}
if req.nb_cpus == 0 {
@ -432,23 +434,33 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res
// max wait for all CPUs to online will use 50 * 100 = 5 seconds.
const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50;
const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100;
const ONLINE_CPUMEM_MAX_RETRIES: i32 = 100;
#[instrument]
fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
let mut onlined_count: i32 = 0;
let mut onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
// for some vmms, like dragonball, they will online cpus for us
// so check first whether agent need to do the online operation
if onlined_cpu_count >= num {
return Ok(num);
}
for i in 0..ONLINE_CPUMEM_MAX_RETRIES {
let r = online_resources(
// online num resources
online_resources(
logger,
SYSFS_CPU_ONLINE_PATH,
SYSFS_CPU_PATH,
r"cpu[0-9]+",
num - onlined_count,
);
num - onlined_cpu_count,
)
.context("online cpu resource")?;
onlined_count += r?;
if onlined_count == num {
info!(logger, "online {} CPU(s) after {} retries", num, i);
onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
if onlined_cpu_count >= num {
info!(
logger,
"Currently {} onlined CPU(s) after {} retries", onlined_cpu_count, i
);
return Ok(num);
}
thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS));
@ -463,10 +475,18 @@ fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
#[instrument]
fn online_memory(logger: &Logger) -> Result<()> {
online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)?;
online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)
.context("online memory resource")?;
Ok(())
}
fn onlined_cpus() -> Result<i32> {
let content =
fs::read_to_string(SYSFS_CPU_ONLINE_PATH).context("read sysfs cpu online file")?;
let online_cpu_set = CpuSet::from_str(content.trim())?;
Ok(online_cpu_set.len() as i32)
}
#[cfg(test)]
mod tests {
use super::*;

View File

@ -366,7 +366,8 @@ message OnlineCPUMemRequest {
// resources are connected asynchronously and the agent returns immediately.
bool wait = 1;
// NbCpus specifies the number of CPUs that were added and the agent has to online.
// NbCpus specifies the number of CPUs that should be onlined in the guest.
// Special value 0 means agent will skip this check.
uint32 nb_cpus = 2;
// CpuOnly specifies whether only online CPU or not.

View File

@ -176,7 +176,7 @@ impl CpuResource {
agent
.online_cpu_mem(OnlineCPUMemRequest {
wait: false,
nb_cpus: add,
nb_cpus: new,
cpu_only: true,
})
.await

View File

@ -10,6 +10,7 @@ import (
"time"
"context"
persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api"
pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc"
@ -119,7 +120,7 @@ type agent interface {
// onlineCPUMem will online CPUs and Memory inside the Sandbox.
// This function should be called after hot adding vCPUs or Memory.
// cpus specifies the number of CPUs that were added and the agent should online
// cpus specifies the number of CPUs that should be onlined in the guest, and special value 0 means agent will skip this check.
// cpuOnly specifies that we should online cpu or online memory or both
onlineCPUMem(ctx context.Context, cpus uint32, cpuOnly bool) error

View File

@ -1924,7 +1924,8 @@ type OnlineCPUMemRequest struct {
// If true the agent returns once all resources have been connected, otherwise all
// resources are connected asynchronously and the agent returns immediately.
Wait bool `protobuf:"varint,1,opt,name=wait,proto3" json:"wait,omitempty"`
// NbCpus specifies the number of CPUs that were added and the agent has to online.
// NbCpus specifies the number of CPUs that should be onlined in the guest.
// Special value 0 means agent will skip this check.
NbCpus uint32 `protobuf:"varint,2,opt,name=nb_cpus,json=nbCpus,proto3" json:"nb_cpus,omitempty"`
// CpuOnly specifies whether only online CPU or not.
CpuOnly bool `protobuf:"varint,3,opt,name=cpu_only,json=cpuOnly,proto3" json:"cpu_only,omitempty"`

View File

@ -2117,9 +2117,8 @@ func (s *Sandbox) updateResources(ctx context.Context) error {
s.Logger().Debugf("Request to hypervisor to update oldCPUs/newCPUs: %d/%d", oldCPUs, newCPUs)
// If the CPUs were increased, ask agent to online them
if oldCPUs < newCPUs {
vcpusAdded := newCPUs - oldCPUs
s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", vcpusAdded)
if err := s.agent.onlineCPUMem(ctx, vcpusAdded, true); err != nil {
s.Logger().Debugf("Request to onlineCPUMem with %d CPUs", newCPUs)
if err := s.agent.onlineCPUMem(ctx, newCPUs, true); err != nil {
return err
}
}

View File

@ -293,7 +293,7 @@ func (v *VM) AddMemory(ctx context.Context, numMB uint32) error {
// OnlineCPUMemory puts the hotplugged CPU and memory online.
func (v *VM) OnlineCPUMemory(ctx context.Context) error {
v.logger().Infof("online CPU %d and memory", v.cpuDelta)
err := v.agent.onlineCPUMem(ctx, v.cpuDelta, false)
err := v.agent.onlineCPUMem(ctx, v.cpu, false)
if err == nil {
v.cpuDelta = 0
}