mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-28 08:17:37 +00:00
agent: add retry between doing CPU hotplug and make it online.
Sometimes runtime will fail in onlining CPU process, because when the runtime calls to QMP `device_add`, QEMU doesn't allocate all vCPUs inmediatelly. Fixes: #665 Signed-off-by: bin liu <bin@hyper.sh>
This commit is contained in:
parent
7f20587433
commit
d86e74674c
@ -1179,8 +1179,6 @@ impl protocols::agent_ttrpc::AgentService for agentService {
|
|||||||
_ctx: &ttrpc::TtrpcContext,
|
_ctx: &ttrpc::TtrpcContext,
|
||||||
req: protocols::agent::OnlineCPUMemRequest,
|
req: protocols::agent::OnlineCPUMemRequest,
|
||||||
) -> ttrpc::Result<Empty> {
|
) -> ttrpc::Result<Empty> {
|
||||||
// sleep 5 seconds for debug
|
|
||||||
// thread::sleep(Duration::new(5, 0));
|
|
||||||
let s = Arc::clone(&self.sandbox);
|
let s = Arc::clone(&self.sandbox);
|
||||||
let sandbox = s.lock().unwrap();
|
let sandbox = s.lock().unwrap();
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ use std::fs;
|
|||||||
use std::os::unix::fs::PermissionsExt;
|
use std::os::unix::fs::PermissionsExt;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::mpsc::Sender;
|
use std::sync::mpsc::Sender;
|
||||||
|
use std::{thread, time};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Sandbox {
|
pub struct Sandbox {
|
||||||
@ -316,10 +317,18 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res
|
|||||||
if re.is_match(name) {
|
if re.is_match(name) {
|
||||||
let file = format!("{}/{}", p.to_str().unwrap(), SYSFS_ONLINE_FILE);
|
let file = format!("{}/{}", p.to_str().unwrap(), SYSFS_ONLINE_FILE);
|
||||||
info!(logger, "{}", file.as_str());
|
info!(logger, "{}", file.as_str());
|
||||||
let c = fs::read_to_string(file.as_str())?;
|
|
||||||
|
let c = fs::read_to_string(file.as_str());
|
||||||
|
if c.is_err() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let c = c.unwrap();
|
||||||
|
|
||||||
if c.trim().contains("0") {
|
if c.trim().contains("0") {
|
||||||
fs::write(file.as_str(), "1")?;
|
let r = fs::write(file.as_str(), "1");
|
||||||
|
if r.is_err() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
count += 1;
|
count += 1;
|
||||||
|
|
||||||
if num > 0 && count == num {
|
if num > 0 && count == num {
|
||||||
@ -336,8 +345,37 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res
|
|||||||
Ok(0)
|
Ok(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// max wait for all CPUs to online will use 50 * 100 = 5 seconds.
|
||||||
|
const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50;
|
||||||
|
const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100;
|
||||||
|
|
||||||
fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
|
fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
|
||||||
online_resources(logger, SYSFS_CPU_ONLINE_PATH, r"cpu[0-9]+", num)
|
let mut onlined_count: i32 = 0;
|
||||||
|
|
||||||
|
for i in 0..ONLINE_CPUMEM_MAX_RETRIES {
|
||||||
|
let r = online_resources(
|
||||||
|
logger,
|
||||||
|
SYSFS_CPU_ONLINE_PATH,
|
||||||
|
r"cpu[0-9]+",
|
||||||
|
(num - onlined_count),
|
||||||
|
);
|
||||||
|
if r.is_err() {
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
onlined_count += r.unwrap();
|
||||||
|
if onlined_count == num {
|
||||||
|
info!(logger, "online {} CPU(s) after {} retries", num, i);
|
||||||
|
return Ok(num);
|
||||||
|
}
|
||||||
|
thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS));
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(anyhow!(
|
||||||
|
"failed to online {} CPU(s) after {} retries",
|
||||||
|
num,
|
||||||
|
ONLINE_CPUMEM_MAX_RETRIES
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn online_memory(logger: &Logger) -> Result<()> {
|
fn online_memory(logger: &Logger) -> Result<()> {
|
||||||
|
Loading…
Reference in New Issue
Block a user