From d86e74674cd76754a3e720d01bbc6d27a48211f3 Mon Sep 17 00:00:00 2001 From: bin liu Date: Wed, 9 Sep 2020 23:09:37 +0800 Subject: [PATCH] agent: add retry between doing CPU hotplug and make it online. Sometimes runtime will fail in onlining CPU process, because when the runtime calls to QMP `device_add`, QEMU doesn't allocate all vCPUs inmediatelly. Fixes: #665 Signed-off-by: bin liu --- src/agent/src/rpc.rs | 2 -- src/agent/src/sandbox.rs | 44 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index befc5557ba..4e801e166c 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -1179,8 +1179,6 @@ impl protocols::agent_ttrpc::AgentService for agentService { _ctx: &ttrpc::TtrpcContext, req: protocols::agent::OnlineCPUMemRequest, ) -> ttrpc::Result { - // sleep 5 seconds for debug - // thread::sleep(Duration::new(5, 0)); let s = Arc::clone(&self.sandbox); let sandbox = s.lock().unwrap(); diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index c34c3cdd74..26b6554fad 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -25,6 +25,7 @@ use std::fs; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::sync::mpsc::Sender; +use std::{thread, time}; #[derive(Debug)] pub struct Sandbox { @@ -316,10 +317,18 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res if re.is_match(name) { let file = format!("{}/{}", p.to_str().unwrap(), SYSFS_ONLINE_FILE); info!(logger, "{}", file.as_str()); - let c = fs::read_to_string(file.as_str())?; + + let c = fs::read_to_string(file.as_str()); + if c.is_err() { + continue; + } + let c = c.unwrap(); if c.trim().contains("0") { - fs::write(file.as_str(), "1")?; + let r = fs::write(file.as_str(), "1"); + if r.is_err() { + continue; + } count += 1; if num > 0 && count == num { @@ -336,8 +345,37 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res Ok(0) } +// max wait for all CPUs to online will use 50 * 100 = 5 seconds. +const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50; +const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100; + fn online_cpus(logger: &Logger, num: i32) -> Result { - online_resources(logger, SYSFS_CPU_ONLINE_PATH, r"cpu[0-9]+", num) + let mut onlined_count: i32 = 0; + + for i in 0..ONLINE_CPUMEM_MAX_RETRIES { + let r = online_resources( + logger, + SYSFS_CPU_ONLINE_PATH, + r"cpu[0-9]+", + (num - onlined_count), + ); + if r.is_err() { + return r; + } + + onlined_count += r.unwrap(); + if onlined_count == num { + info!(logger, "online {} CPU(s) after {} retries", num, i); + return Ok(num); + } + thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS)); + } + + Err(anyhow!( + "failed to online {} CPU(s) after {} retries", + num, + ONLINE_CPUMEM_MAX_RETRIES + )) } fn online_memory(logger: &Logger) -> Result<()> {