runtime-rs: support cgroup resource

Fixes: #3785
Signed-off-by: Tim Zhang <tim@hyper.sh>
This commit is contained in:
Tim Zhang
2022-03-30 10:52:47 +08:00
committed by Fupan Li
parent 75e282b4c1
commit 234d7bca04
9 changed files with 408 additions and 746 deletions

View File

@@ -93,9 +93,9 @@ dependencies = [
[[package]]
name = "bitflags"
version = "1.3.2"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
[[package]]
name = "block-buffer"
@@ -157,7 +157,7 @@ checksum = "cdae996d9638ba03253ffa1c93345a585974a97abbdeab9176c77922f3efc1e8"
dependencies = [
"libc",
"log",
"nix",
"nix 0.23.1",
"regex",
]
@@ -184,7 +184,7 @@ dependencies = [
"kata-sys-util",
"kata-types",
"lazy_static",
"nix",
"nix 0.23.1",
"oci",
"protobuf",
"serde_json",
@@ -602,7 +602,7 @@ dependencies = [
"kata-types",
"lazy_static",
"libc",
"nix",
"nix 0.23.1",
"oci",
"once_cell",
"serde_json",
@@ -764,6 +764,19 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "nix"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd0eaf8df8bab402257e0a5c17a254e4cc1f72a93588a1ddfb5d356c801aa7cb"
dependencies = [
"bitflags",
"cc",
"cfg-if 0.1.10",
"libc",
"void",
]
[[package]]
name = "nix"
version = "0.23.1"
@@ -1118,6 +1131,26 @@ dependencies = [
"winapi",
]
[[package]]
name = "resource"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"cgroups-rs",
"hypervisor",
"kata-sys-util",
"kata-types",
"lazy_static",
"libc",
"log",
"logging",
"nix 0.16.1",
"oci",
"slog",
"tokio",
]
[[package]]
name = "runtimes"
version = "0.1.0"
@@ -1270,7 +1303,7 @@ dependencies = [
"libc",
"log",
"logging",
"nix",
"nix 0.23.1",
"oci",
"protobuf",
"rand",
@@ -1570,7 +1603,7 @@ dependencies = [
"futures 0.3.21",
"libc",
"log",
"nix",
"nix 0.23.1",
"protobuf",
"protobuf-codegen-pure",
"thiserror",
@@ -1699,6 +1732,12 @@ dependencies = [
"tokio",
]
[[package]]
name = "void"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
[[package]]
name = "vsock"
version = "0.2.6"
@@ -1706,7 +1745,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133"
dependencies = [
"libc",
"nix",
"nix 0.23.1",
]
[[package]]

View File

@@ -3,6 +3,6 @@ members = [
"crates/shim",
# TODO: current only for check, delete after use the agent crate
"crates/agent",
# TODO: current only for check, delete after use the hypervisor crate
"crates/hypervisor",
# TODO: current only for check, delete after use the resource crate
"crates/resource",
]

View File

@@ -0,0 +1,24 @@
[package]
name = "resource"
version = "0.1.0"
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
edition = "2018"
[dependencies]
anyhow = "^1.0"
async-trait = "0.1.48"
cgroups-rs = "0.2.9"
lazy_static = "1.4.0"
libc = ">=0.2.39"
log = "^0.4.0"
nix = "0.16.0"
slog = "2.5.2"
tokio = { version = "1.8.0", features = ["sync"] }
hypervisor = { path = "../hypervisor" }
kata-types = { path = "../../../libs/kata-types" }
kata-sys-util = { path = "../../../libs/kata-sys-util" }
logging = { path = "../../../libs/logging" }
oci = { path = "../../../libs/oci" }
[features]

View File

@@ -0,0 +1,217 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
mod utils;
use std::{
collections::{HashMap, HashSet},
iter::FromIterator,
sync::Arc,
};
use anyhow::{anyhow, Context, Result};
use cgroups_rs::{cgroup_builder::CgroupBuilder, Cgroup, CgroupPid, CpuResources, Resources};
use hypervisor::Hypervisor;
use kata_sys_util::spec::load_oci_spec;
use kata_types::config::TomlConfig;
use oci::LinuxResources;
use tokio::sync::RwLock;
pub struct CgroupConfig {
pub path: String,
pub overhead_path: String,
pub sandbox_cgroup_only: bool,
}
impl CgroupConfig {
fn new(sid: &str, toml_config: &TomlConfig) -> Result<Self> {
let overhead_path = utils::gen_overhead_path(sid);
let spec = load_oci_spec()?;
let path = spec
.linux
// The trim of '/' is important, because cgroup_path is a relative path.
.map(|linux| linux.cgroups_path.trim_start_matches('/').to_string())
.unwrap_or_default();
Ok(Self {
path,
overhead_path,
sandbox_cgroup_only: toml_config.runtime.sandbox_cgroup_only,
})
}
}
pub struct CgroupsResource {
resources: Arc<RwLock<HashMap<String, Resources>>>,
cgroup_manager: Cgroup,
overhead_cgroup_manager: Option<Cgroup>,
}
impl CgroupsResource {
pub fn new(sid: &str, toml_config: &TomlConfig) -> Result<Self> {
let config = CgroupConfig::new(sid, toml_config)?;
// Create the sandbox cgroups manager (cgroups on Linux).
// Depending on the sandbox_cgroup_only value, this cgroup
// will either hold all the pod threads (sandbox_cgroup_only is true)
// or only the virtual CPU ones (sandbox_cgroup_only is false).
let hier = cgroups_rs::hierarchies::auto();
let cgroup_manager = CgroupBuilder::new(&config.path).build(hier);
// The shim configuration is requesting that we do not put all threads
// into the sandbox resource controller.
// We're creating an overhead controller, with no constraints. Everything but
// the vCPU threads will eventually make it there.
let overhead_cgroup_manager = if !config.sandbox_cgroup_only {
let hier = cgroups_rs::hierarchies::auto();
Some(CgroupBuilder::new(&config.overhead_path).build(hier))
} else {
None
};
// Add the runtime to the VMM sandbox resource controller
// By adding the runtime process to either the sandbox or overhead controller, we are making
// sure that any child process of the runtime (i.e. *all* processes serving a Kata pod)
// will initially live in this controller. Depending on the sandbox_cgroup_only settings, we will
// then move the vCPU threads between resource controllers.
let pid = CgroupPid { pid: 0 };
if let Some(manager) = overhead_cgroup_manager.as_ref() {
manager.add_task_by_tgid(pid).context("add task by tgid")?;
} else {
cgroup_manager
.add_task_by_tgid(pid)
.context("add task by tgid with sandbox only")?;
}
Ok(Self {
cgroup_manager,
resources: Arc::new(RwLock::new(HashMap::new())),
overhead_cgroup_manager,
})
}
/// delete will move the running processes in the cgroup_manager and
/// overhead_cgroup_manager to the parent and then delete the cgroups.
pub async fn delete(&self) -> Result<()> {
for cg_pid in self.cgroup_manager.tasks() {
self.cgroup_manager.remove_task(cg_pid);
}
self.cgroup_manager.delete()?;
if let Some(overhead) = self.overhead_cgroup_manager.as_ref() {
for cg_pid in overhead.tasks() {
overhead.remove_task(cg_pid);
}
overhead.delete()?;
}
Ok(())
}
pub async fn update_cgroups(
&self,
cid: &str,
linux_resources: Option<&LinuxResources>,
h: &dyn Hypervisor,
) -> Result<()> {
let resource = self.calc_resource(linux_resources);
let changed = self.update_resources(cid, resource).await;
if !changed {
return Ok(());
}
self.do_update_cgroups(h).await
}
async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool {
let mut resources = self.resources.write().await;
let old_resource = resources.insert(cid.to_owned(), new_resource.clone());
if let Some(old_resource) = old_resource {
if old_resource == new_resource {
return false;
}
}
true
}
async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> {
let merged_resources = self.merge_resources().await;
self.cgroup_manager
.apply(&merged_resources)
.map_err(|e| anyhow!(e))?;
if self.overhead_cgroup_manager.is_some() {
// If we have an overhead controller, new vCPU threads would start there,
// as being children of the VMM PID.
// We need to constrain them by moving them into the sandbox controller.
self.constrain_hypervisor(h).await?
}
Ok(())
}
/// constrain_hypervisor will place the VMM and vCPU threads into resource controllers (cgroups on Linux).
async fn constrain_hypervisor(&self, h: &dyn Hypervisor) -> Result<()> {
let tids = h.get_thread_ids().await?;
let tids = tids.vcpus.values();
// All vCPU threads move to the sandbox controller.
for tid in tids {
self.cgroup_manager
.add_task_by_tgid(CgroupPid { pid: *tid as u64 })?
}
Ok(())
}
async fn merge_resources(&self) -> Resources {
let resources = self.resources.read().await;
let mut cpu_list: HashSet<String> = HashSet::new();
let mut mem_list: HashSet<String> = HashSet::new();
resources.values().for_each(|r| {
if let Some(cpus) = &r.cpu.cpus {
cpu_list.insert(cpus.clone());
}
if let Some(mems) = &r.cpu.mems {
mem_list.insert(mems.clone());
}
});
let cpu_resource = CpuResources {
cpus: Some(Vec::from_iter(cpu_list.into_iter()).join(",")),
mems: Some(Vec::from_iter(mem_list.into_iter()).join(",")),
..Default::default()
};
Resources {
cpu: cpu_resource,
..Default::default()
}
}
fn calc_cpu_resources(&self, linux_resources: Option<&LinuxResources>) -> CpuResources {
let cpu = || -> Option<oci::LinuxCpu> { linux_resources.as_ref()?.cpu.clone() }();
CpuResources {
cpus: cpu.clone().map(|cpu| cpu.cpus),
mems: cpu.map(|cpu| cpu.mems),
..Default::default()
}
}
fn calc_resource(&self, linux_resources: Option<&LinuxResources>) -> Resources {
Resources {
cpu: self.calc_cpu_resources(linux_resources),
..Default::default()
}
}
}

View File

@@ -0,0 +1,16 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
// When the Kata overhead threads (I/O, VMM, etc) are not
// placed in the sandbox resource controller (A cgroup on Linux),
// they are moved to a specific, unconstrained resource controller.
// On Linux, assuming the cgroup mount point is at /sys/fs/cgroup/,
// on a cgroup v1 system, the Kata overhead memory cgroup will be at
// /sys/fs/cgroup/memory/kata_overhead/$CGPATH where $CGPATH is
// defined by the orchestrator.
pub(crate) fn gen_overhead_path(path: &str) -> String {
format!("/kata_overhead/{}", path.trim_start_matches('/'))
}

View File

@@ -0,0 +1,7 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
pub mod cgroups;