mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-09-17 14:58:16 +00:00
runtime-rs: support cgroup resource
Fixes: #3785 Signed-off-by: Tim Zhang <tim@hyper.sh>
This commit is contained in:
55
src/runtime-rs/Cargo.lock
generated
55
src/runtime-rs/Cargo.lock
generated
@@ -93,9 +93,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
@@ -157,7 +157,7 @@ checksum = "cdae996d9638ba03253ffa1c93345a585974a97abbdeab9176c77922f3efc1e8"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
"regex",
|
||||
]
|
||||
|
||||
@@ -184,7 +184,7 @@ dependencies = [
|
||||
"kata-sys-util",
|
||||
"kata-types",
|
||||
"lazy_static",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
"oci",
|
||||
"protobuf",
|
||||
"serde_json",
|
||||
@@ -602,7 +602,7 @@ dependencies = [
|
||||
"kata-types",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
"oci",
|
||||
"once_cell",
|
||||
"serde_json",
|
||||
@@ -764,6 +764,19 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.16.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd0eaf8df8bab402257e0a5c17a254e4cc1f72a93588a1ddfb5d356c801aa7cb"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cc",
|
||||
"cfg-if 0.1.10",
|
||||
"libc",
|
||||
"void",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.1"
|
||||
@@ -1118,6 +1131,26 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "resource"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"cgroups-rs",
|
||||
"hypervisor",
|
||||
"kata-sys-util",
|
||||
"kata-types",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"logging",
|
||||
"nix 0.16.1",
|
||||
"oci",
|
||||
"slog",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "runtimes"
|
||||
version = "0.1.0"
|
||||
@@ -1270,7 +1303,7 @@ dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"logging",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
"oci",
|
||||
"protobuf",
|
||||
"rand",
|
||||
@@ -1570,7 +1603,7 @@ dependencies = [
|
||||
"futures 0.3.21",
|
||||
"libc",
|
||||
"log",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
"protobuf",
|
||||
"protobuf-codegen-pure",
|
||||
"thiserror",
|
||||
@@ -1699,6 +1732,12 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "void"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
|
||||
|
||||
[[package]]
|
||||
name = "vsock"
|
||||
version = "0.2.6"
|
||||
@@ -1706,7 +1745,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"nix",
|
||||
"nix 0.23.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@@ -3,6 +3,6 @@ members = [
|
||||
"crates/shim",
|
||||
# TODO: current only for check, delete after use the agent crate
|
||||
"crates/agent",
|
||||
# TODO: current only for check, delete after use the hypervisor crate
|
||||
"crates/hypervisor",
|
||||
# TODO: current only for check, delete after use the resource crate
|
||||
"crates/resource",
|
||||
]
|
||||
|
24
src/runtime-rs/crates/resource/Cargo.toml
Normal file
24
src/runtime-rs/crates/resource/Cargo.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
[package]
|
||||
name = "resource"
|
||||
version = "0.1.0"
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "^1.0"
|
||||
async-trait = "0.1.48"
|
||||
cgroups-rs = "0.2.9"
|
||||
lazy_static = "1.4.0"
|
||||
libc = ">=0.2.39"
|
||||
log = "^0.4.0"
|
||||
nix = "0.16.0"
|
||||
slog = "2.5.2"
|
||||
tokio = { version = "1.8.0", features = ["sync"] }
|
||||
|
||||
hypervisor = { path = "../hypervisor" }
|
||||
kata-types = { path = "../../../libs/kata-types" }
|
||||
kata-sys-util = { path = "../../../libs/kata-sys-util" }
|
||||
logging = { path = "../../../libs/logging" }
|
||||
oci = { path = "../../../libs/oci" }
|
||||
|
||||
[features]
|
217
src/runtime-rs/crates/resource/src/cgroups/mod.rs
Normal file
217
src/runtime-rs/crates/resource/src/cgroups/mod.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
// Copyright (c) 2019-2022 Alibaba Cloud
|
||||
// Copyright (c) 2019-2022 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
mod utils;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
iter::FromIterator,
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use cgroups_rs::{cgroup_builder::CgroupBuilder, Cgroup, CgroupPid, CpuResources, Resources};
|
||||
use hypervisor::Hypervisor;
|
||||
use kata_sys_util::spec::load_oci_spec;
|
||||
use kata_types::config::TomlConfig;
|
||||
use oci::LinuxResources;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
pub struct CgroupConfig {
|
||||
pub path: String,
|
||||
pub overhead_path: String,
|
||||
pub sandbox_cgroup_only: bool,
|
||||
}
|
||||
|
||||
impl CgroupConfig {
|
||||
fn new(sid: &str, toml_config: &TomlConfig) -> Result<Self> {
|
||||
let overhead_path = utils::gen_overhead_path(sid);
|
||||
let spec = load_oci_spec()?;
|
||||
let path = spec
|
||||
.linux
|
||||
// The trim of '/' is important, because cgroup_path is a relative path.
|
||||
.map(|linux| linux.cgroups_path.trim_start_matches('/').to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
Ok(Self {
|
||||
path,
|
||||
overhead_path,
|
||||
sandbox_cgroup_only: toml_config.runtime.sandbox_cgroup_only,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CgroupsResource {
|
||||
resources: Arc<RwLock<HashMap<String, Resources>>>,
|
||||
cgroup_manager: Cgroup,
|
||||
overhead_cgroup_manager: Option<Cgroup>,
|
||||
}
|
||||
|
||||
impl CgroupsResource {
|
||||
pub fn new(sid: &str, toml_config: &TomlConfig) -> Result<Self> {
|
||||
let config = CgroupConfig::new(sid, toml_config)?;
|
||||
|
||||
// Create the sandbox cgroups manager (cgroups on Linux).
|
||||
// Depending on the sandbox_cgroup_only value, this cgroup
|
||||
// will either hold all the pod threads (sandbox_cgroup_only is true)
|
||||
// or only the virtual CPU ones (sandbox_cgroup_only is false).
|
||||
let hier = cgroups_rs::hierarchies::auto();
|
||||
let cgroup_manager = CgroupBuilder::new(&config.path).build(hier);
|
||||
|
||||
// The shim configuration is requesting that we do not put all threads
|
||||
// into the sandbox resource controller.
|
||||
// We're creating an overhead controller, with no constraints. Everything but
|
||||
// the vCPU threads will eventually make it there.
|
||||
let overhead_cgroup_manager = if !config.sandbox_cgroup_only {
|
||||
let hier = cgroups_rs::hierarchies::auto();
|
||||
Some(CgroupBuilder::new(&config.overhead_path).build(hier))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Add the runtime to the VMM sandbox resource controller
|
||||
|
||||
// By adding the runtime process to either the sandbox or overhead controller, we are making
|
||||
// sure that any child process of the runtime (i.e. *all* processes serving a Kata pod)
|
||||
// will initially live in this controller. Depending on the sandbox_cgroup_only settings, we will
|
||||
// then move the vCPU threads between resource controllers.
|
||||
let pid = CgroupPid { pid: 0 };
|
||||
if let Some(manager) = overhead_cgroup_manager.as_ref() {
|
||||
manager.add_task_by_tgid(pid).context("add task by tgid")?;
|
||||
} else {
|
||||
cgroup_manager
|
||||
.add_task_by_tgid(pid)
|
||||
.context("add task by tgid with sandbox only")?;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
cgroup_manager,
|
||||
resources: Arc::new(RwLock::new(HashMap::new())),
|
||||
overhead_cgroup_manager,
|
||||
})
|
||||
}
|
||||
|
||||
/// delete will move the running processes in the cgroup_manager and
|
||||
/// overhead_cgroup_manager to the parent and then delete the cgroups.
|
||||
pub async fn delete(&self) -> Result<()> {
|
||||
for cg_pid in self.cgroup_manager.tasks() {
|
||||
self.cgroup_manager.remove_task(cg_pid);
|
||||
}
|
||||
self.cgroup_manager.delete()?;
|
||||
|
||||
if let Some(overhead) = self.overhead_cgroup_manager.as_ref() {
|
||||
for cg_pid in overhead.tasks() {
|
||||
overhead.remove_task(cg_pid);
|
||||
}
|
||||
overhead.delete()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn update_cgroups(
|
||||
&self,
|
||||
cid: &str,
|
||||
linux_resources: Option<&LinuxResources>,
|
||||
h: &dyn Hypervisor,
|
||||
) -> Result<()> {
|
||||
let resource = self.calc_resource(linux_resources);
|
||||
let changed = self.update_resources(cid, resource).await;
|
||||
|
||||
if !changed {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.do_update_cgroups(h).await
|
||||
}
|
||||
|
||||
async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool {
|
||||
let mut resources = self.resources.write().await;
|
||||
let old_resource = resources.insert(cid.to_owned(), new_resource.clone());
|
||||
|
||||
if let Some(old_resource) = old_resource {
|
||||
if old_resource == new_resource {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> {
|
||||
let merged_resources = self.merge_resources().await;
|
||||
self.cgroup_manager
|
||||
.apply(&merged_resources)
|
||||
.map_err(|e| anyhow!(e))?;
|
||||
|
||||
if self.overhead_cgroup_manager.is_some() {
|
||||
// If we have an overhead controller, new vCPU threads would start there,
|
||||
// as being children of the VMM PID.
|
||||
// We need to constrain them by moving them into the sandbox controller.
|
||||
self.constrain_hypervisor(h).await?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// constrain_hypervisor will place the VMM and vCPU threads into resource controllers (cgroups on Linux).
|
||||
async fn constrain_hypervisor(&self, h: &dyn Hypervisor) -> Result<()> {
|
||||
let tids = h.get_thread_ids().await?;
|
||||
let tids = tids.vcpus.values();
|
||||
|
||||
// All vCPU threads move to the sandbox controller.
|
||||
for tid in tids {
|
||||
self.cgroup_manager
|
||||
.add_task_by_tgid(CgroupPid { pid: *tid as u64 })?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn merge_resources(&self) -> Resources {
|
||||
let resources = self.resources.read().await;
|
||||
|
||||
let mut cpu_list: HashSet<String> = HashSet::new();
|
||||
let mut mem_list: HashSet<String> = HashSet::new();
|
||||
|
||||
resources.values().for_each(|r| {
|
||||
if let Some(cpus) = &r.cpu.cpus {
|
||||
cpu_list.insert(cpus.clone());
|
||||
}
|
||||
if let Some(mems) = &r.cpu.mems {
|
||||
mem_list.insert(mems.clone());
|
||||
}
|
||||
});
|
||||
|
||||
let cpu_resource = CpuResources {
|
||||
cpus: Some(Vec::from_iter(cpu_list.into_iter()).join(",")),
|
||||
mems: Some(Vec::from_iter(mem_list.into_iter()).join(",")),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Resources {
|
||||
cpu: cpu_resource,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn calc_cpu_resources(&self, linux_resources: Option<&LinuxResources>) -> CpuResources {
|
||||
let cpu = || -> Option<oci::LinuxCpu> { linux_resources.as_ref()?.cpu.clone() }();
|
||||
|
||||
CpuResources {
|
||||
cpus: cpu.clone().map(|cpu| cpu.cpus),
|
||||
mems: cpu.map(|cpu| cpu.mems),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn calc_resource(&self, linux_resources: Option<&LinuxResources>) -> Resources {
|
||||
Resources {
|
||||
cpu: self.calc_cpu_resources(linux_resources),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
16
src/runtime-rs/crates/resource/src/cgroups/utils.rs
Normal file
16
src/runtime-rs/crates/resource/src/cgroups/utils.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
// Copyright (c) 2019-2022 Alibaba Cloud
|
||||
// Copyright (c) 2019-2022 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
// When the Kata overhead threads (I/O, VMM, etc) are not
|
||||
// placed in the sandbox resource controller (A cgroup on Linux),
|
||||
// they are moved to a specific, unconstrained resource controller.
|
||||
// On Linux, assuming the cgroup mount point is at /sys/fs/cgroup/,
|
||||
// on a cgroup v1 system, the Kata overhead memory cgroup will be at
|
||||
// /sys/fs/cgroup/memory/kata_overhead/$CGPATH where $CGPATH is
|
||||
// defined by the orchestrator.
|
||||
pub(crate) fn gen_overhead_path(path: &str) -> String {
|
||||
format!("/kata_overhead/{}", path.trim_start_matches('/'))
|
||||
}
|
7
src/runtime-rs/crates/resource/src/lib.rs
Normal file
7
src/runtime-rs/crates/resource/src/lib.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
// Copyright (c) 2019-2022 Alibaba Cloud
|
||||
// Copyright (c) 2019-2022 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
pub mod cgroups;
|
Reference in New Issue
Block a user