runtime-rs: enable container hugepage

enable the functionality of using hugepages in container

Fixes: #5560
Signed-off-by: Zhongtao Hu <zhongtaohu.tim@linux.alibaba.com>
This commit is contained in:
Zhongtao Hu 2022-12-02 14:16:39 +08:00
parent fc4a67eec3
commit afaf17f423
11 changed files with 279 additions and 12 deletions

8
src/libs/Cargo.lock generated
View File

@ -40,6 +40,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "base64"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]]
name = "bitflags"
version = "1.2.1"
@ -420,6 +426,8 @@ dependencies = [
name = "kata-types"
version = "0.1.0"
dependencies = [
"anyhow",
"base64",
"bitmask-enum",
"byte-unit",
"glob",

View File

@ -62,7 +62,7 @@ use crate::sl;
/// Default permission for directories created for mountpoint.
const MOUNT_PERM: u32 = 0o755;
const PROC_MOUNTS_FILE: &str = "/proc/mounts";
pub const PROC_MOUNTS_FILE: &str = "/proc/mounts";
const PROC_FIELDS_PER_LINE: usize = 6;
const PROC_DEVICE_INDEX: usize = 0;
const PROC_PATH_INDEX: usize = 1;

View File

@ -329,6 +329,16 @@ version = "3.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8"
[[package]]
name = "byte-unit"
version = "4.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "581ad4b3d627b0c09a0ccb2912148f839acaca0b93cf54cbe42b6c674e86079c"
dependencies = [
"serde",
"utf8-width",
]
[[package]]
name = "byteorder"
version = "1.4.3"
@ -1361,7 +1371,7 @@ dependencies = [
"anyhow",
"base64",
"bitmask-enum",
"byte-unit",
"byte-unit 3.1.4",
"glob",
"lazy_static",
"num_cpus",
@ -2279,6 +2289,7 @@ dependencies = [
"anyhow",
"async-trait",
"bitflags",
"byte-unit 4.0.17",
"cgroups-rs",
"futures 0.3.21",
"hypervisor",
@ -2299,6 +2310,7 @@ dependencies = [
"serde_json",
"slog",
"slog-scope",
"tempfile",
"test-utils",
"tokio",
"uuid",
@ -2998,6 +3010,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8-width"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
[[package]]
name = "uuid"
version = "0.4.0"

View File

@ -7,11 +7,13 @@ license = "Apache-2.0"
[dev-dependencies]
test-utils = { path = "../../../libs/test-utils" }
tempfile = "3.2.0"
[dependencies]
anyhow = "^1.0"
async-trait = "0.1.48"
bitflags = "1.2.1"
byte-unit = "4.0.14"
cgroups-rs = "0.2.9"
futures = "0.3.11"
lazy_static = "1.4.0"

View File

@ -78,10 +78,10 @@ impl ResourceManager {
pub async fn handler_volumes(
&self,
cid: &str,
oci_mounts: &[oci::Mount],
spec: &oci::Spec,
) -> Result<Vec<Arc<dyn Volume>>> {
let inner = self.inner.read().await;
inner.handler_volumes(cid, oci_mounts).await
inner.handler_volumes(cid, spec).await
}
pub async fn dump(&self) {

View File

@ -214,10 +214,10 @@ impl ResourceManagerInner {
pub async fn handler_volumes(
&self,
cid: &str,
oci_mounts: &[oci::Mount],
spec: &oci::Spec,
) -> Result<Vec<Arc<dyn Volume>>> {
self.volume_resource
.handler_volumes(&self.share_fs, cid, oci_mounts)
.handler_volumes(&self.share_fs, cid, spec)
.await
}

View File

@ -15,6 +15,7 @@ use tokio::sync::Mutex;
pub use utils::{do_get_guest_path, do_get_guest_share_path, get_host_rw_shared_path};
mod virtio_fs_share_mount;
use virtio_fs_share_mount::VirtiofsShareMount;
pub use virtio_fs_share_mount::EPHEMERAL_PATH;
use std::{collections::HashMap, fmt::Debug, path::PathBuf, sync::Arc};

View File

@ -17,7 +17,7 @@ use std::path::Path;
const WATCHABLE_PATH_NAME: &str = "watchable";
const WATCHABLE_BIND_DEV_TYPE: &str = "watchable-bind";
const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral";
pub const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral";
use super::{
utils::{self, do_get_host_path},

View File

@ -0,0 +1,223 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use std::{
collections::HashMap,
fs::File,
io::{BufRead, BufReader},
};
use crate::share_fs::EPHEMERAL_PATH;
use agent::Storage;
use anyhow::{anyhow, Context, Ok, Result};
use async_trait::async_trait;
use byte_unit::Byte;
use hypervisor::HUGETLBFS;
use kata_sys_util::{fs::get_base_name, mount::PROC_MOUNTS_FILE};
use kata_types::mount::KATA_EPHEMERAL_VOLUME_TYPE;
use super::{Volume, BIND};
type PageSize = Byte;
type Limit = u64;
const NODEV: &str = "nodev";
// container hugepage
pub(crate) struct Hugepage {
// storage info
storage: Option<Storage>,
// mount info
mount: oci::Mount,
}
// handle hugepage
impl Hugepage {
pub(crate) fn new(
mount: &oci::Mount,
hugepage_limits_map: HashMap<PageSize, Limit>,
fs_options: Vec<String>,
) -> Result<Self> {
// Create mount option string
let page_size = get_page_size(fs_options).context("failed to get page size")?;
let option = hugepage_limits_map
.get(&page_size)
.map(|limit| format!("pagesize={},size={}", page_size.get_bytes(), limit))
.context("failed to get hugepage option")?;
let base_name = get_base_name(mount.source.clone())?
.into_string()
.map_err(|e| anyhow!("failed to convert to string{:?}", e))?;
let mut mount = mount.clone();
// Set the mount source path to a path that resides inside the VM
mount.source = format!("{}{}{}", EPHEMERAL_PATH, "/", base_name);
// Set the mount type to "bind"
mount.r#type = BIND.to_string();
// Create a storage struct so that kata agent is able to create
// hugetlbfs backed volume inside the VM
let storage = Storage {
driver: KATA_EPHEMERAL_VOLUME_TYPE.to_string(),
source: NODEV.to_string(),
fs_type: HUGETLBFS.to_string(),
mount_point: mount.source.clone(),
options: vec![option],
..Default::default()
};
Ok(Self {
storage: Some(storage),
mount,
})
}
}
#[async_trait]
impl Volume for Hugepage {
fn get_volume_mount(&self) -> Result<Vec<oci::Mount>> {
Ok(vec![self.mount.clone()])
}
fn get_storage(&self) -> Result<Vec<agent::Storage>> {
let s = if let Some(s) = self.storage.as_ref() {
vec![s.clone()]
} else {
vec![]
};
Ok(s)
}
async fn cleanup(&self) -> Result<()> {
Ok(())
}
}
pub(crate) fn get_huge_page_option(m: &oci::Mount) -> Result<Option<Vec<String>>> {
if m.source.is_empty() {
return Err(anyhow!("empty mount source"));
}
let file = File::open(PROC_MOUNTS_FILE).context("failed open file")?;
let reader = BufReader::new(file);
for line in reader.lines().flatten() {
let items: Vec<&str> = line.split(' ').collect();
if m.source == items[1] && items[2] == HUGETLBFS {
let fs_options: Vec<&str> = items[3].split(',').collect();
return Ok(Some(
fs_options
.iter()
.map(|&s| s.to_string())
.collect::<Vec<String>>(),
));
}
}
Ok(None)
}
// TODO add hugepage limit to sandbox memory once memory hotplug is enabled
// https://github.com/kata-containers/kata-containers/issues/5880
pub(crate) fn get_huge_page_limits_map(spec: &oci::Spec) -> Result<HashMap<PageSize, Limit>> {
let mut hugepage_limits_map: HashMap<PageSize, Limit> = HashMap::new();
if let Some(l) = &spec.linux {
if let Some(r) = &l.resources {
let hugepage_limits = r.hugepage_limits.clone();
for hugepage_limit in hugepage_limits {
// the pagesize send from oci spec is MB or GB, change it to Mi and Gi
let page_size = hugepage_limit.page_size.replace('B', "i");
let page_size = Byte::from_str(page_size)
.context("failed to create Byte object from String")?;
hugepage_limits_map.insert(page_size, hugepage_limit.limit);
}
return Ok(hugepage_limits_map);
}
return Ok(hugepage_limits_map);
}
Ok(hugepage_limits_map)
}
fn get_page_size(fs_options: Vec<String>) -> Result<Byte> {
for fs_option in fs_options {
if fs_option.starts_with("pagesize=") {
let page_size = fs_option
.strip_prefix("pagesize=")
// the parameters passed are in unit M or G, append i to be Mi and Gi
.map(|s| format!("{}i", s))
.context("failed to strip prefix pagesize")?;
return Byte::from_str(page_size)
.map_err(|_| anyhow!("failed to convert string to byte"));
}
}
Err(anyhow!("failed to get page size"))
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, fs};
use crate::volume::hugepage::{get_page_size, HUGETLBFS, NODEV};
use super::{get_huge_page_limits_map, get_huge_page_option};
use byte_unit::Byte;
use nix::mount::{mount, umount, MsFlags};
use oci::{Linux, LinuxHugepageLimit, LinuxResources};
use test_utils::skip_if_not_root;
#[test]
fn test_get_huge_page_option() {
let format_sizes = ["1GB", "2MB"];
let mut huge_page_limits: Vec<LinuxHugepageLimit> = vec![];
for format_size in format_sizes {
huge_page_limits.push(LinuxHugepageLimit {
page_size: format_size.to_string(),
limit: 100000,
});
}
let spec = oci::Spec {
linux: Some(Linux {
resources: Some(LinuxResources {
hugepage_limits: huge_page_limits,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
assert!(get_huge_page_limits_map(&spec).is_ok());
let mut expect_res = HashMap::new();
expect_res.insert(Byte::from_str("1Gi").ok().unwrap(), 100000);
expect_res.insert(Byte::from_str("2Mi").ok().unwrap(), 100000);
assert_eq!(get_huge_page_limits_map(&spec).unwrap(), expect_res);
}
#[test]
fn test_get_huge_page_size() {
skip_if_not_root!();
let format_sizes = ["1Gi", "2Mi"];
for format_size in format_sizes {
let dir = tempfile::tempdir().unwrap();
let dst = dir.path().join(format!("hugepages-{}", format_size));
fs::create_dir_all(&dst).unwrap();
mount(
Some(NODEV),
&dst,
Some(HUGETLBFS),
MsFlags::MS_NODEV,
Some(format!("pagesize={}", format_size).as_str()),
)
.unwrap();
let mount = oci::Mount {
source: dst.to_str().unwrap().to_string(),
..Default::default()
};
let option = get_huge_page_option(&mount).unwrap().unwrap();
let page_size = get_page_size(option).unwrap();
assert_eq!(page_size, Byte::from_str(format_size).unwrap());
umount(&dst).unwrap();
fs::remove_dir(&dst).unwrap();
}
}
}

View File

@ -6,17 +6,20 @@
mod block_volume;
mod default_volume;
pub mod hugepage;
mod share_fs_volume;
mod shm_volume;
use async_trait::async_trait;
use std::{sync::Arc, vec::Vec};
use anyhow::{Context, Result};
use std::{sync::Arc, vec::Vec};
use tokio::sync::RwLock;
use crate::share_fs::ShareFs;
use self::hugepage::{get_huge_page_limits_map, get_huge_page_option};
const BIND: &str = "bind";
#[async_trait]
pub trait Volume: Send + Sync {
fn get_volume_mount(&self) -> Result<Vec<oci::Mount>>;
@ -43,9 +46,11 @@ impl VolumeResource {
&self,
share_fs: &Option<Arc<dyn ShareFs>>,
cid: &str,
oci_mounts: &[oci::Mount],
spec: &oci::Spec,
) -> Result<Vec<Arc<dyn Volume>>> {
let mut volumes: Vec<Arc<dyn Volume>> = vec![];
let oci_mounts = &spec.mounts;
// handle mounts
for m in oci_mounts {
let volume: Arc<dyn Volume> = if shm_volume::is_shim_volume(m) {
let shm_size = shm_volume::DEFAULT_SHM_SIZE;
@ -59,6 +64,17 @@ impl VolumeResource {
.await
.with_context(|| format!("new share fs volume {:?}", m))?,
)
} else if let Some(options) =
get_huge_page_option(m).context("failed to check huge page")?
{
// get hugepage limits from oci
let hugepage_limits =
get_huge_page_limits_map(spec).context("get huge page option")?;
// handle container hugepage
Arc::new(
hugepage::Hugepage::new(m, hugepage_limits, options)
.with_context(|| format!("handle hugepages {:?}", m))?,
)
} else if block_volume::is_block_volume(m) {
Arc::new(
block_volume::BlockVolume::new(m)

View File

@ -110,7 +110,7 @@ impl Container {
// handler volumes
let volumes = self
.resource_manager
.handler_volumes(&config.container_id, &spec.mounts)
.handler_volumes(&config.container_id, &spec)
.await
.context("handler volumes")?;
let mut oci_mounts = vec![];
@ -394,7 +394,6 @@ fn amend_spec(spec: &mut oci::Spec, disable_guest_seccomp: bool) -> Result<()> {
resource.devices = Vec::new();
resource.pids = None;
resource.block_io = None;
resource.hugepage_limits = Vec::new();
resource.network = None;
}