diff --git a/src/libs/Cargo.lock b/src/libs/Cargo.lock index 66090c19c5..90423e0023 100644 --- a/src/libs/Cargo.lock +++ b/src/libs/Cargo.lock @@ -40,6 +40,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "bitflags" version = "1.2.1" @@ -420,6 +426,8 @@ dependencies = [ name = "kata-types" version = "0.1.0" dependencies = [ + "anyhow", + "base64", "bitmask-enum", "byte-unit", "glob", diff --git a/src/libs/kata-sys-util/src/mount.rs b/src/libs/kata-sys-util/src/mount.rs index 61a80f150a..d9064935bb 100644 --- a/src/libs/kata-sys-util/src/mount.rs +++ b/src/libs/kata-sys-util/src/mount.rs @@ -62,7 +62,7 @@ use crate::sl; /// Default permission for directories created for mountpoint. const MOUNT_PERM: u32 = 0o755; -const PROC_MOUNTS_FILE: &str = "/proc/mounts"; +pub const PROC_MOUNTS_FILE: &str = "/proc/mounts"; const PROC_FIELDS_PER_LINE: usize = 6; const PROC_DEVICE_INDEX: usize = 0; const PROC_PATH_INDEX: usize = 1; diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock index 5c436ca535..7fecc459f4 100644 --- a/src/runtime-rs/Cargo.lock +++ b/src/runtime-rs/Cargo.lock @@ -329,6 +329,16 @@ version = "3.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8" +[[package]] +name = "byte-unit" +version = "4.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581ad4b3d627b0c09a0ccb2912148f839acaca0b93cf54cbe42b6c674e86079c" +dependencies = [ + "serde", + "utf8-width", +] + [[package]] name = "byteorder" version = "1.4.3" @@ -1361,7 +1371,7 @@ dependencies = [ "anyhow", "base64", "bitmask-enum", - "byte-unit", + "byte-unit 3.1.4", "glob", "lazy_static", "num_cpus", @@ -2279,6 +2289,7 @@ dependencies = [ "anyhow", "async-trait", "bitflags", + "byte-unit 4.0.17", "cgroups-rs", "futures 0.3.21", "hypervisor", @@ -2299,6 +2310,7 @@ dependencies = [ "serde_json", "slog", "slog-scope", + "tempfile", "test-utils", "tokio", "uuid", @@ -2998,6 +3010,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf8-width" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" + [[package]] name = "uuid" version = "0.4.0" diff --git a/src/runtime-rs/crates/resource/Cargo.toml b/src/runtime-rs/crates/resource/Cargo.toml index e39169f0e1..f1957755c9 100644 --- a/src/runtime-rs/crates/resource/Cargo.toml +++ b/src/runtime-rs/crates/resource/Cargo.toml @@ -7,11 +7,13 @@ license = "Apache-2.0" [dev-dependencies] test-utils = { path = "../../../libs/test-utils" } +tempfile = "3.2.0" [dependencies] anyhow = "^1.0" async-trait = "0.1.48" bitflags = "1.2.1" +byte-unit = "4.0.14" cgroups-rs = "0.2.9" futures = "0.3.11" lazy_static = "1.4.0" diff --git a/src/runtime-rs/crates/resource/src/manager.rs b/src/runtime-rs/crates/resource/src/manager.rs index 78b40380ff..ef14a0e4d0 100644 --- a/src/runtime-rs/crates/resource/src/manager.rs +++ b/src/runtime-rs/crates/resource/src/manager.rs @@ -78,10 +78,10 @@ impl ResourceManager { pub async fn handler_volumes( &self, cid: &str, - oci_mounts: &[oci::Mount], + spec: &oci::Spec, ) -> Result>> { let inner = self.inner.read().await; - inner.handler_volumes(cid, oci_mounts).await + inner.handler_volumes(cid, spec).await } pub async fn dump(&self) { diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 791d76a375..d9e3816cbc 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -214,10 +214,10 @@ impl ResourceManagerInner { pub async fn handler_volumes( &self, cid: &str, - oci_mounts: &[oci::Mount], + spec: &oci::Spec, ) -> Result>> { self.volume_resource - .handler_volumes(&self.share_fs, cid, oci_mounts) + .handler_volumes(&self.share_fs, cid, spec) .await } diff --git a/src/runtime-rs/crates/resource/src/share_fs/mod.rs b/src/runtime-rs/crates/resource/src/share_fs/mod.rs index 96f6dc32f1..83942288c9 100644 --- a/src/runtime-rs/crates/resource/src/share_fs/mod.rs +++ b/src/runtime-rs/crates/resource/src/share_fs/mod.rs @@ -15,6 +15,7 @@ use tokio::sync::Mutex; pub use utils::{do_get_guest_path, do_get_guest_share_path, get_host_rw_shared_path}; mod virtio_fs_share_mount; use virtio_fs_share_mount::VirtiofsShareMount; +pub use virtio_fs_share_mount::EPHEMERAL_PATH; use std::{collections::HashMap, fmt::Debug, path::PathBuf, sync::Arc}; diff --git a/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs b/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs index c1d999cfb0..8627150a54 100644 --- a/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs +++ b/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs @@ -17,7 +17,7 @@ use std::path::Path; const WATCHABLE_PATH_NAME: &str = "watchable"; const WATCHABLE_BIND_DEV_TYPE: &str = "watchable-bind"; -const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral"; +pub const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral"; use super::{ utils::{self, do_get_host_path}, diff --git a/src/runtime-rs/crates/resource/src/volume/hugepage.rs b/src/runtime-rs/crates/resource/src/volume/hugepage.rs new file mode 100644 index 0000000000..a827b26579 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/hugepage.rs @@ -0,0 +1,223 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + collections::HashMap, + fs::File, + io::{BufRead, BufReader}, +}; + +use crate::share_fs::EPHEMERAL_PATH; +use agent::Storage; +use anyhow::{anyhow, Context, Ok, Result}; +use async_trait::async_trait; +use byte_unit::Byte; +use hypervisor::HUGETLBFS; +use kata_sys_util::{fs::get_base_name, mount::PROC_MOUNTS_FILE}; +use kata_types::mount::KATA_EPHEMERAL_VOLUME_TYPE; + +use super::{Volume, BIND}; + +type PageSize = Byte; +type Limit = u64; + +const NODEV: &str = "nodev"; + +// container hugepage +pub(crate) struct Hugepage { + // storage info + storage: Option, + // mount info + mount: oci::Mount, +} + +// handle hugepage +impl Hugepage { + pub(crate) fn new( + mount: &oci::Mount, + hugepage_limits_map: HashMap, + fs_options: Vec, + ) -> Result { + // Create mount option string + let page_size = get_page_size(fs_options).context("failed to get page size")?; + let option = hugepage_limits_map + .get(&page_size) + .map(|limit| format!("pagesize={},size={}", page_size.get_bytes(), limit)) + .context("failed to get hugepage option")?; + let base_name = get_base_name(mount.source.clone())? + .into_string() + .map_err(|e| anyhow!("failed to convert to string{:?}", e))?; + let mut mount = mount.clone(); + // Set the mount source path to a path that resides inside the VM + mount.source = format!("{}{}{}", EPHEMERAL_PATH, "/", base_name); + // Set the mount type to "bind" + mount.r#type = BIND.to_string(); + + // Create a storage struct so that kata agent is able to create + // hugetlbfs backed volume inside the VM + let storage = Storage { + driver: KATA_EPHEMERAL_VOLUME_TYPE.to_string(), + source: NODEV.to_string(), + fs_type: HUGETLBFS.to_string(), + mount_point: mount.source.clone(), + options: vec![option], + ..Default::default() + }; + Ok(Self { + storage: Some(storage), + mount, + }) + } +} + +#[async_trait] +impl Volume for Hugepage { + fn get_volume_mount(&self) -> Result> { + Ok(vec![self.mount.clone()]) + } + + fn get_storage(&self) -> Result> { + let s = if let Some(s) = self.storage.as_ref() { + vec![s.clone()] + } else { + vec![] + }; + Ok(s) + } + + async fn cleanup(&self) -> Result<()> { + Ok(()) + } +} + +pub(crate) fn get_huge_page_option(m: &oci::Mount) -> Result>> { + if m.source.is_empty() { + return Err(anyhow!("empty mount source")); + } + let file = File::open(PROC_MOUNTS_FILE).context("failed open file")?; + let reader = BufReader::new(file); + for line in reader.lines().flatten() { + let items: Vec<&str> = line.split(' ').collect(); + if m.source == items[1] && items[2] == HUGETLBFS { + let fs_options: Vec<&str> = items[3].split(',').collect(); + return Ok(Some( + fs_options + .iter() + .map(|&s| s.to_string()) + .collect::>(), + )); + } + } + Ok(None) +} + +// TODO add hugepage limit to sandbox memory once memory hotplug is enabled +// https://github.com/kata-containers/kata-containers/issues/5880 +pub(crate) fn get_huge_page_limits_map(spec: &oci::Spec) -> Result> { + let mut hugepage_limits_map: HashMap = HashMap::new(); + if let Some(l) = &spec.linux { + if let Some(r) = &l.resources { + let hugepage_limits = r.hugepage_limits.clone(); + for hugepage_limit in hugepage_limits { + // the pagesize send from oci spec is MB or GB, change it to Mi and Gi + let page_size = hugepage_limit.page_size.replace('B', "i"); + let page_size = Byte::from_str(page_size) + .context("failed to create Byte object from String")?; + hugepage_limits_map.insert(page_size, hugepage_limit.limit); + } + return Ok(hugepage_limits_map); + } + return Ok(hugepage_limits_map); + } + Ok(hugepage_limits_map) +} + +fn get_page_size(fs_options: Vec) -> Result { + for fs_option in fs_options { + if fs_option.starts_with("pagesize=") { + let page_size = fs_option + .strip_prefix("pagesize=") + // the parameters passed are in unit M or G, append i to be Mi and Gi + .map(|s| format!("{}i", s)) + .context("failed to strip prefix pagesize")?; + return Byte::from_str(page_size) + .map_err(|_| anyhow!("failed to convert string to byte")); + } + } + Err(anyhow!("failed to get page size")) +} + +#[cfg(test)] +mod tests { + + use std::{collections::HashMap, fs}; + + use crate::volume::hugepage::{get_page_size, HUGETLBFS, NODEV}; + + use super::{get_huge_page_limits_map, get_huge_page_option}; + use byte_unit::Byte; + use nix::mount::{mount, umount, MsFlags}; + use oci::{Linux, LinuxHugepageLimit, LinuxResources}; + use test_utils::skip_if_not_root; + + #[test] + fn test_get_huge_page_option() { + let format_sizes = ["1GB", "2MB"]; + let mut huge_page_limits: Vec = vec![]; + for format_size in format_sizes { + huge_page_limits.push(LinuxHugepageLimit { + page_size: format_size.to_string(), + limit: 100000, + }); + } + + let spec = oci::Spec { + linux: Some(Linux { + resources: Some(LinuxResources { + hugepage_limits: huge_page_limits, + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }; + + assert!(get_huge_page_limits_map(&spec).is_ok()); + + let mut expect_res = HashMap::new(); + expect_res.insert(Byte::from_str("1Gi").ok().unwrap(), 100000); + expect_res.insert(Byte::from_str("2Mi").ok().unwrap(), 100000); + assert_eq!(get_huge_page_limits_map(&spec).unwrap(), expect_res); + } + + #[test] + fn test_get_huge_page_size() { + skip_if_not_root!(); + let format_sizes = ["1Gi", "2Mi"]; + for format_size in format_sizes { + let dir = tempfile::tempdir().unwrap(); + let dst = dir.path().join(format!("hugepages-{}", format_size)); + fs::create_dir_all(&dst).unwrap(); + mount( + Some(NODEV), + &dst, + Some(HUGETLBFS), + MsFlags::MS_NODEV, + Some(format!("pagesize={}", format_size).as_str()), + ) + .unwrap(); + let mount = oci::Mount { + source: dst.to_str().unwrap().to_string(), + ..Default::default() + }; + let option = get_huge_page_option(&mount).unwrap().unwrap(); + let page_size = get_page_size(option).unwrap(); + assert_eq!(page_size, Byte::from_str(format_size).unwrap()); + umount(&dst).unwrap(); + fs::remove_dir(&dst).unwrap(); + } + } +} diff --git a/src/runtime-rs/crates/resource/src/volume/mod.rs b/src/runtime-rs/crates/resource/src/volume/mod.rs index 684b76431b..7a603c601f 100644 --- a/src/runtime-rs/crates/resource/src/volume/mod.rs +++ b/src/runtime-rs/crates/resource/src/volume/mod.rs @@ -6,17 +6,20 @@ mod block_volume; mod default_volume; +pub mod hugepage; mod share_fs_volume; mod shm_volume; use async_trait::async_trait; -use std::{sync::Arc, vec::Vec}; - use anyhow::{Context, Result}; +use std::{sync::Arc, vec::Vec}; use tokio::sync::RwLock; use crate::share_fs::ShareFs; +use self::hugepage::{get_huge_page_limits_map, get_huge_page_option}; + +const BIND: &str = "bind"; #[async_trait] pub trait Volume: Send + Sync { fn get_volume_mount(&self) -> Result>; @@ -43,9 +46,11 @@ impl VolumeResource { &self, share_fs: &Option>, cid: &str, - oci_mounts: &[oci::Mount], + spec: &oci::Spec, ) -> Result>> { let mut volumes: Vec> = vec![]; + let oci_mounts = &spec.mounts; + // handle mounts for m in oci_mounts { let volume: Arc = if shm_volume::is_shim_volume(m) { let shm_size = shm_volume::DEFAULT_SHM_SIZE; @@ -59,6 +64,17 @@ impl VolumeResource { .await .with_context(|| format!("new share fs volume {:?}", m))?, ) + } else if let Some(options) = + get_huge_page_option(m).context("failed to check huge page")? + { + // get hugepage limits from oci + let hugepage_limits = + get_huge_page_limits_map(spec).context("get huge page option")?; + // handle container hugepage + Arc::new( + hugepage::Hugepage::new(m, hugepage_limits, options) + .with_context(|| format!("handle hugepages {:?}", m))?, + ) } else if block_volume::is_block_volume(m) { Arc::new( block_volume::BlockVolume::new(m) diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs index ded8f0a45a..54aa0ebbf9 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs @@ -110,7 +110,7 @@ impl Container { // handler volumes let volumes = self .resource_manager - .handler_volumes(&config.container_id, &spec.mounts) + .handler_volumes(&config.container_id, &spec) .await .context("handler volumes")?; let mut oci_mounts = vec![]; @@ -394,7 +394,6 @@ fn amend_spec(spec: &mut oci::Spec, disable_guest_seccomp: bool) -> Result<()> { resource.devices = Vec::new(); resource.pids = None; resource.block_io = None; - resource.hugepage_limits = Vec::new(); resource.network = None; }