diff --git a/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md b/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md index 43ca4f4b0..e53a3b534 100644 --- a/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md +++ b/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md @@ -25,8 +25,8 @@ Finally, when running a Kata Containers with `ctr run --mount type=X, src=Y, dst Now, supported types: - `directvol` for direct volume -- `spdkvol` for SPDK volume (TBD) - `vfiovol` for VFIO device based volume +- `spdkvol` for SPDK/vhost-user based volume ## Setup Device and Run a Kata-Containers @@ -147,6 +147,80 @@ $ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=vfiovol,src= ``` -### SPDK Device Based Volume +### SPDK Device Based Block Volume -TBD +SPDK vhost-user devices in runtime-rs, unlike runtime (golang version), there is no need to `mknod` device node under `/dev/` any more. +Just using the `kata-ctl direct-volume add ..` to make a mount info config is enough. + +#### Run SPDK vhost target and Expose vhost block device + +Run a SPDK vhost target and get vhost-user block controller as an example: + +First, run SPDK vhost target: + +> **Tips:** If driver `vfio-pci` supported, you can run SPDK with `DRIVER_OVERRIDE=vfio-pci` +> Otherwise, Just run without it `sudo HUGEMEM=4096 ./scripts/setup.sh`. + +```bash +$ SPDK_DEVEL=/xx/spdk +$ VHU_UDS_PATH=/tmp/vhu-targets +$ RAW_DISKS=/xx/rawdisks +$ # Reset first +$ ${SPDK_DEVEL}/scripts/setup.sh reset +$ sudo sysctl -w vm.nr_hugepages=2048 +$ #4G Huge Memory for spdk +$ sudo HUGEMEM=4096 DRIVER_OVERRIDE=vfio-pci ${SPDK_DEVEL}/scripts/setup.sh +$ sudo ${SPDK_DEVEL}/build/bin/spdk_tgt -S $VHU_UDS_PATH -s 1024 -m 0x3 & +``` + +Second, create a vhost controller: + +```bash +$ sudo dd if=/dev/zero of=${RAW_DISKS}/rawdisk01.20g bs=1M count=20480 +$ sudo ${SPDK_DEVEL}/scripts/rpc.py bdev_aio_create ${RAW_DISKS}/rawdisk01.20g vhu-rawdisk01.20g 512 +$ sudo ${SPDK_DEVEL}/scripts/rpc.py vhost_create_blk_controller vhost-blk-rawdisk01.sock vhu-rawdisk01.20g +``` + +Here, a vhost controller `vhost-blk-rawdisk01.sock` is created, and the controller will +be passed to Hypervisor, such as Dragonball, Cloud-Hypervisor, Firecracker or QEMU. + + +#### setup vhost-user block device for kata-containers + + +First, `mkdir` a sub-path `kubelet/kata-test-vol-001/` under `/run/kata-containers/shared/direct-volumes/`. + +Second, fill fields in `mountinfo.json`, it looks like as below: +```json +{ + "device": "/tmp/vhu-targets/vhost-blk-rawdisk01.sock", + "volume_type": "spdkvol", + "fs_type": "ext4", + "metadata":"{}", + "options": [] +} +``` + +Third, with the help of `kata-ctl direct-volume` to add block device to generate `mountinfo.json`, and run a kata container with `--mount`. + +```bash +$ # kata-ctl direct-volume add +$ sudo kata-ctl direct-volume add /kubelet/kata-test-vol-001/volume001 "{\"device\": \"/tmp/vhu-targets/vhost-blk-rawdisk01.sock\", \"volume_type\":\"spdkvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}" +$ # /kubelet/kata-test-vol-001/volume001 <==> /run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx +$ cat L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx/mountInfo.json +$ {"volume_type":"spdkvol","device":"/tmp/vhu-targets/vhost-blk-rawdisk01.sock","fs_type":"ext4","metadata":{},"options":[]} +``` + +As `/run/kata-containers/shared/direct-volumes/` is a fixed path , we will be able to run a kata pod with `--mount` and set +`src` sub-path. And the `--mount` argument looks like: `--mount type=spdkvol,src=/kubelet/kata-test-vol-001/volume001,dst=/disk001`. + + +#### Run a Kata container with SPDK vhost-user block device + + +In the case, `ctr run --mount type=X, src=source, dst=dest`, the X will be set `spdkvol` which is a proprietary type specifically designed for SPDK volumes. + +```bash +$ # ctr run with --mount type=spdkvol,src=/kubelet/kata-test-vol-001/volume001,dst=/disk001 +$ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=spdkvol,src=/kubelet/kata-test-vol-001/volume001,dst=/disk001,options=rbind:rw "$image" kata-spdk-vol-xx0530 /bin/bash +``` diff --git a/src/dragonball/src/device_manager/blk_dev_mgr.rs b/src/dragonball/src/device_manager/blk_dev_mgr.rs index 0fe10cf24..854edfc09 100644 --- a/src/dragonball/src/device_manager/blk_dev_mgr.rs +++ b/src/dragonball/src/device_manager/blk_dev_mgr.rs @@ -114,6 +114,8 @@ pub enum BlockDeviceType { /// SPOOL is a reliable NVMe virtualization system for the cloud environment. /// You could learn more SPOOL here: https://www.usenix.org/conference/atc20/presentation/xue Spool, + /// The standard vhost-user-blk based device such as Spdk device. + Spdk, /// Local disk/file based low level device. RawBlock, } @@ -124,6 +126,8 @@ impl BlockDeviceType { // SPOOL path should be started with "spool", e.g. "spool:/device1" if path.starts_with("spool:/") { BlockDeviceType::Spool + } else if path.starts_with("spdk:/") { + BlockDeviceType::Spdk } else { BlockDeviceType::RawBlock } @@ -400,6 +404,10 @@ impl BlockDeviceMgr { BlockDeviceError::DeviceManager(e) }) } + BlockDeviceType::Spool | BlockDeviceType::Spdk => { + // TBD + todo!() + } _ => Err(BlockDeviceError::InvalidBlockDeviceType), } } diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 74adaac0d..b490dc94a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -11,8 +11,8 @@ use kata_sys_util::rand::RandomBytes; use tokio::sync::{Mutex, RwLock}; use crate::{ - BlockConfig, BlockDevice, Hypervisor, VfioDevice, KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, - VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, + device::VhostUserBlkDevice, BlockConfig, BlockDevice, Hypervisor, VfioDevice, VhostUserConfig, + KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, }; use super::{ @@ -25,17 +25,34 @@ pub type ArcMutexDevice = Arc>; /// block_index and released_block_index are used to search an available block index /// in Sandbox. /// +/// @block_driver to be used for block device; /// @block_index generally default is 1 for ; /// @released_block_index for blk devices removed and indexes will released at the same time. #[derive(Clone, Debug, Default)] struct SharedInfo { + block_driver: String, block_index: u64, released_block_index: Vec, } impl SharedInfo { - fn new() -> Self { + async fn new(hypervisor: Arc) -> Self { + // get hypervisor block driver + let block_driver = match hypervisor + .hypervisor_config() + .await + .blockdev_info + .block_device_driver + .as_str() + { + // convert the block driver to kata type + VIRTIO_BLOCK_MMIO => KATA_MMIO_BLK_DEV_TYPE.to_string(), + VIRTIO_BLOCK_PCI => KATA_BLK_DEV_TYPE.to_string(), + _ => "".to_string(), + }; + SharedInfo { + block_driver, block_index: 1, released_block_index: vec![], } @@ -67,26 +84,24 @@ pub struct DeviceManager { } impl DeviceManager { - pub fn new(hypervisor: Arc) -> Result { + pub async fn new(hypervisor: Arc) -> Result { let devices = HashMap::::new(); Ok(DeviceManager { devices, - hypervisor, - shared_info: SharedInfo::new(), + hypervisor: hypervisor.clone(), + shared_info: SharedInfo::new(hypervisor.clone()).await, }) } - async fn try_add_device(&mut self, device_id: &str) -> Result<()> { + pub async fn try_add_device(&mut self, device_id: &str) -> Result<()> { // find the device let device = self .devices .get(device_id) .context("failed to find device")?; - - // attach device let mut device_guard = device.lock().await; + // attach device let result = device_guard.attach(self.hypervisor.as_ref()).await; - // handle attach error if let Err(e) = result { match device_guard.get_device_info().await { @@ -102,6 +117,9 @@ impl DeviceManager { .release_device_index(device.config.virt_path.unwrap().0); } } + DeviceType::VhostUserBlk(device) => { + self.shared_info.release_device_index(device.config.index); + } _ => { debug!(sl!(), "no need to do release device index."); } @@ -109,6 +127,7 @@ impl DeviceManager { drop(device_guard); self.devices.remove(device_id); + return Err(e); } @@ -168,6 +187,11 @@ impl DeviceManager { return Some(device_id.to_string()); } } + DeviceType::VhostUserBlk(device) => { + if device.config.socket_path == host_path { + return Some(device_id.to_string()); + } + } _ => { // TODO: support find other device type continue; @@ -225,6 +249,23 @@ impl DeviceManager { &vfio_dev_config, ))) } + DeviceConfig::VhostUserBlkCfg(config) => { + // try to find the device, found and just return id. + if let Some(dev_id_matched) = self.find_device(config.socket_path.clone()).await { + info!( + sl!(), + "vhost blk device with path:{:?} found. just return device id: {:?}", + config.socket_path.clone(), + dev_id_matched + ); + + return Ok(dev_id_matched); + } + + self.create_vhost_blk_device(config, device_id.clone()) + .await + .context("failed to create vhost blk device")? + } _ => { return Err(anyhow!("invliad device type")); } @@ -236,27 +277,34 @@ impl DeviceManager { Ok(device_id) } + async fn create_vhost_blk_device( + &mut self, + config: &VhostUserConfig, + device_id: String, + ) -> Result { + let mut vhu_blk_config = config.clone(); + vhu_blk_config.driver_option = self.shared_info.block_driver.clone(); + + // generate block device index and virt path + // safe here, Block device always has virt_path. + if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK)? { + vhu_blk_config.index = virt_path.0; + vhu_blk_config.virt_path = virt_path.1; + } + + Ok(Arc::new(Mutex::new(VhostUserBlkDevice::new( + device_id, + vhu_blk_config, + )))) + } + async fn create_block_device( &mut self, config: &BlockConfig, device_id: String, ) -> Result { let mut block_config = config.clone(); - // get hypervisor block driver - let block_driver = match self - .hypervisor - .hypervisor_config() - .await - .blockdev_info - .block_device_driver - .as_str() - { - // convert the block driver to kata type - VIRTIO_BLOCK_MMIO => KATA_MMIO_BLK_DEV_TYPE.to_string(), - VIRTIO_BLOCK_PCI => KATA_BLK_DEV_TYPE.to_string(), - _ => "".to_string(), - }; - block_config.driver_option = block_driver; + block_config.driver_option = self.shared_info.block_driver.clone(); // generate virt path if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK)? { diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index eabf9b5f1..7b676fd56 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -1,5 +1,5 @@ -// Copyright (c) 2019-2022 Alibaba Cloud -// Copyright (c) 2019-2022 Ant Group +// Copyright (c) 2019-2023 Alibaba Cloud +// Copyright (c) 2019-2023 Ant Group // // SPDX-License-Identifier: Apache-2.0 // @@ -26,6 +26,9 @@ pub use virtio_fs::{ pub use virtio_net::{Address, NetworkConfig, NetworkDevice}; pub use virtio_vsock::{HybridVsockConfig, HybridVsockDevice, VsockConfig, VsockDevice}; +pub mod vhost_user_blk; +pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType}; + use anyhow::{anyhow, Context, Result}; // Tips: diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user.rs index a105672d5..53258821c 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user.rs @@ -1,34 +1,69 @@ -// Copyright (c) 2019-2023 Alibaba Cloud -// Copyright (c) 2019-2023 Ant Group +// Copyright (c) 2022-2023 Alibaba Cloud +// Copyright (c) 2022-2023 Ant Group // // SPDX-License-Identifier: Apache-2.0 // -use crate::device::Device; -use crate::device::DeviceType; -use crate::Hypervisor as hypervisor; -use anyhow::Result; -use async_trait::async_trait; +#[derive(Debug, Clone)] +pub enum VhostUserType { + /// Blk - represents a block vhostuser device type + /// "vhost-user-blk-pci" + Blk(String), + + /// SCSI - represents SCSI based vhost-user type + /// "vhost-user-scsi-pci" + SCSI(String), + + /// Net - represents Net based vhost-user type + /// "virtio-net-pci" + Net(String), + + /// FS - represents a virtio-fs vhostuser device type + /// "vhost-user-fs-pci" + FS(String), +} + +impl Default for VhostUserType { + fn default() -> Self { + VhostUserType::Blk("vhost-user-blk-pci".to_owned()) + } +} #[derive(Debug, Clone, Default)] /// VhostUserConfig represents data shared by most vhost-user devices pub struct VhostUserConfig { - /// Device id + /// device id pub dev_id: String, - /// Socket path + /// socket path pub socket_path: String, - /// Mac_address is only meaningful for vhost user net device + /// mac_address is only meaningful for vhost user net device pub mac_address: String, - /// These are only meaningful for vhost user fs devices + + /// vhost-user-fs is only meaningful for vhost-user-fs device pub tag: String, - pub cache: String, - pub device_type: String, - /// Pci_addr is the PCI address used to identify the slot at which the drive is attached. - pub pci_addr: Option, - /// Block index of the device if assigned - pub index: u8, + /// vhost-user-fs cache mode + pub cache_mode: String, + /// vhost-user-fs cache size in MB pub cache_size: u32, - pub queue_siez: u32, + + /// vhost user device type + pub device_type: VhostUserType, + /// guest block driver + pub driver_option: String, + /// pci_addr is the PCI address used to identify the slot at which the drive is attached. + pub pci_addr: Option, + + /// Block index of the device if assigned + /// type u64 is not OK + pub index: u64, + + /// Virtio queue size. Size: byte + pub queue_size: u32, + /// Block device multi-queue + pub num_queues: usize, + + /// device path in guest + pub virt_path: String, } #[derive(Debug, Clone, Default)] @@ -36,26 +71,3 @@ pub struct VhostUserDevice { pub device_id: String, pub config: VhostUserConfig, } - -#[async_trait] -impl Device for VhostUserConfig { - async fn attach(&mut self, _h: &dyn hypervisor) -> Result<()> { - todo!() - } - - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { - todo!() - } - - async fn get_device_info(&self) -> DeviceType { - todo!() - } - - async fn increase_attach_count(&mut self) -> Result { - todo!() - } - - async fn decrease_attach_count(&mut self) -> Result { - todo!() - } -} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs new file mode 100644 index 000000000..0912f89f1 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs @@ -0,0 +1,122 @@ +// Copyright (c) 2023 Alibaba Cloud +// Copyright (c) 2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; + +use super::VhostUserConfig; +use crate::{ + device::{Device, DeviceType}, + Hypervisor as hypervisor, +}; + +#[derive(Debug, Clone, Default)] +pub struct VhostUserBlkDevice { + pub device_id: String, + + /// If set to true, the drive is opened in read-only mode. Otherwise, the + /// drive is opened as read-write. + pub is_readonly: bool, + + /// Don't close `path_on_host` file when dropping the device. + pub no_drop: bool, + + /// driver type for block device + pub driver_option: String, + + pub attach_count: u64, + pub config: VhostUserConfig, +} + +impl VhostUserBlkDevice { + // new creates a new VhostUserBlkDevice + pub fn new(device_id: String, config: VhostUserConfig) -> Self { + VhostUserBlkDevice { + device_id, + attach_count: 0, + config, + ..Default::default() + } + } +} + +#[async_trait] +impl Device for VhostUserBlkDevice { + async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + // increase attach count, skip attach the device if the device is already attached + if self + .increase_attach_count() + .await + .context("failed to increase attach count")? + { + return Ok(()); + } + + if let Err(e) = h.add_device(DeviceType::VhostUserBlk(self.clone())).await { + self.decrease_attach_count().await?; + + return Err(e); + } + + return Ok(()); + } + + async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + // get the count of device detached, and detach once it reaches 0 + if self + .decrease_attach_count() + .await + .context("failed to decrease attach count")? + { + return Ok(None); + } + + if let Err(e) = h + .remove_device(DeviceType::VhostUserBlk(self.clone())) + .await + { + self.increase_attach_count().await?; + + return Err(e); + } + + Ok(Some(self.config.index)) + } + + async fn get_device_info(&self) -> DeviceType { + DeviceType::VhostUserBlk(self.clone()) + } + + async fn increase_attach_count(&mut self) -> Result { + match self.attach_count { + 0 => { + // do real attach + self.attach_count += 1; + Ok(false) + } + std::u64::MAX => Err(anyhow!("device was attached too many times")), + _ => { + self.attach_count += 1; + Ok(true) + } + } + } + + async fn decrease_attach_count(&mut self) -> Result { + match self.attach_count { + 0 => Err(anyhow!("detaching a device that wasn't attached")), + 1 => { + // do real wrok + self.attach_count -= 1; + Ok(false) + } + _ => { + self.attach_count -= 1; + Ok(true) + } + } + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs index da5d50ea7..5091b688e 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs @@ -1,17 +1,18 @@ -// Copyright (c) 2019-2022 Alibaba Cloud -// Copyright (c) 2019-2022 Ant Group +// Copyright (c) 2022-2023 Alibaba Cloud +// Copyright (c) 2022-2023 Ant Group // // SPDX-License-Identifier: Apache-2.0 // -pub const VIRTIO_BLOCK_MMIO: &str = "virtio-blk-mmio"; use crate::device::Device; use crate::device::DeviceType; use crate::Hypervisor as hypervisor; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; + /// VIRTIO_BLOCK_PCI indicates block driver is virtio-pci based pub const VIRTIO_BLOCK_PCI: &str = "virtio-blk-pci"; +pub const VIRTIO_BLOCK_MMIO: &str = "virtio-blk-mmio"; pub const KATA_MMIO_BLK_DEV_TYPE: &str = "mmioblk"; pub const KATA_BLK_DEV_TYPE: &str = "blk"; diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index d4996a3e6..4da2218e4 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -6,10 +6,11 @@ use std::fmt; +use crate::device::driver::vhost_user_blk::VhostUserBlkDevice; use crate::{ BlockConfig, BlockDevice, HybridVsockConfig, HybridVsockDevice, Hypervisor as hypervisor, NetworkConfig, NetworkDevice, ShareFsDevice, ShareFsDeviceConfig, ShareFsMountConfig, - ShareFsMountDevice, VfioConfig, VfioDevice, VsockConfig, VsockDevice, + ShareFsMountDevice, VfioConfig, VfioDevice, VhostUserConfig, VsockConfig, VsockDevice, }; use anyhow::Result; use async_trait::async_trait; @@ -21,6 +22,7 @@ pub mod util; #[derive(Debug)] pub enum DeviceConfig { BlockCfg(BlockConfig), + VhostUserBlkCfg(VhostUserConfig), NetworkCfg(NetworkConfig), ShareFsCfg(ShareFsDeviceConfig), VfioCfg(VfioConfig), @@ -32,6 +34,7 @@ pub enum DeviceConfig { #[derive(Debug)] pub enum DeviceType { Block(BlockDevice), + VhostUserBlk(VhostUserBlkDevice), Vfio(VfioDevice), Network(NetworkDevice), ShareFs(ShareFsDevice), diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs index 80b4f33d3..1bd8f2ecc 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs @@ -8,9 +8,12 @@ use std::path::PathBuf; use anyhow::{anyhow, Context, Result}; use dbs_utils::net::MacAddr; -use dragonball::api::v1::{ - BlockDeviceConfigInfo, FsDeviceConfigInfo, FsMountConfigInfo, VirtioNetDeviceConfigInfo, - VsockDeviceConfigInfo, +use dragonball::{ + api::v1::{ + BlockDeviceConfigInfo, FsDeviceConfigInfo, FsMountConfigInfo, VirtioNetDeviceConfigInfo, + VsockDeviceConfigInfo, + }, + device_manager::blk_dev_mgr::BlockDeviceType, }; use super::DragonballInner; @@ -56,6 +59,14 @@ impl DragonballInner { block.config.no_drop, ) .context("add block device"), + DeviceType::VhostUserBlk(block) => self + .add_block_device( + block.config.socket_path.as_str(), + block.device_id.as_str(), + block.is_readonly, + block.no_drop, + ) + .context("add vhost user based block device"), DeviceType::HybridVsock(hvsock) => self.add_hvsock(&hvsock.config).context("add vsock"), DeviceType::ShareFs(sharefs) => self .add_share_fs_device(&sharefs.config) @@ -161,6 +172,7 @@ impl DragonballInner { let blk_cfg = BlockDeviceConfigInfo { drive_id: id.to_string(), + device_type: BlockDeviceType::get_type(path), path_on_host: PathBuf::from(jailed_drive), is_direct: self.config.blockdev_info.block_device_cache_direct, no_drop, diff --git a/src/runtime-rs/crates/resource/src/manager.rs b/src/runtime-rs/crates/resource/src/manager.rs index d79de40cd..58a42db0f 100644 --- a/src/runtime-rs/crates/resource/src/manager.rs +++ b/src/runtime-rs/crates/resource/src/manager.rs @@ -35,19 +35,16 @@ pub struct ResourceManager { } impl ResourceManager { - pub fn new( + pub async fn new( sid: &str, agent: Arc, hypervisor: Arc, toml_config: Arc, ) -> Result { Ok(Self { - inner: Arc::new(RwLock::new(ResourceManagerInner::new( - sid, - agent, - hypervisor, - toml_config, - )?)), + inner: Arc::new(RwLock::new( + ResourceManagerInner::new(sid, agent, hypervisor, toml_config).await?, + )), }) } diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 4744da40f..23426ccd0 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -51,15 +51,16 @@ pub(crate) struct ResourceManagerInner { } impl ResourceManagerInner { - pub(crate) fn new( + pub(crate) async fn new( sid: &str, agent: Arc, hypervisor: Arc, toml_config: Arc, ) -> Result { // create device manager - let dev_manager = - DeviceManager::new(hypervisor.clone()).context("failed to create device manager")?; + let dev_manager = DeviceManager::new(hypervisor.clone()) + .await + .context("failed to create device manager")?; let cgroups_resource = CgroupsResource::new(sid, &toml_config)?; let cpu_resource = CpuResource::new(toml_config.clone())?; @@ -473,7 +474,9 @@ impl Persist for ResourceManagerInner { sid: resource_args.sid, agent: resource_args.agent, hypervisor: resource_args.hypervisor.clone(), - device_manager: Arc::new(RwLock::new(DeviceManager::new(resource_args.hypervisor)?)), + device_manager: Arc::new(RwLock::new( + DeviceManager::new(resource_args.hypervisor).await?, + )), network: None, share_fs: None, rootfs_resource: RootFsResource::new(), diff --git a/src/runtime-rs/crates/resource/src/volume/mod.rs b/src/runtime-rs/crates/resource/src/volume/mod.rs index 5a92c2db0..230b7098b 100644 --- a/src/runtime-rs/crates/resource/src/volume/mod.rs +++ b/src/runtime-rs/crates/resource/src/volume/mod.rs @@ -14,6 +14,9 @@ pub mod utils; pub mod vfio_volume; use vfio_volume::is_vfio_volume; +pub mod spdk_volume; +use spdk_volume::is_spdk_volume; + use std::{sync::Arc, vec::Vec}; use anyhow::{Context, Result}; @@ -84,6 +87,12 @@ impl VolumeResource { .await .with_context(|| format!("new vfio volume {:?}", m))?, ) + } else if is_spdk_volume(m) { + Arc::new( + spdk_volume::SPDKVolume::new(d, m, read_only, cid, sid) + .await + .with_context(|| format!("create spdk volume {:?}", m))?, + ) } else if let Some(options) = get_huge_page_option(m).context("failed to check huge page")? { diff --git a/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs b/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs new file mode 100644 index 000000000..31076b8a3 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs @@ -0,0 +1,189 @@ +// Copyright (c) 2023 Alibaba Cloud +// Copyright (c) 2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use nix::sys::{stat, stat::SFlag}; +use tokio::sync::RwLock; + +use super::Volume; +use crate::volume::utils::{ + generate_shared_path, volume_mount_info, DEFAULT_VOLUME_FS_TYPE, KATA_SPDK_VOLUME_TYPE, + KATA_SPOOL_VOLUME_TYPE, +}; +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + DeviceConfig, DeviceType, + }, + VhostUserConfig, VhostUserType, +}; + +/// SPDKVolume: spdk block device volume +#[derive(Clone)] +pub(crate) struct SPDKVolume { + storage: Option, + mount: oci::Mount, + device_id: String, +} + +impl SPDKVolume { + pub(crate) async fn new( + d: &RwLock, + m: &oci::Mount, + read_only: bool, + cid: &str, + sid: &str, + ) -> Result { + let mnt_src: &str = &m.source; + + // deserde Information from mountinfo.json + let v = volume_mount_info(mnt_src).context("deserde information from mountinfo.json")?; + let device = match v.volume_type.as_str() { + KATA_SPDK_VOLUME_TYPE => { + if v.device.starts_with("spdk://") { + v.device.clone() + } else { + format!("spdk://{}", v.device.as_str()) + } + } + KATA_SPOOL_VOLUME_TYPE => { + if v.device.starts_with("spool://") { + v.device.clone() + } else { + format!("spool://{}", v.device.as_str()) + } + } + _ => return Err(anyhow!("mountinfo.json is invalid")), + }; + + // device format: X:///x/y/z.sock,so just unwrap it. + // if file is not S_IFSOCK, return error. + { + // device tokens: (Type, Socket) + let device_tokens = device.split_once("://").unwrap(); + + let fstat = stat::stat(device_tokens.1).context("stat socket failed")?; + let s_flag = SFlag::from_bits_truncate(fstat.st_mode); + if s_flag != SFlag::S_IFSOCK { + return Err(anyhow!("device {:?} is not valid", device)); + } + } + + let mut vhu_blk_config = &mut VhostUserConfig { + socket_path: device, + device_type: VhostUserType::Blk("vhost-user-blk-pci".to_owned()), + ..Default::default() + }; + + if let Some(num) = v.metadata.get("num_queues") { + vhu_blk_config.num_queues = num + .parse::() + .context("num queues parse usize failed.")?; + } + if let Some(size) = v.metadata.get("queue_size") { + vhu_blk_config.queue_size = size + .parse::() + .context("num queues parse u32 failed.")?; + } + + // create and insert block device into Kata VM + let device_info = + do_handle_device(d, &DeviceConfig::VhostUserBlkCfg(vhu_blk_config.clone())) + .await + .context("do handle device failed.")?; + + // generate host guest shared path + let guest_path = generate_shared_path(m.destination.clone(), read_only, cid, sid) + .await + .context("generate host-guest shared path failed")?; + + // storage + let mut storage = agent::Storage { + mount_point: guest_path.clone(), + ..Default::default() + }; + + storage.options = if read_only { + vec!["ro".to_string()] + } else { + Vec::new() + }; + + let mut device_id = String::new(); + if let DeviceType::VhostUserBlk(device) = device_info { + // blk, mmioblk + storage.driver = device.config.driver_option; + // /dev/vdX + storage.source = device.config.virt_path; + device_id = device.device_id; + } + + if m.r#type != "bind" { + storage.fs_type = v.fs_type.clone(); + } else { + storage.fs_type = DEFAULT_VOLUME_FS_TYPE.to_string(); + } + + if m.destination.clone().starts_with("/dev") { + storage.fs_type = "bind".to_string(); + storage.options.append(&mut m.options.clone()); + } + + storage.fs_group = None; + let mount = oci::Mount { + destination: m.destination.clone(), + r#type: storage.fs_type.clone(), + source: guest_path, + options: m.options.clone(), + }; + + Ok(Self { + storage: Some(storage), + mount, + device_id, + }) + } +} + +#[async_trait] +impl Volume for SPDKVolume { + fn get_volume_mount(&self) -> Result> { + Ok(vec![self.mount.clone()]) + } + + fn get_storage(&self) -> Result> { + let s = if let Some(s) = self.storage.as_ref() { + vec![s.clone()] + } else { + vec![] + }; + + Ok(s) + } + + async fn cleanup(&self, device_manager: &RwLock) -> Result<()> { + device_manager + .write() + .await + .try_remove_device(&self.device_id) + .await + } + + fn get_device_id(&self) -> Result> { + Ok(Some(self.device_id.clone())) + } +} + +pub(crate) fn is_spdk_volume(m: &oci::Mount) -> bool { + // spdkvol or spoolvol will share the same implementation + let vol_types = vec![KATA_SPDK_VOLUME_TYPE, KATA_SPOOL_VOLUME_TYPE]; + if vol_types.contains(&m.r#type.as_str()) { + return true; + } + + false +} diff --git a/src/runtime-rs/crates/resource/src/volume/utils.rs b/src/runtime-rs/crates/resource/src/volume/utils.rs index 892724e1e..cffb839c8 100644 --- a/src/runtime-rs/crates/resource/src/volume/utils.rs +++ b/src/runtime-rs/crates/resource/src/volume/utils.rs @@ -20,6 +20,7 @@ pub const KATA_MOUNT_BIND_TYPE: &str = "bind"; pub const KATA_DIRECT_VOLUME_TYPE: &str = "directvol"; pub const KATA_VFIO_VOLUME_TYPE: &str = "vfiovol"; pub const KATA_SPDK_VOLUME_TYPE: &str = "spdkvol"; +pub const KATA_SPOOL_VOLUME_TYPE: &str = "spoolvol"; // volume mount info load infomation from mountinfo.json pub fn volume_mount_info(volume_path: &str) -> Result { diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs index 3d1381c71..9bfc72f5e 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs @@ -74,12 +74,8 @@ impl RuntimeHandler for VirtContainer { // get uds from hypervisor and get config from toml_config let agent = new_agent(&config).context("new agent")?; - let resource_manager = Arc::new(ResourceManager::new( - sid, - agent.clone(), - hypervisor.clone(), - config, - )?); + let resource_manager = + Arc::new(ResourceManager::new(sid, agent.clone(), hypervisor.clone(), config).await?); let pid = std::process::id(); let sandbox = sandbox::VirtSandbox::new(