diff --git a/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md b/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md index d4fd048ec9..43ca4f4b07 100644 --- a/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md +++ b/docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.md @@ -20,13 +20,13 @@ The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/ And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`. -Finally, when running a Kata Containers witch `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume. +Finally, when running a Kata Containers with `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume. Now, supported types: - `directvol` for direct volume - `spdkvol` for SPDK volume (TBD) -- `vfiovol` for VFIO device based volume (TBD) +- `vfiovol` for VFIO device based volume ## Setup Device and Run a Kata-Containers @@ -55,7 +55,7 @@ $ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g ``` ```bash -$ sudo ./kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}" +$ sudo kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}" $# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx $ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json {"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]} @@ -65,14 +65,88 @@ $ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json ```bash $ # type=disrectvol,src=/kubelet/kata-direct-vol-002/directvol002,dst=/disk002,options=rbind:rw -$sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=directvol,src=/kubelet/kata-direct-vol-002/directvol002,dst=/disk002,options=rbind:rw "$image" kata-direct-vol-xx05302045 /bin/bash +$ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=directvol,src=/kubelet/kata-direct-vol-002/directvol002,dst=/disk002,options=rbind:rw "$image" kata-direct-vol-xx05302045 /bin/bash +``` + + +### VFIO Device Based Block Volume + +#### create VFIO device based backend storage + +> **Tip:** It only supports `vfio-pci` based PCI device passthrough mode. + +In this scenario, the device's host kernel driver will be replaced by `vfio-pci`, and IOMMU group ID generated. +And either device's BDF or its VFIO IOMMU group ID in `/dev/vfio/` is fine for "device" in `mountinfo.json`. + +```bash +$ lspci -nn -k -s 45:00.1 +45:00.1 SCSI storage controller +... +Kernel driver in use: vfio-pci +... +$ ls /dev/vfio/110 +/dev/vfio/110 +$ ls /sys/kernel/iommu_groups/110/devices/ +0000:45:00.1 +``` + +#### setup VFIO device for kata-containers + +First, configure the `mountinfo.json`, as below: + +- (1) device with `BB:DD:F` + +```json +{ + "device": "45:00.1", + "volume_type": "vfiovol", + "fs_type": "ext4", + "metadata":"{}", + "options": [] +} +``` + +- (2) device with `DDDD:BB:DD:F` + +```json +{ + "device": "0000:45:00.1", + "volume_type": "vfiovol", + "fs_type": "ext4", + "metadata":"{}", + "options": [] +} +``` + +- (3) device with `/dev/vfio/X` + +```json +{ + "device": "/dev/vfio/110", + "volume_type": "vfiovol", + "fs_type": "ext4", + "metadata":"{}", + "options": [] +} +``` + +Second, run kata-containers with device(`/dev/vfio/110`) as an example: + +```bash +$ sudo kata-ctl direct-volume add /kubelet/kata-vfio-vol-003/vfiovol003 "{\"device\": \"/dev/vfio/110\", \"volume_type\": \"vfiovol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}" +$ # /kubelet/kata-vfio-vol-003/directvol003 <==> /run/kata-containers/shared/direct-volumes/F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx +$ cat F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx/mountInfo.json +{"volume_type":"vfiovol","device":"/dev/vfio/110","fs_type":"ext4","metadata":{},"options":[]} +``` + +#### Run a Kata container with VFIO block device based volume + +```bash +$ # type=disrectvol,src=/kubelet/kata-vfio-vol-003/vfiovol003,dst=/disk003,options=rbind:rw +$ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=vfiovol,src=/kubelet/kata-vfio-vol-003/vfiovol003,dst=/disk003,options=rbind:rw "$image" kata-vfio-vol-xx05302245 /bin/bash ``` ### SPDK Device Based Volume TBD - -### VFIO Device Based Volume - -TBD \ No newline at end of file diff --git a/src/runtime-rs/crates/resource/src/volume/mod.rs b/src/runtime-rs/crates/resource/src/volume/mod.rs index 6fc6e3bc81..5a92c2db07 100644 --- a/src/runtime-rs/crates/resource/src/volume/mod.rs +++ b/src/runtime-rs/crates/resource/src/volume/mod.rs @@ -11,6 +11,9 @@ mod share_fs_volume; mod shm_volume; pub mod utils; +pub mod vfio_volume; +use vfio_volume::is_vfio_volume; + use std::{sync::Arc, vec::Vec}; use anyhow::{Context, Result}; @@ -75,6 +78,12 @@ impl VolumeResource { .await .with_context(|| format!("new share fs volume {:?}", m))?, ) + } else if is_vfio_volume(m) { + Arc::new( + vfio_volume::VfioVolume::new(d, m, read_only, cid, sid) + .await + .with_context(|| format!("new vfio volume {:?}", m))?, + ) } else if let Some(options) = get_huge_page_option(m).context("failed to check huge page")? { diff --git a/src/runtime-rs/crates/resource/src/volume/vfio_volume.rs b/src/runtime-rs/crates/resource/src/volume/vfio_volume.rs new file mode 100644 index 0000000000..dab8c7dae2 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/vfio_volume.rs @@ -0,0 +1,141 @@ +// Copyright (c) 2023 Alibaba Cloud +// Copyright (c) 2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use tokio::sync::RwLock; + +use super::Volume; +use crate::volume::utils::{ + generate_shared_path, volume_mount_info, DEFAULT_VOLUME_FS_TYPE, KATA_VFIO_VOLUME_TYPE, +}; +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + DeviceConfig, DeviceType, + }, + get_vfio_device, VfioConfig, +}; + +pub(crate) struct VfioVolume { + storage: Option, + mount: oci::Mount, + device_id: String, +} + +// VfioVolume: vfio device based block volume +impl VfioVolume { + pub(crate) async fn new( + d: &RwLock, + m: &oci::Mount, + read_only: bool, + cid: &str, + sid: &str, + ) -> Result { + let mnt_src: &str = &m.source; + + // deserde Information from mountinfo.json + let v = volume_mount_info(mnt_src).context("deserde information from mountinfo.json")?; + if v.volume_type != KATA_VFIO_VOLUME_TYPE { + return Err(anyhow!("volume type is invalid")); + } + + // support both /dev/vfio/X and BDF or BDF + let vfio_device = get_vfio_device(v.device).context("get vfio device failed.")?; + let vfio_dev_config = &mut VfioConfig { + host_path: vfio_device.clone(), + dev_type: "b".to_string(), + hostdev_prefix: "vfio_vol".to_owned(), + ..Default::default() + }; + + // create and insert block device into Kata VM + let device_info = do_handle_device(d, &DeviceConfig::VfioCfg(vfio_dev_config.clone())) + .await + .context("do handle device failed.")?; + + // generate host guest shared path + let guest_path = generate_shared_path(m.destination.clone(), read_only, cid, sid) + .await + .context("generate host-guest shared path failed")?; + + let storage_options = if read_only { + vec!["ro".to_string()] + } else { + Vec::new() + }; + + let mut storage = agent::Storage { + options: storage_options, + mount_point: guest_path.clone(), + ..Default::default() + }; + + let mut device_id = String::new(); + if let DeviceType::Vfio(device) = device_info { + device_id = device.device_id; + storage.driver = device.driver_type; + // safe here, device_info is correct and only unwrap it. + storage.source = device.config.virt_path.unwrap().1; + } + + if m.r#type != "bind" { + storage.fs_type = v.fs_type.clone(); + } else { + storage.fs_type = DEFAULT_VOLUME_FS_TYPE.to_string(); + } + + let mount = oci::Mount { + destination: m.destination.clone(), + r#type: v.fs_type, + source: guest_path, + options: m.options.clone(), + }; + + Ok(Self { + storage: Some(storage), + mount, + device_id, + }) + } +} + +#[async_trait] +impl Volume for VfioVolume { + fn get_volume_mount(&self) -> Result> { + Ok(vec![self.mount.clone()]) + } + + fn get_storage(&self) -> Result> { + let s = if let Some(s) = self.storage.as_ref() { + vec![s.clone()] + } else { + vec![] + }; + + Ok(s) + } + + async fn cleanup(&self, device_manager: &RwLock) -> Result<()> { + device_manager + .write() + .await + .try_remove_device(&self.device_id) + .await + } + + fn get_device_id(&self) -> Result> { + Ok(Some(self.device_id.clone())) + } +} + +pub(crate) fn is_vfio_volume(m: &oci::Mount) -> bool { + if m.r#type == KATA_VFIO_VOLUME_TYPE { + return true; + } + + false +}