runtime-rs: add support vfio device based volume

A new choice of using vfio devic based volume for kata-containers.
With the help of kata-ctl direct-volume, users are able to add a
specified device which is BDF or IOMMU group ID.

To help users to use it smoothly, A doc about howto added in
docs/how-to/how-to-run-kata-containers-with-kinds-of-Block-Volumes.

Fixes: #6525

Signed-off-by: alex.lyn <alex.lyn@antgroup.com>
This commit is contained in:
alex.lyn 2023-06-18 14:07:05 +08:00
parent 1e3b372bbb
commit 59510cfee0
3 changed files with 232 additions and 8 deletions

View File

@ -20,13 +20,13 @@ The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/
And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is
encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`. encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`.
Finally, when running a Kata Containers witch `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume. Finally, when running a Kata Containers with `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume.
Now, supported types: Now, supported types:
- `directvol` for direct volume - `directvol` for direct volume
- `spdkvol` for SPDK volume (TBD) - `spdkvol` for SPDK volume (TBD)
- `vfiovol` for VFIO device based volume (TBD) - `vfiovol` for VFIO device based volume
## Setup Device and Run a Kata-Containers ## Setup Device and Run a Kata-Containers
@ -55,7 +55,7 @@ $ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g
``` ```
```bash ```bash
$ sudo ./kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}" $ sudo kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
$# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx $# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx
$ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json $ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
{"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]} {"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]}
@ -69,10 +69,84 @@ $sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=directvol,src
``` ```
### VFIO Device Based Block Volume
#### create VFIO device based backend storage
> **Tip:** It only supports `vfio-pci` based PCI device passthrough mode.
In this scenario, the device's host kernel driver will be replaced by `vfio-pci`, and IOMMU group ID generated.
And either device's BDF or its VFIO IOMMU group ID in `/dev/vfio/` is fine for "device" in `mountinfo.json`.
```bash
$ lspci -nn -k -s 45:00.1
45:00.1 SCSI storage controller
...
Kernel driver in use: vfio-pci
...
$ ls /dev/vfio/110
/dev/vfio/110
$ ls /sys/kernel/iommu_groups/110/devices/
0000:45:00.1
```
#### setup VFIO device for kata-containers
First, configure the `mountinfo.json`, as below:
- (1) device with `BB:DD:F`
```json
{
"device": "45:00.1",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
- (2) device with `DDDD:BB:DD:F`
```json
{
"device": "0000:45:00.1",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
- (3) device with `/dev/vfio/X`
```json
{
"device": "/dev/vfio/110",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
Second, run kata-containers with device(`/dev/vfio/110`) as an example:
```bash
$ sudo kata-ctl direct-volume add /kubelet/kata-vfio-vol-003/vfiovol003 "{\"device\": \"/dev/vfio/110\", \"volume_type\": \"vfiovol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
$ # /kubelet/kata-vfio-vol-003/directvol003 <==> /run/kata-containers/shared/direct-volumes/F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx
$ cat F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx/mountInfo.json
{"volume_type":"vfiovol","device":"/dev/vfio/110","fs_type":"ext4","metadata":{},"options":[]}
```
#### Run a Kata container with VFIO block device based volume
```bash
$ # type=disrectvol,src=/kubelet/kata-vfio-vol-003/vfiovol003,dst=/disk003,options=rbind:rw
$ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=vfiovol,src=/kubelet/kata-vfio-vol-003/vfiovol003,dst=/disk003,options=rbind:rw "$image" kata-vfio-vol-xx05302245 /bin/bash
```
### SPDK Device Based Volume ### SPDK Device Based Volume
TBD TBD
### VFIO Device Based Volume
TBD

View File

@ -11,6 +11,9 @@ mod share_fs_volume;
mod shm_volume; mod shm_volume;
pub mod utils; pub mod utils;
pub mod vfio_volume;
use vfio_volume::is_vfio_volume;
use std::{sync::Arc, vec::Vec}; use std::{sync::Arc, vec::Vec};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
@ -75,6 +78,12 @@ impl VolumeResource {
.await .await
.with_context(|| format!("new share fs volume {:?}", m))?, .with_context(|| format!("new share fs volume {:?}", m))?,
) )
} else if is_vfio_volume(m) {
Arc::new(
vfio_volume::VfioVolume::new(d, m, read_only, cid, sid)
.await
.with_context(|| format!("new vfio volume {:?}", m))?,
)
} else if let Some(options) = } else if let Some(options) =
get_huge_page_option(m).context("failed to check huge page")? get_huge_page_option(m).context("failed to check huge page")?
{ {

View File

@ -0,0 +1,141 @@
// Copyright (c) 2023 Alibaba Cloud
// Copyright (c) 2023 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use tokio::sync::RwLock;
use super::Volume;
use crate::volume::utils::{
generate_shared_path, volume_mount_info, DEFAULT_VOLUME_FS_TYPE, KATA_VFIO_VOLUME_TYPE,
};
use hypervisor::{
device::{
device_manager::{do_handle_device, DeviceManager},
DeviceConfig, DeviceType,
},
get_vfio_device, VfioConfig,
};
pub(crate) struct VfioVolume {
storage: Option<agent::Storage>,
mount: oci::Mount,
device_id: String,
}
// VfioVolume: vfio device based block volume
impl VfioVolume {
pub(crate) async fn new(
d: &RwLock<DeviceManager>,
m: &oci::Mount,
read_only: bool,
cid: &str,
sid: &str,
) -> Result<Self> {
let mnt_src: &str = &m.source;
// deserde Information from mountinfo.json
let v = volume_mount_info(mnt_src).context("deserde information from mountinfo.json")?;
if v.volume_type != KATA_VFIO_VOLUME_TYPE {
return Err(anyhow!("volume type is invalid"));
}
// support both /dev/vfio/X and BDF<DDDD:BB:DD.F> or BDF<BB:DD.F>
let vfio_device = get_vfio_device(v.device).context("get vfio device failed.")?;
let vfio_dev_config = &mut VfioConfig {
host_path: vfio_device.clone(),
dev_type: "b".to_string(),
hostdev_prefix: "vfio_vol".to_owned(),
..Default::default()
};
// create and insert block device into Kata VM
let device_info = do_handle_device(d, &DeviceConfig::VfioCfg(vfio_dev_config.clone()))
.await
.context("do handle device failed.")?;
// generate host guest shared path
let guest_path = generate_shared_path(m.destination.clone(), read_only, cid, sid)
.await
.context("generate host-guest shared path failed")?;
let storage_options = if read_only {
vec!["ro".to_string()]
} else {
Vec::new()
};
let mut storage = agent::Storage {
options: storage_options,
mount_point: guest_path.clone(),
..Default::default()
};
let mut device_id = String::new();
if let DeviceType::Vfio(device) = device_info {
device_id = device.device_id;
storage.driver = device.driver_type;
// safe here, device_info is correct and only unwrap it.
storage.source = device.config.virt_path.unwrap().1;
}
if m.r#type != "bind" {
storage.fs_type = v.fs_type.clone();
} else {
storage.fs_type = DEFAULT_VOLUME_FS_TYPE.to_string();
}
let mount = oci::Mount {
destination: m.destination.clone(),
r#type: v.fs_type,
source: guest_path,
options: m.options.clone(),
};
Ok(Self {
storage: Some(storage),
mount,
device_id,
})
}
}
#[async_trait]
impl Volume for VfioVolume {
fn get_volume_mount(&self) -> Result<Vec<oci::Mount>> {
Ok(vec![self.mount.clone()])
}
fn get_storage(&self) -> Result<Vec<agent::Storage>> {
let s = if let Some(s) = self.storage.as_ref() {
vec![s.clone()]
} else {
vec![]
};
Ok(s)
}
async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()> {
device_manager
.write()
.await
.try_remove_device(&self.device_id)
.await
}
fn get_device_id(&self) -> Result<Option<String>> {
Ok(Some(self.device_id.clone()))
}
}
pub(crate) fn is_vfio_volume(m: &oci::Mount) -> bool {
if m.r#type == KATA_VFIO_VOLUME_TYPE {
return true;
}
false
}