Merge pull request #10146 from Apokleos/intro-cdi

Introduce cdi in runtime-rs
This commit is contained in:
Alex Lyn 2024-09-23 21:45:42 +08:00 committed by GitHub
commit 6b94cc47a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 593 additions and 22 deletions

View File

@ -42,7 +42,7 @@ pub struct StringUser {
pub additional_gids: Vec<String>,
}
#[derive(PartialEq, Clone, Default)]
#[derive(PartialEq, Clone, Debug, Default)]
pub struct Device {
pub id: String,
pub field_type: String,

View File

@ -115,12 +115,12 @@ pub enum VfioDeviceType {
Mediated,
}
// DeviceVendor represents a PCI device's device id and vendor id
// DeviceVendor: (device, vendor)
// DeviceVendorClass represents a PCI device's deviceID, vendorID and classID
// DeviceVendorClass: (device, vendor, class)
#[derive(Clone, Debug)]
pub struct DeviceVendor(String, String);
pub struct DeviceVendorClass(String, String, String);
impl DeviceVendor {
impl DeviceVendorClass {
pub fn get_device_vendor(&self) -> Result<(u32, u32)> {
// default value is 0 when vendor_id or device_id is empty
if self.0.is_empty() || self.1.is_empty() {
@ -142,6 +142,10 @@ impl DeviceVendor {
Ok((device, vendor))
}
pub fn get_vendor_class_id(&self) -> Result<(&str, &str)> {
Ok((&self.1, &self.2))
}
pub fn get_device_vendor_id(&self) -> Result<u32> {
let (device, vendor) = self
.get_device_vendor()
@ -163,8 +167,8 @@ pub struct HostDevice {
/// PCI device information (BDF): "bus:slot:function"
pub bus_slot_func: String,
/// device_vendor: device id and vendor id
pub device_vendor: Option<DeviceVendor>,
/// device_vendor_class: (device, vendor, class)
pub device_vendor_class: Option<DeviceVendorClass>,
/// type of vfio device
pub vfio_type: VfioDeviceType,
@ -336,13 +340,14 @@ impl VfioDevice {
}
// read vendor and deviceor from /sys/bus/pci/devices/BDF/X
fn get_vfio_device_vendor(&self, bdf: &str) -> Result<DeviceVendor> {
fn get_vfio_device_vendor_class(&self, bdf: &str) -> Result<DeviceVendorClass> {
let device =
get_device_property(bdf, "device").context("get device from syspath failed")?;
let vendor =
get_device_property(bdf, "vendor").context("get vendor from syspath failed")?;
let class = get_device_property(bdf, "class").context("get class from syspath failed")?;
Ok(DeviceVendor(device, vendor))
Ok(DeviceVendorClass(device, vendor, class))
}
fn set_vfio_config(
@ -356,13 +361,13 @@ impl VfioDevice {
// It's safe as BDF really exists.
let dev_bdf = vfio_dev_details.0.unwrap();
let dev_vendor = self
.get_vfio_device_vendor(&dev_bdf)
let dev_vendor_class = self
.get_vfio_device_vendor_class(&dev_bdf)
.context("get property device and vendor failed")?;
let vfio_dev = HostDevice {
bus_slot_func: dev_bdf.clone(),
device_vendor: Some(dev_vendor),
device_vendor_class: Some(dev_vendor_class),
sysfs_path: vfio_dev_details.1,
vfio_type: vfio_dev_details.2,
..Default::default()

View File

@ -147,8 +147,8 @@ impl DragonballInner {
// And the the first one is Primary device.
// safe here, devices is not empty.
let primary_device = device.devices.first_mut().unwrap();
let vendor_device_id = if let Some(vd) = primary_device.device_vendor.as_ref() {
vd.get_device_vendor_id()?
let vendor_device_id = if let Some(vdc) = primary_device.device_vendor_class.as_ref() {
vdc.get_device_vendor_id()?
} else {
0
};

View File

@ -0,0 +1,475 @@
//
// Copyright (c) 2024 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use std::collections::HashMap;
use std::path::Path;
use anyhow::Result;
use oci_spec::runtime::Spec;
use super::{resolve_cdi_device_kind, ContainerDevice};
use agent::types::Device;
const CDI_PREFIX: &str = "cdi.k8s.io";
// Sort the devices based on the first element's PCI_Guest_Path in the PCI bus according to options.
fn sort_devices_by_guest_pcipath(devices: &mut [ContainerDevice]) {
// Extract first guest_pcipath from device_options
let extract_first_guest_pcipath = |options: &[String]| -> Option<String> {
options
.first()
.and_then(|option| option.split('=').nth(1))
.map(|path| path.to_string())
};
devices.sort_by(|a, b| {
let guest_path_a = extract_first_guest_pcipath(&a.device.options);
let guest_path_b = extract_first_guest_pcipath(&b.device.options);
guest_path_a.cmp(&guest_path_b)
});
}
// Annotate container devices with CDI annotations in OCI Spec
pub fn annotate_container_devices(
spec: &mut Spec,
container_devices: Vec<ContainerDevice>,
) -> Result<Vec<Device>> {
let mut devices_agent: Vec<Device> = Vec::new();
// Make sure that annotations is Some().
if spec.annotations().is_none() {
spec.set_annotations(Some(HashMap::new()));
}
// Step 1: Extract all devices and filter out devices without device_info for vfio_devices
let vfio_devices: Vec<ContainerDevice> = container_devices
.into_iter()
.map(|device| {
// push every device's Device to agent_devices
devices_agent.push(device.device.clone());
device
})
.filter(|device| device.device_info.is_some())
.collect();
// Step 2: Group devices by vendor_id-class_id
let mut grouped_devices: HashMap<String, Vec<ContainerDevice>> = HashMap::new();
for device in vfio_devices {
// Extract the vendor/class key and insert into the map if both are present
if let Some(key) = device
.device_info
.as_ref()
.and_then(|info| resolve_cdi_device_kind(&info.vendor_id, &info.class_id))
{
grouped_devices
.entry(key.to_owned())
.or_default()
.push(device);
}
}
// Step 3: Sort devices within each group by guest_pcipath
grouped_devices
.iter_mut()
.for_each(|(vendor_class, container_devices)| {
// The *offset* is a monotonically increasing counter that keeps track of the number of devices
// within an IOMMU group. It increments by total_of whenever a new IOMMU group is processed.
let offset: &mut usize = &mut 0;
sort_devices_by_guest_pcipath(container_devices);
container_devices
.iter()
.enumerate()
.for_each(|(base, container_device)| {
let total_of = container_device.device.options.len();
// annotate device with cdi information in OCI Spec.
for index in 0..total_of {
if let Some(iommu_grpid) =
Path::new(&container_device.device.container_path)
.file_name()
.and_then(|name| name.to_str())
{
spec.annotations_mut().as_mut().unwrap().insert(
format!("{}/vfio{}.{}", CDI_PREFIX, iommu_grpid, index), // cdi.k8s.io/vfioX.y
format!("{}={}", vendor_class, base + *offset), // vendor/class=name
);
}
}
// update the offset with *total_of*.
*offset += total_of - 1;
});
});
Ok(devices_agent)
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use crate::cdi_devices::DeviceInfo;
use agent::types::Device;
use oci_spec::runtime::SpecBuilder;
use super::*;
#[test]
fn test_sort_devices_by_guest_pcipath() {
let mut devices = vec![
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0xffff".to_string(),
class_id: "0x030x".to_string(),
host_path: PathBuf::from("/dev/device3"),
}),
device: Device {
options: vec!["pci_host_path03=BB:DD03.F03".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0xffff".to_string(),
class_id: "0x030x".to_string(),
host_path: PathBuf::from("/dev/device1"),
}),
device: Device {
options: vec!["pci_host_path01=BB:DD01.F01".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0xffff".to_string(),
class_id: "0x030x".to_string(),
host_path: PathBuf::from("/dev/device2"),
}),
device: Device {
options: vec!["pci_host_path02=BB:DD02.F02".to_string()],
..Default::default()
},
},
];
sort_devices_by_guest_pcipath(&mut devices);
let expected_devices_order = vec![
"/dev/device1".to_string(),
"/dev/device2".to_string(),
"/dev/device3".to_string(),
];
let actual_devices_order: Vec<String> = devices
.iter()
.map(|cd| {
cd.device_info
.as_ref()
.unwrap()
.host_path
.display()
.to_string()
})
.collect();
assert_eq!(actual_devices_order, expected_devices_order);
}
#[test]
fn test_sort_devices_with_empty_options() {
let mut devices = vec![
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0xffff".to_string(),
class_id: "0x030x".to_string(),
host_path: PathBuf::from("/dev/device1"),
}),
device: Device {
options: vec![], // empty
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0xffff".to_string(),
class_id: "0x030x".to_string(),
host_path: PathBuf::from("/dev/device2"),
}),
device: Device {
options: vec!["pci_host_path02=BB:DD02.F02".to_string()],
..Default::default()
},
},
];
sort_devices_by_guest_pcipath(&mut devices);
// As the first device has no options, ignore it.
let expected_devices_order = vec!["BB:DD02.F02".to_string()];
let actual_devices_order: Vec<String> = devices
.iter()
.filter_map(|d| d.device.options.first())
.map(|option| option.split('=').nth(1).unwrap_or("").to_string())
.collect();
assert_eq!(actual_devices_order, expected_devices_order);
}
#[test]
fn test_annotate_container_devices() {
let devices = vec![
ContainerDevice {
device_info: None,
device: Device {
id: "test0000x".to_string(),
container_path: "/dev/xvdx".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdx".to_string(),
options: vec![],
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x1002".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device2"),
}),
device: Device {
container_path: "/dev/device2".to_string(),
options: vec!["pci_host_path02=BB:DD02.F02".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x1002".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device3"),
}),
device: Device {
container_path: "/dev/device3".to_string(),
options: vec!["pci_host_path03=BB:DD03.F03".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x1002".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device1"),
}),
device: Device {
container_path: "/dev/device1".to_string(),
options: vec!["pci_host_path01=BB:DD01.F01".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: None,
device: Device {
id: "test0000yx".to_string(),
container_path: "/dev/xvdyx".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdyx".to_string(),
options: vec![],
},
},
];
let annotations = HashMap::new();
let mut spec = SpecBuilder::default()
.annotations(annotations)
.build()
.unwrap();
// do annotate container devices
let _devices = annotate_container_devices(&mut spec, devices);
let expected_annotations: HashMap<String, String> = vec![
(
"cdi.k8s.io/vfiodevice3.0".to_owned(),
"amd.com/gpu=2".to_owned(),
),
(
"cdi.k8s.io/vfiodevice1.0".to_owned(),
"amd.com/gpu=0".to_owned(),
),
(
"cdi.k8s.io/vfiodevice2.0".to_owned(),
"amd.com/gpu=1".to_owned(),
),
]
.into_iter()
.collect();
assert_eq!(Some(expected_annotations), spec.annotations().clone());
}
#[test]
fn test_annotate_container_multi_vendor_devices() {
let devices = vec![
ContainerDevice {
device_info: None,
device: Device {
id: "test0000x".to_string(),
container_path: "/dev/xvdx".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdx".to_string(),
options: vec![],
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x10de".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device2"),
}),
device: Device {
container_path: "/dev/device2".to_string(),
options: vec!["pci_host_path02=BB:DD02.F02".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x10de".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device3"),
}),
device: Device {
container_path: "/dev/device3".to_string(),
options: vec!["pci_host_path03=BB:DD03.F03".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x8086".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device1"),
}),
device: Device {
container_path: "/dev/device1".to_string(),
options: vec!["pci_host_path01=BB:DD01.F01".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: Some(DeviceInfo {
vendor_id: "0x8086".to_string(),
class_id: "0x0302".to_string(),
host_path: PathBuf::from("/dev/device4"),
}),
device: Device {
container_path: "/dev/device4".to_string(),
options: vec!["pci_host_path04=BB:DD01.F04".to_string()],
..Default::default()
},
},
ContainerDevice {
device_info: None,
device: Device {
id: "test0000yx".to_string(),
container_path: "/dev/xvdyx".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdyx".to_string(),
options: vec![],
},
},
];
let annotations = HashMap::new();
let mut spec = SpecBuilder::default()
.annotations(annotations)
.build()
.unwrap();
let _devices = annotate_container_devices(&mut spec, devices);
let expected_annotations: HashMap<String, String> = vec![
(
"cdi.k8s.io/vfiodevice1.0".to_owned(),
"intel.com/gpu=0".to_owned(),
),
(
"cdi.k8s.io/vfiodevice2.0".to_owned(),
"nvidia.com/gpu=0".to_owned(),
),
(
"cdi.k8s.io/vfiodevice3.0".to_owned(),
"nvidia.com/gpu=1".to_owned(),
),
(
"cdi.k8s.io/vfiodevice4.0".to_owned(),
"intel.com/gpu=1".to_owned(),
),
]
.into_iter()
.collect();
assert_eq!(Some(expected_annotations), spec.annotations().clone());
}
#[test]
fn test_annotate_container_without_vfio_devices() {
let devices = vec![
ContainerDevice {
device_info: None,
device: Device {
id: "test0000x".to_string(),
container_path: "/dev/xvdx".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdx".to_string(),
options: vec![],
},
},
ContainerDevice {
device_info: None,
device: Device {
id: "test0000y".to_string(),
container_path: "/dev/yvdy".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/vdy".to_string(),
options: vec![],
},
},
ContainerDevice {
device_info: None,
device: Device {
id: "test0000z".to_string(),
container_path: "/dev/zvdz".to_string(),
field_type: "virtio-blk".to_string(),
vm_path: "/dev/zvdz".to_string(),
options: vec![],
},
},
];
let annotations = HashMap::from([(
"cdi.k8s.io/vfiodeviceX".to_owned(),
"katacontainer.com/device=Y".to_owned(),
)]);
let mut spec = SpecBuilder::default()
.annotations(annotations)
.build()
.unwrap();
// do annotate container devices
let annotated_devices = annotate_container_devices(&mut spec, devices.clone()).unwrap();
let actual_devices = devices
.iter()
.map(|d| d.device.clone())
.collect::<Vec<Device>>();
let expected_annotations: HashMap<String, String> = HashMap::from([(
"cdi.k8s.io/vfiodeviceX".to_owned(),
"katacontainer.com/device=Y".to_owned(),
)]);
assert_eq!(Some(expected_annotations), spec.annotations().clone());
assert_eq!(annotated_devices, actual_devices);
}
}

View File

@ -0,0 +1,63 @@
//
// Copyright (c) 2024 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
pub mod container_device;
use agent::types::Device;
use std::collections::HashMap;
use std::path::PathBuf;
#[derive(Clone, Default)]
pub struct DeviceInfo {
pub class_id: String,
pub vendor_id: String,
pub host_path: PathBuf,
}
#[derive(Clone, Default)]
pub struct ContainerDevice {
pub device_info: Option<DeviceInfo>,
pub device: Device,
}
lazy_static! {
// *CDI_DEVICE_KIND_TABLE* is static hash map to store a mapping between device vendor and class
// identifiers and their corresponding CDI vendor and class strings. This mapping is essentially a
// lookup table that allows the system to determine the appropriate CDI for a given device based on
// its vendor and class information.
// Note: Our device mapping is designed to be flexible and responsive to user needs. The current list
// is not exhaustive and will be updated as required.
pub static ref CDI_DEVICE_KIND_TABLE: HashMap<&'static str, &'static str> = {
let mut m = HashMap::new();
m.insert("0x10de-0x030", "nvidia.com/gpu");
m.insert("0x8086-0x030", "intel.com/gpu");
m.insert("0x1002-0x030", "amd.com/gpu");
m.insert("0x15b3-0x020", "nvidia.com/nic");
// TODO: it will be updated as required.
m
};
}
// Sort devices by guest_pcipath
pub fn sort_options_by_pcipath(mut device_options: Vec<String>) -> Vec<String> {
device_options.sort_by(|a, b| {
let extract_path = |s: &str| s.split('=').nth(1).map(|path| path.to_string());
let guest_path_a = extract_path(a);
let guest_path_b = extract_path(b);
guest_path_a.cmp(&guest_path_b)
});
device_options
}
// Resolve the CDI vendor ID/device Class by a lookup table based on the provided vendor and class.
pub fn resolve_cdi_device_kind<'a>(vendor_id: &'a str, class_id: &'a str) -> Option<&'a str> {
let vendor_class = format!("{}-{}", vendor_id, class_id);
// The first 12 characters of the string ("0x10de-0x030") provide a concise
// and clear identification of both the manufacturer and the device category.
// it returns "nvidia.com/gpu", "amd.com/gpu" or others.
CDI_DEVICE_KIND_TABLE.get(&vendor_class[..12]).copied()
}

View File

@ -23,6 +23,7 @@ pub mod rootfs;
pub mod share_fs;
pub mod volume;
pub use manager::ResourceManager;
pub mod cdi_devices;
pub mod cpu_mem;
use kata_types::config::hypervisor::SharedFsInfo;

View File

@ -6,7 +6,6 @@
use std::sync::Arc;
use agent::types::Device;
use agent::{Agent, Storage};
use anyhow::Result;
use async_trait::async_trait;
@ -20,6 +19,7 @@ use persist::sandbox_persist::Persist;
use tokio::sync::RwLock;
use tracing::instrument;
use crate::cdi_devices::ContainerDevice;
use crate::cpu_mem::initial_size::InitialSizeManager;
use crate::network::NetworkConfig;
use crate::resource_persist::ResourceState;
@ -116,7 +116,7 @@ impl ResourceManager {
inner.handler_volumes(cid, spec).await
}
pub async fn handler_devices(&self, cid: &str, linux: &Linux) -> Result<Vec<Device>> {
pub async fn handler_devices(&self, cid: &str, linux: &Linux) -> Result<Vec<ContainerDevice>> {
let inner = self.inner.read().await;
inner.handler_devices(cid, linux).await
}

View File

@ -25,6 +25,7 @@ use persist::sandbox_persist::Persist;
use tokio::{runtime, sync::RwLock};
use crate::{
cdi_devices::{sort_options_by_pcipath, ContainerDevice, DeviceInfo},
cgroups::{CgroupArgs, CgroupsResource},
cpu_mem::{cpu::CpuResource, initial_size::InitialSizeManager, mem::MemResource},
manager::ManagerArgs,
@ -292,7 +293,7 @@ impl ResourceManagerInner {
.await
}
pub async fn handler_devices(&self, _cid: &str, linux: &Linux) -> Result<Vec<Device>> {
pub async fn handler_devices(&self, _cid: &str, linux: &Linux) -> Result<Vec<ContainerDevice>> {
let mut devices = vec![];
let linux_devices = linux.devices().clone().unwrap_or_default();
@ -329,7 +330,10 @@ impl ResourceManagerInner {
vm_path: device.config.virt_path,
..Default::default()
};
devices.push(agent_device);
devices.push(ContainerDevice {
device_info: None,
device: agent_device,
});
}
}
LinuxDeviceType::C => {
@ -361,14 +365,33 @@ impl ResourceManagerInner {
// create agent device
if let DeviceType::Vfio(device) = device_info {
let device_options = sort_options_by_pcipath(device.device_options);
let agent_device = Device {
id: device.device_id, // just for kata-agent
container_path: d.path().display().to_string().clone(),
field_type: vfio_mode,
options: device.device_options,
options: device_options,
..Default::default()
};
devices.push(agent_device);
let vendor_class = device
.devices
.first()
.unwrap()
.device_vendor_class
.as_ref()
.unwrap()
.get_vendor_class_id()
.context("get vendor class failed")?;
let device_info = Some(DeviceInfo {
vendor_id: vendor_class.0.to_owned(),
class_id: vendor_class.1.to_owned(),
host_path: d.path().clone(),
});
devices.push(ContainerDevice {
device_info,
device: agent_device,
});
}
}
_ => {

View File

@ -21,7 +21,9 @@ use kata_types::k8s;
use oci_spec::runtime as oci;
use oci::{LinuxResources, Process as OCIProcess};
use resource::{ResourceManager, ResourceUpdateOp};
use resource::{
cdi_devices::container_device::annotate_container_devices, ResourceManager, ResourceUpdateOp,
};
use tokio::sync::RwLock;
use super::{
@ -174,10 +176,12 @@ impl Container {
.as_ref()
.context("OCI spec missing linux field")?;
let devices_agent = self
let container_devices = self
.resource_manager
.handler_devices(&config.container_id, linux)
.await?;
let devices_agent = annotate_container_devices(&mut spec, container_devices)
.context("annotate container devices failed")?;
// update vcpus, mems and host cgroups
let resources = self