diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index d00d2533db..b571e26c6f 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -492,6 +492,38 @@ impl DeviceInfo { } } +/// Virtual machine PCIe Topology configuration. +#[derive(Clone, Debug, Default)] +pub struct TopologyConfigInfo { + /// Hypervisor name + pub hypervisor_name: String, + /// Device Info + pub device_info: DeviceInfo, +} + +impl TopologyConfigInfo { + /// Initialize the topology config info from toml config + pub fn new(toml_config: &TomlConfig) -> Option { + // Firecracker does not support PCIe Devices, so we should not initialize such a PCIe topology for it. + // If the case of fc hit, just return None. + let hypervisor_names = [ + HYPERVISOR_NAME_QEMU, + HYPERVISOR_NAME_CH, + HYPERVISOR_NAME_DRAGONBALL, + ]; + let hypervisor_name = toml_config.runtime.hypervisor_name.as_str(); + if !hypervisor_names.contains(&hypervisor_name) { + return None; + } + + let hv = toml_config.hypervisor.get(hypervisor_name)?; + Some(Self { + hypervisor_name: hypervisor_name.to_string(), + device_info: hv.device_info.clone(), + }) + } +} + /// Configuration information for virtual machine. #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct MachineInfo { diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 76eaba4e1d..e5ec6276f5 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -8,6 +8,7 @@ use std::{collections::HashMap, sync::Arc}; use anyhow::{anyhow, Context, Result}; use kata_sys_util::rand::RandomBytes; +use kata_types::config::hypervisor::TopologyConfigInfo; use tokio::sync::{Mutex, RwLock}; use crate::{ @@ -18,6 +19,7 @@ use crate::{ }; use super::{ + topology::PCIeTopology, util::{get_host_path, get_virt_drive_name, DEVICE_TYPE_BLOCK}, Device, DeviceConfig, DeviceType, }; @@ -93,15 +95,20 @@ pub struct DeviceManager { devices: HashMap, hypervisor: Arc, shared_info: SharedInfo, + pcie_topology: Option, } impl DeviceManager { - pub async fn new(hypervisor: Arc) -> Result { + pub async fn new( + hypervisor: Arc, + topo_config: Option<&TopologyConfigInfo>, + ) -> Result { let devices = HashMap::::new(); Ok(DeviceManager { devices, hypervisor, shared_info: SharedInfo::new().await, + pcie_topology: PCIeTopology::new(topo_config), }) } @@ -119,9 +126,12 @@ impl DeviceManager { .devices .get(device_id) .context("failed to find device")?; + let mut device_guard = device.lock().await; // attach device - let result = device_guard.attach(self.hypervisor.as_ref()).await; + let result = device_guard + .attach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref()) + .await; // handle attach error if let Err(e) = result { match device_guard.get_device_info().await { @@ -161,7 +171,10 @@ impl DeviceManager { pub async fn try_remove_device(&mut self, device_id: &str) -> Result<()> { if let Some(dev) = self.devices.get(device_id) { let mut device_guard = dev.lock().await; - let result = match device_guard.detach(self.hypervisor.as_ref()).await { + let result = match device_guard + .detach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref()) + .await + { Ok(index) => { if let Some(i) = index { // release the declared device index @@ -599,6 +612,7 @@ mod tests { BlockConfig, KATA_BLK_DEV_TYPE, }; use anyhow::{anyhow, Context, Result}; + use kata_types::config::hypervisor::TopologyConfigInfo; use std::sync::Arc; use tests_utils::load_test_config; use tokio::sync::RwLock; @@ -606,6 +620,7 @@ mod tests { async fn new_device_manager() -> Result>> { let hypervisor_name: &str = "qemu"; let toml_config = load_test_config(hypervisor_name.to_owned())?; + let topo_config = TopologyConfigInfo::new(&toml_config); let hypervisor_config = toml_config .hypervisor .get(hypervisor_name) @@ -617,7 +632,7 @@ mod tests { .await; let dm = Arc::new(RwLock::new( - DeviceManager::new(Arc::new(hypervisor)) + DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref()) .await .context("device manager")?, )); diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs index aac879a594..7d8abd2e09 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs @@ -5,27 +5,24 @@ // use std::{ - collections::HashMap, fs, path::{Path, PathBuf}, process::Command, - sync::{ - atomic::{AtomicU8, Ordering}, - Arc, RwLock, - }, }; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; -use lazy_static::lazy_static; use path_clean::PathClean; use kata_sys_util::fs::get_base_name; -use crate::device::{ - hypervisor, - pci_path::{PciPath, PciSlot}, - Device, DeviceType, +use crate::{ + device::{ + pci_path::PciPath, + topology::{do_add_pcie_endpoint, PCIeTopology}, + Device, DeviceType, PCIeDevice, + }, + register_pcie_device, unregister_pcie_device, update_pcie_device, Hypervisor as hypervisor, }; pub const SYS_BUS_PCI_DRIVER_PROBE: &str = "/sys/bus/pci/drivers_probe"; @@ -43,35 +40,6 @@ const INTEL_IOMMU_PREFIX: &str = "dmar"; const AMD_IOMMU_PREFIX: &str = "ivhd"; const ARM_IOMMU_PREFIX: &str = "smmu"; -lazy_static! { - static ref GUEST_DEVICE_ID: Arc = Arc::new(AtomicU8::new(0_u8)); - static ref HOST_GUEST_MAP: Arc>> = - Arc::new(RwLock::new(HashMap::new())); -} - -// map host/guest bdf and the mapping saved into `HOST_GUEST_MAP`, -// and return PciPath. -pub fn generate_guest_pci_path(bdf: String) -> Result { - let hg_map = HOST_GUEST_MAP.clone(); - let current_id = GUEST_DEVICE_ID.clone(); - - current_id.fetch_add(1, Ordering::SeqCst); - let slot = current_id.load(Ordering::SeqCst); - - // In some Hypervisors, dragonball, cloud-hypervisor or firecracker, - // the device is directly connected to the bus without intermediary bus. - // FIXME: Qemu's pci path needs to be implemented; - let host_bdf = normalize_device_bdf(bdf.as_str()); - let guest_bdf = format!("0000:00:{:02x}.0", slot); - - // safe, just do unwrap as `HOST_GUEST_MAP` is always valid. - hg_map.write().unwrap().insert(host_bdf, guest_bdf); - - Ok(PciPath { - slots: vec![PciSlot::new(slot)], - }) -} - pub fn do_check_iommu_on() -> Result { let element = std::fs::read_dir(SYS_CLASS_IOMMU)? .filter_map(|e| e.ok()) @@ -476,7 +444,13 @@ impl VfioDevice { #[async_trait] impl Device for VfioDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { + register_pcie_device!(self, pcie_topo)?; + if self .increase_attach_count() .await @@ -494,37 +468,23 @@ impl Device for VfioDevice { self.devices = vfio.devices; } - if self.bus_mode == VfioBusMode::PCI { - for hostdev in self.devices.iter_mut() { - if hostdev.guest_pci_path.is_none() { - // guest_pci_path may be empty for certain hypervisors such as - // dragonball - hostdev.guest_pci_path = Some( - generate_guest_pci_path(hostdev.bus_slot_func.clone()) - .map_err(|e| anyhow!("generate pci path failed: {:?}", e))?, - ); - } - - // Safe to call unwrap here because of previous assignment. - let pci_path = hostdev.guest_pci_path.clone().unwrap(); - self.device_options.push(format!( - "0000:{}={}", - hostdev.bus_slot_func.clone(), - pci_path.to_string() - )); - } - } + update_pcie_device!(self, pcie_topo)?; Ok(()) } Err(e) => { self.decrease_attach_count().await?; + unregister_pcie_device!(self, pcie_topo)?; return Err(e); } } } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { if self .decrease_attach_count() .await @@ -545,6 +505,8 @@ impl Device for VfioDevice { None }; + unregister_pcie_device!(self, pcie_topo)?; + Ok(device_index) } @@ -588,6 +550,48 @@ impl Device for VfioDevice { } } +#[async_trait] +impl PCIeDevice for VfioDevice { + async fn register(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> { + if self.bus_mode != VfioBusMode::PCI { + return Ok(()); + } + + self.device_options.clear(); + for hostdev in self.devices.iter_mut() { + let pci_path = do_add_pcie_endpoint( + self.device_id.clone(), + hostdev.guest_pci_path.clone(), + pcie_topo, + ) + .context(format!( + "add pcie endpoint for host device {:?} in PCIe Topology failed", + self.device_id + ))?; + hostdev.guest_pci_path = Some(pci_path.clone()); + + self.device_options.push(format!( + "0000:{}={}", + hostdev.bus_slot_func, + pci_path.to_string() + )); + } + + Ok(()) + } + + async fn unregister(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> { + if let Some(_slot) = pcie_topo.remove_device(&self.device_id.clone()) { + Ok(()) + } else { + Err(anyhow!( + "vfio device with {:?} not found.", + self.device_id.clone() + )) + } + } +} + // binds the device to vfio driver after unbinding from host. // Will be called by a network interface or a generic pcie device. pub fn bind_device_to_vfio(bdf: &str, host_driver: &str, _vendor_device_id: &str) -> Result<()> { diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs index 5150f19563..b2a1d90f92 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; use super::VhostUserConfig; use crate::{ - device::{Device, DeviceType}, + device::{topology::PCIeTopology, Device, DeviceType}, Hypervisor as hypervisor, }; @@ -45,7 +45,11 @@ impl VhostUserBlkDevice { #[async_trait] impl Device for VhostUserBlkDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { // increase attach count, skip attach the device if the device is already attached if self .increase_attach_count() @@ -64,7 +68,11 @@ impl Device for VhostUserBlkDevice { return Ok(()); } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { // get the count of device detached, and detach once it reaches 0 if self .decrease_attach_count() diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs index dd31da8697..a28f969852 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs @@ -4,6 +4,7 @@ use anyhow::{Context, Result}; use async_trait::async_trait; +use crate::device::topology::PCIeTopology; use crate::device::{Device, DeviceType}; use crate::{Hypervisor, VhostUserConfig}; @@ -22,14 +23,22 @@ impl VhostUserNetDevice { #[async_trait] impl Device for VhostUserNetDevice { - async fn attach(&mut self, h: &dyn Hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result<()> { h.add_device(DeviceType::VhostUserNetwork(self.clone())) .await .context("add vhost-user-net device to hypervisor")?; Ok(()) } - async fn detach(&mut self, h: &dyn Hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result> { h.remove_device(DeviceType::VhostUserNetwork(self.clone())) .await .context("remove vhost-user-net device from hypervisor")?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs index f16fe8eb04..50d8179200 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs @@ -5,6 +5,7 @@ // use crate::device::pci_path::PciPath; +use crate::device::topology::PCIeTopology; use crate::device::Device; use crate::device::DeviceType; use crate::Hypervisor as hypervisor; @@ -73,7 +74,11 @@ impl BlockDevice { #[async_trait] impl Device for BlockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { // increase attach count, skip attach the device if the device is already attached if self .increase_attach_count() @@ -98,7 +103,11 @@ impl Device for BlockDevice { } } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { // get the count of device detached, skip detach once it reaches the 0 if self .decrease_attach_count() diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs index e968606de3..c06498b24c 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs @@ -7,7 +7,7 @@ use anyhow::{Context, Result}; use async_trait::async_trait; -use crate::device::{hypervisor, Device, DeviceType}; +use crate::device::{hypervisor, topology::PCIeTopology, Device, DeviceType}; #[derive(Copy, Clone, Debug, Default)] pub enum ShareFsMountOperation { @@ -99,7 +99,11 @@ impl ShareFsDevice { #[async_trait] impl Device for ShareFsDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::ShareFs(self.clone())) .await .context("add share-fs device.")?; @@ -107,7 +111,11 @@ impl Device for ShareFsDevice { Ok(()) } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to detach share-fs device Ok(None) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs index bc17e3f21a..4462761bed 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs @@ -9,6 +9,7 @@ use std::fmt; use anyhow::{Context, Result}; use async_trait::async_trait; +use crate::device::topology::PCIeTopology; use crate::device::{Device, DeviceType}; use crate::Hypervisor as hypervisor; @@ -70,7 +71,11 @@ impl NetworkDevice { #[async_trait] impl Device for NetworkDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::Network(self.clone())) .await .context("add network device.")?; @@ -78,7 +83,11 @@ impl Device for NetworkDevice { return Ok(()); } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { h.remove_device(DeviceType::Network(self.clone())) .await .context("remove network device.")?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs index 0c37fb18dd..efafd191fd 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs @@ -12,7 +12,7 @@ use tokio::fs::{File, OpenOptions}; use async_trait::async_trait; use crate::{ - device::{Device, DeviceType}, + device::{topology::PCIeTopology, Device, DeviceType}, Hypervisor as hypervisor, }; @@ -49,7 +49,11 @@ impl HybridVsockDevice { #[async_trait] impl Device for HybridVsockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::HybridVsock(self.clone())) .await .context("add hybrid vsock device.")?; @@ -57,7 +61,11 @@ impl Device for HybridVsockDevice { return Ok(()); } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to do detach, just return Ok(None) Ok(None) } @@ -135,7 +143,11 @@ impl VsockDevice { #[async_trait] impl Device for VsockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::Vsock(self.clone())) .await .context("add vsock device.")?; @@ -143,7 +155,11 @@ impl Device for VsockDevice { return Ok(()); } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to do detach, just return Ok(None) Ok(None) } diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index 4b170de52d..49b1ff844a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -15,9 +15,12 @@ use crate::{ use anyhow::Result; use async_trait::async_trait; +use self::topology::PCIeTopology; + pub mod device_manager; pub mod driver; pub mod pci_path; +pub mod topology; pub mod util; #[derive(Debug)] @@ -53,9 +56,17 @@ impl fmt::Display for DeviceType { #[async_trait] pub trait Device: std::fmt::Debug + Send + Sync { // attach is to plug device into VM - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()>; + async fn attach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()>; // detach is to unplug device from VM - async fn detach(&mut self, h: &dyn hypervisor) -> Result>; + async fn detach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result>; // update is to do update for some device async fn update(&mut self, h: &dyn hypervisor) -> Result<()>; // get_device_info returns device config @@ -71,3 +82,11 @@ pub trait Device: std::fmt::Debug + Send + Sync { // * err error: error while do decrease attach count async fn decrease_attach_count(&mut self) -> Result; } + +#[async_trait] +pub trait PCIeDevice: std::fmt::Debug + Send + Sync { + // register pcie device into PCIe Topology for virtio-pci device or PCI/PCIe device. + async fn register(&mut self, topology: &mut PCIeTopology) -> Result<()>; + // unregister pcie device from PCIe Topology + async fn unregister(&mut self, topology: &mut PCIeTopology) -> Result<()>; +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/topology.rs b/src/runtime-rs/crates/hypervisor/src/device/topology.rs new file mode 100644 index 0000000000..a1ab607f40 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/topology.rs @@ -0,0 +1,366 @@ +// +// Copyright (c) 2019-2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +/* +The design origins from https://github.com/qemu/qemu/blob/master/docs/pcie.txt + +In order to better support the PCIe topologies of different VMMs, we adopt a layered approach. +The first layer is the base layer(the flatten PCIe topology), which mainly consists of the root bus, +which is mainly used by VMMs that only support devices being directly attached to the root bus. +However, not all VMMs have such simple PCIe topologies. For example, Qemu, which can fully simulate +the PCIe topology of the host, has a complex PCIe topology. In this case, we need to add PCIe RootPort, +PCIe Switch, and PCIe-PCI Bridge or pxb-pcie on top of the base layer, which is The Complex PCIe Topology. + +The design graghs as below: + +(1) The flatten PCIe Topology +pcie.0 bus (Root Complex) +---------------------------------------------------------------------------- +| | | | | | | | | | | | | | | .. | +--|--------------------|------------------|-------------------------|------- + | | | | + V V V V +----------- ----------- ----------- ----------- +| PCI Dev | | PCI Dev | | PCI Dev | | PCI Dev | +----------- ----------- ----------- ----------- + +(2) The Complex PCIe Topology(It'll be implemented when Qemu is ready in runtime-rs) +pcie.0 bus (Root Complex) +---------------------------------------------------------------------------- +| | | | | | | | | | | | | | | .. | +------|----------------|--------------------------------------|------------- + | | | + V V V + ------------- ------------- ------------- + | Root Port | | Root Port | | Root Port | + ------------- ------------- ------------- + | | + | -------------------------|----------------------- +------------ | ----------------- | +| PCIe Dev | | PCI Express | Upstream Port | | +------------ | Switch ----------------- | + | | | | + | ------------------- ------------------- | + | | Downstream Port | | Downstream Port | | + | ------------------- ------------------- | + -------------|-----------------------|----------- + ------------ + | PCIe Dev | + ------------ +*/ + +use std::collections::{hash_map::Entry, HashMap}; + +use anyhow::{anyhow, Result}; + +use crate::device::pci_path::PciSlot; +use kata_types::config::hypervisor::TopologyConfigInfo; + +use super::pci_path::PciPath; + +const DEFAULT_PCIE_ROOT_BUS: &str = "pcie.0"; +// Currently, CLH and Dragonball support device attachment solely on the root bus. +const DEFAULT_PCIE_ROOT_BUS_ADDRESS: &str = "0000:00"; +pub const PCIE_ROOT_BUS_SLOTS_CAPACITY: u32 = 32; + +// register_pcie_device: do pre register device into PCIe Topology which +// be called in device driver's attach before device real attached into +// VM. It'll allocate one available PCI path for the device. +// register_pcie_device can be expanded as below: +// register_pcie_device { +// match pcie_topology { +// Some(topology) => self.register(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! register_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.register(topology).await, + None => Ok(()), + } + }; +} + +// update_pcie_device: do update device info, as some VMMs will be able to +// return the device info containing guest PCI path which differs the one allocated +// in runtime. So we need to compair the two PCI path, and finally update it or not +// based on the difference between them. +// update_pcie_device can be expanded as below: +// update_pcie_device { +// match pcie_topology { +// Some(topology) => self.register(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! update_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.register(topology).await, + None => Ok(()), + } + }; +} + +// unregister_pcie_device: do unregister device from pcie topology. +// unregister_pcie_device can be expanded as below: +// unregister_pcie_device { +// match pcie_topology { +// Some(topology) => self.unregister(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! unregister_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.unregister(topology).await, + None => Ok(()), + } + }; +} + +pub trait PCIeDevice: Send + Sync { + fn device_id(&self) -> &str; +} + +#[derive(Clone, Debug, Default)] +pub struct PCIeEndpoint { + // device_id for device in device manager + pub device_id: String, + // device's PCI Path in Guest + pub pci_path: PciPath, + // root_port for PCIe Device + pub root_port: Option, + + // device_type is for device virtio-pci/PCI or PCIe + pub device_type: String, +} + +impl PCIeDevice for PCIeEndpoint { + fn device_id(&self) -> &str { + self.device_id.as_str() + } +} + +// reserved resource +#[derive(Clone, Debug, Default)] +pub struct ResourceReserved { + // This to work needs patches to QEMU + // The PCIE-PCI bridge can be hot-plugged only into pcie-root-port that has 'bus-reserve' + // property value to provide secondary bus for the hot-plugged bridge. + pub bus_reserve: String, + + // reserve prefetched MMIO aperture, 64-bit + pub pref64_reserve: String, + // reserve prefetched MMIO aperture, 32-bit + pub pref32_reserve: String, + // reserve non-prefetched MMIO aperture, 32-bit *only* + pub memory_reserve: String, + + // IO reservation + pub io_reserve: String, +} + +// PCIe Root Port +#[derive(Clone, Debug, Default)] +pub struct PCIeRootPort { + // format: rp{n}, n>=0 + pub id: String, + + // default is pcie.0 + pub bus: String, + // >=0, default is 0x00 + pub address: String, + + // (slot, chassis) pair is mandatory and must be unique for each pcie-root-port, + // chassis >=0, default is 0x00 + pub chassis: u8, + // slot >=0, default is 0x00 + pub slot: u8, + + // multi_function is for PCIe Device passthrough + // true => "on", false => "off", default is off + pub multi_function: bool, + + // reserved resource for some VMM, such as Qemu. + pub resource_reserved: ResourceReserved, + + // romfile specifies the ROM file being used for this device. + pub romfile: String, +} + +// PCIe Root Complex +#[derive(Clone, Debug, Default)] +pub struct PCIeRootComplex { + pub root_bus: String, + pub root_bus_address: String, + pub root_bus_devices: HashMap, +} + +#[derive(Debug, Default)] +pub struct PCIeTopology { + pub hypervisor_name: String, + pub root_complex: PCIeRootComplex, + + pub bridges: u32, + pub pcie_root_ports: u32, + pub hotplug_vfio_on_root_bus: bool, +} + +impl PCIeTopology { + // As some special case doesn't support PCIe devices, there's no need to build a PCIe Topology. + pub fn new(config_info: Option<&TopologyConfigInfo>) -> Option { + // if config_info is None, it will return None. + let topo_config = config_info?; + + let root_complex = PCIeRootComplex { + root_bus: DEFAULT_PCIE_ROOT_BUS.to_owned(), + root_bus_address: DEFAULT_PCIE_ROOT_BUS_ADDRESS.to_owned(), + root_bus_devices: HashMap::with_capacity(PCIE_ROOT_BUS_SLOTS_CAPACITY as usize), + }; + + Some(Self { + hypervisor_name: topo_config.hypervisor_name.to_owned(), + root_complex, + bridges: topo_config.device_info.default_bridges, + pcie_root_ports: topo_config.device_info.pcie_root_port, + hotplug_vfio_on_root_bus: topo_config.device_info.hotplug_vfio_on_root_bus, + }) + } + + pub fn insert_device(&mut self, ep: &mut PCIeEndpoint) -> Option { + let to_pcipath = |v: u32| -> PciPath { + PciPath { + slots: vec![PciSlot(v as u8)], + } + }; + + let to_string = |v: u32| -> String { to_pcipath(v).to_string() }; + + // find the first available index as the allocated slot. + let allocated_slot = (0..PCIE_ROOT_BUS_SLOTS_CAPACITY).find(|&i| { + !self + .root_complex + .root_bus_devices + .contains_key(&to_string(i)) + })?; + + let pcipath = to_string(allocated_slot); + + // update pci_path in Endpoint + ep.pci_path = to_pcipath(allocated_slot); + // convert the allocated slot to pci path and then insert it with ep + self.root_complex + .root_bus_devices + .insert(pcipath, ep.clone()); + + Some(to_pcipath(allocated_slot)) + } + + pub fn remove_device(&mut self, device_id: &str) -> Option { + let mut target_device: Option = None; + self.root_complex.root_bus_devices.retain(|k, v| { + if v.device_id() != device_id { + true + } else { + target_device = Some((*k).to_string()); + false + } + }); + + target_device + } + + pub fn update_device(&mut self, ep: &PCIeEndpoint) -> Option { + let pci_addr = ep.pci_path.clone(); + + // First, find the PCIe Endpoint corresponding to the endpoint in the Hash Map based on the PCI path. + // If found, it means that we do not need to update the device's position in the Hash Map. + // If not found, it means that the PCI Path corresponding to the device has changed, and the device's + // position in the Hash Map needs to be updated. + match self + .root_complex + .root_bus_devices + .entry(pci_addr.to_string()) + { + Entry::Occupied(_) => None, + Entry::Vacant(_entry) => { + self.remove_device(&ep.device_id); + self.root_complex + .root_bus_devices + .insert(pci_addr.to_string(), ep.clone()); + + Some(pci_addr) + } + } + } + + pub fn find_device(&mut self, device_id: &str) -> bool { + for v in self.root_complex.root_bus_devices.values() { + info!( + sl!(), + "find_device with: {:?}, {:?}.", + &device_id, + v.device_id() + ); + if v.device_id() == device_id { + return true; + } + } + + false + } + + pub fn do_insert_or_update(&mut self, pciep: &mut PCIeEndpoint) -> Result { + // Try to check whether the device is present in the PCIe Topology. + // If the device dosen't exist, it proceeds to register it within the topology + let pci_path = if !self.find_device(&pciep.device_id) { + // Register a device within the PCIe topology, allocating and assigning it an available PCI Path. + // Upon successful insertion, it updates the pci_path in PCIeEndpoint and returns it. + // Finally, update both the guest_pci_path and devices_options with the allocated PciPath. + if let Some(pci_addr) = self.insert_device(pciep) { + pci_addr + } else { + return Err(anyhow!("pci path allocated failed.")); + } + } else { + // If the device exists, it proceeds to update its pcipath within + // the topology and the device's guest_pci_path and device_options. + if let Some(pci_addr) = self.update_device(pciep) { + pci_addr + } else { + return Ok(pciep.pci_path.clone()); + } + }; + + Ok(pci_path) + } +} + +// do_add_pcie_endpoint do add a device into PCIe topology with pcie endpoint +// device_id: device's Unique ID in Device Manager. +// allocated_pcipath: allocated pcipath before add_device +// topology: PCIe Topology for devices to build a PCIe Topology in Guest. +pub fn do_add_pcie_endpoint( + device_id: String, + allocated_pcipath: Option, + topology: &mut PCIeTopology, +) -> Result { + let pcie_endpoint = &mut PCIeEndpoint { + device_type: "PCIe".to_string(), + device_id, + ..Default::default() + }; + + if let Some(pci_path) = allocated_pcipath { + pcie_endpoint.pci_path = pci_path; + } + + topology.do_insert_or_update(pcie_endpoint) +} diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 2b70fe90c0..284612453d 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -17,7 +17,7 @@ use hypervisor::{ }, BlockConfig, Hypervisor, VfioConfig, }; -use kata_types::config::TomlConfig; +use kata_types::config::{hypervisor::TopologyConfigInfo, TomlConfig}; use kata_types::mount::Mount; use oci::{Linux, LinuxCpu, LinuxResources}; use persist::sandbox_persist::Persist; @@ -59,8 +59,9 @@ impl ResourceManagerInner { toml_config: Arc, init_size_manager: InitialSizeManager, ) -> Result { + let topo_config = TopologyConfigInfo::new(&toml_config); // create device manager - let dev_manager = DeviceManager::new(hypervisor.clone()) + let dev_manager = DeviceManager::new(hypervisor.clone(), topo_config.as_ref()) .await .context("failed to create device manager")?; @@ -510,12 +511,14 @@ impl Persist for ResourceManagerInner { sid: resource_args.sid.clone(), config: resource_args.config, }; + let topo_config = TopologyConfigInfo::new(&args.config); + Ok(Self { sid: resource_args.sid, agent: resource_args.agent, hypervisor: resource_args.hypervisor.clone(), device_manager: Arc::new(RwLock::new( - DeviceManager::new(resource_args.hypervisor).await?, + DeviceManager::new(resource_args.hypervisor, topo_config.as_ref()).await?, )), network: None, share_fs: None, diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs index 7bfb429621..83db7fe9ec 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs @@ -9,6 +9,7 @@ mod tests { use std::sync::Arc; use anyhow::{anyhow, Context, Result}; + use kata_types::config::hypervisor::TopologyConfigInfo; use netlink_packet_route::MACVLAN_MODE_PRIVATE; use scopeguard::defer; use tests_utils::load_test_config; @@ -29,6 +30,7 @@ mod tests { async fn get_device_manager() -> Result>> { let hypervisor_name: &str = "qemu"; let toml_config = load_test_config(hypervisor_name.to_owned())?; + let topo_config = TopologyConfigInfo::new(&toml_config); let hypervisor_config = toml_config .hypervisor .get(hypervisor_name) @@ -40,7 +42,7 @@ mod tests { .await; let dm = Arc::new(RwLock::new( - DeviceManager::new(Arc::new(hypervisor)) + DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref()) .await .context("device manager")?, ));