From 88839026b9dae14b6465aa3b481e85634cca0000 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:21:53 +0800 Subject: [PATCH 1/9] runtime-rs: introduce TopologyConfigInfo to initialize pcie topology A TopologyConfigInfo added to store device config info for PCIe/PCI devices in the VM from Hypervisor DeviceInfo. And TopologyConfigInfo::new will be the entry to initialize PCIe Topology for each VM. Fixes: #7218 Signed-off-by: alex.lyn --- .../kata-types/src/config/hypervisor/mod.rs | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index d00d2533db..b571e26c6f 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -492,6 +492,38 @@ impl DeviceInfo { } } +/// Virtual machine PCIe Topology configuration. +#[derive(Clone, Debug, Default)] +pub struct TopologyConfigInfo { + /// Hypervisor name + pub hypervisor_name: String, + /// Device Info + pub device_info: DeviceInfo, +} + +impl TopologyConfigInfo { + /// Initialize the topology config info from toml config + pub fn new(toml_config: &TomlConfig) -> Option { + // Firecracker does not support PCIe Devices, so we should not initialize such a PCIe topology for it. + // If the case of fc hit, just return None. + let hypervisor_names = [ + HYPERVISOR_NAME_QEMU, + HYPERVISOR_NAME_CH, + HYPERVISOR_NAME_DRAGONBALL, + ]; + let hypervisor_name = toml_config.runtime.hypervisor_name.as_str(); + if !hypervisor_names.contains(&hypervisor_name) { + return None; + } + + let hv = toml_config.hypervisor.get(hypervisor_name)?; + Some(Self { + hypervisor_name: hypervisor_name.to_string(), + device_info: hv.device_info.clone(), + }) + } +} + /// Configuration information for virtual machine. #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub struct MachineInfo { From 6ebc4884faae455bc0973fbdb78c19d61433b861 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:22:19 +0800 Subject: [PATCH 2/9] runtime-rs: introduce PCIe Topology framework for pcie/pci devices Due to different ways that different VMMs handle PCI devices, we expect to provide a general PCIe topology processing framework that is as compatible as possible with VMMs such as dragonball, qemu, clh(Though it has its own management method, no conflict). Currently,it's mainly developed for kinds of PCIe/PCI devices in dragonball/clh which are attached on the pci/pcie root bus directly. More will be added when Qemu is ready in runtime-rs. Fixes: #7218 Signed-off-by: alex.lyn --- .../crates/hypervisor/src/device/mod.rs | 1 + .../crates/hypervisor/src/device/topology.rs | 307 ++++++++++++++++++ 2 files changed, 308 insertions(+) create mode 100644 src/runtime-rs/crates/hypervisor/src/device/topology.rs diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index 4b170de52d..d9fbd538b7 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -18,6 +18,7 @@ use async_trait::async_trait; pub mod device_manager; pub mod driver; pub mod pci_path; +pub mod topology; pub mod util; #[derive(Debug)] diff --git a/src/runtime-rs/crates/hypervisor/src/device/topology.rs b/src/runtime-rs/crates/hypervisor/src/device/topology.rs new file mode 100644 index 0000000000..993d7b0eaf --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/topology.rs @@ -0,0 +1,307 @@ +// +// Copyright (c) 2019-2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +/* +The design origins from https://github.com/qemu/qemu/blob/master/docs/pcie.txt + +In order to better support the PCIe topologies of different VMMs, we adopt a layered approach. +The first layer is the base layer(the flatten PCIe topology), which mainly consists of the root bus, +which is mainly used by VMMs that only support devices being directly attached to the root bus. +However, not all VMMs have such simple PCIe topologies. For example, Qemu, which can fully simulate +the PCIe topology of the host, has a complex PCIe topology. In this case, we need to add PCIe RootPort, +PCIe Switch, and PCIe-PCI Bridge or pxb-pcie on top of the base layer, which is The Complex PCIe Topology. + +The design graghs as below: + +(1) The flatten PCIe Topology +pcie.0 bus (Root Complex) +---------------------------------------------------------------------------- +| | | | | | | | | | | | | | | .. | +--|--------------------|------------------|-------------------------|------- + | | | | + V V V V +----------- ----------- ----------- ----------- +| PCI Dev | | PCI Dev | | PCI Dev | | PCI Dev | +----------- ----------- ----------- ----------- + +(2) The Complex PCIe Topology(It'll be implemented when Qemu is ready in runtime-rs) +pcie.0 bus (Root Complex) +---------------------------------------------------------------------------- +| | | | | | | | | | | | | | | .. | +------|----------------|--------------------------------------|------------- + | | | + V V V + ------------- ------------- ------------- + | Root Port | | Root Port | | Root Port | + ------------- ------------- ------------- + | | + | -------------------------|----------------------- +------------ | ----------------- | +| PCIe Dev | | PCI Express | Upstream Port | | +------------ | Switch ----------------- | + | | | | + | ------------------- ------------------- | + | | Downstream Port | | Downstream Port | | + | ------------------- ------------------- | + -------------|-----------------------|----------- + ------------ + | PCIe Dev | + ------------ +*/ + +use std::collections::{hash_map::Entry, HashMap}; + +use anyhow::{anyhow, Result}; + +use crate::device::pci_path::PciSlot; +use kata_types::config::hypervisor::TopologyConfigInfo; + +use super::pci_path::PciPath; + +const DEFAULT_PCIE_ROOT_BUS: &str = "pcie.0"; +// Currently, CLH and Dragonball support device attachment solely on the root bus. +const DEFAULT_PCIE_ROOT_BUS_ADDRESS: &str = "0000:00"; +pub const PCIE_ROOT_BUS_SLOTS_CAPACITY: u32 = 32; + +pub trait PCIeDevice: Send + Sync { + fn device_id(&self) -> &str; +} + +#[derive(Clone, Debug, Default)] +pub struct PCIeEndpoint { + // device_id for device in device manager + pub device_id: String, + // device's PCI Path in Guest + pub pci_path: PciPath, + // root_port for PCIe Device + pub root_port: Option, + + // device_type is for device virtio-pci/PCI or PCIe + pub device_type: String, +} + +impl PCIeDevice for PCIeEndpoint { + fn device_id(&self) -> &str { + self.device_id.as_str() + } +} + +// reserved resource +#[derive(Clone, Debug, Default)] +pub struct ResourceReserved { + // This to work needs patches to QEMU + // The PCIE-PCI bridge can be hot-plugged only into pcie-root-port that has 'bus-reserve' + // property value to provide secondary bus for the hot-plugged bridge. + pub bus_reserve: String, + + // reserve prefetched MMIO aperture, 64-bit + pub pref64_reserve: String, + // reserve prefetched MMIO aperture, 32-bit + pub pref32_reserve: String, + // reserve non-prefetched MMIO aperture, 32-bit *only* + pub memory_reserve: String, + + // IO reservation + pub io_reserve: String, +} + +// PCIe Root Port +#[derive(Clone, Debug, Default)] +pub struct PCIeRootPort { + // format: rp{n}, n>=0 + pub id: String, + + // default is pcie.0 + pub bus: String, + // >=0, default is 0x00 + pub address: String, + + // (slot, chassis) pair is mandatory and must be unique for each pcie-root-port, + // chassis >=0, default is 0x00 + pub chassis: u8, + // slot >=0, default is 0x00 + pub slot: u8, + + // multi_function is for PCIe Device passthrough + // true => "on", false => "off", default is off + pub multi_function: bool, + + // reserved resource for some VMM, such as Qemu. + pub resource_reserved: ResourceReserved, + + // romfile specifies the ROM file being used for this device. + pub romfile: String, +} + +// PCIe Root Complex +#[derive(Clone, Debug, Default)] +pub struct PCIeRootComplex { + pub root_bus: String, + pub root_bus_address: String, + pub root_bus_devices: HashMap, +} + +#[derive(Debug, Default)] +pub struct PCIeTopology { + pub hypervisor_name: String, + pub root_complex: PCIeRootComplex, + + pub bridges: u32, + pub pcie_root_ports: u32, + pub hotplug_vfio_on_root_bus: bool, +} + +impl PCIeTopology { + // As some special case doesn't support PCIe devices, there's no need to build a PCIe Topology. + pub fn new(config_info: Option<&TopologyConfigInfo>) -> Option { + // if config_info is None, it will return None. + let topo_config = config_info?; + + let root_complex = PCIeRootComplex { + root_bus: DEFAULT_PCIE_ROOT_BUS.to_owned(), + root_bus_address: DEFAULT_PCIE_ROOT_BUS_ADDRESS.to_owned(), + root_bus_devices: HashMap::with_capacity(PCIE_ROOT_BUS_SLOTS_CAPACITY as usize), + }; + + Some(Self { + hypervisor_name: topo_config.hypervisor_name.to_owned(), + root_complex, + bridges: topo_config.device_info.default_bridges, + pcie_root_ports: topo_config.device_info.pcie_root_port, + hotplug_vfio_on_root_bus: topo_config.device_info.hotplug_vfio_on_root_bus, + }) + } + + pub fn insert_device(&mut self, ep: &mut PCIeEndpoint) -> Option { + let to_pcipath = |v: u32| -> PciPath { + PciPath { + slots: vec![PciSlot(v as u8)], + } + }; + + let to_string = |v: u32| -> String { to_pcipath(v).to_string() }; + + // find the first available index as the allocated slot. + let allocated_slot = (0..PCIE_ROOT_BUS_SLOTS_CAPACITY).find(|&i| { + !self + .root_complex + .root_bus_devices + .contains_key(&to_string(i)) + })?; + + let pcipath = to_string(allocated_slot); + + // update pci_path in Endpoint + ep.pci_path = to_pcipath(allocated_slot); + // convert the allocated slot to pci path and then insert it with ep + self.root_complex + .root_bus_devices + .insert(pcipath, ep.clone()); + + Some(to_pcipath(allocated_slot)) + } + + pub fn remove_device(&mut self, device_id: &str) -> Option { + let mut target_device: Option = None; + self.root_complex.root_bus_devices.retain(|k, v| { + if v.device_id() != device_id { + true + } else { + target_device = Some((*k).to_string()); + false + } + }); + + target_device + } + + pub fn update_device(&mut self, ep: &PCIeEndpoint) -> Option { + let pci_addr = ep.pci_path.clone(); + + // First, find the PCIe Endpoint corresponding to the endpoint in the Hash Map based on the PCI path. + // If found, it means that we do not need to update the device's position in the Hash Map. + // If not found, it means that the PCI Path corresponding to the device has changed, and the device's + // position in the Hash Map needs to be updated. + match self + .root_complex + .root_bus_devices + .entry(pci_addr.to_string()) + { + Entry::Occupied(_) => None, + Entry::Vacant(_entry) => { + self.remove_device(&ep.device_id); + self.root_complex + .root_bus_devices + .insert(pci_addr.to_string(), ep.clone()); + + Some(pci_addr) + } + } + } + + pub fn find_device(&mut self, device_id: &str) -> bool { + for v in self.root_complex.root_bus_devices.values() { + info!( + sl!(), + "find_device with: {:?}, {:?}.", + &device_id, + v.device_id() + ); + if v.device_id() == device_id { + return true; + } + } + + false + } + + pub fn do_insert_or_update(&mut self, pciep: &mut PCIeEndpoint) -> Result { + // Try to check whether the device is present in the PCIe Topology. + // If the device dosen't exist, it proceeds to register it within the topology + let pci_path = if !self.find_device(&pciep.device_id) { + // Register a device within the PCIe topology, allocating and assigning it an available PCI Path. + // Upon successful insertion, it updates the pci_path in PCIeEndpoint and returns it. + // Finally, update both the guest_pci_path and devices_options with the allocated PciPath. + if let Some(pci_addr) = self.insert_device(pciep) { + pci_addr + } else { + return Err(anyhow!("pci path allocated failed.")); + } + } else { + // If the device exists, it proceeds to update its pcipath within + // the topology and the device's guest_pci_path and device_options. + if let Some(pci_addr) = self.update_device(pciep) { + pci_addr + } else { + return Ok(pciep.pci_path.clone()); + } + }; + + Ok(pci_path) + } +} + +// do_add_pcie_endpoint do add a device into PCIe topology with pcie endpoint +// device_id: device's Unique ID in Device Manager. +// allocated_pcipath: allocated pcipath before add_device +// topology: PCIe Topology for devices to build a PCIe Topology in Guest. +pub fn do_add_pcie_endpoint( + device_id: String, + allocated_pcipath: Option, + topology: &mut PCIeTopology, +) -> Result { + let pcie_endpoint = &mut PCIeEndpoint { + device_type: "PCIe".to_string(), + device_id, + ..Default::default() + }; + + if let Some(pci_path) = allocated_pcipath { + pcie_endpoint.pci_path = pci_path; + } + + topology.do_insert_or_update(pcie_endpoint) +} From 87e39cd1f60e7828725515216728249931f069b5 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:25:09 +0800 Subject: [PATCH 3/9] runtime-rs: introduce Trait PCIeDevice to do [un]register device Introduce Trait PCIeDevice with register/unregister, which are used to register or unregister pcie device within the PCIe topology. Fixes: #7218 Signed-off-by: alex.lyn --- src/runtime-rs/crates/hypervisor/src/device/mod.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index d9fbd538b7..7b18d2f867 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -15,6 +15,8 @@ use crate::{ use anyhow::Result; use async_trait::async_trait; +use self::topology::PCIeTopology; + pub mod device_manager; pub mod driver; pub mod pci_path; @@ -72,3 +74,11 @@ pub trait Device: std::fmt::Debug + Send + Sync { // * err error: error while do decrease attach count async fn decrease_attach_count(&mut self) -> Result; } + +#[async_trait] +pub trait PCIeDevice: std::fmt::Debug + Send + Sync { + // register pcie device into PCIe Topology for virtio-pci device or PCI/PCIe device. + async fn register(&mut self, topology: &mut PCIeTopology) -> Result<()>; + // unregister pcie device from PCIe Topology + async fn unregister(&mut self, topology: &mut PCIeTopology) -> Result<()>; +} From b425de61059db0cd315553d91fb0409ab9ef6a35 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:33:08 +0800 Subject: [PATCH 4/9] runtime-rs: implement Trait PCIeDevice for pcie/pci device Implement Trait PCIeDevice register/unregister for pcie/pci device, such as vfio device which needs set/get device's pci path for kata agent's device handler. Fixes: #7218 Signed-off-by: alex.lyn --- .../hypervisor/src/device/driver/vfio.rs | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs index aac879a594..958081d8eb 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs @@ -25,7 +25,8 @@ use kata_sys_util::fs::get_base_name; use crate::device::{ hypervisor, pci_path::{PciPath, PciSlot}, - Device, DeviceType, + topology::{do_add_pcie_endpoint, PCIeTopology}, + Device, DeviceType, PCIeDevice, }; pub const SYS_BUS_PCI_DRIVER_PROBE: &str = "/sys/bus/pci/drivers_probe"; @@ -588,6 +589,48 @@ impl Device for VfioDevice { } } +#[async_trait] +impl PCIeDevice for VfioDevice { + async fn register(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> { + if self.bus_mode != VfioBusMode::PCI { + return Ok(()); + } + + self.device_options.clear(); + for hostdev in self.devices.iter_mut() { + let pci_path = do_add_pcie_endpoint( + self.device_id.clone(), + hostdev.guest_pci_path.clone(), + pcie_topo, + ) + .context(format!( + "add pcie endpoint for host device {:?} in PCIe Topology failed", + self.device_id + ))?; + hostdev.guest_pci_path = Some(pci_path.clone()); + + self.device_options.push(format!( + "0000:{}={}", + hostdev.bus_slot_func, + pci_path.to_string() + )); + } + + Ok(()) + } + + async fn unregister(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> { + if let Some(_slot) = pcie_topo.remove_device(&self.device_id.clone()) { + Ok(()) + } else { + Err(anyhow!( + "vfio device with {:?} not found.", + self.device_id.clone() + )) + } + } +} + // binds the device to vfio driver after unbinding from host. // Will be called by a network interface or a generic pcie device. pub fn bind_device_to_vfio(bdf: &str, host_driver: &str, _vendor_device_id: &str) -> Result<()> { From 0d4992b24dcf3e7b601f80fd95dd1a61984caccd Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:40:01 +0800 Subject: [PATCH 5/9] runtime-rs: add one more argument in Device attach/detach Add one more argument with type &mut Option<&mut PCIeTopology> in attach and detach to inroduce methods within PCIe Topology. Fixes: #7218 Signed-off-by: alex.lyn --- .../hypervisor/src/device/device_manager.rs | 10 +++++-- .../hypervisor/src/device/driver/vfio.rs | 12 +++++++-- .../src/device/driver/vhost_user_blk.rs | 14 +++++++--- .../src/device/driver/vhost_user_net.rs | 13 ++++++++-- .../src/device/driver/virtio_blk.rs | 13 ++++++++-- .../hypervisor/src/device/driver/virtio_fs.rs | 14 +++++++--- .../src/device/driver/virtio_net.rs | 13 ++++++++-- .../src/device/driver/virtio_vsock.rs | 26 +++++++++++++++---- .../crates/hypervisor/src/device/mod.rs | 12 +++++++-- 9 files changed, 104 insertions(+), 23 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 76eaba4e1d..376ed671d1 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -18,6 +18,7 @@ use crate::{ }; use super::{ + topology::PCIeTopology, util::{get_host_path, get_virt_drive_name, DEVICE_TYPE_BLOCK}, Device, DeviceConfig, DeviceType, }; @@ -121,7 +122,9 @@ impl DeviceManager { .context("failed to find device")?; let mut device_guard = device.lock().await; // attach device - let result = device_guard.attach(self.hypervisor.as_ref()).await; + let result = device_guard + .attach(&mut None::<&mut PCIeTopology>, self.hypervisor.as_ref()) + .await; // handle attach error if let Err(e) = result { match device_guard.get_device_info().await { @@ -161,7 +164,10 @@ impl DeviceManager { pub async fn try_remove_device(&mut self, device_id: &str) -> Result<()> { if let Some(dev) = self.devices.get(device_id) { let mut device_guard = dev.lock().await; - let result = match device_guard.detach(self.hypervisor.as_ref()).await { + let result = match device_guard + .detach(&mut None::<&mut PCIeTopology>, self.hypervisor.as_ref()) + .await + { Ok(index) => { if let Some(i) = index { // release the declared device index diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs index 958081d8eb..4c1f89e450 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs @@ -477,7 +477,11 @@ impl VfioDevice { #[async_trait] impl Device for VfioDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { if self .increase_attach_count() .await @@ -525,7 +529,11 @@ impl Device for VfioDevice { } } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { if self .decrease_attach_count() .await diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs index 5150f19563..b2a1d90f92 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_blk.rs @@ -9,7 +9,7 @@ use async_trait::async_trait; use super::VhostUserConfig; use crate::{ - device::{Device, DeviceType}, + device::{topology::PCIeTopology, Device, DeviceType}, Hypervisor as hypervisor, }; @@ -45,7 +45,11 @@ impl VhostUserBlkDevice { #[async_trait] impl Device for VhostUserBlkDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { // increase attach count, skip attach the device if the device is already attached if self .increase_attach_count() @@ -64,7 +68,11 @@ impl Device for VhostUserBlkDevice { return Ok(()); } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { // get the count of device detached, and detach once it reaches 0 if self .decrease_attach_count() diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs index dd31da8697..a28f969852 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vhost_user_net.rs @@ -4,6 +4,7 @@ use anyhow::{Context, Result}; use async_trait::async_trait; +use crate::device::topology::PCIeTopology; use crate::device::{Device, DeviceType}; use crate::{Hypervisor, VhostUserConfig}; @@ -22,14 +23,22 @@ impl VhostUserNetDevice { #[async_trait] impl Device for VhostUserNetDevice { - async fn attach(&mut self, h: &dyn Hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result<()> { h.add_device(DeviceType::VhostUserNetwork(self.clone())) .await .context("add vhost-user-net device to hypervisor")?; Ok(()) } - async fn detach(&mut self, h: &dyn Hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result> { h.remove_device(DeviceType::VhostUserNetwork(self.clone())) .await .context("remove vhost-user-net device from hypervisor")?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs index f16fe8eb04..50d8179200 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs @@ -5,6 +5,7 @@ // use crate::device::pci_path::PciPath; +use crate::device::topology::PCIeTopology; use crate::device::Device; use crate::device::DeviceType; use crate::Hypervisor as hypervisor; @@ -73,7 +74,11 @@ impl BlockDevice { #[async_trait] impl Device for BlockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { // increase attach count, skip attach the device if the device is already attached if self .increase_attach_count() @@ -98,7 +103,11 @@ impl Device for BlockDevice { } } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { // get the count of device detached, skip detach once it reaches the 0 if self .decrease_attach_count() diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs index e968606de3..c06498b24c 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_fs.rs @@ -7,7 +7,7 @@ use anyhow::{Context, Result}; use async_trait::async_trait; -use crate::device::{hypervisor, Device, DeviceType}; +use crate::device::{hypervisor, topology::PCIeTopology, Device, DeviceType}; #[derive(Copy, Clone, Debug, Default)] pub enum ShareFsMountOperation { @@ -99,7 +99,11 @@ impl ShareFsDevice { #[async_trait] impl Device for ShareFsDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::ShareFs(self.clone())) .await .context("add share-fs device.")?; @@ -107,7 +111,11 @@ impl Device for ShareFsDevice { Ok(()) } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to detach share-fs device Ok(None) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs index bc17e3f21a..4462761bed 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs @@ -9,6 +9,7 @@ use std::fmt; use anyhow::{Context, Result}; use async_trait::async_trait; +use crate::device::topology::PCIeTopology; use crate::device::{Device, DeviceType}; use crate::Hypervisor as hypervisor; @@ -70,7 +71,11 @@ impl NetworkDevice { #[async_trait] impl Device for NetworkDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::Network(self.clone())) .await .context("add network device.")?; @@ -78,7 +83,11 @@ impl Device for NetworkDevice { return Ok(()); } - async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result> { h.remove_device(DeviceType::Network(self.clone())) .await .context("remove network device.")?; diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs index 0c37fb18dd..efafd191fd 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_vsock.rs @@ -12,7 +12,7 @@ use tokio::fs::{File, OpenOptions}; use async_trait::async_trait; use crate::{ - device::{Device, DeviceType}, + device::{topology::PCIeTopology, Device, DeviceType}, Hypervisor as hypervisor, }; @@ -49,7 +49,11 @@ impl HybridVsockDevice { #[async_trait] impl Device for HybridVsockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::HybridVsock(self.clone())) .await .context("add hybrid vsock device.")?; @@ -57,7 +61,11 @@ impl Device for HybridVsockDevice { return Ok(()); } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to do detach, just return Ok(None) Ok(None) } @@ -135,7 +143,11 @@ impl VsockDevice { #[async_trait] impl Device for VsockDevice { - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + async fn attach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()> { h.add_device(DeviceType::Vsock(self.clone())) .await .context("add vsock device.")?; @@ -143,7 +155,11 @@ impl Device for VsockDevice { return Ok(()); } - async fn detach(&mut self, _h: &dyn hypervisor) -> Result> { + async fn detach( + &mut self, + _pcie_topo: &mut Option<&mut PCIeTopology>, + _h: &dyn hypervisor, + ) -> Result> { // no need to do detach, just return Ok(None) Ok(None) } diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index 7b18d2f867..49b1ff844a 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -56,9 +56,17 @@ impl fmt::Display for DeviceType { #[async_trait] pub trait Device: std::fmt::Debug + Send + Sync { // attach is to plug device into VM - async fn attach(&mut self, h: &dyn hypervisor) -> Result<()>; + async fn attach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result<()>; // detach is to unplug device from VM - async fn detach(&mut self, h: &dyn hypervisor) -> Result>; + async fn detach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn hypervisor, + ) -> Result>; // update is to do update for some device async fn update(&mut self, h: &dyn hypervisor) -> Result<()>; // get_device_info returns device config From ce7d363695d20ef5103f9fcf40294a29d248c432 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:43:58 +0800 Subject: [PATCH 6/9] runtime-rs: Introduce helper macros to simplify PCIe device ops Introduce helper macros to simplify PCIe device register/unregister and update, which provides a convenient way to handle devices in topology. Fixes: #7218 Signed-off-by: alex.lyn --- .../crates/hypervisor/src/device/topology.rs | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/runtime-rs/crates/hypervisor/src/device/topology.rs b/src/runtime-rs/crates/hypervisor/src/device/topology.rs index 993d7b0eaf..a1ab607f40 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/topology.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/topology.rs @@ -66,6 +66,65 @@ const DEFAULT_PCIE_ROOT_BUS: &str = "pcie.0"; const DEFAULT_PCIE_ROOT_BUS_ADDRESS: &str = "0000:00"; pub const PCIE_ROOT_BUS_SLOTS_CAPACITY: u32 = 32; +// register_pcie_device: do pre register device into PCIe Topology which +// be called in device driver's attach before device real attached into +// VM. It'll allocate one available PCI path for the device. +// register_pcie_device can be expanded as below: +// register_pcie_device { +// match pcie_topology { +// Some(topology) => self.register(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! register_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.register(topology).await, + None => Ok(()), + } + }; +} + +// update_pcie_device: do update device info, as some VMMs will be able to +// return the device info containing guest PCI path which differs the one allocated +// in runtime. So we need to compair the two PCI path, and finally update it or not +// based on the difference between them. +// update_pcie_device can be expanded as below: +// update_pcie_device { +// match pcie_topology { +// Some(topology) => self.register(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! update_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.register(topology).await, + None => Ok(()), + } + }; +} + +// unregister_pcie_device: do unregister device from pcie topology. +// unregister_pcie_device can be expanded as below: +// unregister_pcie_device { +// match pcie_topology { +// Some(topology) => self.unregister(topology).await, +// None => Ok(()) +// } +// } +#[macro_export] +macro_rules! unregister_pcie_device { + ($self:ident, $opt:expr) => { + match $opt { + Some(topology) => $self.unregister(topology).await, + None => Ok(()), + } + }; +} + pub trait PCIeDevice: Send + Sync { fn device_id(&self) -> &str; } From 0f0b6d13c93f5538dbe79b063689cd7634b245ed Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:49:18 +0800 Subject: [PATCH 7/9] runtime-rs: do register/update device in Trait Device/attach Before calling the device driver to attach a device, register the device to PCIe topology and allocate a PciPath for it. However, for some hypervisor such as CLH, the allocation is invalid when plugging devices to VM, they have the ability to return DeviceInfo containing PciPath. It'll update the PciPath with the returned pci path in the PCIe topology for them to prevent the inferred pcipath from being different from the actual value returned. But the update will not be executed if the pcipath value doesn't change. Fixes: #7218 Signed-off-by: alex.lyn --- .../hypervisor/src/device/driver/vfio.rs | 73 +++---------------- 1 file changed, 12 insertions(+), 61 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs index 4c1f89e450..292d53b206 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs @@ -5,28 +5,24 @@ // use std::{ - collections::HashMap, fs, path::{Path, PathBuf}, process::Command, - sync::{ - atomic::{AtomicU8, Ordering}, - Arc, RwLock, - }, }; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; -use lazy_static::lazy_static; use path_clean::PathClean; use kata_sys_util::fs::get_base_name; -use crate::device::{ - hypervisor, - pci_path::{PciPath, PciSlot}, - topology::{do_add_pcie_endpoint, PCIeTopology}, - Device, DeviceType, PCIeDevice, +use crate::{ + device::{ + pci_path::PciPath, + topology::{do_add_pcie_endpoint, PCIeTopology}, + Device, DeviceType, PCIeDevice, + }, + register_pcie_device, unregister_pcie_device, update_pcie_device, Hypervisor as hypervisor, }; pub const SYS_BUS_PCI_DRIVER_PROBE: &str = "/sys/bus/pci/drivers_probe"; @@ -44,35 +40,6 @@ const INTEL_IOMMU_PREFIX: &str = "dmar"; const AMD_IOMMU_PREFIX: &str = "ivhd"; const ARM_IOMMU_PREFIX: &str = "smmu"; -lazy_static! { - static ref GUEST_DEVICE_ID: Arc = Arc::new(AtomicU8::new(0_u8)); - static ref HOST_GUEST_MAP: Arc>> = - Arc::new(RwLock::new(HashMap::new())); -} - -// map host/guest bdf and the mapping saved into `HOST_GUEST_MAP`, -// and return PciPath. -pub fn generate_guest_pci_path(bdf: String) -> Result { - let hg_map = HOST_GUEST_MAP.clone(); - let current_id = GUEST_DEVICE_ID.clone(); - - current_id.fetch_add(1, Ordering::SeqCst); - let slot = current_id.load(Ordering::SeqCst); - - // In some Hypervisors, dragonball, cloud-hypervisor or firecracker, - // the device is directly connected to the bus without intermediary bus. - // FIXME: Qemu's pci path needs to be implemented; - let host_bdf = normalize_device_bdf(bdf.as_str()); - let guest_bdf = format!("0000:00:{:02x}.0", slot); - - // safe, just do unwrap as `HOST_GUEST_MAP` is always valid. - hg_map.write().unwrap().insert(host_bdf, guest_bdf); - - Ok(PciPath { - slots: vec![PciSlot::new(slot)], - }) -} - pub fn do_check_iommu_on() -> Result { let element = std::fs::read_dir(SYS_CLASS_IOMMU)? .filter_map(|e| e.ok()) @@ -479,9 +446,11 @@ impl VfioDevice { impl Device for VfioDevice { async fn attach( &mut self, - _pcie_topo: &mut Option<&mut PCIeTopology>, + pcie_topo: &mut Option<&mut PCIeTopology>, h: &dyn hypervisor, ) -> Result<()> { + register_pcie_device!(self, pcie_topo)?; + if self .increase_attach_count() .await @@ -499,31 +468,13 @@ impl Device for VfioDevice { self.devices = vfio.devices; } - if self.bus_mode == VfioBusMode::PCI { - for hostdev in self.devices.iter_mut() { - if hostdev.guest_pci_path.is_none() { - // guest_pci_path may be empty for certain hypervisors such as - // dragonball - hostdev.guest_pci_path = Some( - generate_guest_pci_path(hostdev.bus_slot_func.clone()) - .map_err(|e| anyhow!("generate pci path failed: {:?}", e))?, - ); - } - - // Safe to call unwrap here because of previous assignment. - let pci_path = hostdev.guest_pci_path.clone().unwrap(); - self.device_options.push(format!( - "0000:{}={}", - hostdev.bus_slot_func.clone(), - pci_path.to_string() - )); - } - } + update_pcie_device!(self, pcie_topo)?; Ok(()) } Err(e) => { self.decrease_attach_count().await?; + unregister_pcie_device!(self, pcie_topo)?; return Err(e); } } From b42548b8e1bb855227d12e3329ce8755a5139e77 Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:52:26 +0800 Subject: [PATCH 8/9] runtime-rs: do unregister device in Trait Device/detach Fixes: #7218 Signed-off-by: alex.lyn --- src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs index 292d53b206..7d8abd2e09 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio.rs @@ -482,7 +482,7 @@ impl Device for VfioDevice { async fn detach( &mut self, - _pcie_topo: &mut Option<&mut PCIeTopology>, + pcie_topo: &mut Option<&mut PCIeTopology>, h: &dyn hypervisor, ) -> Result> { if self @@ -505,6 +505,8 @@ impl Device for VfioDevice { None }; + unregister_pcie_device!(self, pcie_topo)?; + Ok(device_index) } From ea69c170088c4820cea8899eb0758dc75132900d Mon Sep 17 00:00:00 2001 From: "alex.lyn" Date: Wed, 27 Dec 2023 15:57:23 +0800 Subject: [PATCH 9/9] runtime-rs: initialize pcie topology in Device Manager Add a pcie_topology field to DeviceManager and initialize pcie_topology when ResourceManager calls DeviceManager's new() with TopologyConfigInfo. Fixes: #7218 Signed-off-by: alex.lyn --- .../hypervisor/src/device/device_manager.rs | 17 +++++++++++++---- .../crates/resource/src/manager_inner.rs | 9 ++++++--- .../src/network/endpoint/endpoints_test.rs | 4 +++- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 376ed671d1..e5ec6276f5 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -8,6 +8,7 @@ use std::{collections::HashMap, sync::Arc}; use anyhow::{anyhow, Context, Result}; use kata_sys_util::rand::RandomBytes; +use kata_types::config::hypervisor::TopologyConfigInfo; use tokio::sync::{Mutex, RwLock}; use crate::{ @@ -94,15 +95,20 @@ pub struct DeviceManager { devices: HashMap, hypervisor: Arc, shared_info: SharedInfo, + pcie_topology: Option, } impl DeviceManager { - pub async fn new(hypervisor: Arc) -> Result { + pub async fn new( + hypervisor: Arc, + topo_config: Option<&TopologyConfigInfo>, + ) -> Result { let devices = HashMap::::new(); Ok(DeviceManager { devices, hypervisor, shared_info: SharedInfo::new().await, + pcie_topology: PCIeTopology::new(topo_config), }) } @@ -120,10 +126,11 @@ impl DeviceManager { .devices .get(device_id) .context("failed to find device")?; + let mut device_guard = device.lock().await; // attach device let result = device_guard - .attach(&mut None::<&mut PCIeTopology>, self.hypervisor.as_ref()) + .attach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref()) .await; // handle attach error if let Err(e) = result { @@ -165,7 +172,7 @@ impl DeviceManager { if let Some(dev) = self.devices.get(device_id) { let mut device_guard = dev.lock().await; let result = match device_guard - .detach(&mut None::<&mut PCIeTopology>, self.hypervisor.as_ref()) + .detach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref()) .await { Ok(index) => { @@ -605,6 +612,7 @@ mod tests { BlockConfig, KATA_BLK_DEV_TYPE, }; use anyhow::{anyhow, Context, Result}; + use kata_types::config::hypervisor::TopologyConfigInfo; use std::sync::Arc; use tests_utils::load_test_config; use tokio::sync::RwLock; @@ -612,6 +620,7 @@ mod tests { async fn new_device_manager() -> Result>> { let hypervisor_name: &str = "qemu"; let toml_config = load_test_config(hypervisor_name.to_owned())?; + let topo_config = TopologyConfigInfo::new(&toml_config); let hypervisor_config = toml_config .hypervisor .get(hypervisor_name) @@ -623,7 +632,7 @@ mod tests { .await; let dm = Arc::new(RwLock::new( - DeviceManager::new(Arc::new(hypervisor)) + DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref()) .await .context("device manager")?, )); diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 2b70fe90c0..284612453d 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -17,7 +17,7 @@ use hypervisor::{ }, BlockConfig, Hypervisor, VfioConfig, }; -use kata_types::config::TomlConfig; +use kata_types::config::{hypervisor::TopologyConfigInfo, TomlConfig}; use kata_types::mount::Mount; use oci::{Linux, LinuxCpu, LinuxResources}; use persist::sandbox_persist::Persist; @@ -59,8 +59,9 @@ impl ResourceManagerInner { toml_config: Arc, init_size_manager: InitialSizeManager, ) -> Result { + let topo_config = TopologyConfigInfo::new(&toml_config); // create device manager - let dev_manager = DeviceManager::new(hypervisor.clone()) + let dev_manager = DeviceManager::new(hypervisor.clone(), topo_config.as_ref()) .await .context("failed to create device manager")?; @@ -510,12 +511,14 @@ impl Persist for ResourceManagerInner { sid: resource_args.sid.clone(), config: resource_args.config, }; + let topo_config = TopologyConfigInfo::new(&args.config); + Ok(Self { sid: resource_args.sid, agent: resource_args.agent, hypervisor: resource_args.hypervisor.clone(), device_manager: Arc::new(RwLock::new( - DeviceManager::new(resource_args.hypervisor).await?, + DeviceManager::new(resource_args.hypervisor, topo_config.as_ref()).await?, )), network: None, share_fs: None, diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs index 7bfb429621..83db7fe9ec 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs @@ -9,6 +9,7 @@ mod tests { use std::sync::Arc; use anyhow::{anyhow, Context, Result}; + use kata_types::config::hypervisor::TopologyConfigInfo; use netlink_packet_route::MACVLAN_MODE_PRIVATE; use scopeguard::defer; use tests_utils::load_test_config; @@ -29,6 +30,7 @@ mod tests { async fn get_device_manager() -> Result>> { let hypervisor_name: &str = "qemu"; let toml_config = load_test_config(hypervisor_name.to_owned())?; + let topo_config = TopologyConfigInfo::new(&toml_config); let hypervisor_config = toml_config .hypervisor .get(hypervisor_name) @@ -40,7 +42,7 @@ mod tests { .await; let dm = Arc::new(RwLock::new( - DeviceManager::new(Arc::new(hypervisor)) + DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref()) .await .context("device manager")?, ));