mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-07-31 23:36:12 +00:00
Merge pull request #7489 from Apokleos/pci_path
runtime-rs: add pci topology for pci devices
This commit is contained in:
commit
cbd4481bc1
@ -492,6 +492,38 @@ impl DeviceInfo {
|
||||
}
|
||||
}
|
||||
|
||||
/// Virtual machine PCIe Topology configuration.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct TopologyConfigInfo {
|
||||
/// Hypervisor name
|
||||
pub hypervisor_name: String,
|
||||
/// Device Info
|
||||
pub device_info: DeviceInfo,
|
||||
}
|
||||
|
||||
impl TopologyConfigInfo {
|
||||
/// Initialize the topology config info from toml config
|
||||
pub fn new(toml_config: &TomlConfig) -> Option<Self> {
|
||||
// Firecracker does not support PCIe Devices, so we should not initialize such a PCIe topology for it.
|
||||
// If the case of fc hit, just return None.
|
||||
let hypervisor_names = [
|
||||
HYPERVISOR_NAME_QEMU,
|
||||
HYPERVISOR_NAME_CH,
|
||||
HYPERVISOR_NAME_DRAGONBALL,
|
||||
];
|
||||
let hypervisor_name = toml_config.runtime.hypervisor_name.as_str();
|
||||
if !hypervisor_names.contains(&hypervisor_name) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let hv = toml_config.hypervisor.get(hypervisor_name)?;
|
||||
Some(Self {
|
||||
hypervisor_name: hypervisor_name.to_string(),
|
||||
device_info: hv.device_info.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration information for virtual machine.
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct MachineInfo {
|
||||
|
@ -8,6 +8,7 @@ use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_sys_util::rand::RandomBytes;
|
||||
use kata_types::config::hypervisor::TopologyConfigInfo;
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
||||
use crate::{
|
||||
@ -18,6 +19,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
topology::PCIeTopology,
|
||||
util::{get_host_path, get_virt_drive_name, DEVICE_TYPE_BLOCK},
|
||||
Device, DeviceConfig, DeviceType,
|
||||
};
|
||||
@ -93,15 +95,20 @@ pub struct DeviceManager {
|
||||
devices: HashMap<String, ArcMutexDevice>,
|
||||
hypervisor: Arc<dyn Hypervisor>,
|
||||
shared_info: SharedInfo,
|
||||
pcie_topology: Option<PCIeTopology>,
|
||||
}
|
||||
|
||||
impl DeviceManager {
|
||||
pub async fn new(hypervisor: Arc<dyn Hypervisor>) -> Result<Self> {
|
||||
pub async fn new(
|
||||
hypervisor: Arc<dyn Hypervisor>,
|
||||
topo_config: Option<&TopologyConfigInfo>,
|
||||
) -> Result<Self> {
|
||||
let devices = HashMap::<String, ArcMutexDevice>::new();
|
||||
Ok(DeviceManager {
|
||||
devices,
|
||||
hypervisor,
|
||||
shared_info: SharedInfo::new().await,
|
||||
pcie_topology: PCIeTopology::new(topo_config),
|
||||
})
|
||||
}
|
||||
|
||||
@ -119,9 +126,12 @@ impl DeviceManager {
|
||||
.devices
|
||||
.get(device_id)
|
||||
.context("failed to find device")?;
|
||||
|
||||
let mut device_guard = device.lock().await;
|
||||
// attach device
|
||||
let result = device_guard.attach(self.hypervisor.as_ref()).await;
|
||||
let result = device_guard
|
||||
.attach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref())
|
||||
.await;
|
||||
// handle attach error
|
||||
if let Err(e) = result {
|
||||
match device_guard.get_device_info().await {
|
||||
@ -161,7 +171,10 @@ impl DeviceManager {
|
||||
pub async fn try_remove_device(&mut self, device_id: &str) -> Result<()> {
|
||||
if let Some(dev) = self.devices.get(device_id) {
|
||||
let mut device_guard = dev.lock().await;
|
||||
let result = match device_guard.detach(self.hypervisor.as_ref()).await {
|
||||
let result = match device_guard
|
||||
.detach(&mut self.pcie_topology.as_mut(), self.hypervisor.as_ref())
|
||||
.await
|
||||
{
|
||||
Ok(index) => {
|
||||
if let Some(i) = index {
|
||||
// release the declared device index
|
||||
@ -599,6 +612,7 @@ mod tests {
|
||||
BlockConfig, KATA_BLK_DEV_TYPE,
|
||||
};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_types::config::hypervisor::TopologyConfigInfo;
|
||||
use std::sync::Arc;
|
||||
use tests_utils::load_test_config;
|
||||
use tokio::sync::RwLock;
|
||||
@ -606,6 +620,7 @@ mod tests {
|
||||
async fn new_device_manager() -> Result<Arc<RwLock<DeviceManager>>> {
|
||||
let hypervisor_name: &str = "qemu";
|
||||
let toml_config = load_test_config(hypervisor_name.to_owned())?;
|
||||
let topo_config = TopologyConfigInfo::new(&toml_config);
|
||||
let hypervisor_config = toml_config
|
||||
.hypervisor
|
||||
.get(hypervisor_name)
|
||||
@ -617,7 +632,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let dm = Arc::new(RwLock::new(
|
||||
DeviceManager::new(Arc::new(hypervisor))
|
||||
DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref())
|
||||
.await
|
||||
.context("device manager")?,
|
||||
));
|
||||
|
@ -5,27 +5,24 @@
|
||||
//
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs,
|
||||
path::{Path, PathBuf},
|
||||
process::Command,
|
||||
sync::{
|
||||
atomic::{AtomicU8, Ordering},
|
||||
Arc, RwLock,
|
||||
},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use lazy_static::lazy_static;
|
||||
use path_clean::PathClean;
|
||||
|
||||
use kata_sys_util::fs::get_base_name;
|
||||
|
||||
use crate::device::{
|
||||
hypervisor,
|
||||
pci_path::{PciPath, PciSlot},
|
||||
Device, DeviceType,
|
||||
use crate::{
|
||||
device::{
|
||||
pci_path::PciPath,
|
||||
topology::{do_add_pcie_endpoint, PCIeTopology},
|
||||
Device, DeviceType, PCIeDevice,
|
||||
},
|
||||
register_pcie_device, unregister_pcie_device, update_pcie_device, Hypervisor as hypervisor,
|
||||
};
|
||||
|
||||
pub const SYS_BUS_PCI_DRIVER_PROBE: &str = "/sys/bus/pci/drivers_probe";
|
||||
@ -43,35 +40,6 @@ const INTEL_IOMMU_PREFIX: &str = "dmar";
|
||||
const AMD_IOMMU_PREFIX: &str = "ivhd";
|
||||
const ARM_IOMMU_PREFIX: &str = "smmu";
|
||||
|
||||
lazy_static! {
|
||||
static ref GUEST_DEVICE_ID: Arc<AtomicU8> = Arc::new(AtomicU8::new(0_u8));
|
||||
static ref HOST_GUEST_MAP: Arc<RwLock<HashMap<String, String>>> =
|
||||
Arc::new(RwLock::new(HashMap::new()));
|
||||
}
|
||||
|
||||
// map host/guest bdf and the mapping saved into `HOST_GUEST_MAP`,
|
||||
// and return PciPath.
|
||||
pub fn generate_guest_pci_path(bdf: String) -> Result<PciPath> {
|
||||
let hg_map = HOST_GUEST_MAP.clone();
|
||||
let current_id = GUEST_DEVICE_ID.clone();
|
||||
|
||||
current_id.fetch_add(1, Ordering::SeqCst);
|
||||
let slot = current_id.load(Ordering::SeqCst);
|
||||
|
||||
// In some Hypervisors, dragonball, cloud-hypervisor or firecracker,
|
||||
// the device is directly connected to the bus without intermediary bus.
|
||||
// FIXME: Qemu's pci path needs to be implemented;
|
||||
let host_bdf = normalize_device_bdf(bdf.as_str());
|
||||
let guest_bdf = format!("0000:00:{:02x}.0", slot);
|
||||
|
||||
// safe, just do unwrap as `HOST_GUEST_MAP` is always valid.
|
||||
hg_map.write().unwrap().insert(host_bdf, guest_bdf);
|
||||
|
||||
Ok(PciPath {
|
||||
slots: vec![PciSlot::new(slot)],
|
||||
})
|
||||
}
|
||||
|
||||
pub fn do_check_iommu_on() -> Result<bool> {
|
||||
let element = std::fs::read_dir(SYS_CLASS_IOMMU)?
|
||||
.filter_map(|e| e.ok())
|
||||
@ -476,7 +444,13 @@ impl VfioDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for VfioDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
register_pcie_device!(self, pcie_topo)?;
|
||||
|
||||
if self
|
||||
.increase_attach_count()
|
||||
.await
|
||||
@ -494,37 +468,23 @@ impl Device for VfioDevice {
|
||||
self.devices = vfio.devices;
|
||||
}
|
||||
|
||||
if self.bus_mode == VfioBusMode::PCI {
|
||||
for hostdev in self.devices.iter_mut() {
|
||||
if hostdev.guest_pci_path.is_none() {
|
||||
// guest_pci_path may be empty for certain hypervisors such as
|
||||
// dragonball
|
||||
hostdev.guest_pci_path = Some(
|
||||
generate_guest_pci_path(hostdev.bus_slot_func.clone())
|
||||
.map_err(|e| anyhow!("generate pci path failed: {:?}", e))?,
|
||||
);
|
||||
}
|
||||
|
||||
// Safe to call unwrap here because of previous assignment.
|
||||
let pci_path = hostdev.guest_pci_path.clone().unwrap();
|
||||
self.device_options.push(format!(
|
||||
"0000:{}={}",
|
||||
hostdev.bus_slot_func.clone(),
|
||||
pci_path.to_string()
|
||||
));
|
||||
}
|
||||
}
|
||||
update_pcie_device!(self, pcie_topo)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
self.decrease_attach_count().await?;
|
||||
unregister_pcie_device!(self, pcie_topo)?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
if self
|
||||
.decrease_attach_count()
|
||||
.await
|
||||
@ -545,6 +505,8 @@ impl Device for VfioDevice {
|
||||
None
|
||||
};
|
||||
|
||||
unregister_pcie_device!(self, pcie_topo)?;
|
||||
|
||||
Ok(device_index)
|
||||
}
|
||||
|
||||
@ -588,6 +550,48 @@ impl Device for VfioDevice {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl PCIeDevice for VfioDevice {
|
||||
async fn register(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> {
|
||||
if self.bus_mode != VfioBusMode::PCI {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.device_options.clear();
|
||||
for hostdev in self.devices.iter_mut() {
|
||||
let pci_path = do_add_pcie_endpoint(
|
||||
self.device_id.clone(),
|
||||
hostdev.guest_pci_path.clone(),
|
||||
pcie_topo,
|
||||
)
|
||||
.context(format!(
|
||||
"add pcie endpoint for host device {:?} in PCIe Topology failed",
|
||||
self.device_id
|
||||
))?;
|
||||
hostdev.guest_pci_path = Some(pci_path.clone());
|
||||
|
||||
self.device_options.push(format!(
|
||||
"0000:{}={}",
|
||||
hostdev.bus_slot_func,
|
||||
pci_path.to_string()
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn unregister(&mut self, pcie_topo: &mut PCIeTopology) -> Result<()> {
|
||||
if let Some(_slot) = pcie_topo.remove_device(&self.device_id.clone()) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"vfio device with {:?} not found.",
|
||||
self.device_id.clone()
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// binds the device to vfio driver after unbinding from host.
|
||||
// Will be called by a network interface or a generic pcie device.
|
||||
pub fn bind_device_to_vfio(bdf: &str, host_driver: &str, _vendor_device_id: &str) -> Result<()> {
|
||||
|
@ -9,7 +9,7 @@ use async_trait::async_trait;
|
||||
|
||||
use super::VhostUserConfig;
|
||||
use crate::{
|
||||
device::{Device, DeviceType},
|
||||
device::{topology::PCIeTopology, Device, DeviceType},
|
||||
Hypervisor as hypervisor,
|
||||
};
|
||||
|
||||
@ -45,7 +45,11 @@ impl VhostUserBlkDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for VhostUserBlkDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
// increase attach count, skip attach the device if the device is already attached
|
||||
if self
|
||||
.increase_attach_count()
|
||||
@ -64,7 +68,11 @@ impl Device for VhostUserBlkDevice {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// get the count of device detached, and detach once it reaches 0
|
||||
if self
|
||||
.decrease_attach_count()
|
||||
|
@ -4,6 +4,7 @@
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::device::topology::PCIeTopology;
|
||||
use crate::device::{Device, DeviceType};
|
||||
use crate::{Hypervisor, VhostUserConfig};
|
||||
|
||||
@ -22,14 +23,22 @@ impl VhostUserNetDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for VhostUserNetDevice {
|
||||
async fn attach(&mut self, h: &dyn Hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn Hypervisor,
|
||||
) -> Result<()> {
|
||||
h.add_device(DeviceType::VhostUserNetwork(self.clone()))
|
||||
.await
|
||||
.context("add vhost-user-net device to hypervisor")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn detach(&mut self, h: &dyn Hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn Hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
h.remove_device(DeviceType::VhostUserNetwork(self.clone()))
|
||||
.await
|
||||
.context("remove vhost-user-net device from hypervisor")?;
|
||||
|
@ -5,6 +5,7 @@
|
||||
//
|
||||
|
||||
use crate::device::pci_path::PciPath;
|
||||
use crate::device::topology::PCIeTopology;
|
||||
use crate::device::Device;
|
||||
use crate::device::DeviceType;
|
||||
use crate::Hypervisor as hypervisor;
|
||||
@ -73,7 +74,11 @@ impl BlockDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for BlockDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
// increase attach count, skip attach the device if the device is already attached
|
||||
if self
|
||||
.increase_attach_count()
|
||||
@ -98,7 +103,11 @@ impl Device for BlockDevice {
|
||||
}
|
||||
}
|
||||
|
||||
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// get the count of device detached, skip detach once it reaches the 0
|
||||
if self
|
||||
.decrease_attach_count()
|
||||
|
@ -7,7 +7,7 @@
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::device::{hypervisor, Device, DeviceType};
|
||||
use crate::device::{hypervisor, topology::PCIeTopology, Device, DeviceType};
|
||||
|
||||
#[derive(Copy, Clone, Debug, Default)]
|
||||
pub enum ShareFsMountOperation {
|
||||
@ -99,7 +99,11 @@ impl ShareFsDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for ShareFsDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
h.add_device(DeviceType::ShareFs(self.clone()))
|
||||
.await
|
||||
.context("add share-fs device.")?;
|
||||
@ -107,7 +111,11 @@ impl Device for ShareFsDevice {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn detach(&mut self, _h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
_h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// no need to detach share-fs device
|
||||
|
||||
Ok(None)
|
||||
|
@ -9,6 +9,7 @@ use std::fmt;
|
||||
use anyhow::{Context, Result};
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::device::topology::PCIeTopology;
|
||||
use crate::device::{Device, DeviceType};
|
||||
use crate::Hypervisor as hypervisor;
|
||||
|
||||
@ -70,7 +71,11 @@ impl NetworkDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for NetworkDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
h.add_device(DeviceType::Network(self.clone()))
|
||||
.await
|
||||
.context("add network device.")?;
|
||||
@ -78,7 +83,11 @@ impl Device for NetworkDevice {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
h.remove_device(DeviceType::Network(self.clone()))
|
||||
.await
|
||||
.context("remove network device.")?;
|
||||
|
@ -12,7 +12,7 @@ use tokio::fs::{File, OpenOptions};
|
||||
use async_trait::async_trait;
|
||||
|
||||
use crate::{
|
||||
device::{Device, DeviceType},
|
||||
device::{topology::PCIeTopology, Device, DeviceType},
|
||||
Hypervisor as hypervisor,
|
||||
};
|
||||
|
||||
@ -49,7 +49,11 @@ impl HybridVsockDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for HybridVsockDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
h.add_device(DeviceType::HybridVsock(self.clone()))
|
||||
.await
|
||||
.context("add hybrid vsock device.")?;
|
||||
@ -57,7 +61,11 @@ impl Device for HybridVsockDevice {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn detach(&mut self, _h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
_h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// no need to do detach, just return Ok(None)
|
||||
Ok(None)
|
||||
}
|
||||
@ -135,7 +143,11 @@ impl VsockDevice {
|
||||
|
||||
#[async_trait]
|
||||
impl Device for VsockDevice {
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> {
|
||||
async fn attach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()> {
|
||||
h.add_device(DeviceType::Vsock(self.clone()))
|
||||
.await
|
||||
.context("add vsock device.")?;
|
||||
@ -143,7 +155,11 @@ impl Device for VsockDevice {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn detach(&mut self, _h: &dyn hypervisor) -> Result<Option<u64>> {
|
||||
async fn detach(
|
||||
&mut self,
|
||||
_pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
_h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>> {
|
||||
// no need to do detach, just return Ok(None)
|
||||
Ok(None)
|
||||
}
|
||||
|
@ -15,9 +15,12 @@ use crate::{
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
|
||||
use self::topology::PCIeTopology;
|
||||
|
||||
pub mod device_manager;
|
||||
pub mod driver;
|
||||
pub mod pci_path;
|
||||
pub mod topology;
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -53,9 +56,17 @@ impl fmt::Display for DeviceType {
|
||||
#[async_trait]
|
||||
pub trait Device: std::fmt::Debug + Send + Sync {
|
||||
// attach is to plug device into VM
|
||||
async fn attach(&mut self, h: &dyn hypervisor) -> Result<()>;
|
||||
async fn attach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<()>;
|
||||
// detach is to unplug device from VM
|
||||
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>>;
|
||||
async fn detach(
|
||||
&mut self,
|
||||
pcie_topo: &mut Option<&mut PCIeTopology>,
|
||||
h: &dyn hypervisor,
|
||||
) -> Result<Option<u64>>;
|
||||
// update is to do update for some device
|
||||
async fn update(&mut self, h: &dyn hypervisor) -> Result<()>;
|
||||
// get_device_info returns device config
|
||||
@ -71,3 +82,11 @@ pub trait Device: std::fmt::Debug + Send + Sync {
|
||||
// * err error: error while do decrease attach count
|
||||
async fn decrease_attach_count(&mut self) -> Result<bool>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait PCIeDevice: std::fmt::Debug + Send + Sync {
|
||||
// register pcie device into PCIe Topology for virtio-pci device or PCI/PCIe device.
|
||||
async fn register(&mut self, topology: &mut PCIeTopology) -> Result<()>;
|
||||
// unregister pcie device from PCIe Topology
|
||||
async fn unregister(&mut self, topology: &mut PCIeTopology) -> Result<()>;
|
||||
}
|
||||
|
366
src/runtime-rs/crates/hypervisor/src/device/topology.rs
Normal file
366
src/runtime-rs/crates/hypervisor/src/device/topology.rs
Normal file
@ -0,0 +1,366 @@
|
||||
//
|
||||
// Copyright (c) 2019-2023 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
/*
|
||||
The design origins from https://github.com/qemu/qemu/blob/master/docs/pcie.txt
|
||||
|
||||
In order to better support the PCIe topologies of different VMMs, we adopt a layered approach.
|
||||
The first layer is the base layer(the flatten PCIe topology), which mainly consists of the root bus,
|
||||
which is mainly used by VMMs that only support devices being directly attached to the root bus.
|
||||
However, not all VMMs have such simple PCIe topologies. For example, Qemu, which can fully simulate
|
||||
the PCIe topology of the host, has a complex PCIe topology. In this case, we need to add PCIe RootPort,
|
||||
PCIe Switch, and PCIe-PCI Bridge or pxb-pcie on top of the base layer, which is The Complex PCIe Topology.
|
||||
|
||||
The design graghs as below:
|
||||
|
||||
(1) The flatten PCIe Topology
|
||||
pcie.0 bus (Root Complex)
|
||||
----------------------------------------------------------------------------
|
||||
| | | | | | | | | | | | | | | .. |
|
||||
--|--------------------|------------------|-------------------------|-------
|
||||
| | | |
|
||||
V V V V
|
||||
----------- ----------- ----------- -----------
|
||||
| PCI Dev | | PCI Dev | | PCI Dev | | PCI Dev |
|
||||
----------- ----------- ----------- -----------
|
||||
|
||||
(2) The Complex PCIe Topology(It'll be implemented when Qemu is ready in runtime-rs)
|
||||
pcie.0 bus (Root Complex)
|
||||
----------------------------------------------------------------------------
|
||||
| | | | | | | | | | | | | | | .. |
|
||||
------|----------------|--------------------------------------|-------------
|
||||
| | |
|
||||
V V V
|
||||
------------- ------------- -------------
|
||||
| Root Port | | Root Port | | Root Port |
|
||||
------------- ------------- -------------
|
||||
| |
|
||||
| -------------------------|-----------------------
|
||||
------------ | ----------------- |
|
||||
| PCIe Dev | | PCI Express | Upstream Port | |
|
||||
------------ | Switch ----------------- |
|
||||
| | | |
|
||||
| ------------------- ------------------- |
|
||||
| | Downstream Port | | Downstream Port | |
|
||||
| ------------------- ------------------- |
|
||||
-------------|-----------------------|-----------
|
||||
------------
|
||||
| PCIe Dev |
|
||||
------------
|
||||
*/
|
||||
|
||||
use std::collections::{hash_map::Entry, HashMap};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
|
||||
use crate::device::pci_path::PciSlot;
|
||||
use kata_types::config::hypervisor::TopologyConfigInfo;
|
||||
|
||||
use super::pci_path::PciPath;
|
||||
|
||||
const DEFAULT_PCIE_ROOT_BUS: &str = "pcie.0";
|
||||
// Currently, CLH and Dragonball support device attachment solely on the root bus.
|
||||
const DEFAULT_PCIE_ROOT_BUS_ADDRESS: &str = "0000:00";
|
||||
pub const PCIE_ROOT_BUS_SLOTS_CAPACITY: u32 = 32;
|
||||
|
||||
// register_pcie_device: do pre register device into PCIe Topology which
|
||||
// be called in device driver's attach before device real attached into
|
||||
// VM. It'll allocate one available PCI path for the device.
|
||||
// register_pcie_device can be expanded as below:
|
||||
// register_pcie_device {
|
||||
// match pcie_topology {
|
||||
// Some(topology) => self.register(topology).await,
|
||||
// None => Ok(())
|
||||
// }
|
||||
// }
|
||||
#[macro_export]
|
||||
macro_rules! register_pcie_device {
|
||||
($self:ident, $opt:expr) => {
|
||||
match $opt {
|
||||
Some(topology) => $self.register(topology).await,
|
||||
None => Ok(()),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// update_pcie_device: do update device info, as some VMMs will be able to
|
||||
// return the device info containing guest PCI path which differs the one allocated
|
||||
// in runtime. So we need to compair the two PCI path, and finally update it or not
|
||||
// based on the difference between them.
|
||||
// update_pcie_device can be expanded as below:
|
||||
// update_pcie_device {
|
||||
// match pcie_topology {
|
||||
// Some(topology) => self.register(topology).await,
|
||||
// None => Ok(())
|
||||
// }
|
||||
// }
|
||||
#[macro_export]
|
||||
macro_rules! update_pcie_device {
|
||||
($self:ident, $opt:expr) => {
|
||||
match $opt {
|
||||
Some(topology) => $self.register(topology).await,
|
||||
None => Ok(()),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// unregister_pcie_device: do unregister device from pcie topology.
|
||||
// unregister_pcie_device can be expanded as below:
|
||||
// unregister_pcie_device {
|
||||
// match pcie_topology {
|
||||
// Some(topology) => self.unregister(topology).await,
|
||||
// None => Ok(())
|
||||
// }
|
||||
// }
|
||||
#[macro_export]
|
||||
macro_rules! unregister_pcie_device {
|
||||
($self:ident, $opt:expr) => {
|
||||
match $opt {
|
||||
Some(topology) => $self.unregister(topology).await,
|
||||
None => Ok(()),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub trait PCIeDevice: Send + Sync {
|
||||
fn device_id(&self) -> &str;
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct PCIeEndpoint {
|
||||
// device_id for device in device manager
|
||||
pub device_id: String,
|
||||
// device's PCI Path in Guest
|
||||
pub pci_path: PciPath,
|
||||
// root_port for PCIe Device
|
||||
pub root_port: Option<PCIeRootPort>,
|
||||
|
||||
// device_type is for device virtio-pci/PCI or PCIe
|
||||
pub device_type: String,
|
||||
}
|
||||
|
||||
impl PCIeDevice for PCIeEndpoint {
|
||||
fn device_id(&self) -> &str {
|
||||
self.device_id.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
// reserved resource
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ResourceReserved {
|
||||
// This to work needs patches to QEMU
|
||||
// The PCIE-PCI bridge can be hot-plugged only into pcie-root-port that has 'bus-reserve'
|
||||
// property value to provide secondary bus for the hot-plugged bridge.
|
||||
pub bus_reserve: String,
|
||||
|
||||
// reserve prefetched MMIO aperture, 64-bit
|
||||
pub pref64_reserve: String,
|
||||
// reserve prefetched MMIO aperture, 32-bit
|
||||
pub pref32_reserve: String,
|
||||
// reserve non-prefetched MMIO aperture, 32-bit *only*
|
||||
pub memory_reserve: String,
|
||||
|
||||
// IO reservation
|
||||
pub io_reserve: String,
|
||||
}
|
||||
|
||||
// PCIe Root Port
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct PCIeRootPort {
|
||||
// format: rp{n}, n>=0
|
||||
pub id: String,
|
||||
|
||||
// default is pcie.0
|
||||
pub bus: String,
|
||||
// >=0, default is 0x00
|
||||
pub address: String,
|
||||
|
||||
// (slot, chassis) pair is mandatory and must be unique for each pcie-root-port,
|
||||
// chassis >=0, default is 0x00
|
||||
pub chassis: u8,
|
||||
// slot >=0, default is 0x00
|
||||
pub slot: u8,
|
||||
|
||||
// multi_function is for PCIe Device passthrough
|
||||
// true => "on", false => "off", default is off
|
||||
pub multi_function: bool,
|
||||
|
||||
// reserved resource for some VMM, such as Qemu.
|
||||
pub resource_reserved: ResourceReserved,
|
||||
|
||||
// romfile specifies the ROM file being used for this device.
|
||||
pub romfile: String,
|
||||
}
|
||||
|
||||
// PCIe Root Complex
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct PCIeRootComplex {
|
||||
pub root_bus: String,
|
||||
pub root_bus_address: String,
|
||||
pub root_bus_devices: HashMap<String, PCIeEndpoint>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PCIeTopology {
|
||||
pub hypervisor_name: String,
|
||||
pub root_complex: PCIeRootComplex,
|
||||
|
||||
pub bridges: u32,
|
||||
pub pcie_root_ports: u32,
|
||||
pub hotplug_vfio_on_root_bus: bool,
|
||||
}
|
||||
|
||||
impl PCIeTopology {
|
||||
// As some special case doesn't support PCIe devices, there's no need to build a PCIe Topology.
|
||||
pub fn new(config_info: Option<&TopologyConfigInfo>) -> Option<Self> {
|
||||
// if config_info is None, it will return None.
|
||||
let topo_config = config_info?;
|
||||
|
||||
let root_complex = PCIeRootComplex {
|
||||
root_bus: DEFAULT_PCIE_ROOT_BUS.to_owned(),
|
||||
root_bus_address: DEFAULT_PCIE_ROOT_BUS_ADDRESS.to_owned(),
|
||||
root_bus_devices: HashMap::with_capacity(PCIE_ROOT_BUS_SLOTS_CAPACITY as usize),
|
||||
};
|
||||
|
||||
Some(Self {
|
||||
hypervisor_name: topo_config.hypervisor_name.to_owned(),
|
||||
root_complex,
|
||||
bridges: topo_config.device_info.default_bridges,
|
||||
pcie_root_ports: topo_config.device_info.pcie_root_port,
|
||||
hotplug_vfio_on_root_bus: topo_config.device_info.hotplug_vfio_on_root_bus,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn insert_device(&mut self, ep: &mut PCIeEndpoint) -> Option<PciPath> {
|
||||
let to_pcipath = |v: u32| -> PciPath {
|
||||
PciPath {
|
||||
slots: vec![PciSlot(v as u8)],
|
||||
}
|
||||
};
|
||||
|
||||
let to_string = |v: u32| -> String { to_pcipath(v).to_string() };
|
||||
|
||||
// find the first available index as the allocated slot.
|
||||
let allocated_slot = (0..PCIE_ROOT_BUS_SLOTS_CAPACITY).find(|&i| {
|
||||
!self
|
||||
.root_complex
|
||||
.root_bus_devices
|
||||
.contains_key(&to_string(i))
|
||||
})?;
|
||||
|
||||
let pcipath = to_string(allocated_slot);
|
||||
|
||||
// update pci_path in Endpoint
|
||||
ep.pci_path = to_pcipath(allocated_slot);
|
||||
// convert the allocated slot to pci path and then insert it with ep
|
||||
self.root_complex
|
||||
.root_bus_devices
|
||||
.insert(pcipath, ep.clone());
|
||||
|
||||
Some(to_pcipath(allocated_slot))
|
||||
}
|
||||
|
||||
pub fn remove_device(&mut self, device_id: &str) -> Option<String> {
|
||||
let mut target_device: Option<String> = None;
|
||||
self.root_complex.root_bus_devices.retain(|k, v| {
|
||||
if v.device_id() != device_id {
|
||||
true
|
||||
} else {
|
||||
target_device = Some((*k).to_string());
|
||||
false
|
||||
}
|
||||
});
|
||||
|
||||
target_device
|
||||
}
|
||||
|
||||
pub fn update_device(&mut self, ep: &PCIeEndpoint) -> Option<PciPath> {
|
||||
let pci_addr = ep.pci_path.clone();
|
||||
|
||||
// First, find the PCIe Endpoint corresponding to the endpoint in the Hash Map based on the PCI path.
|
||||
// If found, it means that we do not need to update the device's position in the Hash Map.
|
||||
// If not found, it means that the PCI Path corresponding to the device has changed, and the device's
|
||||
// position in the Hash Map needs to be updated.
|
||||
match self
|
||||
.root_complex
|
||||
.root_bus_devices
|
||||
.entry(pci_addr.to_string())
|
||||
{
|
||||
Entry::Occupied(_) => None,
|
||||
Entry::Vacant(_entry) => {
|
||||
self.remove_device(&ep.device_id);
|
||||
self.root_complex
|
||||
.root_bus_devices
|
||||
.insert(pci_addr.to_string(), ep.clone());
|
||||
|
||||
Some(pci_addr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn find_device(&mut self, device_id: &str) -> bool {
|
||||
for v in self.root_complex.root_bus_devices.values() {
|
||||
info!(
|
||||
sl!(),
|
||||
"find_device with: {:?}, {:?}.",
|
||||
&device_id,
|
||||
v.device_id()
|
||||
);
|
||||
if v.device_id() == device_id {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
pub fn do_insert_or_update(&mut self, pciep: &mut PCIeEndpoint) -> Result<PciPath> {
|
||||
// Try to check whether the device is present in the PCIe Topology.
|
||||
// If the device dosen't exist, it proceeds to register it within the topology
|
||||
let pci_path = if !self.find_device(&pciep.device_id) {
|
||||
// Register a device within the PCIe topology, allocating and assigning it an available PCI Path.
|
||||
// Upon successful insertion, it updates the pci_path in PCIeEndpoint and returns it.
|
||||
// Finally, update both the guest_pci_path and devices_options with the allocated PciPath.
|
||||
if let Some(pci_addr) = self.insert_device(pciep) {
|
||||
pci_addr
|
||||
} else {
|
||||
return Err(anyhow!("pci path allocated failed."));
|
||||
}
|
||||
} else {
|
||||
// If the device exists, it proceeds to update its pcipath within
|
||||
// the topology and the device's guest_pci_path and device_options.
|
||||
if let Some(pci_addr) = self.update_device(pciep) {
|
||||
pci_addr
|
||||
} else {
|
||||
return Ok(pciep.pci_path.clone());
|
||||
}
|
||||
};
|
||||
|
||||
Ok(pci_path)
|
||||
}
|
||||
}
|
||||
|
||||
// do_add_pcie_endpoint do add a device into PCIe topology with pcie endpoint
|
||||
// device_id: device's Unique ID in Device Manager.
|
||||
// allocated_pcipath: allocated pcipath before add_device
|
||||
// topology: PCIe Topology for devices to build a PCIe Topology in Guest.
|
||||
pub fn do_add_pcie_endpoint(
|
||||
device_id: String,
|
||||
allocated_pcipath: Option<PciPath>,
|
||||
topology: &mut PCIeTopology,
|
||||
) -> Result<PciPath> {
|
||||
let pcie_endpoint = &mut PCIeEndpoint {
|
||||
device_type: "PCIe".to_string(),
|
||||
device_id,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
if let Some(pci_path) = allocated_pcipath {
|
||||
pcie_endpoint.pci_path = pci_path;
|
||||
}
|
||||
|
||||
topology.do_insert_or_update(pcie_endpoint)
|
||||
}
|
@ -17,7 +17,7 @@ use hypervisor::{
|
||||
},
|
||||
BlockConfig, Hypervisor, VfioConfig,
|
||||
};
|
||||
use kata_types::config::TomlConfig;
|
||||
use kata_types::config::{hypervisor::TopologyConfigInfo, TomlConfig};
|
||||
use kata_types::mount::Mount;
|
||||
use oci::{Linux, LinuxCpu, LinuxResources};
|
||||
use persist::sandbox_persist::Persist;
|
||||
@ -59,8 +59,9 @@ impl ResourceManagerInner {
|
||||
toml_config: Arc<TomlConfig>,
|
||||
init_size_manager: InitialSizeManager,
|
||||
) -> Result<Self> {
|
||||
let topo_config = TopologyConfigInfo::new(&toml_config);
|
||||
// create device manager
|
||||
let dev_manager = DeviceManager::new(hypervisor.clone())
|
||||
let dev_manager = DeviceManager::new(hypervisor.clone(), topo_config.as_ref())
|
||||
.await
|
||||
.context("failed to create device manager")?;
|
||||
|
||||
@ -510,12 +511,14 @@ impl Persist for ResourceManagerInner {
|
||||
sid: resource_args.sid.clone(),
|
||||
config: resource_args.config,
|
||||
};
|
||||
let topo_config = TopologyConfigInfo::new(&args.config);
|
||||
|
||||
Ok(Self {
|
||||
sid: resource_args.sid,
|
||||
agent: resource_args.agent,
|
||||
hypervisor: resource_args.hypervisor.clone(),
|
||||
device_manager: Arc::new(RwLock::new(
|
||||
DeviceManager::new(resource_args.hypervisor).await?,
|
||||
DeviceManager::new(resource_args.hypervisor, topo_config.as_ref()).await?,
|
||||
)),
|
||||
network: None,
|
||||
share_fs: None,
|
||||
|
@ -9,6 +9,7 @@ mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_types::config::hypervisor::TopologyConfigInfo;
|
||||
use netlink_packet_route::MACVLAN_MODE_PRIVATE;
|
||||
use scopeguard::defer;
|
||||
use tests_utils::load_test_config;
|
||||
@ -29,6 +30,7 @@ mod tests {
|
||||
async fn get_device_manager() -> Result<Arc<RwLock<DeviceManager>>> {
|
||||
let hypervisor_name: &str = "qemu";
|
||||
let toml_config = load_test_config(hypervisor_name.to_owned())?;
|
||||
let topo_config = TopologyConfigInfo::new(&toml_config);
|
||||
let hypervisor_config = toml_config
|
||||
.hypervisor
|
||||
.get(hypervisor_name)
|
||||
@ -40,7 +42,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let dm = Arc::new(RwLock::new(
|
||||
DeviceManager::new(Arc::new(hypervisor))
|
||||
DeviceManager::new(Arc::new(hypervisor), topo_config.as_ref())
|
||||
.await
|
||||
.context("device manager")?,
|
||||
));
|
||||
|
Loading…
Reference in New Issue
Block a user