diff --git a/Cargo.lock b/Cargo.lock index 577b56385b..07962d9ead 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2937,6 +2937,7 @@ dependencies = [ "libc", "logging", "nix 0.26.4", + "once_cell", "path-clean", "persist", "protocols", @@ -2944,6 +2945,7 @@ dependencies = [ "qapi-qmp", "qapi-spec", "rand 0.8.5", + "regex", "rust-ini", "safe-path 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "seccompiler", diff --git a/src/runtime-rs/crates/hypervisor/Cargo.toml b/src/runtime-rs/crates/hypervisor/Cargo.toml index 41d409537e..f450c599ef 100644 --- a/src/runtime-rs/crates/hypervisor/Cargo.toml +++ b/src/runtime-rs/crates/hypervisor/Cargo.toml @@ -36,7 +36,8 @@ qapi-spec = "0.3.2" qapi-qmp = "0.15.0" hyperlocal = { workspace = true } hyper = { workspace = true, features = ["client"] } - +regex = "1" +once_cell = "1.21.3" # Local dependencies kata-sys-util = { workspace = true } kata-types = { workspace = true } diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index fd07744e42..e2e16fd9fe 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -254,7 +254,8 @@ impl DeviceManager { DeviceType::HybridVsock(_) | DeviceType::Vsock(_) | DeviceType::Protection(_) - | DeviceType::PortDevice(_) => { + | DeviceType::PortDevice(_) + | DeviceType::VfioModern(_) => { continue; } } diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index a108a9a90f..277a4420a4 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -7,6 +7,7 @@ mod port_device; mod protection_device; mod vfio; +pub mod vfio_device; mod vhost_user; pub mod vhost_user_blk; mod vhost_user_net; @@ -21,6 +22,7 @@ pub use vfio::{ bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig, VfioDevice, }; +pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle}; pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType}; pub use vhost_user_net::VhostUserNetDevice; pub use virtio_blk::{ diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs new file mode 100644 index 0000000000..6bfa41a1ba --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/core.rs @@ -0,0 +1,767 @@ +// Copyright (c) 2026 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; +use regex::Regex; +use serde::{Deserialize, Serialize}; + +use std::collections::BTreeMap; +use std::fs; +use std::os::unix::fs::{FileTypeExt, MetadataExt}; +use std::path::{Path, PathBuf}; + +/// Path constants for VFIO and IOMMU sysfs/dev interfaces +const DEV_VFIO: &str = "/dev/vfio"; +const SYS_IOMMU_GROUPS: &str = "/sys/kernel/iommu_groups"; +const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices"; +const DEV_IOMMU: &str = "/dev/iommu"; +const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices"; +const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VfioIommufdBackend { + /// Host global IOMMUFD device node (/dev/iommu) + pub iommufd_dev: PathBuf, + /// The per-device VFIO cdev nodes required for this assignment + pub cdevs: Vec, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct VfioDevice { + pub id: String, + pub device_type: VfioDeviceType, + pub bus_mode: VfioBusMode, + + /// Metadata for Legacy VFIO backend + pub iommu_group: Option, + pub iommu_group_id: Option, + + /// Metadata for IOMMUFD backend + pub iommufd: Option, + + /// Common device information + pub devices: Vec, + /// The representative primary device for this assignment unit + pub primary: DeviceInfo, + pub labels: BTreeMap, + pub health: Health, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum VfioDeviceType { + #[default] + Normal, + MediatedPci, + MediatedAp, + Error, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum VfioBusMode { + #[default] + Mmio, + Pci, + Ccw, +} + +/// PCI Bus-Device-Function (BDF) Address representation +#[derive(Debug, Default, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct BdfAddress { + pub domain: u16, + pub bus: u8, + pub device: u8, + pub function: u8, +} + +impl BdfAddress { + pub fn new(domain: u16, bus: u8, device: u8, function: u8) -> Self { + Self { + domain, + bus, + device, + function, + } + } + + /// Parses a BDF string in formats like "0000:01:00.0" or "01:00.0" + pub fn parse(s: &str) -> Result { + let parts: Vec<&str> = s.split(':').collect(); + + let (domain, bus_str, bus_dev_func) = match parts.len() { + 2 => (0u16, parts[0], parts[1]), + 3 => { + let domain = u16::from_str_radix(parts[0], 16).context("Invalid domain hex")?; + (domain, parts[1], parts[2]) + } + _ => return Err(anyhow!("Invalid BDF format: {}", s)), + }; + + let bus = u8::from_str_radix(bus_str, 16).context("Invalid bus hex")?; + + let dev_func: Vec<&str> = bus_dev_func.split('.').collect(); + if dev_func.len() != 2 { + return Err(anyhow!("Invalid device.function format")); + } + + let device = u8::from_str_radix(dev_func[0], 16).context("Invalid device hex")?; + let function = u8::from_str_radix(dev_func[1], 16).context("Invalid function hex")?; + + Ok(Self { + domain, + bus, + device, + function, + }) + } + + pub fn to_short_string(&self) -> String { + format!("{:02x}:{:02x}.{:x}", self.bus, self.device, self.function) + } +} + +impl std::fmt::Display for BdfAddress { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:x}", + self.domain, self.bus, self.device, self.function + ) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DeviceAddress { + Pci(BdfAddress), + Ccw(String), + Mmio(String), + MdevUuid(String), +} + +impl std::fmt::Display for DeviceAddress { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DeviceAddress::Pci(bdf) => write!(f, "{bdf}"), + DeviceAddress::Ccw(s) => write!(f, "{s}"), + DeviceAddress::Mmio(s) => write!(f, "{s}"), + DeviceAddress::MdevUuid(s) => write!(f, "{s}"), + } + } +} + +impl Default for DeviceAddress { + fn default() -> Self { + DeviceAddress::Pci(BdfAddress::default()) + } +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct DeviceInfo { + /// Logical address on the specific bus + pub addr: DeviceAddress, + + /// Hardware identification (may be missing for non-PCI/mdev) + pub vendor_id: Option, + pub device_id: Option, + pub class_code: Option, + + /// Active kernel driver (e.g., "vfio-pci") + pub driver: Option, + + /// Parent IOMMU group (critical for legacy passthrough) + pub iommu_group_id: Option, + + /// Proximity to CPU/Memory (sysfs reports -1 for no specific node) + pub numa_node: Option, + + /// Canonical path in sysfs + pub sysfs_path: PathBuf, + + /// VFIO character device node (e.g., /dev/vfio/devices/vfio0) + /// Only populated if the kernel/hardware supports device-centric VFIO + pub vfio_cdev: Option, +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum Health { + #[default] + Healthy, + Unhealthy, + Unknown, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct VfioGroup { + pub group_id: u32, + pub devnode: PathBuf, + pub vfio_ctl: PathBuf, + /// Aggregated VFIO cdev nodes for all devices within this group + pub vfio_cdevs: Vec, + pub devices: Vec, + // primary device used for labeling and identification + pub primary: DeviceInfo, + pub labels: BTreeMap, + pub is_viable: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VfioCdev { + /// Instance name (e.g., "vfio0") + pub name: String, + /// Device node path (/dev/vfio/devices/vfio0) + pub devnode: PathBuf, + /// Character device major number + pub major: Option, + /// Character device minor number + pub minor: Option, + pub sysfs_path: PathBuf, + /// Associated PCI BDF if applicable + pub bdf: Option, + pub group_id: Option, +} + +fn read_trim(path: impl AsRef) -> Option { + fs::read_to_string(path.as_ref()) + .ok() + .map(|s| s.trim().to_string()) +} + +fn parse_i32(path: impl AsRef) -> Option { + read_trim(path).and_then(|s| s.parse::().ok()) +} + +fn driver_name(pci_dev_path: &Path) -> Option { + let link = fs::read_link(pci_dev_path.join("driver")).ok()?; + link.file_name().map(|n| n.to_string_lossy().to_string()) +} + +fn parse_bdf_str(s: &str) -> Result { + // Standard format: "0000:65:00.0" + let re = Regex::new( + r"^(?P[0-9a-fA-F]{4}):(?P[0-9a-fA-F]{2}):(?P[0-9a-fA-F]{2})\.(?P[0-7])$", + ) + .unwrap(); + let cap = re + .captures(s) + .ok_or_else(|| anyhow!("invalid BDF format: {s}"))?; + Ok(BdfAddress { + domain: u16::from_str_radix(&cap["d"], 16)?, + bus: u8::from_str_radix(&cap["b"], 16)?, + device: u8::from_str_radix(&cap["dev"], 16)?, + function: (cap["f"]).parse::()?, + }) +} + +/// Scans sysfs to find all PCI devices belonging to a specific IOMMU group +fn discover_group_devices(group_id: u32) -> Result> { + let mut out = vec![]; + let group_dir = Path::new(SYS_IOMMU_GROUPS) + .join(group_id.to_string()) + .join("devices"); + + for ent in + fs::read_dir(&group_dir).context(format!("Failed to read {}", group_dir.display()))? + { + let ent = ent?; + let bdf_str = ent.file_name().to_string_lossy().to_string(); + let pci_path = Path::new(SYS_PCI_DEVS).join(&bdf_str); + + if !pci_path.exists() { + continue; + } + + let bdf = parse_bdf_str(&bdf_str)?; + let vendor_id = read_trim(pci_path.join("vendor")); + let device_id = read_trim(pci_path.join("device")); + let class_code = read_trim(pci_path.join("class")) + .as_deref() + .and_then(parse_class_code_u32); + let driver = driver_name(&pci_path); + + let numa_node = + parse_i32(pci_path.join("numa_node")).and_then(|n| if n < 0 { None } else { Some(n) }); + + out.push(DeviceInfo { + addr: DeviceAddress::Pci(bdf), + vendor_id, + device_id, + class_code, + driver, + iommu_group_id: Some(group_id), + numa_node, + sysfs_path: pci_path, + vfio_cdev: None, // Populated later + }); + } + + // Ensure deterministic ordering + out.sort_by(|a, b| a.sysfs_path.cmp(&b.sysfs_path)); + Ok(out) +} + +/// Generates descriptive labels for an IOMMU group (e.g., identifying GPUs) +fn build_group_labels(devs: &[DeviceInfo]) -> BTreeMap { + let mut labels = BTreeMap::new(); + let mut gpu = false; + let mut vendor: Option = None; + + for d in devs { + if vendor.is_none() { + vendor = d.vendor_id.clone(); + } + + // PCI Class Code layout: 0xBBSSPP (Base Class, Sub Class, Programming Interface) + if let Some(class_code) = d.class_code { + let base = ((class_code >> 16) & 0xff) as u8; + let sub = ((class_code >> 8) & 0xff) as u8; + + // Base 0x03 = Display controller + // Sub 0x00 = VGA compatible, 0x02 = 3D controller (NVIDIA/AMD) + if base == 0x03 && (sub == 0x00 || sub == 0x02) { + gpu = true; + } + } + } + + if let Some(v) = vendor { + labels.insert("vendor".into(), v); + } + labels.insert("gpu".into(), gpu.to_string()); + labels +} + +/// Validates that an IOMMU group can be safely passed through. +/// Note: Bridges and Host Controllers in the group are ignored as they cannot be passed to guests. +fn validate_group_basic(devices: &[DeviceInfo]) -> bool { + // Current minimal check: group must not be empty. + // Production logic may include blacklisting specific device classes. + for device in devices.iter() { + if let DeviceAddress::Pci(bdf) = &device.addr { + // filter host or PCI bridge + let bdf_str = bdf.to_string(); + // Filter out host or PCI bridges (cannot be passed through) + if filter_bridge_device(&bdf_str, 0x0600).is_some() { + continue; + } + } + } + + !devices.is_empty() +} + +fn get_device_property(device_bdf: &str, property: &str) -> Result { + let dev_sys_path = Path::new(SYS_PCI_DEVS).join(device_bdf); + let cfg_path = fs::read_to_string(dev_sys_path.join(property)).with_context(|| { + format!( + "failed to read property {} for device {}", + property, device_bdf + ) + })?; + + Ok(cfg_path.trim().to_string()) +} + +/// Filters for Host or PCI bridges within an IOMMU group. +/// PCI Bridge: Class 0x0604, Host Bridge: Class 0x0600. +fn filter_bridge_device(bdf: &str, bitmask: u64) -> Option { + let device_class = get_device_property(bdf, "class").unwrap_or_default(); + + if device_class.is_empty() { + return None; + } + + match device_class.parse::() { + Ok(cid_u32) => { + // PCI class code is 24 bits, shift right 8 to get base+sub class + let class_code = u64::from(cid_u32) >> 8; + if class_code & bitmask == bitmask { + Some(class_code) + } else { + None + } + } + _ => None, + } +} + +fn parse_class_code_u32(s: &str) -> Option { + let t = s.trim().strip_prefix("0x").unwrap_or(s.trim()); + u32::from_str_radix(t, 16).ok() +} + +/// Determines device priority for selection as the 'Primary' device of a group. +/// GPUs take precedence, followed by Network and Storage controllers. +fn class_priority(class_code: Option) -> u8 { + let Some(c) = class_code else { return 255 }; + let base = ((c >> 16) & 0xff) as u8; + let sub = ((c >> 8) & 0xff) as u8; + + match (base, sub) { + (0x03, 0x00) | (0x03, 0x02) => 0, // VGA/3D GPU + (0x02, _) => 10, // Network controller + (0x01, _) => 20, // Mass storage + _ => 100, // Other + } +} + +/// Picks the most significant device in a group to act as the primary identifier. +fn select_primary_device(devs: &[DeviceInfo]) -> DeviceInfo { + assert!(!devs.is_empty()); + + devs.iter() + .min_by(|a, b| { + let pa = class_priority(a.class_code); + let pb = class_priority(b.class_code); + if pa != pb { + return pa.cmp(&pb); + } + + // Fallback to function number if classes are identical + let fa = match &a.addr { + DeviceAddress::Pci(bdf) => bdf.function, + _ => u8::MAX, + }; + let fb = match &b.addr { + DeviceAddress::Pci(bdf) => bdf.function, + _ => u8::MAX, + }; + fa.cmp(&fb) + }) + .cloned() + .unwrap() +} + +fn is_char_dev(p: &Path) -> bool { + fs::metadata(p) + .map(|m| m.file_type().is_char_device()) + .unwrap_or(false) +} + +/// Extracts the IOMMU group ID from a PCI device's sysfs link. +fn vfio_group_id_from_pci(bdf: &str) -> Option { + let link = fs::read_link(Path::new(SYS_PCI_DEVS).join(bdf).join("iommu_group")).ok()?; + link.file_name()?.to_string_lossy().parse::().ok() +} + +/// Locates the VFIO character device (cdev) for a given PCI BDF. +/// Path: /sys/bus/pci/devices//vfio-dev/vfioX +fn discover_vfio_cdev_for_pci(bdf: &str, gid: u32) -> Option { + let pci_path = Path::new(SYS_PCI_DEVS).join(bdf); + let vfio_dev_dir = pci_path.join("vfio-dev"); + let rd = fs::read_dir(&vfio_dev_dir).ok()?; + for e in rd.flatten() { + let name = e.file_name().to_string_lossy().to_string(); + if !name.starts_with("vfio") { + continue; + } + return discover_vfio_cdev_by_name(&name, Some(bdf.to_string()), Some(gid)); + } + None +} + +/// Extracts major/minor device numbers from a file's metadata. +fn stat_major_minor(path: &Path) -> Option<(u32, u32)> { + let md = fs::metadata(path).ok()?; + let rdev = md.rdev(); + Some((linux_major(rdev), linux_minor(rdev))) +} + +fn discover_vfio_cdev_by_name( + vfio_name: &str, + bdf: Option, + gid: Option, +) -> Option { + let devnode = Path::new(DEV_VFIO_DEVICES).join(vfio_name); + if !is_char_dev(&devnode) { + return None; + } + let (major, minor) = stat_major_minor(&devnode).unwrap_or((0, 0)); + Some(VfioCdev { + name: vfio_name.to_string(), + devnode, + major: if major == 0 && minor == 0 { + None + } else { + Some(major) + }, + minor: if major == 0 && minor == 0 { + None + } else { + Some(minor) + }, + sysfs_path: Path::new(SYS_CLASS_VFIO_DEV).join(vfio_name), + bdf, + group_id: gid, + }) +} + +/// Discovers the VFIO device context based on a /dev/vfio/devices/vfio path. +pub fn discover_vfio_device(vfio_device: &Path) -> Result { + if vfio_device.exists() && is_char_dev(vfio_device) { + let vfio_name = vfio_device + .file_name() + .ok_or_else(|| anyhow!("Invalid vfio device path"))? + .to_string_lossy() + .to_string(); + + // Resolve VFIO name to BDF via sysfs symlink + let dev_link = fs::read_link( + Path::new(SYS_CLASS_VFIO_DEV) + .join(&vfio_name) + .join("device"), + ) + .with_context(|| format!("failed to read sysfs device link for {}", vfio_name))?; + + let bdf = dev_link + .file_name() + .ok_or_else(|| anyhow!("Malformed vfio-dev symlink for {}", vfio_name))? + .to_string_lossy() + .to_string(); + + // Resolve BDF to IOMMU group. On iommufd-first hosts there is often no legacy + // /dev/vfio/ node — only /dev/vfio/devices/vfioX cdevs exist — so use the + // cdev we were given as the group char dev when legacy is absent. + let gid = vfio_group_id_from_pci(&bdf) + .ok_or_else(|| anyhow!("could not resolve IOMMU group for {}", bdf))?; + let legacy = Path::new(DEV_VFIO).join(gid.to_string()); + let group_devnode = if legacy.exists() && is_char_dev(&legacy) { + legacy + } else { + vfio_device.to_path_buf() + }; + discover_vfio_device_for_iommu_group(gid, group_devnode) + } else { + Err(anyhow!("vfio device {} not found", vfio_device.display())) + } +} + +fn parse_dev_vfio_group_id(s: &str) -> Option { + // Extracts numeric ID from "/dev/vfio/12" or just "12" + let base = Path::new(s).file_name()?.to_string_lossy(); + base.parse::().ok() +} + +/// Per-device cdev under iommufd (`/dev/vfio/devices/vfioN`). Matches Go +/// `strings.HasPrefix(HostPath, IommufdDevPath)` in `pkg/device/drivers/vfio.go`, not only +/// [`Path::starts_with`]: component-wise path prefix can disagree with string prefix for some +/// `OsStr` forms, so we use the same string rule as the Go runtime. +fn is_iommufd_devices_cdev_path(path: &Path) -> bool { + let s = path.to_string_lossy(); + if !s.starts_with(DEV_VFIO_DEVICES) { + return false; + } + match s.as_bytes().get(DEV_VFIO_DEVICES.len()) { + None => true, + Some(b'/') => true, + Some(_) => false, + } +} + +/// Main entry point: Discovers a VFIO device unit based on an IOMMU group path (/dev/vfio/) +/// +/// CDI / device plugins often pass the per-device cdev (`/dev/vfio/devices/vfioX`) as the only +/// host path; that is stored as `iommu_group_devnode` without setting `iommu_device_node`. +/// Treat those like [`discover_vfio_device`]. +pub fn discover_vfio_group_device(host_path: PathBuf) -> Result { + if is_iommufd_devices_cdev_path(&host_path) { + return discover_vfio_device(&host_path); + } + let gid = parse_dev_vfio_group_id(&host_path.to_string_lossy()) + .ok_or_else(|| anyhow!("Invalid VFIO group path: {}", host_path.display()))?; + discover_vfio_device_for_iommu_group(gid, host_path) +} + +/// Builds [`VfioDevice`] for IOMMU group `gid`. +/// +/// `group_devnode` is the char device used to represent the group for metadata/health: +/// typically `/dev/vfio/` (legacy) or `/dev/vfio/devices/vfioX` when legacy nodes are absent. +fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Result { + let vfio_ctl = Path::new(DEV_VFIO).join("vfio"); + if !vfio_ctl.exists() { + return Err(anyhow!("VFIO control node missing: {}", vfio_ctl.display())); + } + + let devnode = group_devnode; + let mut devices = discover_group_devices(gid)?; + if devices.is_empty() { + return Err(anyhow!("IOMMU group {} contains no PCI devices", gid)); + } + + // Populate per-device VFIO cdevs (required for IOMMUFD backend) + for d in devices.iter_mut() { + if let DeviceAddress::Pci(bdf) = &d.addr { + d.vfio_cdev = discover_vfio_cdev_for_pci(&bdf.to_string(), gid); + } + } + + let labels = build_group_labels(&devices); + let is_viable = validate_group_basic(&devices); + let primary_device = select_primary_device(&devices); + + let group = VfioGroup { + group_id: gid, + devnode: devnode.clone(), + vfio_ctl: vfio_ctl.clone(), + devices: devices.clone(), + primary: primary_device.clone(), + labels: labels.clone(), + is_viable, + vfio_cdevs: devices + .iter() + .filter_map(|d| d.vfio_cdev.as_ref().map(|c| c.devnode.clone())) + .collect(), + }; + + // Construct IOMMUFD backend context (Best-effort discovery) + let iommufd_backend = { + let iommu_dev = PathBuf::from(DEV_IOMMU); + if is_char_dev(&iommu_dev) { + let mut cdevs: Vec = + devices.iter().filter_map(|d| d.vfio_cdev.clone()).collect(); + cdevs.sort_by(|a, b| a.devnode.cmp(&b.devnode)); + cdevs.dedup_by(|a, b| a.devnode == b.devnode); + if !cdevs.is_empty() { + Some(VfioIommufdBackend { + iommufd_dev: iommu_dev, + cdevs, + }) + } else { + None + } + } else { + None + } + }; + + let health = if is_viable && devnode.exists() && is_char_dev(&devnode) { + Health::Healthy + } else { + Health::Unhealthy + }; + + Ok(VfioDevice { + id: format!("vfio-group-{}", gid), + device_type: VfioDeviceType::Normal, + bus_mode: VfioBusMode::Pci, + iommu_group: Some(group), + iommu_group_id: Some(gid), + iommufd: iommufd_backend, + devices, + primary: primary_device, + labels, + health, + }) +} + +/// Resolves an IOMMUFD-style VFIO device cdev (/dev/vfio/devices/vfioX) +/// back to its PCI BDF and IOMMU group ID. +#[allow(dead_code)] +pub fn vfio_cdev_to_bdf_and_group(vfio_cdev: impl AsRef) -> Result<(String, u32)> { + let vfio_cdev = vfio_cdev.as_ref(); + + let (major, minor) = major_minor_from_char_device(vfio_cdev).context(format!( + "Failed to get major/minor for {}", + vfio_cdev.display() + ))?; + + // Map char device to its sysfs entry + let sys_dev_char = PathBuf::from(format!("/sys/dev/char/{major}:{minor}")); + let resolved = fs::canonicalize(&sys_dev_char) + .context(format!("failed to canonicalize {}", sys_dev_char.display()))?; + + // Parse the sysfs path to find the associated PCI device + let bdf = extract_last_pci_bdf(&resolved) + .context(format!("no PCI BDF found in path {}", resolved.display()))?; + + // Get IOMMU group, with a fallback to manual path scanning if the symlink is missing + let group_id = iommu_group_id_for_bdf(&bdf).or_else(|primary_err| { + group_id_from_path(&resolved).map_err(|fallback_err| { + anyhow!( + "failed to resolve group for BDF {bdf}: {primary_err}; fallback scan also failed: {fallback_err}" + ) + }) + })?; + + Ok((bdf, group_id)) +} + +/// Extract (major, minor) from a char device node. +/// Uses Linux's encoding macros (same logic as gnu libc major()/minor()). +fn major_minor_from_char_device(p: &Path) -> Result<(u32, u32)> { + let md = fs::metadata(p).context(format!("stat failed for {}", p.display()))?; + if !md.file_type().is_char_device() { + return Err(anyhow!("{} is not a character device", p.display())); + } + + let rdev = md.rdev(); + Ok((linux_major(rdev), linux_minor(rdev))) +} + +/// Linux device number encoding (glibc-compatible). +#[inline] +fn linux_major(dev: u64) -> u32 { + (((dev >> 8) & 0xfff) | ((dev >> 32) & 0xfffff000)) as u32 +} + +/// Linux device number encoding (glibc-compatible). +#[inline] +fn linux_minor(dev: u64) -> u32 { + ((dev & 0xff) | ((dev >> 12) & 0xfffff00)) as u32 +} + +/// Extracts the final PCI BDF in a sysfs path string. +/// Handles nested bridge paths like: .../pci0000:00/0000:00:01.0/0000:01:00.0/vfio-dev/... +fn extract_last_pci_bdf(p: &Path) -> Result { + static RE: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { + Regex::new(r"(?i)\b[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-7]\b").unwrap() + }); + + let s = p.to_string_lossy(); + RE.find_iter(&s) + .last() + .map(|m| m.as_str().to_owned()) + .ok_or_else(|| anyhow!("no PCI BDF found in path: {}", s)) +} + +/// Resolve iommu group id from `/sys/bus/pci/devices//iommu_group`. +fn iommu_group_id_for_bdf(bdf: &str) -> Result { + let iommu_link = PathBuf::from(format!("/sys/bus/pci/devices/{bdf}/iommu_group")); + let target = fs::read_link(&iommu_link).context("failed to read iommu_group symlink")?; + + target + .file_name() + .ok_or_else(|| anyhow!("link target {} invalid", target.display()))? + .to_string_lossy() + .parse::() + .context("failed to parse group ID from filename") +} + +fn group_id_from_path(p: &Path) -> Result { + static RE: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| Regex::new(r"/iommu_groups/(\d+)(/|$)").unwrap()); + + let s = p.to_string_lossy(); + let caps = RE + .captures(&s) + .ok_or_else(|| anyhow!("no iommu_groups component in path"))?; + + caps.get(1) + .unwrap() + .as_str() + .parse::() + .context("parse group id") +} + +#[allow(dead_code)] +pub fn is_dev_vfio_group_path(host_path: &str) -> bool { + let s = host_path.trim_end_matches('/'); + const PREFIX: &str = "/dev/vfio/"; + let rest = match s.strip_prefix(PREFIX) { + Some(r) => r, + None => return false, + }; + + // Valid if remainder is non-empty and contains only digits + !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs new file mode 100644 index 0000000000..e7db73679a --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/device.rs @@ -0,0 +1,314 @@ +// Copyright (c) 2026 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use std::path::PathBuf; +use std::sync::Arc; +use tokio::sync::Mutex; + +use crate::device::pci_path::PciPath; +use crate::device::topology::{PCIePort, PCIeTopology}; +use crate::device::util::{do_decrease_count, do_increase_count}; +use crate::device::{Device, DeviceType, PCIeDevice}; +use crate::vfio_device::core::{discover_vfio_device, discover_vfio_group_device, VfioDevice}; +use crate::Hypervisor; + +/// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id) +/// bus_name = rp +pub type BusPortId = (String, u32, u32); + +#[derive(Debug, Default, Clone)] +pub struct VfioDeviceBase { + /// Host device path, typically /dev/vfio/N (legacy) + pub host_path: String, + + /// Primary PCI Bus-Device-Function (BDF) address + pub host_bdf: String, + + /// All BDFs belonging to the same logical device or IOMMU group + pub host_bdfs: Vec, + + /// The bus and port ID to which the device is attached (e.g., ("pci.1", 2)) + pub bus_port_id: BusPortId, + + /// Specifies the PCIe port type (e.g., Root Port, Downstream Port) + pub port: PCIePort, + + /// Character device node for the IOMMU group (/dev/vfio/X) + pub iommu_group_devnode: PathBuf, + + /// Character device node for the specific VFIO device (/dev/vfio/devices/vfioX) + pub iommu_device_node: Option, + + /// The guest-side PCI path representing the device's BDF address in the VM + pub guest_pci_path: Option, + + /// Device classification: "block" or "char" + pub dev_type: String, + + /// Underlying bus architecture: "pci" or "ccw" + pub bus_type: String, + + /// Represents the device's path as it appears inside the VM guest, + /// independent of the host container's mount namespace. + /// format: Option<(device_index, path_name)> + pub virt_path: Option<(u64, String)>, + + /// Prefix used for host device identification. Examples: + /// - Physical Endpoint: "physical_nic_" + /// - Mediated Device: "vfio_mdev_" + /// - PCI Passthrough: "vfio_device_" + /// - VFIO Volume: "vfio_vol_" + /// - VFIO NVMe: "vfio_nvme_" + pub hostdev_prefix: String, +} + +#[derive(Debug, Default, Clone)] +pub struct VfioDeviceModern { + pub device_id: String, + pub device: VfioDevice, + pub config: VfioDeviceBase, + + /// Configuration options passed to the vfio-pci handler in kata-agent + pub device_options: Vec, + + /// Indicates if the host device has been allocated to a specific guest + pub is_allocated: bool, + + /// Reference count for active attachments + pub attach_count: u64, +} + +/// Path used for [`discover_vfio_group_device`] when `iommu_device_node` is unset. +/// CDI cold-plug often only fills `host_path`; `iommu_group_devnode` may still be empty until +/// device_manager copies `host_path` — treat those as the same node. +fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf { + if !base.iommu_group_devnode.as_os_str().is_empty() { + base.iommu_group_devnode.clone() + } else { + PathBuf::from(base.host_path.trim()) + } +} + +impl VfioDeviceModern { + pub fn new(device_id: String, base: &VfioDeviceBase) -> Result { + // For modern VFIO devices, we require the specific device cdev path to be provided in the configuration. + // This allows us to directly discover the device context without needing to resolve group devices. + // If the device node is not provided, we can optionally fallback to group device discovery, + // but this is less efficient and may not be supported in all environments. + let device = if let Some(ref node) = base.iommu_device_node { + if !node.as_os_str().is_empty() { + discover_vfio_device(node)? + } else { + discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + } + } else { + discover_vfio_group_device(vfio_modern_group_discovery_path(base))? + }; + Ok(Self { + device_id, + device, + config: base.clone(), + device_options: Vec::new(), + is_allocated: false, + attach_count: 0, + }) + } +} + +/// Thread-safe handle for managing modern VFIO devices using asynchronous locking. +#[derive(Clone, Debug)] +pub struct VfioDeviceModernHandle { + pub inner: Arc>, +} + +impl VfioDeviceModernHandle { + pub fn new(device_id: String, base: &VfioDeviceBase) -> Result { + let vfio_device = VfioDeviceModern::new(device_id, base)?; + Ok(Self { + inner: Arc::new(Mutex::new(vfio_device)), + }) + } + + pub fn arc(&self) -> Arc> { + self.inner.clone() + } + + /// Scoped read access: Executes a closure within the device lock. + pub async fn with(&self, f: impl FnOnce(&VfioDeviceModern) -> R) -> R { + let guard = self.inner.lock().await; + f(&guard) + } + + /// Scoped write access: Executes a mutating closure within the device lock. + pub async fn with_mut(&self, f: impl FnOnce(&mut VfioDeviceModern) -> R) -> R { + let mut guard = self.inner.lock().await; + f(&mut guard) + } + + pub async fn device_id(&self) -> String { + self.inner.lock().await.device_id.clone() + } + + pub async fn vfio_config(&self) -> VfioDeviceBase { + self.inner.lock().await.config.clone() + } + + pub async fn vfio_device(&self) -> VfioDevice { + self.inner.lock().await.device.clone() + } + + pub async fn attach_count(&self) -> u64 { + self.inner.lock().await.attach_count + } + + pub async fn set_allocated(&self, allocated: bool) { + self.inner.lock().await.is_allocated = allocated; + } + + pub async fn update_config(&self, cfg: VfioDeviceBase) { + self.inner.lock().await.config = cfg; + } +} + +#[async_trait] +impl Device for VfioDeviceModernHandle { + /// Attaches the VFIO device to the hypervisor and registers it in the PCIe topology. + async fn attach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result<()> { + // Check if device is already attached + if self + .increase_attach_count() + .await + .context("failed to increase attach count")? + { + warn!( + sl!(), + "The device {:?} is already attached; multi-attach is not allowed.", + self.device_id().await + ); + return Ok(()); + } + + // Register the device in the virtual PCIe topology + let topo = pcie_topo.as_deref_mut().ok_or_else(|| { + anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided") + })?; + self.register(topo).await?; + + // Request Hypervisor to perform the actual hardware passthrough + if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await { + error!(sl!(), "failed to attach vfio device: {:?}", e); + + // Rollback state on failure + self.decrease_attach_count().await?; + self.unregister(topo).await?; + return Err(e); + } + info!( + sl!(), + "vfio device {:?} attached successfully", + self.device_id().await + ); + Ok(()) + } + + /// Detaches the VFIO device from the hypervisor and releases topology resources. + async fn detach( + &mut self, + pcie_topo: &mut Option<&mut PCIeTopology>, + h: &dyn Hypervisor, + ) -> Result> { + // Only proceed with detachment if reference count reaches zero + if self + .decrease_attach_count() + .await + .context("failed to decrease attach count")? + { + return Ok(None); + } + + if let Err(e) = h + .remove_device(DeviceType::VfioModern(self.inner.clone())) + .await + { + // Rollback: increment count if hypervisor fails to remove the device + self.increase_attach_count().await?; + return Err(e); + } + + // Retrieve device index if a virtual path exists + let virt = self.with(|d| d.config.virt_path.clone()).await; + let device_index = virt.map(|(idx, _)| idx); + + // Unregister from PCIe topology + if let Some(topo) = pcie_topo { + self.unregister(topo).await?; + } + + Ok(device_index) + } + + async fn update(&mut self, _h: &dyn Hypervisor) -> Result<()> { + // Updates are typically not required for VFIO passthrough devices + Ok(()) + } + + async fn increase_attach_count(&mut self) -> Result { + let mut guard = self.inner.lock().await; + do_increase_count(&mut guard.attach_count) + } + + async fn decrease_attach_count(&mut self) -> Result { + let mut guard = self.inner.lock().await; + do_decrease_count(&mut guard.attach_count) + } + + async fn get_device_info(&self) -> DeviceType { + DeviceType::VfioModern(self.arc()) + } +} + +#[async_trait] +impl PCIeDevice for VfioDeviceModernHandle { + /// Reserves a bus and port in the PCIe topology for this device. + async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> { + let device_id = self.device_id().await; + let port_type = self.with(|d| d.config.port).await; + + // Reserve the bus based on the specified port type + let bus_port_id = match topo.reserve_bus_for_device(&device_id, port_type)? { + Some(id) => id, + None => return Err(anyhow::anyhow!("can not get bus port")), + }; + + self.with_mut(|d| { + d.config.bus_port_id = bus_port_id; + d.is_allocated = true; + }) + .await; + + Ok(()) + } + + /// Releases the reserved PCIe resources and resets attachment state. + async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> { + let device_id = self.device_id().await; + topo.release_bus_for_device(&device_id)?; + + self.with_mut(|d| { + d.is_allocated = false; + d.config.bus_port_id.0.clear(); + d.config.guest_pci_path = None; + }) + .await; + + Ok(()) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs new file mode 100644 index 0000000000..0758e3d43d --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/vfio_device/mod.rs @@ -0,0 +1,72 @@ +// Copyright (c) 2026 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod core; +mod device; + +pub use core::{discover_vfio_group_device, VfioDevice}; +pub use device::VfioDeviceBase; +pub use device::VfioDeviceModern; +pub use device::VfioDeviceModernHandle; + +use std::fs; +use std::path::Path; + +use anyhow::Result; + +const DEV_VFIO_CTL: &str = "/dev/vfio/vfio"; +const DEV_IOMMU: &str = "/dev/iommu"; +const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum VfioBackendChoice { + /// legacy VFIO group/container: /dev/vfio/vfio + /dev/vfio/ + LegacyGroup, + /// iommufd backend: /dev/iommu + /dev/vfio/devices/vfioX + Iommufd, +} + +#[derive(Debug, Default, Clone)] +pub struct VfioHostCaps { + pub has_vfio_ctl: bool, // /dev/vfio/vfio exists + pub has_iommufd: bool, // /dev/iommu exists + pub has_vfio_cdev: bool, // /dev/vfio/devices exists and contains vfio* +} + +pub fn detect_vfio_host_caps() -> VfioHostCaps { + let has_vfio_ctl = Path::new(DEV_VFIO_CTL).exists(); + let has_iommufd = Path::new(DEV_IOMMU).exists(); + + let has_vfio_cdev = match fs::read_dir(DEV_VFIO_DEVICES) { + Ok(rd) => rd + .flatten() + .any(|e| e.file_name().to_string_lossy().starts_with("vfio")), + Err(_) => false, + }; + + VfioHostCaps { + has_vfio_ctl, + has_iommufd, + has_vfio_cdev, + } +} + +pub fn choose_vfio_backend(caps: &VfioHostCaps) -> Result { + // Prefer iommufd when fully supported + if caps.has_iommufd && caps.has_vfio_cdev { + return Ok(VfioBackendChoice::Iommufd); + } + + // Fallback to legacy VFIO container/group + if caps.has_vfio_ctl { + return Ok(VfioBackendChoice::LegacyGroup); + } + + Err(anyhow::anyhow!( + "No usable VFIO backend: caps={:?}. Need (/dev/iommu + /dev/vfio/devices/vfio*) \ + for iommufd, or /dev/vfio/vfio for legacy.", + caps + )) +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs index 12fb2948b8..bd819e02be 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -5,8 +5,12 @@ // use std::fmt; +use std::sync::Arc; + +use tokio::sync::Mutex; use crate::device::driver::vhost_user_blk::VhostUserBlkDevice; +use crate::device::driver::vfio_device::VfioDeviceModern; use crate::{ BlockConfig, BlockDevice, HybridVsockConfig, HybridVsockDevice, Hypervisor as hypervisor, NetworkConfig, NetworkDevice, PCIePortDevice, PortDeviceConfig, ProtectionDevice, @@ -52,6 +56,7 @@ pub enum DeviceType { Vsock(VsockDevice), Protection(ProtectionDevice), PortDevice(PCIePortDevice), + VfioModern(Arc>), } impl fmt::Display for DeviceType { diff --git a/src/runtime-rs/crates/hypervisor/src/device/topology.rs b/src/runtime-rs/crates/hypervisor/src/device/topology.rs index f565169a46..6792226485 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/topology.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/topology.rs @@ -680,6 +680,26 @@ impl PCIeTopology { // No available node found None } + + /// Reserve a PCIe bus/port for a cold-plugged device. + /// + /// Full implementation is added alongside cold-plug port management; + /// this stub satisfies the VfioDeviceModern driver's PCIeDevice trait impl. + pub fn reserve_bus_for_device( + &mut self, + _device_id: &str, + _port: PCIePort, + ) -> Result> { + todo!("PCIe cold-plug port reservation not yet wired") + } + + /// Release a previously reserved PCIe bus/port for a cold-plugged device. + /// + /// Full implementation is added alongside cold-plug port management; + /// this stub satisfies the VfioDeviceModern driver's PCIeDevice trait impl. + pub fn release_bus_for_device(&mut self, _device_id: &str) -> Result<()> { + todo!("PCIe cold-plug port release not yet wired") + } } // do_add_pcie_endpoint do add a device into PCIe topology with pcie endpoint