From ff84b5f8ca40580c881e28c1ef7d153d516d8d67 Mon Sep 17 00:00:00 2001 From: Alex Lyn Date: Sat, 4 Apr 2026 11:44:44 +0200 Subject: [PATCH] agent: Update VFIO device handling for GPU cold-plug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the in-guest agent's VFIO device handler to support the cold-plug flow. When the runtime cold-plugs a GPU before the VM boots, the agent needs to bind the device to the vfio-pci driver inside the guest and set up the correct /dev/vfio/ group nodes so the workload can access the GPU. This updates the device discovery logic to handle the PCI topology that QEMU presents for cold-plugged vfio-pci devices and ensures the IOMMU group is properly resolved from the guest's sysfs. Signed-off-by: Alex Lyn Signed-off-by: Fabiano FidĂȘncio --- src/agent/src/device/mod.rs | 162 +++++++++++++++++--- src/agent/src/device/vfio_device_handler.rs | 49 +++++- src/agent/src/rpc.rs | 4 +- 3 files changed, 189 insertions(+), 26 deletions(-) diff --git a/src/agent/src/device/mod.rs b/src/agent/src/device/mod.rs index e267280a06..cc278aff49 100644 --- a/src/agent/src/device/mod.rs +++ b/src/agent/src/device/mod.rs @@ -250,6 +250,21 @@ pub async fn add_devices( update_spec_devices(logger, spec, dev_updates) } +pub fn dump_nvidia_cdi_yaml(logger: &Logger) -> Result<()> { + let file_path = "/var/run/cdi/nvidia.yaml"; + let path = PathBuf::from(file_path); + + if !path.exists() { + error!(logger, "file does not exist: {}", file_path); + return Ok(()); + } + + let content = fs::read_to_string(path)?; + info!(logger, "===== cdi filepath at {:?} with content: ===== \n {:?}", file_path, content); + + Ok(()) +} + #[instrument] pub async fn handle_cdi_devices( logger: &Logger, @@ -308,9 +323,11 @@ pub async fn handle_cdi_devices( cdi_timeout.as_secs(), e ); + time::sleep(Duration::from_secs(1)).await; + return Ok(()); } } - time::sleep(Duration::from_secs(1)).await; + // time::sleep(Duration::from_secs(1)).await; } Err(anyhow!( "failed to inject devices after CDI timeout of {} seconds", @@ -561,6 +578,104 @@ fn update_spec_devices( Ok(()) } +fn parse_pci_bdf_name(name: &str) -> Option { + pci::Address::from_str(name).ok() +} + +fn bus_of_addr(addr: &pci::Address) -> Result { + // addr.to_string() format: "0000:01:00.0" + let s = addr.to_string(); + let mut parts = s.split(':'); + + let domain = parts + .next() + .ok_or_else(|| anyhow!("bad pci address {}", s))?; + let bus = parts + .next() + .ok_or_else(|| anyhow!("bad pci address {}", s))?; + + Ok(format!("{domain}:{bus}")) +} + +fn unique_bus_from_pci_addresses(addrs: &[pci::Address]) -> Result { + let mut buses = addrs.iter().map(bus_of_addr).collect::>>()?; + + buses.sort(); + buses.dedup(); + + match buses.len() { + 1 => Ok(buses[0].clone()), + 0 => Err(anyhow!("no downstream PCI devices found")), + _ => Err(anyhow!("multiple downstream buses found: {:?}", buses)), + } +} + +fn read_single_bus_from_pci_bus_dir(bridgebuspath: &PathBuf) -> Result { + let mut files = Vec::new(); + + for entry in fs::read_dir(bridgebuspath)? { + files.push(entry?); + } + + if files.len() != 1 { + return Err(anyhow!( + "expected exactly one PCI bus in {:?}, got {}", + bridgebuspath, + files.len() + )); + } + + files[0] + .file_name() + .into_string() + .map_err(|e| anyhow!("bad filename under {:?}: {:?}", bridgebuspath, e)) +} + +fn infer_bus_from_child_devices(devpath: &PathBuf) -> Result { + let mut child_pci_addrs = Vec::new(); + + for entry in fs::read_dir(devpath)? { + let entry = entry?; + let file_type = entry.file_type()?; + + if !file_type.is_dir() { + continue; + } + + let name = entry.file_name(); + let name = name + .to_str() + .ok_or_else(|| anyhow!("non-utf8 filename under {:?}: {:?}", devpath, name))?; + + if let Some(addr) = parse_pci_bdf_name(name) { + child_pci_addrs.push(addr); + } + } + + unique_bus_from_pci_addresses(&child_pci_addrs).with_context(|| { + format!( + "failed to infer downstream bus from child PCI devices under {:?}", + devpath + ) + }) +} + +fn get_next_bus_from_bridge(devpath: &PathBuf) -> Result { + let bridgebuspath = devpath.join("pci_bus"); + + if bridgebuspath.exists() { + return read_single_bus_from_pci_bus_dir(&bridgebuspath) + .with_context(|| format!("failed to read downstream bus from {:?}", bridgebuspath)); + } + + infer_bus_from_child_devices(devpath).with_context(|| { + format!( + "bridge {:?} has no pci_bus directory; fallback to child device scan failed", + devpath + ) + }) +} + // pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to // the sysfs path for the PCI host bridge, based on the PCI path // provided. @@ -569,6 +684,10 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result Result = fs::read_dir(&bridgebuspath)?.collect(); + let devpath = PathBuf::from(root_bus_sysfs).join(relpath.trim_start_matches('/')); - match files.pop() { - Some(busfile) if files.is_empty() => { - bus = busfile? - .file_name() - .into_string() - .map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?; - } - _ => { - return Err(anyhow!( - "Expected exactly one PCI bus in {}, got {} instead", - bridgebuspath, - // Adjust to original value as we've already popped - files.len() + 1 - )); - } - }; + bus = get_next_bus_from_bridge(&devpath).with_context(|| { + format!( + "failed to resolve next bus for PCI path element {} (device {}) under root {}", + i, bdf, root_bus_sysfs + ) + })?; } Ok(relpath) @@ -1150,6 +1257,21 @@ mod tests { assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0/0000:02:04.0"); } + #[test] + fn test_pcipath_to_sysfs_fallback_child_device_scan() { + let testdir = tempdir().expect("failed to create tmpdir"); + let rootbuspath = testdir.path().to_str().unwrap(); + + let path23 = pci::Path::from_str("02/03").unwrap(); + let bridge2path = format!("{}{}", rootbuspath, "/0000:00:02.0"); + let child_device_path = format!("{bridge2path}/0000:01:03.0"); + + fs::create_dir_all(child_device_path).unwrap(); + + let relpath = pcipath_to_sysfs(rootbuspath, &path23); + assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0"); + } + // We use device specific variants of this for real cases, but // they have some complications that make them troublesome to unit // test diff --git a/src/agent/src/device/vfio_device_handler.rs b/src/agent/src/device/vfio_device_handler.rs index 01342865b1..6ed098c4ca 100644 --- a/src/agent/src/device/vfio_device_handler.rs +++ b/src/agent/src/device/vfio_device_handler.rs @@ -69,7 +69,8 @@ impl DeviceHandler for VfioPciDeviceHandler { let (root_complex, pcipath) = pcipath_from_dev_tree_path(pcipath)?; - let guestdev = wait_for_pci_device(ctx.sandbox, root_complex, &pcipath).await?; + let guestdev = + wait_for_pci_device(ctx.logger, ctx.sandbox, root_complex, &pcipath).await?; if vfio_in_guest { pci_driver_override(ctx.logger, SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?; @@ -301,24 +302,62 @@ async fn associate_ap_device(apqn: &Apqn, mkvp: &str) -> Result<()> { Ok(apqn.set_associate_state(AssocState::Associated(secret_idx))?) } +fn pci_addr_from_sysfs_path(sysfs_abs: &Path) -> Result { + // sysfs_abs like: /sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0 + let name = sysfs_abs + .file_name() + .ok_or_else(|| anyhow!("bad sysfs path (no file_name): {:?}", sysfs_abs))? + .to_str() + .ok_or_else(|| anyhow!("bad sysfs path (non-utf8): {:?}", sysfs_abs))?; + + pci::Address::from_str(name) + .map_err(|e| anyhow!("failed to parse pci bdf from sysfs '{}': {e}", name)) +} + pub async fn wait_for_pci_device( + logger: &Logger, sandbox: &Arc>, root_complex: &str, pcipath: &pci::Path, ) -> Result { - let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path(root_complex)); - let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; + info!(logger, "Xwait_for_pci_device at {}", pcipath); + let root_bus_rel = create_pci_root_bus_path(root_complex); // "/devices/pci0000:00" + let root_bus_sysfs = format!("{}{}", SYSFS_DIR, &root_bus_rel); // "/sys/devices/pci0000:00" + info!( + logger, + "wait_for_pci_device: root_bus_sysfs {} pcipath {}", &root_bus_sysfs, pcipath + ); + let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; // "/0000:00:06.0/0000:02:00.0" + + // "/sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0" + let sysfs_abs = format!("{root_bus_sysfs}{sysfs_rel_path}"); + let sysfs_abs_path = std::path::PathBuf::from(&sysfs_abs); + + if tokio::fs::metadata(&sysfs_abs_path).await.is_ok() { + info!( + logger, + "wait_for_pci_device: PCI device {} already exists at {}", pcipath, sysfs_abs + ); + return pci_addr_from_sysfs_path(&sysfs_abs_path); + } else { + info!( + logger, + "wait_for_pci_device: Waiting uevent for PCI device {} at {}", pcipath, sysfs_abs + ); + } + let matcher = PciMatcher::new(&sysfs_rel_path, root_complex)?; let uev = wait_for_uevent(sandbox, matcher).await?; + // uev.devpath like "/devices/pci0000:00/0000:00:06.0/0000:02:00.0" let addr = uev .devpath .rsplit('/') .next() .ok_or_else(|| anyhow!("Bad device path {:?} in uevent", &uev.devpath))?; - let addr = pci::Address::from_str(addr)?; - Ok(addr) + + pci::Address::from_str(addr) } // Represents an IOMMU group diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index b85e948e49..2e5dfb907a 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -65,7 +65,7 @@ use crate::device::block_device_handler::get_virtio_blk_pci_device_name; use crate::device::network_device_handler::wait_for_ccw_net_interface; #[cfg(not(target_arch = "s390x"))] use crate::device::network_device_handler::wait_for_pci_net_interface; -use crate::device::{add_devices, handle_cdi_devices, update_env_pci}; +use crate::device::{add_devices, handle_cdi_devices, dump_nvidia_cdi_yaml, update_env_pci}; use crate::features::get_build_features; use crate::metrics::get_metrics; use crate::mount::baremount; @@ -244,6 +244,8 @@ impl AgentService { // or other entities for a specifc device. // In Kata we only consider the directory "/var/run/cdi", "/etc" may be // readonly + info!(sl(), "dump_nvidia_cdi_yaml at path: /var/run/cdi"); + dump_nvidia_cdi_yaml(&sl())?; handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?; // Handle trusted storage configuration before mounting any storage