agent: Update VFIO device handling for GPU cold-plug

Extend the in-guest agent's VFIO device handler to support the cold-plug
flow. When the runtime cold-plugs a GPU before the VM boots, the agent
needs to bind the device to the vfio-pci driver inside the guest and
set up the correct /dev/vfio/ group nodes so the workload can access
the GPU.

This updates the device discovery logic to handle the PCI topology that
QEMU presents for cold-plugged vfio-pci devices and ensures the IOMMU
group is properly resolved from the guest's sysfs.

Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
Alex Lyn
2026-04-04 11:44:44 +02:00
committed by Fabiano Fidêncio
parent 504101d77a
commit ff84b5f8ca
3 changed files with 189 additions and 26 deletions

View File

@@ -250,6 +250,21 @@ pub async fn add_devices(
update_spec_devices(logger, spec, dev_updates)
}
pub fn dump_nvidia_cdi_yaml(logger: &Logger) -> Result<()> {
let file_path = "/var/run/cdi/nvidia.yaml";
let path = PathBuf::from(file_path);
if !path.exists() {
error!(logger, "file does not exist: {}", file_path);
return Ok(());
}
let content = fs::read_to_string(path)?;
info!(logger, "===== cdi filepath at {:?} with content: ===== \n {:?}", file_path, content);
Ok(())
}
#[instrument]
pub async fn handle_cdi_devices(
logger: &Logger,
@@ -308,9 +323,11 @@ pub async fn handle_cdi_devices(
cdi_timeout.as_secs(),
e
);
time::sleep(Duration::from_secs(1)).await;
return Ok(());
}
}
time::sleep(Duration::from_secs(1)).await;
// time::sleep(Duration::from_secs(1)).await;
}
Err(anyhow!(
"failed to inject devices after CDI timeout of {} seconds",
@@ -561,6 +578,104 @@ fn update_spec_devices(
Ok(())
}
fn parse_pci_bdf_name(name: &str) -> Option<pci::Address> {
pci::Address::from_str(name).ok()
}
fn bus_of_addr(addr: &pci::Address) -> Result<String> {
// addr.to_string() format: "0000:01:00.0"
let s = addr.to_string();
let mut parts = s.split(':');
let domain = parts
.next()
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
let bus = parts
.next()
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
Ok(format!("{domain}:{bus}"))
}
fn unique_bus_from_pci_addresses(addrs: &[pci::Address]) -> Result<String> {
let mut buses = addrs.iter().map(bus_of_addr).collect::<Result<Vec<_>>>()?;
buses.sort();
buses.dedup();
match buses.len() {
1 => Ok(buses[0].clone()),
0 => Err(anyhow!("no downstream PCI devices found")),
_ => Err(anyhow!("multiple downstream buses found: {:?}", buses)),
}
}
fn read_single_bus_from_pci_bus_dir(bridgebuspath: &PathBuf) -> Result<String> {
let mut files = Vec::new();
for entry in fs::read_dir(bridgebuspath)? {
files.push(entry?);
}
if files.len() != 1 {
return Err(anyhow!(
"expected exactly one PCI bus in {:?}, got {}",
bridgebuspath,
files.len()
));
}
files[0]
.file_name()
.into_string()
.map_err(|e| anyhow!("bad filename under {:?}: {:?}", bridgebuspath, e))
}
fn infer_bus_from_child_devices(devpath: &PathBuf) -> Result<String> {
let mut child_pci_addrs = Vec::new();
for entry in fs::read_dir(devpath)? {
let entry = entry?;
let file_type = entry.file_type()?;
if !file_type.is_dir() {
continue;
}
let name = entry.file_name();
let name = name
.to_str()
.ok_or_else(|| anyhow!("non-utf8 filename under {:?}: {:?}", devpath, name))?;
if let Some(addr) = parse_pci_bdf_name(name) {
child_pci_addrs.push(addr);
}
}
unique_bus_from_pci_addresses(&child_pci_addrs).with_context(|| {
format!(
"failed to infer downstream bus from child PCI devices under {:?}",
devpath
)
})
}
fn get_next_bus_from_bridge(devpath: &PathBuf) -> Result<String> {
let bridgebuspath = devpath.join("pci_bus");
if bridgebuspath.exists() {
return read_single_bus_from_pci_bus_dir(&bridgebuspath)
.with_context(|| format!("failed to read downstream bus from {:?}", bridgebuspath));
}
infer_bus_from_child_devices(devpath).with_context(|| {
format!(
"bridge {:?} has no pci_bus directory; fallback to child device scan failed",
devpath
)
})
}
// pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to
// the sysfs path for the PCI host bridge, based on the PCI path
// provided.
@@ -569,6 +684,10 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
let mut bus = "0000:00".to_string();
let mut relpath = String::new();
if pcipath.is_empty() {
return Err(anyhow!("empty PCI path"));
}
for i in 0..pcipath.len() {
let bdf = format!("{}:{}", bus, pcipath[i]);
@@ -579,26 +698,14 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
break;
}
// Find out the bus exposed by bridge
let bridgebuspath = format!("{root_bus_sysfs}{relpath}/pci_bus");
let mut files: Vec<_> = fs::read_dir(&bridgebuspath)?.collect();
let devpath = PathBuf::from(root_bus_sysfs).join(relpath.trim_start_matches('/'));
match files.pop() {
Some(busfile) if files.is_empty() => {
bus = busfile?
.file_name()
.into_string()
.map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?;
}
_ => {
return Err(anyhow!(
"Expected exactly one PCI bus in {}, got {} instead",
bridgebuspath,
// Adjust to original value as we've already popped
files.len() + 1
));
}
};
bus = get_next_bus_from_bridge(&devpath).with_context(|| {
format!(
"failed to resolve next bus for PCI path element {} (device {}) under root {}",
i, bdf, root_bus_sysfs
)
})?;
}
Ok(relpath)
@@ -1150,6 +1257,21 @@ mod tests {
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0/0000:02:04.0");
}
#[test]
fn test_pcipath_to_sysfs_fallback_child_device_scan() {
let testdir = tempdir().expect("failed to create tmpdir");
let rootbuspath = testdir.path().to_str().unwrap();
let path23 = pci::Path::from_str("02/03").unwrap();
let bridge2path = format!("{}{}", rootbuspath, "/0000:00:02.0");
let child_device_path = format!("{bridge2path}/0000:01:03.0");
fs::create_dir_all(child_device_path).unwrap();
let relpath = pcipath_to_sysfs(rootbuspath, &path23);
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0");
}
// We use device specific variants of this for real cases, but
// they have some complications that make them troublesome to unit
// test

View File

@@ -69,7 +69,8 @@ impl DeviceHandler for VfioPciDeviceHandler {
let (root_complex, pcipath) = pcipath_from_dev_tree_path(pcipath)?;
let guestdev = wait_for_pci_device(ctx.sandbox, root_complex, &pcipath).await?;
let guestdev =
wait_for_pci_device(ctx.logger, ctx.sandbox, root_complex, &pcipath).await?;
if vfio_in_guest {
pci_driver_override(ctx.logger, SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?;
@@ -301,24 +302,62 @@ async fn associate_ap_device(apqn: &Apqn, mkvp: &str) -> Result<()> {
Ok(apqn.set_associate_state(AssocState::Associated(secret_idx))?)
}
fn pci_addr_from_sysfs_path(sysfs_abs: &Path) -> Result<pci::Address> {
// sysfs_abs like: /sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0
let name = sysfs_abs
.file_name()
.ok_or_else(|| anyhow!("bad sysfs path (no file_name): {:?}", sysfs_abs))?
.to_str()
.ok_or_else(|| anyhow!("bad sysfs path (non-utf8): {:?}", sysfs_abs))?;
pci::Address::from_str(name)
.map_err(|e| anyhow!("failed to parse pci bdf from sysfs '{}': {e}", name))
}
pub async fn wait_for_pci_device(
logger: &Logger,
sandbox: &Arc<Mutex<Sandbox>>,
root_complex: &str,
pcipath: &pci::Path,
) -> Result<pci::Address> {
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path(root_complex));
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
info!(logger, "Xwait_for_pci_device at {}", pcipath);
let root_bus_rel = create_pci_root_bus_path(root_complex); // "/devices/pci0000:00"
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, &root_bus_rel); // "/sys/devices/pci0000:00"
info!(
logger,
"wait_for_pci_device: root_bus_sysfs {} pcipath {}", &root_bus_sysfs, pcipath
);
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; // "/0000:00:06.0/0000:02:00.0"
// "/sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
let sysfs_abs = format!("{root_bus_sysfs}{sysfs_rel_path}");
let sysfs_abs_path = std::path::PathBuf::from(&sysfs_abs);
if tokio::fs::metadata(&sysfs_abs_path).await.is_ok() {
info!(
logger,
"wait_for_pci_device: PCI device {} already exists at {}", pcipath, sysfs_abs
);
return pci_addr_from_sysfs_path(&sysfs_abs_path);
} else {
info!(
logger,
"wait_for_pci_device: Waiting uevent for PCI device {} at {}", pcipath, sysfs_abs
);
}
let matcher = PciMatcher::new(&sysfs_rel_path, root_complex)?;
let uev = wait_for_uevent(sandbox, matcher).await?;
// uev.devpath like "/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
let addr = uev
.devpath
.rsplit('/')
.next()
.ok_or_else(|| anyhow!("Bad device path {:?} in uevent", &uev.devpath))?;
let addr = pci::Address::from_str(addr)?;
Ok(addr)
pci::Address::from_str(addr)
}
// Represents an IOMMU group

View File

@@ -65,7 +65,7 @@ use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
use crate::device::network_device_handler::wait_for_ccw_net_interface;
#[cfg(not(target_arch = "s390x"))]
use crate::device::network_device_handler::wait_for_pci_net_interface;
use crate::device::{add_devices, handle_cdi_devices, update_env_pci};
use crate::device::{add_devices, handle_cdi_devices, dump_nvidia_cdi_yaml, update_env_pci};
use crate::features::get_build_features;
use crate::metrics::get_metrics;
use crate::mount::baremount;
@@ -244,6 +244,8 @@ impl AgentService {
// or other entities for a specifc device.
// In Kata we only consider the directory "/var/run/cdi", "/etc" may be
// readonly
info!(sl(), "dump_nvidia_cdi_yaml at path: /var/run/cdi");
dump_nvidia_cdi_yaml(&sl())?;
handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?;
// Handle trusted storage configuration before mounting any storage