mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-10 14:02:59 +00:00
agent: Update VFIO device handling for GPU cold-plug
Extend the in-guest agent's VFIO device handler to support the cold-plug flow. When the runtime cold-plugs a GPU before the VM boots, the agent needs to bind the device to the vfio-pci driver inside the guest and set up the correct /dev/vfio/ group nodes so the workload can access the GPU. This updates the device discovery logic to handle the PCI topology that QEMU presents for cold-plugged vfio-pci devices and ensures the IOMMU group is properly resolved from the guest's sysfs. Signed-off-by: Alex Lyn <alex.lyn@antgroup.com> Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
504101d77a
commit
ff84b5f8ca
@@ -250,6 +250,21 @@ pub async fn add_devices(
|
||||
update_spec_devices(logger, spec, dev_updates)
|
||||
}
|
||||
|
||||
pub fn dump_nvidia_cdi_yaml(logger: &Logger) -> Result<()> {
|
||||
let file_path = "/var/run/cdi/nvidia.yaml";
|
||||
let path = PathBuf::from(file_path);
|
||||
|
||||
if !path.exists() {
|
||||
error!(logger, "file does not exist: {}", file_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let content = fs::read_to_string(path)?;
|
||||
info!(logger, "===== cdi filepath at {:?} with content: ===== \n {:?}", file_path, content);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub async fn handle_cdi_devices(
|
||||
logger: &Logger,
|
||||
@@ -308,9 +323,11 @@ pub async fn handle_cdi_devices(
|
||||
cdi_timeout.as_secs(),
|
||||
e
|
||||
);
|
||||
time::sleep(Duration::from_secs(1)).await;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
time::sleep(Duration::from_secs(1)).await;
|
||||
// time::sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
Err(anyhow!(
|
||||
"failed to inject devices after CDI timeout of {} seconds",
|
||||
@@ -561,6 +578,104 @@ fn update_spec_devices(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_pci_bdf_name(name: &str) -> Option<pci::Address> {
|
||||
pci::Address::from_str(name).ok()
|
||||
}
|
||||
|
||||
fn bus_of_addr(addr: &pci::Address) -> Result<String> {
|
||||
// addr.to_string() format: "0000:01:00.0"
|
||||
let s = addr.to_string();
|
||||
let mut parts = s.split(':');
|
||||
|
||||
let domain = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
|
||||
let bus = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("bad pci address {}", s))?;
|
||||
|
||||
Ok(format!("{domain}:{bus}"))
|
||||
}
|
||||
|
||||
fn unique_bus_from_pci_addresses(addrs: &[pci::Address]) -> Result<String> {
|
||||
let mut buses = addrs.iter().map(bus_of_addr).collect::<Result<Vec<_>>>()?;
|
||||
|
||||
buses.sort();
|
||||
buses.dedup();
|
||||
|
||||
match buses.len() {
|
||||
1 => Ok(buses[0].clone()),
|
||||
0 => Err(anyhow!("no downstream PCI devices found")),
|
||||
_ => Err(anyhow!("multiple downstream buses found: {:?}", buses)),
|
||||
}
|
||||
}
|
||||
|
||||
fn read_single_bus_from_pci_bus_dir(bridgebuspath: &PathBuf) -> Result<String> {
|
||||
let mut files = Vec::new();
|
||||
|
||||
for entry in fs::read_dir(bridgebuspath)? {
|
||||
files.push(entry?);
|
||||
}
|
||||
|
||||
if files.len() != 1 {
|
||||
return Err(anyhow!(
|
||||
"expected exactly one PCI bus in {:?}, got {}",
|
||||
bridgebuspath,
|
||||
files.len()
|
||||
));
|
||||
}
|
||||
|
||||
files[0]
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("bad filename under {:?}: {:?}", bridgebuspath, e))
|
||||
}
|
||||
|
||||
fn infer_bus_from_child_devices(devpath: &PathBuf) -> Result<String> {
|
||||
let mut child_pci_addrs = Vec::new();
|
||||
|
||||
for entry in fs::read_dir(devpath)? {
|
||||
let entry = entry?;
|
||||
let file_type = entry.file_type()?;
|
||||
|
||||
if !file_type.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let name = entry.file_name();
|
||||
let name = name
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("non-utf8 filename under {:?}: {:?}", devpath, name))?;
|
||||
|
||||
if let Some(addr) = parse_pci_bdf_name(name) {
|
||||
child_pci_addrs.push(addr);
|
||||
}
|
||||
}
|
||||
|
||||
unique_bus_from_pci_addresses(&child_pci_addrs).with_context(|| {
|
||||
format!(
|
||||
"failed to infer downstream bus from child PCI devices under {:?}",
|
||||
devpath
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn get_next_bus_from_bridge(devpath: &PathBuf) -> Result<String> {
|
||||
let bridgebuspath = devpath.join("pci_bus");
|
||||
|
||||
if bridgebuspath.exists() {
|
||||
return read_single_bus_from_pci_bus_dir(&bridgebuspath)
|
||||
.with_context(|| format!("failed to read downstream bus from {:?}", bridgebuspath));
|
||||
}
|
||||
|
||||
infer_bus_from_child_devices(devpath).with_context(|| {
|
||||
format!(
|
||||
"bridge {:?} has no pci_bus directory; fallback to child device scan failed",
|
||||
devpath
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// pcipath_to_sysfs fetches the sysfs path for a PCI path, relative to
|
||||
// the sysfs path for the PCI host bridge, based on the PCI path
|
||||
// provided.
|
||||
@@ -569,6 +684,10 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
|
||||
let mut bus = "0000:00".to_string();
|
||||
let mut relpath = String::new();
|
||||
|
||||
if pcipath.is_empty() {
|
||||
return Err(anyhow!("empty PCI path"));
|
||||
}
|
||||
|
||||
for i in 0..pcipath.len() {
|
||||
let bdf = format!("{}:{}", bus, pcipath[i]);
|
||||
|
||||
@@ -579,26 +698,14 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
|
||||
break;
|
||||
}
|
||||
|
||||
// Find out the bus exposed by bridge
|
||||
let bridgebuspath = format!("{root_bus_sysfs}{relpath}/pci_bus");
|
||||
let mut files: Vec<_> = fs::read_dir(&bridgebuspath)?.collect();
|
||||
let devpath = PathBuf::from(root_bus_sysfs).join(relpath.trim_start_matches('/'));
|
||||
|
||||
match files.pop() {
|
||||
Some(busfile) if files.is_empty() => {
|
||||
bus = busfile?
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(
|
||||
"Expected exactly one PCI bus in {}, got {} instead",
|
||||
bridgebuspath,
|
||||
// Adjust to original value as we've already popped
|
||||
files.len() + 1
|
||||
));
|
||||
}
|
||||
};
|
||||
bus = get_next_bus_from_bridge(&devpath).with_context(|| {
|
||||
format!(
|
||||
"failed to resolve next bus for PCI path element {} (device {}) under root {}",
|
||||
i, bdf, root_bus_sysfs
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(relpath)
|
||||
@@ -1150,6 +1257,21 @@ mod tests {
|
||||
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0/0000:02:04.0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pcipath_to_sysfs_fallback_child_device_scan() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir");
|
||||
let rootbuspath = testdir.path().to_str().unwrap();
|
||||
|
||||
let path23 = pci::Path::from_str("02/03").unwrap();
|
||||
let bridge2path = format!("{}{}", rootbuspath, "/0000:00:02.0");
|
||||
let child_device_path = format!("{bridge2path}/0000:01:03.0");
|
||||
|
||||
fs::create_dir_all(child_device_path).unwrap();
|
||||
|
||||
let relpath = pcipath_to_sysfs(rootbuspath, &path23);
|
||||
assert_eq!(relpath.unwrap(), "/0000:00:02.0/0000:01:03.0");
|
||||
}
|
||||
|
||||
// We use device specific variants of this for real cases, but
|
||||
// they have some complications that make them troublesome to unit
|
||||
// test
|
||||
|
||||
@@ -69,7 +69,8 @@ impl DeviceHandler for VfioPciDeviceHandler {
|
||||
|
||||
let (root_complex, pcipath) = pcipath_from_dev_tree_path(pcipath)?;
|
||||
|
||||
let guestdev = wait_for_pci_device(ctx.sandbox, root_complex, &pcipath).await?;
|
||||
let guestdev =
|
||||
wait_for_pci_device(ctx.logger, ctx.sandbox, root_complex, &pcipath).await?;
|
||||
if vfio_in_guest {
|
||||
pci_driver_override(ctx.logger, SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?;
|
||||
|
||||
@@ -301,24 +302,62 @@ async fn associate_ap_device(apqn: &Apqn, mkvp: &str) -> Result<()> {
|
||||
Ok(apqn.set_associate_state(AssocState::Associated(secret_idx))?)
|
||||
}
|
||||
|
||||
fn pci_addr_from_sysfs_path(sysfs_abs: &Path) -> Result<pci::Address> {
|
||||
// sysfs_abs like: /sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0
|
||||
let name = sysfs_abs
|
||||
.file_name()
|
||||
.ok_or_else(|| anyhow!("bad sysfs path (no file_name): {:?}", sysfs_abs))?
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("bad sysfs path (non-utf8): {:?}", sysfs_abs))?;
|
||||
|
||||
pci::Address::from_str(name)
|
||||
.map_err(|e| anyhow!("failed to parse pci bdf from sysfs '{}': {e}", name))
|
||||
}
|
||||
|
||||
pub async fn wait_for_pci_device(
|
||||
logger: &Logger,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
root_complex: &str,
|
||||
pcipath: &pci::Path,
|
||||
) -> Result<pci::Address> {
|
||||
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, create_pci_root_bus_path(root_complex));
|
||||
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?;
|
||||
info!(logger, "Xwait_for_pci_device at {}", pcipath);
|
||||
let root_bus_rel = create_pci_root_bus_path(root_complex); // "/devices/pci0000:00"
|
||||
let root_bus_sysfs = format!("{}{}", SYSFS_DIR, &root_bus_rel); // "/sys/devices/pci0000:00"
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: root_bus_sysfs {} pcipath {}", &root_bus_sysfs, pcipath
|
||||
);
|
||||
let sysfs_rel_path = pcipath_to_sysfs(&root_bus_sysfs, pcipath)?; // "/0000:00:06.0/0000:02:00.0"
|
||||
|
||||
// "/sys/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
|
||||
let sysfs_abs = format!("{root_bus_sysfs}{sysfs_rel_path}");
|
||||
let sysfs_abs_path = std::path::PathBuf::from(&sysfs_abs);
|
||||
|
||||
if tokio::fs::metadata(&sysfs_abs_path).await.is_ok() {
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: PCI device {} already exists at {}", pcipath, sysfs_abs
|
||||
);
|
||||
return pci_addr_from_sysfs_path(&sysfs_abs_path);
|
||||
} else {
|
||||
info!(
|
||||
logger,
|
||||
"wait_for_pci_device: Waiting uevent for PCI device {} at {}", pcipath, sysfs_abs
|
||||
);
|
||||
}
|
||||
|
||||
let matcher = PciMatcher::new(&sysfs_rel_path, root_complex)?;
|
||||
|
||||
let uev = wait_for_uevent(sandbox, matcher).await?;
|
||||
|
||||
// uev.devpath like "/devices/pci0000:00/0000:00:06.0/0000:02:00.0"
|
||||
let addr = uev
|
||||
.devpath
|
||||
.rsplit('/')
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Bad device path {:?} in uevent", &uev.devpath))?;
|
||||
let addr = pci::Address::from_str(addr)?;
|
||||
Ok(addr)
|
||||
|
||||
pci::Address::from_str(addr)
|
||||
}
|
||||
|
||||
// Represents an IOMMU group
|
||||
|
||||
@@ -65,7 +65,7 @@ use crate::device::block_device_handler::get_virtio_blk_pci_device_name;
|
||||
use crate::device::network_device_handler::wait_for_ccw_net_interface;
|
||||
#[cfg(not(target_arch = "s390x"))]
|
||||
use crate::device::network_device_handler::wait_for_pci_net_interface;
|
||||
use crate::device::{add_devices, handle_cdi_devices, update_env_pci};
|
||||
use crate::device::{add_devices, handle_cdi_devices, dump_nvidia_cdi_yaml, update_env_pci};
|
||||
use crate::features::get_build_features;
|
||||
use crate::metrics::get_metrics;
|
||||
use crate::mount::baremount;
|
||||
@@ -244,6 +244,8 @@ impl AgentService {
|
||||
// or other entities for a specifc device.
|
||||
// In Kata we only consider the directory "/var/run/cdi", "/etc" may be
|
||||
// readonly
|
||||
info!(sl(), "dump_nvidia_cdi_yaml at path: /var/run/cdi");
|
||||
dump_nvidia_cdi_yaml(&sl())?;
|
||||
handle_cdi_devices(&sl(), &mut oci, "/var/run/cdi", AGENT_CONFIG.cdi_timeout).await?;
|
||||
|
||||
// Handle trusted storage configuration before mounting any storage
|
||||
|
||||
Reference in New Issue
Block a user