mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-01 22:50:54 +00:00
runtime-rs: resolve cold-plug VFIO guest PCI path via QMP
The PCIe topology pre-computes a wrong path for cold-plugged physical-
endpoint VFs because the root port has no explicit addr and QEMU auto-
assigns its slot. The pre-computed PciPath { slots: [PciSlot(0)] }
resolves to 0000:00:00.0 (the Q35 MCH), causing
wait_for_pci_net_interface to time out looking for a netdev there.
Add resolve_vfio_device_pci_path(hostdev_id) to the Hypervisor trait.
Implement it in QemuInner using qmp.get_device_by_qdev_id(), which
queries QEMU's query-pci to find the full guest PCIe path (e.g. "05/00"
= slot 5 on pcie.0 / slot 0 on the root port bus).
Store the QEMU device ID (hostdev_id) in PhysicalEndpoint during
attach(). Add vfio_hostdev_id() and set_guest_pci_path() to the
Endpoint trait and add an endpoints() accessor to the Network trait.
In setup_after_start_vm(), call resolve_physical_endpoint_pci_paths()
before apply_network_to_agent() to populate the correct path from QMP
into each PhysicalEndpoint's guest_pci_path field. The field is then
consumed by network_with_netns::interfaces() to fill Interface.device_path
before update_interface is sent to the agent.
This is the runtime-rs counterpart of the Go runtime's
ResolveColdPlugVFIOGuestPciPaths / qomGetPciPath.
Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -13,6 +13,7 @@ pub mod device;
|
||||
pub mod hypervisor_persist;
|
||||
pub use device::driver::*;
|
||||
use device::DeviceType;
|
||||
pub use device::pci_path::PciPath;
|
||||
#[cfg(all(
|
||||
feature = "dragonball",
|
||||
any(target_arch = "x86_64", target_arch = "aarch64")
|
||||
@@ -161,4 +162,15 @@ pub trait Hypervisor: std::fmt::Debug + Send + Sync {
|
||||
async fn set_guest_memory_block_size(&self, size: u32);
|
||||
async fn guest_memory_block_size(&self) -> u32;
|
||||
async fn get_passfd_listener_addr(&self) -> Result<(String, u32)>;
|
||||
|
||||
/// Resolve the in-guest PCIe path for a cold-plugged physical-endpoint VF
|
||||
/// by querying QMP (query-pci + device search by QEMU device ID).
|
||||
/// Only meaningful after the VM has started and QMP is initialised.
|
||||
/// Default: Err (non-QEMU hypervisors do not support this).
|
||||
async fn resolve_vfio_device_pci_path(&self, hostdev_id: &str) -> Result<PciPath> {
|
||||
Err(anyhow::anyhow!(
|
||||
"resolve_vfio_device_pci_path not supported for this hypervisor (device: {})",
|
||||
hostdev_id
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1154,6 +1154,18 @@ impl QemuInner {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resolve the in-guest PCIe path for a cold-plugged physical-endpoint VF
|
||||
/// via QMP query-pci. Must be called after the VM has started and QMP is
|
||||
/// initialised. This is the runtime-rs pair of the Go runtime's
|
||||
/// `ResolveColdPlugVFIOGuestPciPaths` / `qomGetPciPath` call.
|
||||
pub(crate) fn resolve_vfio_device_pci_path(&mut self, hostdev_id: &str) -> Result<PciPath> {
|
||||
let qmp = self
|
||||
.qmp
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow!("QMP not initialised; cannot resolve PCI path for {}", hostdev_id))?;
|
||||
qmp.get_device_by_qdev_id(hostdev_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -7,6 +7,7 @@ mod cmdline_generator;
|
||||
mod inner;
|
||||
mod qmp;
|
||||
|
||||
use crate::device::pci_path::PciPath;
|
||||
use crate::device::DeviceType;
|
||||
use crate::hypervisor_persist::HypervisorState;
|
||||
use crate::{Hypervisor, MemoryConfig};
|
||||
@@ -212,6 +213,13 @@ impl Hypervisor for Qemu {
|
||||
async fn get_passfd_listener_addr(&self) -> Result<(String, u32)> {
|
||||
Err(anyhow::anyhow!("Not yet supported"))
|
||||
}
|
||||
|
||||
async fn resolve_vfio_device_pci_path(&self, hostdev_id: &str) -> Result<PciPath> {
|
||||
self.inner
|
||||
.write()
|
||||
.await
|
||||
.resolve_vfio_device_pci_path(hostdev_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
||||
@@ -345,6 +345,17 @@ impl ResourceManagerInner {
|
||||
}
|
||||
|
||||
if let Some(network) = self.network.as_ref() {
|
||||
// For cold-plugged physical-endpoint VFs, the PCIe topology
|
||||
// pre-computes a wrong path because the root port has no explicit
|
||||
// addr and QEMU auto-assigns its slot. Resolve the actual path
|
||||
// via QMP (query-pci + device search) before sending
|
||||
// update_interface to the agent.
|
||||
resolve_physical_endpoint_pci_paths(
|
||||
network.as_ref(),
|
||||
self.hypervisor.as_ref(),
|
||||
)
|
||||
.await;
|
||||
|
||||
self.apply_network_to_agent(network.as_ref()).await?;
|
||||
}
|
||||
|
||||
@@ -899,3 +910,41 @@ impl Persist for ResourceManagerInner {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// For each physical-endpoint VF in the network, resolve the actual in-guest
|
||||
/// PCIe path via QMP (query-pci) and update the endpoint's `guest_pci_path`.
|
||||
///
|
||||
/// This must be called after the VM has started (QMP is initialised) and
|
||||
/// before `apply_network_to_agent`, because the PCIe topology pre-computes
|
||||
/// a wrong path (root port has no explicit addr → QEMU auto-assigns its slot;
|
||||
/// only QMP can reveal the actual assignment).
|
||||
async fn resolve_physical_endpoint_pci_paths(
|
||||
network: &dyn crate::network::Network,
|
||||
hypervisor: &dyn hypervisor::Hypervisor,
|
||||
) {
|
||||
for endpoint in network.endpoints().await {
|
||||
if let Some(hostdev_id) = endpoint.vfio_hostdev_id().await {
|
||||
match hypervisor.resolve_vfio_device_pci_path(&hostdev_id).await {
|
||||
Ok(pci_path) => {
|
||||
let path_str = pci_path.to_string();
|
||||
info!(
|
||||
sl!(),
|
||||
"resolved physical endpoint guest PCI path: \
|
||||
hostdev_id={} path={}",
|
||||
hostdev_id,
|
||||
path_str
|
||||
);
|
||||
endpoint.set_guest_pci_path(path_str).await;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
sl!(),
|
||||
"failed to resolve guest PCI path for hostdev {}: {}",
|
||||
hostdev_id,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,4 +42,12 @@ pub trait Endpoint: std::fmt::Debug + Send + Sync {
|
||||
async fn guest_pci_path(&self) -> Option<String> {
|
||||
None
|
||||
}
|
||||
/// Returns the QEMU device ID for the cold-plugged VF (e.g.
|
||||
/// `"physical_nic__346_0"`), or `None` for non-physical endpoints.
|
||||
/// Used to resolve the actual guest PCI path via QMP after VM start.
|
||||
async fn vfio_hostdev_id(&self) -> Option<String> {
|
||||
None
|
||||
}
|
||||
/// Update the guest PCI path (called after QMP resolution).
|
||||
async fn set_guest_pci_path(&self, _path: String) {}
|
||||
}
|
||||
|
||||
@@ -62,10 +62,14 @@ pub struct PhysicalEndpoint {
|
||||
driver: String,
|
||||
vendor_device_id: VendorDevice,
|
||||
d: Arc<RwLock<DeviceManager>>,
|
||||
/// Guest PCI path computed by do_add_pcie_endpoint() at attach() time.
|
||||
/// Populated after attach() succeeds; used to set device_path in the
|
||||
/// agent's update_interface request for IB/RoCE GID table population.
|
||||
/// Guest PCI path — populated after QMP resolution in setup_after_start_vm.
|
||||
/// The pre-computed topology path from attach() is WRONG for physical
|
||||
/// endpoints because the root port has no explicit addr; the correct path
|
||||
/// requires QMP query-pci after VM boots.
|
||||
guest_pci_path: std::sync::Mutex<Option<String>>,
|
||||
/// QEMU device ID for the cold-plugged VF (e.g. "physical_nic__346_0").
|
||||
/// Stored during attach() for use in QMP-based path resolution.
|
||||
hostdev_id: std::sync::Mutex<Option<String>>,
|
||||
}
|
||||
|
||||
impl PhysicalEndpoint {
|
||||
@@ -98,6 +102,7 @@ impl PhysicalEndpoint {
|
||||
bdf,
|
||||
d,
|
||||
guest_pci_path: std::sync::Mutex::new(None),
|
||||
hostdev_id: std::sync::Mutex::new(None),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -163,14 +168,14 @@ impl Endpoint for PhysicalEndpoint {
|
||||
.await
|
||||
.context("do handle device failed.")?;
|
||||
|
||||
// Extract and cache the guest PCI path so guest_pci_path() can
|
||||
// expose it to handle_interfaces() for device_path in update_interface.
|
||||
// Store the QEMU hostdev_id for later QMP-based PCI path resolution.
|
||||
// The topology-computed guest_pci_path from do_add_pcie_endpoint() is
|
||||
// WRONG for physical endpoints (root port has no explicit addr so QEMU
|
||||
// auto-assigns its slot; the correct path requires QMP after VM boot).
|
||||
if let hypervisor::device::DeviceType::Vfio(vfio_dev) = device_type {
|
||||
if let Some(hostdev) = vfio_dev.devices.first() {
|
||||
if let Some(pci_path) = &hostdev.guest_pci_path {
|
||||
if let Ok(mut guard) = self.guest_pci_path.lock() {
|
||||
*guard = Some(pci_path.to_string());
|
||||
}
|
||||
if let Ok(mut guard) = self.hostdev_id.lock() {
|
||||
*guard = Some(hostdev.hostdev_id.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -217,6 +222,16 @@ impl Endpoint for PhysicalEndpoint {
|
||||
async fn guest_pci_path(&self) -> Option<String> {
|
||||
self.guest_pci_path.lock().ok()?.clone()
|
||||
}
|
||||
|
||||
async fn vfio_hostdev_id(&self) -> Option<String> {
|
||||
self.hostdev_id.lock().ok()?.clone()
|
||||
}
|
||||
|
||||
async fn set_guest_pci_path(&self, path: String) {
|
||||
if let Ok(mut guard) = self.guest_pci_path.lock() {
|
||||
*guard = Some(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -44,6 +44,11 @@ pub trait Network: Send + Sync {
|
||||
async fn neighs(&self) -> Result<Vec<agent::ARPNeighbor>>;
|
||||
async fn save(&self) -> Option<Vec<EndpointState>>;
|
||||
async fn remove(&self, h: &dyn Hypervisor) -> Result<()>;
|
||||
/// Returns the list of network endpoints. Used to resolve PCI paths
|
||||
/// via QMP before sending update_interface to the agent.
|
||||
async fn endpoints(&self) -> Vec<std::sync::Arc<dyn endpoint::Endpoint>> {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn new(
|
||||
|
||||
@@ -168,6 +168,15 @@ impl Network for NetworkWithNetns {
|
||||
fs::remove_dir_all(inner.netns_path.clone()).context("failed to remove netns path")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn endpoints(&self) -> Vec<std::sync::Arc<dyn crate::network::endpoint::Endpoint>> {
|
||||
let inner = self.inner.read().await;
|
||||
inner
|
||||
.entity_list
|
||||
.iter()
|
||||
.map(|e| e.endpoint.clone())
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Lightweight probe: enter the netns and check whether any non-loopback
|
||||
|
||||
Reference in New Issue
Block a user