Merge pull request #13153 from BbolroC/vfio-ap-passthrough-coldplug-runtime-rs

runtime-rs: VFIO-AP cold-plug support on s390x
This commit is contained in:
Hyounggyu Choi
2026-06-27 15:35:32 +02:00
committed by GitHub
13 changed files with 544 additions and 114 deletions

View File

@@ -381,6 +381,12 @@ disable_image_nvdimm = false
# Default false
hotplug_vfio_on_root_bus = false
# Enable cold-plugging of VFIO devices to a PCIe port type.
# Accepted values: "no-port" (default, disabled), "root-port".
# When set to "root-port", devices discovered via CDI / Pod Resources
# are cold-plugged before VM boot.
cold_plug_vfio = "no-port"
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port

View File

@@ -365,6 +365,12 @@ disable_image_nvdimm = true
# Default false
hotplug_vfio_on_root_bus = false
# Enable cold-plugging of VFIO devices to a PCIe port type.
# Accepted values: "no-port" (disabled), "root-port".
# When set to "root-port", devices discovered via CDI / Pod Resources
# are cold-plugged before VM boot.
cold_plug_vfio = "root-port"
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port

View File

@@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{
use tokio::sync::{Mutex, RwLock};
use crate::{
vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig,
BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor,
NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig,
VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO,
VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
vfio_device::{VfioDeviceModernHandle, VfioDeviceType},
vhost_user_blk::VhostUserBlkDevice,
BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice,
Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice,
VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE,
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
};
use super::{
@@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock<DeviceManager>) -> SharedFsInfo {
d.read().await.get_shared_fs_info().await
}
/// Returns the APQN list for a cold-plugged VFIO-AP device whose
/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is
/// registered in the device manager.
///
/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices
/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so
/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that
/// gap without touching reference counts or the QMP hot-plug path.
pub async fn find_cold_plugged_vfio_ap(
d: &RwLock<DeviceManager>,
host_path: &str,
) -> Option<Vec<String>> {
// Avoid holding the DeviceManager read-lock across .await points.
let devices: Vec<ArcMutexDevice> = {
let dm = d.read().await;
dm.devices.values().cloned().collect()
};
for dev in devices {
if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await {
let guard = inner.lock().await;
if guard.device.device_type == VfioDeviceType::MediatedAp
&& guard.config.iommu_group_devnode == Path::new(host_path)
{
return Some(guard.config.ap_devices.clone());
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::DeviceManager;

View File

@@ -20,10 +20,10 @@ mod virtio_vsock;
pub use port_device::{PCIePortDevice, PortDeviceConfig};
pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
pub use vfio::{
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig,
VfioDevice,
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode,
VfioConfig, VfioDevice, VfioDeviceType,
};
pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
pub use vhost_user_net::VhostUserNetDevice;
pub use virtio_blk::{

View File

@@ -19,6 +19,7 @@ const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices";
const DEV_IOMMU: &str = "/dev/iommu";
const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices";
const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev";
const SYS_VFIO_AP: &str = "/sys/devices/vfio_ap";
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VfioIommufdBackend {
@@ -47,6 +48,10 @@ pub struct VfioDevice {
pub primary: DeviceInfo,
pub labels: BTreeMap<String, String>,
pub health: Health,
/// APQNs (Adjunct Processor Queue Numbers) for MediatedAp devices, e.g. ["0a.0001", "0b.0002"].
/// Populated by discover_vfio_ap_device(); empty for all non-AP device types.
pub ap_devices: Vec<String>,
}
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
@@ -656,6 +661,7 @@ fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Res
primary: primary_device,
labels,
health,
ap_devices: Vec::new(),
})
}
@@ -770,3 +776,96 @@ pub fn is_dev_vfio_group_path(host_path: &str) -> bool {
// Valid if remainder is non-empty and contains only digits
!rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit())
}
/// Returns true if the VFIO group at `group_devnode` (e.g. `/dev/vfio/N`) contains
/// an s390x AP mediated device. Detection works by resolving the sysfs symlinks for
/// every entry in `/sys/kernel/iommu_groups/<N>/devices/` and checking whether any of
/// them resolves to a path under `/sys/devices/vfio_ap`.
pub fn is_vfio_ap_device(group_devnode: &Path) -> bool {
let gid = match parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) {
Some(id) => id,
None => return false,
};
let group_devices_dir = Path::new(SYS_IOMMU_GROUPS)
.join(gid.to_string())
.join("devices");
let rd = match fs::read_dir(&group_devices_dir) {
Ok(r) => r,
Err(_) => return false,
};
for ent in rd.flatten() {
let link_path = group_devices_dir.join(ent.file_name());
if let Ok(resolved) = fs::canonicalize(&link_path) {
if resolved.starts_with(SYS_VFIO_AP) {
return true;
}
}
}
false
}
/// Discovers an s390x VFIO-AP mediated device from its VFIO group path (`/dev/vfio/N`).
///
/// Reads the `matrix` file from the mdev's sysfs path to obtain the list of APQNs
/// (Adjunct Processor Queue Numbers) assigned to this matrix device.
pub fn discover_vfio_ap_device(group_devnode: &Path) -> Result<VfioDevice> {
let gid = parse_dev_vfio_group_id(&group_devnode.to_string_lossy())
.ok_or_else(|| anyhow!("Invalid VFIO group path: {}", group_devnode.display()))?;
let group_devices_dir = Path::new(SYS_IOMMU_GROUPS)
.join(gid.to_string())
.join("devices");
// Enumerate IOMMU group entries and find the AP mdev symlink.
let mut ap_sysfs_path: Option<PathBuf> = None;
for ent in fs::read_dir(&group_devices_dir)
.with_context(|| format!("Failed to read {}", group_devices_dir.display()))?
.flatten()
{
let link_path = group_devices_dir.join(ent.file_name());
if let Ok(resolved) = fs::canonicalize(&link_path) {
if resolved.starts_with(SYS_VFIO_AP) {
ap_sysfs_path = Some(resolved);
break;
}
}
}
let sysfs_dev = ap_sysfs_path
.ok_or_else(|| anyhow!("No VFIO-AP device found in IOMMU group {}", gid))?;
// Read APQNs from the `matrix` sysfs attribute (one APQN per line, e.g. "0a.0001").
let matrix_path = sysfs_dev.join("matrix");
let matrix_raw = fs::read_to_string(&matrix_path)
.with_context(|| format!("Failed to read {}", matrix_path.display()))?;
let ap_devices: Vec<String> = matrix_raw
.lines()
.map(|l| l.trim().to_string())
.filter(|l| !l.is_empty())
.collect();
let primary = DeviceInfo {
addr: DeviceAddress::MdevUuid(
sysfs_dev
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_default(),
),
sysfs_path: sysfs_dev.clone(),
..Default::default()
};
Ok(VfioDevice {
id: format!("vfio-ap-{}", gid),
device_type: VfioDeviceType::MediatedAp,
bus_mode: VfioBusMode::Ccw,
iommu_group: None,
iommu_group_id: Some(gid),
iommufd: None,
devices: vec![primary.clone()],
primary,
labels: BTreeMap::new(),
health: Health::Healthy,
ap_devices,
})
}

View File

@@ -13,7 +13,10 @@ use crate::device::pci_path::PciPath;
use crate::device::topology::{PCIePort, PCIeTopology};
use crate::device::util::{do_decrease_count, do_increase_count};
use crate::device::{Device, DeviceType, PCIeDevice};
use crate::vfio_device::core::{discover_vfio_device, discover_vfio_group_device, VfioDevice};
use crate::vfio_device::core::{
discover_vfio_ap_device, discover_vfio_device, discover_vfio_group_device, is_vfio_ap_device,
VfioDevice, VfioDeviceType,
};
use crate::Hypervisor;
/// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id)
@@ -64,6 +67,11 @@ pub struct VfioDeviceBase {
/// - VFIO Volume: "vfio_vol_"
/// - VFIO NVMe: "vfio_nvme_"
pub hostdev_prefix: String,
/// APQNs assigned to this device (s390x VFIO-AP only).
/// Each entry is a string like "0a.0001" read from the mdev matrix sysfs file.
/// Empty for all non-AP device types.
pub ap_devices: Vec<String>,
}
#[derive(Debug, Default, Clone)]
@@ -95,18 +103,33 @@ fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf {
impl VfioDeviceModern {
pub fn new(device_id: String, base: &VfioDeviceBase) -> Result<Self> {
// For modern VFIO devices, we require the specific device cdev path to be provided in the configuration.
// This allows us to directly discover the device context without needing to resolve group devices.
// If the device node is not provided, we can optionally fallback to group device discovery,
// but this is less efficient and may not be supported in all environments.
let group_path = vfio_modern_group_discovery_path(base);
// s390x VFIO-AP: mediated AP devices have no PCI BDF; discover them separately.
if is_vfio_ap_device(&group_path) {
let device = discover_vfio_ap_device(&group_path)?;
let mut config = base.clone();
config.ap_devices = device.ap_devices.clone();
return Ok(Self {
device_id,
device,
config,
device_options: Vec::new(),
is_allocated: false,
attach_count: 0,
});
}
// PCI / iommufd path: use the iommu_device_node cdev when available, otherwise
// fall back to group-device discovery.
let device = if let Some(ref node) = base.iommu_device_node {
if !node.as_os_str().is_empty() {
discover_vfio_device(node)?
} else {
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
discover_vfio_group_device(group_path)?
}
} else {
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
discover_vfio_group_device(group_path)?
};
Ok(Self {
device_id,
@@ -196,21 +219,31 @@ impl Device for VfioDeviceModernHandle {
return Ok(());
}
// Register the device in the virtual PCIe topology
let topo = pcie_topo.as_deref_mut().ok_or_else(|| {
anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided")
})?;
self.register(topo).await?;
let is_ap = self
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
.await;
// Request Hypervisor to perform the actual hardware passthrough
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
error!(sl!(), "failed to attach vfio device: {:?}", e);
// Rollback state on failure
self.decrease_attach_count().await?;
self.unregister(topo).await?;
return Err(e);
if is_ap {
// AP devices have no PCIe topology; call the hypervisor directly.
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
error!(sl!(), "failed to attach vfio-ap device: {:?}", e);
self.decrease_attach_count().await?;
return Err(e);
}
} else {
// PCI devices must be registered in the topology first.
let topo = pcie_topo.as_deref_mut().ok_or_else(|| {
anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided")
})?;
self.register(topo).await?;
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
error!(sl!(), "failed to attach vfio device: {:?}", e);
self.decrease_attach_count().await?;
self.unregister(topo).await?;
return Err(e);
}
}
info!(
sl!(),
"vfio device {:?} attached successfully",
@@ -247,9 +280,14 @@ impl Device for VfioDeviceModernHandle {
let virt = self.with(|d| d.config.virt_path.clone()).await;
let device_index = virt.map(|(idx, _)| idx);
// Unregister from PCIe topology
if let Some(topo) = pcie_topo {
self.unregister(topo).await?;
// AP devices have no PCIe topology to unregister from.
let is_ap = self
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
.await;
if !is_ap {
if let Some(topo) = pcie_topo {
self.unregister(topo).await?;
}
}
Ok(device_index)
@@ -279,6 +317,14 @@ impl Device for VfioDeviceModernHandle {
impl PCIeDevice for VfioDeviceModernHandle {
/// Reserves a bus and port in the PCIe topology for this device.
async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> {
// AP devices have no PCIe topology.
if self
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
.await
{
return Ok(());
}
let device_id = self.device_id().await;
let port_type = self.with(|d| d.config.port).await;
@@ -299,6 +345,14 @@ impl PCIeDevice for VfioDeviceModernHandle {
/// Releases the reserved PCIe resources and resets attachment state.
async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> {
// AP devices have no PCIe topology.
if self
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
.await
{
return Ok(());
}
let device_id = self.device_id().await;
topo.release_bus_for_device(&device_id)?;

View File

@@ -6,7 +6,10 @@
mod core;
mod device;
pub use core::{discover_vfio_group_device, VfioDevice};
pub use core::{
discover_vfio_ap_device, discover_vfio_group_device, is_vfio_ap_device, VfioDevice,
VfioDeviceType,
};
pub use device::VfioDeviceBase;
pub use device::VfioDeviceModern;
pub use device::VfioDeviceModernHandle;

View File

@@ -2331,6 +2331,21 @@ impl PCIeVfioDevice {
}
}
/// s390x VFIO-AP device: `-device vfio-ap,sysfsdev=<path>`
struct VfioApDevice {
sysfs_path: String,
}
#[async_trait]
impl ToQemuParams for VfioApDevice {
async fn qemu_params(&self) -> Result<Vec<String>> {
Ok(vec![
"-device".to_string(),
format!("vfio-ap,sysfsdev={}", self.sysfs_path),
])
}
}
#[async_trait]
impl ToQemuParams for PCIeVfioDevice {
async fn qemu_params(&self) -> Result<Vec<String>> {
@@ -3169,6 +3184,16 @@ impl<'a> QemuCmdLine<'a> {
Ok(())
}
/// Adds an s390x VFIO-AP device to the QEMU command line.
///
/// Generates: `-device vfio-ap,sysfsdev=<sysfs_path>`
pub fn add_vfio_ap_device(&mut self, sysfs_path: &str) -> Result<()> {
self.devices.push(Box::new(VfioApDevice {
sysfs_path: sysfs_path.to_string(),
}));
Ok(())
}
/// Batch adds multiple VFIO devices to the QEMU command line.
pub fn add_vfio_devices(&mut self, configs: Vec<VfioDeviceConfig>) -> Result<()> {
if configs.is_empty() {

View File

@@ -203,50 +203,59 @@ impl QemuInner {
}
}
DeviceType::VfioModern(vfio_dev) => {
// To avoid holding the lock for too long, we first snapshot the necessary VFIO parameters,
// then release the lock before doing the coldplug via cmdline,
// and finally re-acquire the lock to update the guest PCI path after coldplug.
let (devices, bus_port_id) = {
// Snapshot parameters under the lock; release before doing cmdline work.
let (device_type, ap_sysfs_path, devices, bus_port_id) = {
let vfio_device = vfio_dev.lock().await;
let device_type = vfio_device.device.device_type.clone();
let ap_sysfs_path =
vfio_device.device.primary.sysfs_path.display().to_string();
let devices = vfio_device
.device
.iommu_group
.as_ref()
.map(|g| g.clone().devices)
.unwrap_or_default();
(devices, vfio_device.config.bus_port_id.clone())
(
device_type,
ap_sysfs_path,
devices,
vfio_device.config.bus_port_id.clone(),
)
};
// Cold plug devices
for dev in devices.iter() {
let host_bdf = dev.addr.to_string();
if device_type == VfioDeviceType::MediatedAp {
// s390x VFIO-AP: -device vfio-ap,sysfsdev=<path>
// No PCIe root port, no guest_pci_path.
cmdline.add_vfio_ap_device(&ap_sysfs_path)?;
info!(sl!(), "Completed VFIOModern AP coldplug for sysfsdev: {}", ap_sysfs_path);
} else {
// PCI cold plug devices
for dev in devices.iter() {
let host_bdf = dev.addr.to_string();
let vfio_cfg = VfioDeviceConfig::new(
host_bdf,
bus_port_id.1 as u16,
bus_port_id.1 + 1,
)
.with_vfio_bus(bus_port_id.0.clone());
let vfio_cfg = VfioDeviceConfig::new(
host_bdf,
bus_port_id.1 as u16,
bus_port_id.1 + 1,
)
.with_vfio_bus(bus_port_id.0.clone());
cmdline.add_pcie_vfio_device(vfio_cfg)?;
cmdline.add_pcie_vfio_device(vfio_cfg)?;
}
// Write back guest PCI path
let pci_path =
PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?;
{
let mut vfio_device = vfio_dev.lock().await;
vfio_device.config.guest_pci_path = Some(pci_path.clone());
}
info!(
sl!(),
"Completed VFIOModern coldplug with returned guest pci path: {:?}",
pci_path
);
}
// Write back with lock
let pci_path = PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?;
{
let mut vfio_device = vfio_dev.lock().await;
// Update the guest PCI path for the VFIO device after coldplug,
// which will be used for device mapping into from Guest to Container Environment.
vfio_device.config.guest_pci_path = Some(pci_path.clone());
}
info!(
sl!(),
"Completed VFIOModern coldplug with returned guest pci path: {:?}",
pci_path
);
}
DeviceType::Vfio(vfio_dev) => {
// Cold-plug physical-endpoint VFs (non-IOMMUFD VFIO) onto
@@ -908,6 +917,7 @@ async fn log_qemu_stderr(stderr: ChildStderr, exit_notify: mpsc::Sender<()>) ->
}
use crate::device::DeviceType;
use crate::vfio_device::VfioDeviceType;
// device manager part of Hypervisor
impl QemuInner {
@@ -1141,38 +1151,41 @@ impl QemuInner {
}
DeviceType::VfioModern(ref vfiodev) => {
// Snapshot VFIO parameters inside the lock.
let (hostdev_id, sysfs_path, address, driver_type, bus) = {
let (hostdev_id, device_type, sysfs_path, address, driver_type, bus) = {
let vfio_device = vfiodev.lock().await;
let hostdev_id = vfio_device.device_id.clone();
let device = &vfio_device.device;
let device_type = device.device_type.clone();
// FIXME: The first device in the group might not be the actual device intended for passthrough.
// Multi-function support is tracked via issue #11292.
let primary_device = device
.clone()
.iommu_group
.ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))?
.primary;
let sysfs_path = device.primary.sysfs_path.display().to_string();
info!(
sl!(),
"QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr
);
// For AP devices there is no IOMMU group or BDF; use empty strings.
let (address, driver_type, bus) = if device_type == VfioDeviceType::MediatedAp {
(String::new(), "vfio-ap".to_string(), String::new())
} else {
// FIXME: The first device in the group might not be the actual device intended for passthrough.
// Multi-function support is tracked via issue #11292.
let primary_device = device
.clone()
.iommu_group
.ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))?
.primary;
let sysfs_path = primary_device.sysfs_path.display().to_string();
let driver_type = primary_device
.driver
.clone()
.ok_or_else(|| anyhow!("Driver type missing for primary device"))?;
let address = format!("{}", primary_device.addr);
info!(
sl!(),
"QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr
);
(
hostdev_id,
sysfs_path,
address,
driver_type,
vfio_device.config.bus_port_id.0.clone(),
)
let driver_type = primary_device
.driver
.clone()
.ok_or_else(|| anyhow!("Driver type missing for primary device"))?;
let address = format!("{}", primary_device.addr);
let bus = vfio_device.config.bus_port_id.0.clone();
(address, driver_type, bus)
};
(hostdev_id, device_type, sysfs_path, address, driver_type, bus)
};
// Execute hotplug outside the lock.
@@ -1184,12 +1197,13 @@ impl QemuInner {
&bus,
)?;
// Write the resulting Guest PCI Path back within the lock.
// Write the resulting Guest PCI Path back within the lock (PCI only).
{
let mut vfio_device = vfiodev.lock().await;
if let Some(p) = guest_pci_path {
// Very important to write back the guest pci path for VFIO devices.
vfio_device.config.guest_pci_path = Some(p);
if device_type != VfioDeviceType::MediatedAp {
if let Some(p) = guest_pci_path {
vfio_device.config.guest_pci_path = Some(p);
}
}
info!(
sl!(),

View File

@@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::{
device::{
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
device_manager::{
do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager,
},
util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR},
DeviceConfig, DeviceType,
},
utils::uses_native_ccw_bus,
vfio_device::is_vfio_ap_device,
BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig,
};
use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR};
use kata_types::{
config::{hypervisor::TopologyConfigInfo, TomlConfig},
device::DRIVER_VFIO_AP_COLD_TYPE,
mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL},
};
use libc::NUD_PERMANENT;
use oci::{Linux, LinuxCpu, LinuxResources};
use oci_spec::runtime::{self as oci, LinuxDeviceType};
use persist::sandbox_persist::Persist;
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use tokio::{runtime, sync::RwLock};
use crate::{
@@ -661,6 +665,43 @@ impl ResourceManagerInner {
continue;
}
// VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above
// cannot catch them. If this device was registered in the
// device manager during cold-plug (prepare_coldplug_raw_vfio_devices
// or CDI), retrieve its APQN list and build the agent device
// directly — calling do_handle_device on an already-present
// device would attempt a QMP device_add and fail.
if is_vfio_ap_device(Path::new(&host_path)) {
if let Some(ap_devs) =
find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await
{
let container_path = d.path().display().to_string();
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let agent_device = Device {
id: group_num,
container_path,
field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(),
options: ap_devs,
..Default::default()
};
info!(
sl!(),
"vfio-ap cold-plugged agent device: {:?}", agent_device
);
devices.push(ContainerDevice {
device_info: None,
device: agent_device,
});
continue;
}
// Not registered as cold-plugged — fall through to do_handle_device.
}
let bus_type = if uses_native_ccw_bus() {
"ccw".to_string()
} else {
@@ -681,6 +722,14 @@ impl ResourceManagerInner {
if let DeviceType::VfioModern(vfio_dev) = device_info.clone() {
info!(sl!(), "device info: {:?}", vfio_dev.lock().await);
let vfio_device = vfio_dev.lock().await;
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let guest_pci_path = vfio_device
.config
.guest_pci_path
@@ -697,15 +746,8 @@ impl ResourceManagerInner {
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
"vfio" => {
if bus_type == "ccw" {
"vfio-ap".to_string()
} else {
"vfio-pci".to_string()
}
}
"vfio" => "vfio-pci".to_string(),
_ => "vfio-pci-gk".to_string(),
};
let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)];
@@ -713,12 +755,6 @@ impl ResourceManagerInner {
// filepath.Base(dev.ContainerPath), e.g. "vfio0".
// The agent policy validates this with:
// i_vfio_device.id == concat("", ["vfio", suffix])
let group_num = d
.path()
.file_name()
.and_then(|n| n.to_str())
.unwrap_or_default()
.to_string();
let agent_device = Device {
id: group_num,
container_path: d.path().display().to_string().clone(),

View File

@@ -47,6 +47,10 @@ fn kata_guest_root_shared_fs() -> String {
/// If the container is a PodSandbox, it returns "pause".
/// Otherwise, it attempts to find the image name using the appropriate Kubernetes
/// annotation key.
///
/// If no container type annotation is found (SingleContainer case), it falls back to
/// checking for image name annotations directly. This supports standalone container
/// runtimes like nerdctl that may only provide the image name annotation.
pub fn get_image_reference(spec_annotations: &HashMap<String, String>) -> Result<&str> {
info!(
sl!(),
@@ -74,6 +78,26 @@ pub fn get_image_reference(spec_annotations: &HashMap<String, String>) -> Result
}
}
// Fallback for SingleContainer case: if no container type annotation is found,
// try to get image name directly. This supports standalone container runtimes
// (e.g., nerdctl) that may only provide the image name annotation without the
// container type annotation.
if let Some(image_name) = spec_annotations.get(KUBERNETES_CRI_IMAGE_NAME) {
info!(
sl!(),
"Found image name without container type annotation (SingleContainer): {}", image_name
);
return Ok(image_name.as_str());
}
if let Some(image_name) = spec_annotations.get(KUBERNETES_CRIO_IMAGE_NAME) {
info!(
sl!(),
"Found CRI-O image name without container type annotation (SingleContainer): {}", image_name
);
return Ok(image_name.as_str());
}
Err(anyhow!("no target image reference found"))
}
@@ -256,6 +280,25 @@ mod tests {
let image_ref_result_pod_sandbox = get_image_reference(&annotations_pod_sandbox);
assert!(image_ref_result_pod_sandbox.is_ok());
assert_eq!(image_ref_result_pod_sandbox.unwrap(), "pause");
// Test SingleContainer fallback (no container type annotation)
let mut annotations_single = HashMap::new();
annotations_single.insert(
"io.kubernetes.cri.image-name".to_string(),
"example-image-single".to_string(),
);
let image_ref_result_single = get_image_reference(&annotations_single);
assert!(image_ref_result_single.is_ok());
assert_eq!(image_ref_result_single.unwrap(), "example-image-single");
// Test SingleContainer fallback with CRI-O annotation
let mut annotations_single_crio = HashMap::new();
annotations_single_crio.insert(
"io.kubernetes.cri-o.ImageName".to_string(),
"example-image-single-crio".to_string(),
);
let image_ref_result_single_crio = get_image_reference(&annotations_single_crio);
assert!(image_ref_result_single_crio.is_ok());
assert_eq!(image_ref_result_single_crio.unwrap(), "example-image-single-crio");
}
#[tokio::test]

View File

@@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
))]
use hypervisor::ch::CloudHypervisor;
use hypervisor::device::topology::PCIePort;
use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR};
use hypervisor::remote::Remote;
use hypervisor::VfioDeviceBase;
use hypervisor::{is_vfio_ap_device, VfioDeviceBase};
use hypervisor::VsockConfig;
use hypervisor::HYPERVISOR_REMOTE;
#[cfg(all(
@@ -262,7 +263,23 @@ impl VirtSandbox {
None
};
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
// Cold-plug VFIO devices using two mutually exclusive paths:
// 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins
// (typical in K8s environments with device plugins)
// 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices
// (typical in standalone containers like `ctr --device /dev/vfio/0`)
//
// These paths are mutually exclusive from a user perspective:
// - In K8s, devices come through device plugins, not raw OCI device specs
// - In standalone containers, there's no Pod Resources API available
//
// Therefore, we only attempt the raw VFIO path if CDI finds no devices,
// avoiding unnecessary file I/O and OCI spec parsing in the common K8s case.
let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
if vfio_devices.is_empty() {
let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?;
vfio_devices.extend(raw_vfio);
}
if !vfio_devices.is_empty() {
info!(
sl!(),
@@ -387,6 +404,97 @@ impl VirtSandbox {
.collect())
}
// Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`).
// Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in
// linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO().
// Returns empty when the pod resources API path already handles devices (K8s) or
// when cold_plug_vfio is not configured.
async fn prepare_coldplug_raw_vfio_devices(
&self,
sandbox_config: &SandboxConfig,
) -> Result<Vec<ResourceConfig>> {
let hypervisor_config = self.hypervisor.hypervisor_config().await;
let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio;
if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" {
return Ok(Vec::new());
}
let port = match cold_plug_vfio.as_str() {
"root-port" => PCIePort::RootPort,
other => {
return Err(anyhow!(
"unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported",
other
))
}
};
let bundle = &sandbox_config.state.bundle;
if bundle.is_empty() {
return Ok(Vec::new());
}
let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME);
let oci_spec = match oci::Spec::load(&spec_path) {
Ok(s) => s,
Err(e) => {
info!(
sl!(),
"no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e
);
return Ok(Vec::new());
}
};
let linux_devices = oci_spec
.linux()
.as_ref()
.and_then(|l| l.devices().as_ref())
.cloned()
.unwrap_or_default();
let mut vfio_configs = Vec::new();
for d in linux_devices.iter() {
if d.typ() != oci::LinuxDeviceType::C {
continue;
}
let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) {
Ok(p) => p,
Err(e) => {
warn!(
sl!(),
"failed to resolve host path for {:?}: {:?}", d.path(), e
);
continue;
}
};
// Only process VFIO passthrough devices under /dev/vfio/*.
// Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio).
if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" {
continue;
}
let device_port = if is_vfio_ap_device(Path::new(&host_path)) {
PCIePort::NoPort
} else {
port
};
vfio_configs.push(VfioDeviceBase {
host_path: host_path.clone(),
iommu_group_devnode: PathBuf::from(&host_path),
dev_type: "c".to_string(),
port: device_port,
hostdev_prefix: "vfio_device".to_owned(),
..Default::default()
});
}
info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs);
Ok(vfio_configs
.into_iter()
.map(ResourceConfig::VfioDeviceModern)
.collect())
}
async fn prepare_network_resource(
&self,
network_env: &SandboxNetworkEnv,

View File

@@ -63,6 +63,7 @@ setup_hotplug() {
show_config_file
elif [[ "${runtime}" == "runtime-rs" ]]; then
setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs"
setup_config_file "cold_plug_vfio" "replace" "no-port" "runtime-rs"
show_config_file "runtime-rs"
else
echo "Invalid runtime: ${runtime}" >&2
@@ -79,8 +80,9 @@ setup_coldplug() {
setup_config_file "cold_plug_vfio" "replace" "bridge-port"
show_config_file
elif [[ "${runtime}" == "runtime-rs" ]]; then
echo "Coldplug is not supported for runtime-rs" >&2
exit 1
setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs"
setup_config_file "cold_plug_vfio" "replace" "root-port" "runtime-rs"
show_config_file "runtime-rs"
else
echo "Invalid runtime: ${runtime}" >&2
exit 1
@@ -296,6 +298,9 @@ run_tests() {
setup_hotplug "runtime-rs"
run_test "3" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Hotplug" "&& zcrypttest -a -v"
setup_coldplug "runtime-rs"
run_test "4" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Coldplug" "&& zcrypttest -a -v"
}
main() {