mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-07-04 00:04:37 +00:00
Merge pull request #13153 from BbolroC/vfio-ap-passthrough-coldplug-runtime-rs
runtime-rs: VFIO-AP cold-plug support on s390x
This commit is contained in:
@@ -381,6 +381,12 @@ disable_image_nvdimm = false
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable cold-plugging of VFIO devices to a PCIe port type.
|
||||
# Accepted values: "no-port" (default, disabled), "root-port".
|
||||
# When set to "root-port", devices discovered via CDI / Pod Resources
|
||||
# are cold-plugged before VM boot.
|
||||
cold_plug_vfio = "no-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
|
||||
@@ -365,6 +365,12 @@ disable_image_nvdimm = true
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable cold-plugging of VFIO devices to a PCIe port type.
|
||||
# Accepted values: "no-port" (disabled), "root-port".
|
||||
# When set to "root-port", devices discovered via CDI / Pod Resources
|
||||
# are cold-plugged before VM boot.
|
||||
cold_plug_vfio = "root-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
|
||||
@@ -18,12 +18,13 @@ use kata_types::config::hypervisor::{
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
||||
use crate::{
|
||||
vfio_device::VfioDeviceModernHandle, vhost_user_blk::VhostUserBlkDevice, BlockConfig,
|
||||
BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice, Hypervisor,
|
||||
NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice, VhostUserConfig,
|
||||
VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
|
||||
KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO,
|
||||
VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
vfio_device::{VfioDeviceModernHandle, VfioDeviceType},
|
||||
vhost_user_blk::VhostUserBlkDevice,
|
||||
BlockConfig, BlockConfigModern, BlockDevice, BlockDeviceModernHandle, HybridVsockDevice,
|
||||
Hypervisor, NetworkDevice, PCIePortDevice, ProtectionDevice, ShareFsDevice, VfioDevice,
|
||||
VhostUserConfig, VhostUserNetDevice, VsockDevice, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE,
|
||||
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
|
||||
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -724,6 +725,36 @@ pub async fn get_shared_fs_info(d: &RwLock<DeviceManager>) -> SharedFsInfo {
|
||||
d.read().await.get_shared_fs_info().await
|
||||
}
|
||||
|
||||
/// Returns the APQN list for a cold-plugged VFIO-AP device whose
|
||||
/// `iommu_group_devnode` matches `host_path`, or `None` if no such device is
|
||||
/// registered in the device manager.
|
||||
///
|
||||
/// Used by `handler_devices` to bypass `do_handle_device` for VFIO-AP devices
|
||||
/// that were cold-plugged before VM boot. VFIO-AP devices have no PCIe BDF so
|
||||
/// the BDF-keyed `cold_plug_bdfs` map cannot catch them; this lookup fills that
|
||||
/// gap without touching reference counts or the QMP hot-plug path.
|
||||
pub async fn find_cold_plugged_vfio_ap(
|
||||
d: &RwLock<DeviceManager>,
|
||||
host_path: &str,
|
||||
) -> Option<Vec<String>> {
|
||||
// Avoid holding the DeviceManager read-lock across .await points.
|
||||
let devices: Vec<ArcMutexDevice> = {
|
||||
let dm = d.read().await;
|
||||
dm.devices.values().cloned().collect()
|
||||
};
|
||||
for dev in devices {
|
||||
if let DeviceType::VfioModern(inner) = dev.lock().await.get_device_info().await {
|
||||
let guard = inner.lock().await;
|
||||
if guard.device.device_type == VfioDeviceType::MediatedAp
|
||||
&& guard.config.iommu_group_devnode == Path::new(host_path)
|
||||
{
|
||||
return Some(guard.config.ap_devices.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::DeviceManager;
|
||||
|
||||
@@ -20,10 +20,10 @@ mod virtio_vsock;
|
||||
pub use port_device::{PCIePortDevice, PortDeviceConfig};
|
||||
pub use protection_device::{ProtectionDevice, ProtectionDeviceConfig, SevSnpConfig, TdxConfig};
|
||||
pub use vfio::{
|
||||
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode, VfioConfig,
|
||||
VfioDevice,
|
||||
bind_device_to_host, bind_device_to_vfio, get_vfio_device, HostDevice, VfioBusMode,
|
||||
VfioConfig, VfioDevice, VfioDeviceType,
|
||||
};
|
||||
pub use vfio_device::{VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
|
||||
pub use vfio_device::{is_vfio_ap_device, VfioDeviceBase, VfioDeviceModern, VfioDeviceModernHandle};
|
||||
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
|
||||
pub use vhost_user_net::VhostUserNetDevice;
|
||||
pub use virtio_blk::{
|
||||
|
||||
@@ -19,6 +19,7 @@ const SYS_PCI_DEVS: &str = "/sys/bus/pci/devices";
|
||||
const DEV_IOMMU: &str = "/dev/iommu";
|
||||
const DEV_VFIO_DEVICES: &str = "/dev/vfio/devices";
|
||||
const SYS_CLASS_VFIO_DEV: &str = "/sys/class/vfio-dev";
|
||||
const SYS_VFIO_AP: &str = "/sys/devices/vfio_ap";
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VfioIommufdBackend {
|
||||
@@ -47,6 +48,10 @@ pub struct VfioDevice {
|
||||
pub primary: DeviceInfo,
|
||||
pub labels: BTreeMap<String, String>,
|
||||
pub health: Health,
|
||||
|
||||
/// APQNs (Adjunct Processor Queue Numbers) for MediatedAp devices, e.g. ["0a.0001", "0b.0002"].
|
||||
/// Populated by discover_vfio_ap_device(); empty for all non-AP device types.
|
||||
pub ap_devices: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -656,6 +661,7 @@ fn discover_vfio_device_for_iommu_group(gid: u32, group_devnode: PathBuf) -> Res
|
||||
primary: primary_device,
|
||||
labels,
|
||||
health,
|
||||
ap_devices: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -770,3 +776,96 @@ pub fn is_dev_vfio_group_path(host_path: &str) -> bool {
|
||||
// Valid if remainder is non-empty and contains only digits
|
||||
!rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit())
|
||||
}
|
||||
|
||||
/// Returns true if the VFIO group at `group_devnode` (e.g. `/dev/vfio/N`) contains
|
||||
/// an s390x AP mediated device. Detection works by resolving the sysfs symlinks for
|
||||
/// every entry in `/sys/kernel/iommu_groups/<N>/devices/` and checking whether any of
|
||||
/// them resolves to a path under `/sys/devices/vfio_ap`.
|
||||
pub fn is_vfio_ap_device(group_devnode: &Path) -> bool {
|
||||
let gid = match parse_dev_vfio_group_id(&group_devnode.to_string_lossy()) {
|
||||
Some(id) => id,
|
||||
None => return false,
|
||||
};
|
||||
let group_devices_dir = Path::new(SYS_IOMMU_GROUPS)
|
||||
.join(gid.to_string())
|
||||
.join("devices");
|
||||
let rd = match fs::read_dir(&group_devices_dir) {
|
||||
Ok(r) => r,
|
||||
Err(_) => return false,
|
||||
};
|
||||
for ent in rd.flatten() {
|
||||
let link_path = group_devices_dir.join(ent.file_name());
|
||||
if let Ok(resolved) = fs::canonicalize(&link_path) {
|
||||
if resolved.starts_with(SYS_VFIO_AP) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Discovers an s390x VFIO-AP mediated device from its VFIO group path (`/dev/vfio/N`).
|
||||
///
|
||||
/// Reads the `matrix` file from the mdev's sysfs path to obtain the list of APQNs
|
||||
/// (Adjunct Processor Queue Numbers) assigned to this matrix device.
|
||||
pub fn discover_vfio_ap_device(group_devnode: &Path) -> Result<VfioDevice> {
|
||||
let gid = parse_dev_vfio_group_id(&group_devnode.to_string_lossy())
|
||||
.ok_or_else(|| anyhow!("Invalid VFIO group path: {}", group_devnode.display()))?;
|
||||
|
||||
let group_devices_dir = Path::new(SYS_IOMMU_GROUPS)
|
||||
.join(gid.to_string())
|
||||
.join("devices");
|
||||
|
||||
// Enumerate IOMMU group entries and find the AP mdev symlink.
|
||||
let mut ap_sysfs_path: Option<PathBuf> = None;
|
||||
for ent in fs::read_dir(&group_devices_dir)
|
||||
.with_context(|| format!("Failed to read {}", group_devices_dir.display()))?
|
||||
.flatten()
|
||||
{
|
||||
let link_path = group_devices_dir.join(ent.file_name());
|
||||
if let Ok(resolved) = fs::canonicalize(&link_path) {
|
||||
if resolved.starts_with(SYS_VFIO_AP) {
|
||||
ap_sysfs_path = Some(resolved);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let sysfs_dev = ap_sysfs_path
|
||||
.ok_or_else(|| anyhow!("No VFIO-AP device found in IOMMU group {}", gid))?;
|
||||
|
||||
// Read APQNs from the `matrix` sysfs attribute (one APQN per line, e.g. "0a.0001").
|
||||
let matrix_path = sysfs_dev.join("matrix");
|
||||
let matrix_raw = fs::read_to_string(&matrix_path)
|
||||
.with_context(|| format!("Failed to read {}", matrix_path.display()))?;
|
||||
let ap_devices: Vec<String> = matrix_raw
|
||||
.lines()
|
||||
.map(|l| l.trim().to_string())
|
||||
.filter(|l| !l.is_empty())
|
||||
.collect();
|
||||
|
||||
let primary = DeviceInfo {
|
||||
addr: DeviceAddress::MdevUuid(
|
||||
sysfs_dev
|
||||
.file_name()
|
||||
.map(|n| n.to_string_lossy().to_string())
|
||||
.unwrap_or_default(),
|
||||
),
|
||||
sysfs_path: sysfs_dev.clone(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Ok(VfioDevice {
|
||||
id: format!("vfio-ap-{}", gid),
|
||||
device_type: VfioDeviceType::MediatedAp,
|
||||
bus_mode: VfioBusMode::Ccw,
|
||||
iommu_group: None,
|
||||
iommu_group_id: Some(gid),
|
||||
iommufd: None,
|
||||
devices: vec![primary.clone()],
|
||||
primary,
|
||||
labels: BTreeMap::new(),
|
||||
health: Health::Healthy,
|
||||
ap_devices,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -13,7 +13,10 @@ use crate::device::pci_path::PciPath;
|
||||
use crate::device::topology::{PCIePort, PCIeTopology};
|
||||
use crate::device::util::{do_decrease_count, do_increase_count};
|
||||
use crate::device::{Device, DeviceType, PCIeDevice};
|
||||
use crate::vfio_device::core::{discover_vfio_device, discover_vfio_group_device, VfioDevice};
|
||||
use crate::vfio_device::core::{
|
||||
discover_vfio_ap_device, discover_vfio_device, discover_vfio_group_device, is_vfio_ap_device,
|
||||
VfioDevice, VfioDeviceType,
|
||||
};
|
||||
use crate::Hypervisor;
|
||||
|
||||
/// Identifies a specific port on a PCI bus: (bus_name, bus_slot, port_id)
|
||||
@@ -64,6 +67,11 @@ pub struct VfioDeviceBase {
|
||||
/// - VFIO Volume: "vfio_vol_"
|
||||
/// - VFIO NVMe: "vfio_nvme_"
|
||||
pub hostdev_prefix: String,
|
||||
|
||||
/// APQNs assigned to this device (s390x VFIO-AP only).
|
||||
/// Each entry is a string like "0a.0001" read from the mdev matrix sysfs file.
|
||||
/// Empty for all non-AP device types.
|
||||
pub ap_devices: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
@@ -95,18 +103,33 @@ fn vfio_modern_group_discovery_path(base: &VfioDeviceBase) -> PathBuf {
|
||||
|
||||
impl VfioDeviceModern {
|
||||
pub fn new(device_id: String, base: &VfioDeviceBase) -> Result<Self> {
|
||||
// For modern VFIO devices, we require the specific device cdev path to be provided in the configuration.
|
||||
// This allows us to directly discover the device context without needing to resolve group devices.
|
||||
// If the device node is not provided, we can optionally fallback to group device discovery,
|
||||
// but this is less efficient and may not be supported in all environments.
|
||||
let group_path = vfio_modern_group_discovery_path(base);
|
||||
|
||||
// s390x VFIO-AP: mediated AP devices have no PCI BDF; discover them separately.
|
||||
if is_vfio_ap_device(&group_path) {
|
||||
let device = discover_vfio_ap_device(&group_path)?;
|
||||
let mut config = base.clone();
|
||||
config.ap_devices = device.ap_devices.clone();
|
||||
return Ok(Self {
|
||||
device_id,
|
||||
device,
|
||||
config,
|
||||
device_options: Vec::new(),
|
||||
is_allocated: false,
|
||||
attach_count: 0,
|
||||
});
|
||||
}
|
||||
|
||||
// PCI / iommufd path: use the iommu_device_node cdev when available, otherwise
|
||||
// fall back to group-device discovery.
|
||||
let device = if let Some(ref node) = base.iommu_device_node {
|
||||
if !node.as_os_str().is_empty() {
|
||||
discover_vfio_device(node)?
|
||||
} else {
|
||||
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
|
||||
discover_vfio_group_device(group_path)?
|
||||
}
|
||||
} else {
|
||||
discover_vfio_group_device(vfio_modern_group_discovery_path(base))?
|
||||
discover_vfio_group_device(group_path)?
|
||||
};
|
||||
Ok(Self {
|
||||
device_id,
|
||||
@@ -196,21 +219,31 @@ impl Device for VfioDeviceModernHandle {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Register the device in the virtual PCIe topology
|
||||
let topo = pcie_topo.as_deref_mut().ok_or_else(|| {
|
||||
anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided")
|
||||
})?;
|
||||
self.register(topo).await?;
|
||||
let is_ap = self
|
||||
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
|
||||
.await;
|
||||
|
||||
// Request Hypervisor to perform the actual hardware passthrough
|
||||
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
|
||||
error!(sl!(), "failed to attach vfio device: {:?}", e);
|
||||
|
||||
// Rollback state on failure
|
||||
self.decrease_attach_count().await?;
|
||||
self.unregister(topo).await?;
|
||||
return Err(e);
|
||||
if is_ap {
|
||||
// AP devices have no PCIe topology; call the hypervisor directly.
|
||||
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
|
||||
error!(sl!(), "failed to attach vfio-ap device: {:?}", e);
|
||||
self.decrease_attach_count().await?;
|
||||
return Err(e);
|
||||
}
|
||||
} else {
|
||||
// PCI devices must be registered in the topology first.
|
||||
let topo = pcie_topo.as_deref_mut().ok_or_else(|| {
|
||||
anyhow::anyhow!("VFIO device requires a PCIe topology but none was provided")
|
||||
})?;
|
||||
self.register(topo).await?;
|
||||
if let Err(e) = h.add_device(DeviceType::VfioModern(self.arc())).await {
|
||||
error!(sl!(), "failed to attach vfio device: {:?}", e);
|
||||
self.decrease_attach_count().await?;
|
||||
self.unregister(topo).await?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"vfio device {:?} attached successfully",
|
||||
@@ -247,9 +280,14 @@ impl Device for VfioDeviceModernHandle {
|
||||
let virt = self.with(|d| d.config.virt_path.clone()).await;
|
||||
let device_index = virt.map(|(idx, _)| idx);
|
||||
|
||||
// Unregister from PCIe topology
|
||||
if let Some(topo) = pcie_topo {
|
||||
self.unregister(topo).await?;
|
||||
// AP devices have no PCIe topology to unregister from.
|
||||
let is_ap = self
|
||||
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
|
||||
.await;
|
||||
if !is_ap {
|
||||
if let Some(topo) = pcie_topo {
|
||||
self.unregister(topo).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(device_index)
|
||||
@@ -279,6 +317,14 @@ impl Device for VfioDeviceModernHandle {
|
||||
impl PCIeDevice for VfioDeviceModernHandle {
|
||||
/// Reserves a bus and port in the PCIe topology for this device.
|
||||
async fn register(&mut self, topo: &mut PCIeTopology) -> Result<()> {
|
||||
// AP devices have no PCIe topology.
|
||||
if self
|
||||
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
|
||||
.await
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let device_id = self.device_id().await;
|
||||
let port_type = self.with(|d| d.config.port).await;
|
||||
|
||||
@@ -299,6 +345,14 @@ impl PCIeDevice for VfioDeviceModernHandle {
|
||||
|
||||
/// Releases the reserved PCIe resources and resets attachment state.
|
||||
async fn unregister(&mut self, topo: &mut PCIeTopology) -> Result<()> {
|
||||
// AP devices have no PCIe topology.
|
||||
if self
|
||||
.with(|d| d.device.device_type == VfioDeviceType::MediatedAp)
|
||||
.await
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let device_id = self.device_id().await;
|
||||
topo.release_bus_for_device(&device_id)?;
|
||||
|
||||
|
||||
@@ -6,7 +6,10 @@
|
||||
mod core;
|
||||
mod device;
|
||||
|
||||
pub use core::{discover_vfio_group_device, VfioDevice};
|
||||
pub use core::{
|
||||
discover_vfio_ap_device, discover_vfio_group_device, is_vfio_ap_device, VfioDevice,
|
||||
VfioDeviceType,
|
||||
};
|
||||
pub use device::VfioDeviceBase;
|
||||
pub use device::VfioDeviceModern;
|
||||
pub use device::VfioDeviceModernHandle;
|
||||
|
||||
@@ -2331,6 +2331,21 @@ impl PCIeVfioDevice {
|
||||
}
|
||||
}
|
||||
|
||||
/// s390x VFIO-AP device: `-device vfio-ap,sysfsdev=<path>`
|
||||
struct VfioApDevice {
|
||||
sysfs_path: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ToQemuParams for VfioApDevice {
|
||||
async fn qemu_params(&self) -> Result<Vec<String>> {
|
||||
Ok(vec![
|
||||
"-device".to_string(),
|
||||
format!("vfio-ap,sysfsdev={}", self.sysfs_path),
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ToQemuParams for PCIeVfioDevice {
|
||||
async fn qemu_params(&self) -> Result<Vec<String>> {
|
||||
@@ -3169,6 +3184,16 @@ impl<'a> QemuCmdLine<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds an s390x VFIO-AP device to the QEMU command line.
|
||||
///
|
||||
/// Generates: `-device vfio-ap,sysfsdev=<sysfs_path>`
|
||||
pub fn add_vfio_ap_device(&mut self, sysfs_path: &str) -> Result<()> {
|
||||
self.devices.push(Box::new(VfioApDevice {
|
||||
sysfs_path: sysfs_path.to_string(),
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Batch adds multiple VFIO devices to the QEMU command line.
|
||||
pub fn add_vfio_devices(&mut self, configs: Vec<VfioDeviceConfig>) -> Result<()> {
|
||||
if configs.is_empty() {
|
||||
|
||||
@@ -203,50 +203,59 @@ impl QemuInner {
|
||||
}
|
||||
}
|
||||
DeviceType::VfioModern(vfio_dev) => {
|
||||
// To avoid holding the lock for too long, we first snapshot the necessary VFIO parameters,
|
||||
// then release the lock before doing the coldplug via cmdline,
|
||||
// and finally re-acquire the lock to update the guest PCI path after coldplug.
|
||||
let (devices, bus_port_id) = {
|
||||
// Snapshot parameters under the lock; release before doing cmdline work.
|
||||
let (device_type, ap_sysfs_path, devices, bus_port_id) = {
|
||||
let vfio_device = vfio_dev.lock().await;
|
||||
let device_type = vfio_device.device.device_type.clone();
|
||||
let ap_sysfs_path =
|
||||
vfio_device.device.primary.sysfs_path.display().to_string();
|
||||
let devices = vfio_device
|
||||
.device
|
||||
.iommu_group
|
||||
.as_ref()
|
||||
.map(|g| g.clone().devices)
|
||||
.unwrap_or_default();
|
||||
|
||||
(devices, vfio_device.config.bus_port_id.clone())
|
||||
(
|
||||
device_type,
|
||||
ap_sysfs_path,
|
||||
devices,
|
||||
vfio_device.config.bus_port_id.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
// Cold plug devices
|
||||
for dev in devices.iter() {
|
||||
let host_bdf = dev.addr.to_string();
|
||||
if device_type == VfioDeviceType::MediatedAp {
|
||||
// s390x VFIO-AP: -device vfio-ap,sysfsdev=<path>
|
||||
// No PCIe root port, no guest_pci_path.
|
||||
cmdline.add_vfio_ap_device(&ap_sysfs_path)?;
|
||||
info!(sl!(), "Completed VFIOModern AP coldplug for sysfsdev: {}", ap_sysfs_path);
|
||||
} else {
|
||||
// PCI cold plug devices
|
||||
for dev in devices.iter() {
|
||||
let host_bdf = dev.addr.to_string();
|
||||
|
||||
let vfio_cfg = VfioDeviceConfig::new(
|
||||
host_bdf,
|
||||
bus_port_id.1 as u16,
|
||||
bus_port_id.1 + 1,
|
||||
)
|
||||
.with_vfio_bus(bus_port_id.0.clone());
|
||||
let vfio_cfg = VfioDeviceConfig::new(
|
||||
host_bdf,
|
||||
bus_port_id.1 as u16,
|
||||
bus_port_id.1 + 1,
|
||||
)
|
||||
.with_vfio_bus(bus_port_id.0.clone());
|
||||
|
||||
cmdline.add_pcie_vfio_device(vfio_cfg)?;
|
||||
cmdline.add_pcie_vfio_device(vfio_cfg)?;
|
||||
}
|
||||
|
||||
// Write back guest PCI path
|
||||
let pci_path =
|
||||
PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?;
|
||||
{
|
||||
let mut vfio_device = vfio_dev.lock().await;
|
||||
vfio_device.config.guest_pci_path = Some(pci_path.clone());
|
||||
}
|
||||
info!(
|
||||
sl!(),
|
||||
"Completed VFIOModern coldplug with returned guest pci path: {:?}",
|
||||
pci_path
|
||||
);
|
||||
}
|
||||
|
||||
// Write back with lock
|
||||
let pci_path = PciPath::try_from(format!("{:02x}/00", bus_port_id.1).as_str())?;
|
||||
|
||||
{
|
||||
let mut vfio_device = vfio_dev.lock().await;
|
||||
// Update the guest PCI path for the VFIO device after coldplug,
|
||||
// which will be used for device mapping into from Guest to Container Environment.
|
||||
vfio_device.config.guest_pci_path = Some(pci_path.clone());
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"Completed VFIOModern coldplug with returned guest pci path: {:?}",
|
||||
pci_path
|
||||
);
|
||||
}
|
||||
DeviceType::Vfio(vfio_dev) => {
|
||||
// Cold-plug physical-endpoint VFs (non-IOMMUFD VFIO) onto
|
||||
@@ -908,6 +917,7 @@ async fn log_qemu_stderr(stderr: ChildStderr, exit_notify: mpsc::Sender<()>) ->
|
||||
}
|
||||
|
||||
use crate::device::DeviceType;
|
||||
use crate::vfio_device::VfioDeviceType;
|
||||
|
||||
// device manager part of Hypervisor
|
||||
impl QemuInner {
|
||||
@@ -1141,38 +1151,41 @@ impl QemuInner {
|
||||
}
|
||||
DeviceType::VfioModern(ref vfiodev) => {
|
||||
// Snapshot VFIO parameters inside the lock.
|
||||
let (hostdev_id, sysfs_path, address, driver_type, bus) = {
|
||||
let (hostdev_id, device_type, sysfs_path, address, driver_type, bus) = {
|
||||
let vfio_device = vfiodev.lock().await;
|
||||
let hostdev_id = vfio_device.device_id.clone();
|
||||
let device = &vfio_device.device;
|
||||
let device_type = device.device_type.clone();
|
||||
|
||||
// FIXME: The first device in the group might not be the actual device intended for passthrough.
|
||||
// Multi-function support is tracked via issue #11292.
|
||||
let primary_device = device
|
||||
.clone()
|
||||
.iommu_group
|
||||
.ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))?
|
||||
.primary;
|
||||
let sysfs_path = device.primary.sysfs_path.display().to_string();
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr
|
||||
);
|
||||
// For AP devices there is no IOMMU group or BDF; use empty strings.
|
||||
let (address, driver_type, bus) = if device_type == VfioDeviceType::MediatedAp {
|
||||
(String::new(), "vfio-ap".to_string(), String::new())
|
||||
} else {
|
||||
// FIXME: The first device in the group might not be the actual device intended for passthrough.
|
||||
// Multi-function support is tracked via issue #11292.
|
||||
let primary_device = device
|
||||
.clone()
|
||||
.iommu_group
|
||||
.ok_or_else(|| anyhow!("IOMMU group missing for VFIO device"))?
|
||||
.primary;
|
||||
|
||||
let sysfs_path = primary_device.sysfs_path.display().to_string();
|
||||
let driver_type = primary_device
|
||||
.driver
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow!("Driver type missing for primary device"))?;
|
||||
let address = format!("{}", primary_device.addr);
|
||||
info!(
|
||||
sl!(),
|
||||
"QMP hotplug VFIO primary_device address: {:?}", &primary_device.addr
|
||||
);
|
||||
|
||||
(
|
||||
hostdev_id,
|
||||
sysfs_path,
|
||||
address,
|
||||
driver_type,
|
||||
vfio_device.config.bus_port_id.0.clone(),
|
||||
)
|
||||
let driver_type = primary_device
|
||||
.driver
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow!("Driver type missing for primary device"))?;
|
||||
let address = format!("{}", primary_device.addr);
|
||||
let bus = vfio_device.config.bus_port_id.0.clone();
|
||||
(address, driver_type, bus)
|
||||
};
|
||||
|
||||
(hostdev_id, device_type, sysfs_path, address, driver_type, bus)
|
||||
};
|
||||
|
||||
// Execute hotplug outside the lock.
|
||||
@@ -1184,12 +1197,13 @@ impl QemuInner {
|
||||
&bus,
|
||||
)?;
|
||||
|
||||
// Write the resulting Guest PCI Path back within the lock.
|
||||
// Write the resulting Guest PCI Path back within the lock (PCI only).
|
||||
{
|
||||
let mut vfio_device = vfiodev.lock().await;
|
||||
if let Some(p) = guest_pci_path {
|
||||
// Very important to write back the guest pci path for VFIO devices.
|
||||
vfio_device.config.guest_pci_path = Some(p);
|
||||
if device_type != VfioDeviceType::MediatedAp {
|
||||
if let Some(p) = guest_pci_path {
|
||||
vfio_device.config.guest_pci_path = Some(p);
|
||||
}
|
||||
}
|
||||
info!(
|
||||
sl!(),
|
||||
|
||||
@@ -11,23 +11,27 @@ use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use hypervisor::{
|
||||
device::{
|
||||
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
|
||||
device_manager::{
|
||||
do_handle_device, find_cold_plugged_vfio_ap, get_block_device_info, DeviceManager,
|
||||
},
|
||||
util::{get_host_path, DEVICE_TYPE_BLOCK, DEVICE_TYPE_CHAR},
|
||||
DeviceConfig, DeviceType,
|
||||
},
|
||||
utils::uses_native_ccw_bus,
|
||||
vfio_device::is_vfio_ap_device,
|
||||
BlockConfig, BlockDeviceAio, Hypervisor, VfioConfig,
|
||||
};
|
||||
use kata_types::mount::{kata_guest_sandbox_dir, Mount, KATA_EPHEMERAL_VOLUME_TYPE, SHM_DIR};
|
||||
use kata_types::{
|
||||
config::{hypervisor::TopologyConfigInfo, TomlConfig},
|
||||
device::DRIVER_VFIO_AP_COLD_TYPE,
|
||||
mount::{adjust_rootfs_mounts, KATA_IMAGE_FORCE_GUEST_PULL},
|
||||
};
|
||||
use libc::NUD_PERMANENT;
|
||||
use oci::{Linux, LinuxCpu, LinuxResources};
|
||||
use oci_spec::runtime::{self as oci, LinuxDeviceType};
|
||||
use persist::sandbox_persist::Persist;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::{runtime, sync::RwLock};
|
||||
|
||||
use crate::{
|
||||
@@ -661,6 +665,43 @@ impl ResourceManagerInner {
|
||||
continue;
|
||||
}
|
||||
|
||||
// VFIO-AP devices have no PCIe BDF, so cold_plug_bdfs above
|
||||
// cannot catch them. If this device was registered in the
|
||||
// device manager during cold-plug (prepare_coldplug_raw_vfio_devices
|
||||
// or CDI), retrieve its APQN list and build the agent device
|
||||
// directly — calling do_handle_device on an already-present
|
||||
// device would attempt a QMP device_add and fail.
|
||||
if is_vfio_ap_device(Path::new(&host_path)) {
|
||||
if let Some(ap_devs) =
|
||||
find_cold_plugged_vfio_ap(&self.device_manager, &host_path).await
|
||||
{
|
||||
let container_path = d.path().display().to_string();
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let agent_device = Device {
|
||||
id: group_num,
|
||||
container_path,
|
||||
field_type: DRIVER_VFIO_AP_COLD_TYPE.to_string(),
|
||||
options: ap_devs,
|
||||
..Default::default()
|
||||
};
|
||||
info!(
|
||||
sl!(),
|
||||
"vfio-ap cold-plugged agent device: {:?}", agent_device
|
||||
);
|
||||
devices.push(ContainerDevice {
|
||||
device_info: None,
|
||||
device: agent_device,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
// Not registered as cold-plugged — fall through to do_handle_device.
|
||||
}
|
||||
|
||||
let bus_type = if uses_native_ccw_bus() {
|
||||
"ccw".to_string()
|
||||
} else {
|
||||
@@ -681,6 +722,14 @@ impl ResourceManagerInner {
|
||||
if let DeviceType::VfioModern(vfio_dev) = device_info.clone() {
|
||||
info!(sl!(), "device info: {:?}", vfio_dev.lock().await);
|
||||
let vfio_device = vfio_dev.lock().await;
|
||||
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
|
||||
let guest_pci_path = vfio_device
|
||||
.config
|
||||
.guest_pci_path
|
||||
@@ -697,15 +746,8 @@ impl ResourceManagerInner {
|
||||
// vfio mode: vfio-pci and vfio-pci-gk for x86_64
|
||||
// - vfio-pci, devices appear as VFIO character devices under /dev/vfio in container.
|
||||
// - vfio-pci-gk, devices are managed by whatever driver in Guest kernel.
|
||||
// - vfio-ap, devices appear as VFIO character devices under /dev/vfio in container for ccw devices.
|
||||
let vfio_mode = match self.toml_config.runtime.vfio_mode.as_str() {
|
||||
"vfio" => {
|
||||
if bus_type == "ccw" {
|
||||
"vfio-ap".to_string()
|
||||
} else {
|
||||
"vfio-pci".to_string()
|
||||
}
|
||||
}
|
||||
"vfio" => "vfio-pci".to_string(),
|
||||
_ => "vfio-pci-gk".to_string(),
|
||||
};
|
||||
let device_options = vec![format!("{}={}", host_bdf, guest_pci_path)];
|
||||
@@ -713,12 +755,6 @@ impl ResourceManagerInner {
|
||||
// filepath.Base(dev.ContainerPath), e.g. "vfio0".
|
||||
// The agent policy validates this with:
|
||||
// i_vfio_device.id == concat("", ["vfio", suffix])
|
||||
let group_num = d
|
||||
.path()
|
||||
.file_name()
|
||||
.and_then(|n| n.to_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let agent_device = Device {
|
||||
id: group_num,
|
||||
container_path: d.path().display().to_string().clone(),
|
||||
|
||||
@@ -47,6 +47,10 @@ fn kata_guest_root_shared_fs() -> String {
|
||||
/// If the container is a PodSandbox, it returns "pause".
|
||||
/// Otherwise, it attempts to find the image name using the appropriate Kubernetes
|
||||
/// annotation key.
|
||||
///
|
||||
/// If no container type annotation is found (SingleContainer case), it falls back to
|
||||
/// checking for image name annotations directly. This supports standalone container
|
||||
/// runtimes like nerdctl that may only provide the image name annotation.
|
||||
pub fn get_image_reference(spec_annotations: &HashMap<String, String>) -> Result<&str> {
|
||||
info!(
|
||||
sl!(),
|
||||
@@ -74,6 +78,26 @@ pub fn get_image_reference(spec_annotations: &HashMap<String, String>) -> Result
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback for SingleContainer case: if no container type annotation is found,
|
||||
// try to get image name directly. This supports standalone container runtimes
|
||||
// (e.g., nerdctl) that may only provide the image name annotation without the
|
||||
// container type annotation.
|
||||
if let Some(image_name) = spec_annotations.get(KUBERNETES_CRI_IMAGE_NAME) {
|
||||
info!(
|
||||
sl!(),
|
||||
"Found image name without container type annotation (SingleContainer): {}", image_name
|
||||
);
|
||||
return Ok(image_name.as_str());
|
||||
}
|
||||
|
||||
if let Some(image_name) = spec_annotations.get(KUBERNETES_CRIO_IMAGE_NAME) {
|
||||
info!(
|
||||
sl!(),
|
||||
"Found CRI-O image name without container type annotation (SingleContainer): {}", image_name
|
||||
);
|
||||
return Ok(image_name.as_str());
|
||||
}
|
||||
|
||||
Err(anyhow!("no target image reference found"))
|
||||
}
|
||||
|
||||
@@ -256,6 +280,25 @@ mod tests {
|
||||
let image_ref_result_pod_sandbox = get_image_reference(&annotations_pod_sandbox);
|
||||
assert!(image_ref_result_pod_sandbox.is_ok());
|
||||
assert_eq!(image_ref_result_pod_sandbox.unwrap(), "pause");
|
||||
// Test SingleContainer fallback (no container type annotation)
|
||||
let mut annotations_single = HashMap::new();
|
||||
annotations_single.insert(
|
||||
"io.kubernetes.cri.image-name".to_string(),
|
||||
"example-image-single".to_string(),
|
||||
);
|
||||
let image_ref_result_single = get_image_reference(&annotations_single);
|
||||
assert!(image_ref_result_single.is_ok());
|
||||
assert_eq!(image_ref_result_single.unwrap(), "example-image-single");
|
||||
|
||||
// Test SingleContainer fallback with CRI-O annotation
|
||||
let mut annotations_single_crio = HashMap::new();
|
||||
annotations_single_crio.insert(
|
||||
"io.kubernetes.cri-o.ImageName".to_string(),
|
||||
"example-image-single-crio".to_string(),
|
||||
);
|
||||
let image_ref_result_single_crio = get_image_reference(&annotations_single_crio);
|
||||
assert!(image_ref_result_single_crio.is_ok());
|
||||
assert_eq!(image_ref_result_single_crio.unwrap(), "example-image-single-crio");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -31,8 +31,9 @@ use containerd_shim_protos::events::task::{TaskExit, TaskOOM};
|
||||
))]
|
||||
use hypervisor::ch::CloudHypervisor;
|
||||
use hypervisor::device::topology::PCIePort;
|
||||
use hypervisor::device::util::{get_host_path, DEVICE_TYPE_CHAR};
|
||||
use hypervisor::remote::Remote;
|
||||
use hypervisor::VfioDeviceBase;
|
||||
use hypervisor::{is_vfio_ap_device, VfioDeviceBase};
|
||||
use hypervisor::VsockConfig;
|
||||
use hypervisor::HYPERVISOR_REMOTE;
|
||||
#[cfg(all(
|
||||
@@ -262,7 +263,23 @@ impl VirtSandbox {
|
||||
None
|
||||
};
|
||||
|
||||
let vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
// Cold-plug VFIO devices using two mutually exclusive paths:
|
||||
// 1. CDI path: Query Kubernetes Pod Resources API for devices managed by device plugins
|
||||
// (typical in K8s environments with device plugins)
|
||||
// 2. Raw VFIO path: Parse OCI spec's linux.devices for directly specified VFIO devices
|
||||
// (typical in standalone containers like `ctr --device /dev/vfio/0`)
|
||||
//
|
||||
// These paths are mutually exclusive from a user perspective:
|
||||
// - In K8s, devices come through device plugins, not raw OCI device specs
|
||||
// - In standalone containers, there's no Pod Resources API available
|
||||
//
|
||||
// Therefore, we only attempt the raw VFIO path if CDI finds no devices,
|
||||
// avoiding unnecessary file I/O and OCI spec parsing in the common K8s case.
|
||||
let mut vfio_devices = self.prepare_coldplug_cdi_devices(sandbox_config).await?;
|
||||
if vfio_devices.is_empty() {
|
||||
let raw_vfio = self.prepare_coldplug_raw_vfio_devices(sandbox_config).await?;
|
||||
vfio_devices.extend(raw_vfio);
|
||||
}
|
||||
if !vfio_devices.is_empty() {
|
||||
info!(
|
||||
sl!(),
|
||||
@@ -387,6 +404,97 @@ impl VirtSandbox {
|
||||
.collect())
|
||||
}
|
||||
|
||||
// Fallback cold-plug path for standalone containers (e.g. `ctr --device /dev/vfio/0`).
|
||||
// Reads the OCI spec from the bundle and cold-plugs any VFIO char devices found in
|
||||
// linux.devices before VM boot, mirroring Go's coldOrHotPlugVFIO().
|
||||
// Returns empty when the pod resources API path already handles devices (K8s) or
|
||||
// when cold_plug_vfio is not configured.
|
||||
async fn prepare_coldplug_raw_vfio_devices(
|
||||
&self,
|
||||
sandbox_config: &SandboxConfig,
|
||||
) -> Result<Vec<ResourceConfig>> {
|
||||
let hypervisor_config = self.hypervisor.hypervisor_config().await;
|
||||
let cold_plug_vfio = &hypervisor_config.device_info.cold_plug_vfio;
|
||||
if cold_plug_vfio.is_empty() || cold_plug_vfio == "no-port" {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let port = match cold_plug_vfio.as_str() {
|
||||
"root-port" => PCIePort::RootPort,
|
||||
other => {
|
||||
return Err(anyhow!(
|
||||
"unsupported cold_plug_vfio value {:?}; only \"root-port\" is supported",
|
||||
other
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
let bundle = &sandbox_config.state.bundle;
|
||||
if bundle.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let spec_path = format!("{}/{}", bundle, spec::OCI_SPEC_CONFIG_FILE_NAME);
|
||||
let oci_spec = match oci::Spec::load(&spec_path) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
info!(
|
||||
sl!(),
|
||||
"no OCI spec at {:?}: {:?}, skipping raw VFIO cold-plug", spec_path, e
|
||||
);
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
};
|
||||
|
||||
let linux_devices = oci_spec
|
||||
.linux()
|
||||
.as_ref()
|
||||
.and_then(|l| l.devices().as_ref())
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut vfio_configs = Vec::new();
|
||||
for d in linux_devices.iter() {
|
||||
if d.typ() != oci::LinuxDeviceType::C {
|
||||
continue;
|
||||
}
|
||||
let host_path = match get_host_path(DEVICE_TYPE_CHAR, d.major(), d.minor()) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
sl!(),
|
||||
"failed to resolve host path for {:?}: {:?}", d.path(), e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
// Only process VFIO passthrough devices under /dev/vfio/*.
|
||||
// Skip non-VFIO devices and the legacy VFIO control node (/dev/vfio/vfio).
|
||||
if !host_path.starts_with("/dev/vfio/") || host_path == "/dev/vfio/vfio" {
|
||||
continue;
|
||||
}
|
||||
let device_port = if is_vfio_ap_device(Path::new(&host_path)) {
|
||||
PCIePort::NoPort
|
||||
} else {
|
||||
port
|
||||
};
|
||||
vfio_configs.push(VfioDeviceBase {
|
||||
host_path: host_path.clone(),
|
||||
iommu_group_devnode: PathBuf::from(&host_path),
|
||||
dev_type: "c".to_string(),
|
||||
port: device_port,
|
||||
hostdev_prefix: "vfio_device".to_owned(),
|
||||
..Default::default()
|
||||
});
|
||||
}
|
||||
info!(sl!(), "raw VFIO cold-plug candidates: {:?}", vfio_configs);
|
||||
|
||||
Ok(vfio_configs
|
||||
.into_iter()
|
||||
.map(ResourceConfig::VfioDeviceModern)
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn prepare_network_resource(
|
||||
&self,
|
||||
network_env: &SandboxNetworkEnv,
|
||||
|
||||
@@ -63,6 +63,7 @@ setup_hotplug() {
|
||||
show_config_file
|
||||
elif [[ "${runtime}" == "runtime-rs" ]]; then
|
||||
setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs"
|
||||
setup_config_file "cold_plug_vfio" "replace" "no-port" "runtime-rs"
|
||||
show_config_file "runtime-rs"
|
||||
else
|
||||
echo "Invalid runtime: ${runtime}" >&2
|
||||
@@ -79,8 +80,9 @@ setup_coldplug() {
|
||||
setup_config_file "cold_plug_vfio" "replace" "bridge-port"
|
||||
show_config_file
|
||||
elif [[ "${runtime}" == "runtime-rs" ]]; then
|
||||
echo "Coldplug is not supported for runtime-rs" >&2
|
||||
exit 1
|
||||
setup_config_file "vfio_mode" "replace" "vfio" "runtime-rs"
|
||||
setup_config_file "cold_plug_vfio" "replace" "root-port" "runtime-rs"
|
||||
show_config_file "runtime-rs"
|
||||
else
|
||||
echo "Invalid runtime: ${runtime}" >&2
|
||||
exit 1
|
||||
@@ -296,6 +298,9 @@ run_tests() {
|
||||
|
||||
setup_hotplug "runtime-rs"
|
||||
run_test "3" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Hotplug" "&& zcrypttest -a -v"
|
||||
|
||||
setup_coldplug "runtime-rs"
|
||||
run_test "4" "runtime-rs" "Test can assign a CEX device inside the guest via VFIO-AP Coldplug" "&& zcrypttest -a -v"
|
||||
}
|
||||
|
||||
main() {
|
||||
|
||||
Reference in New Issue
Block a user