diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index ebafd353a5..27e706be9a 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use crate::device::topology::{PCIePortBusPrefix, TopologyPortDevice, DEFAULT_PCIE_ROOT_BUS}; +use crate::device::topology::{TopologyPortDevice, DEFAULT_PCIE_ROOT_BUS}; use crate::qemu::qmp::get_qmp_socket_path; use crate::utils::{ chown_to_parent, clear_cloexec, create_vhost_net_fds, open_named_tuntap, uses_native_ccw_bus, @@ -11,6 +11,7 @@ use crate::utils::{ }; use crate::{kernel_param::KernelParams, Address, HypervisorConfig}; +use std::borrow::Cow; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; @@ -19,11 +20,12 @@ use kata_types::rootless::is_rootless; use serde::{Deserialize, Serialize}; use serde_json; use std::collections::HashMap; -use std::fmt::Display; +use std::fmt::{Display, Write}; use std::fs::{read_to_string, File}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd}; use std::os::unix::net::UnixListener; use std::path::PathBuf; +use std::str; use tokio; // These should have been called MiB and GiB for better readability but the @@ -510,7 +512,7 @@ impl Machine { accel: "kvm".to_owned(), options: config.machine_info.machine_accelerators.clone(), nvdimm: false, - kernel_irqchip: None, + kernel_irqchip: Some("on".to_owned()), // default to off, will be turned on if needed by VFIO devices confidential_guest_support: "".to_owned(), is_nvdimm_supported, memory_backend: None, @@ -2012,108 +2014,146 @@ impl ToQemuParams for ObjectTdxGuest { } } -/// PCIeRootPortDevice directly attached onto the root bus -/// -device pcie-root-port,id=rp0,bus=pcie.0,chassis=0,slot=0,multifunction=off,pref64-reserve=B,mem-reserve=B -#[derive(Debug, Default)] +const DEFAULT_START_ADDR: &str = "0x5"; +//const DEFAULT_ADDR: &str = "0x0"; + +/// Configuration for the IOMMUFD object backend. +#[derive(Debug, Clone)] +pub struct ObjectIommufd { + id: String, +} + +impl ObjectIommufd { + pub fn new(id: impl Into) -> Self { + Self { id: id.into() } + } +} + +#[async_trait] +impl ToQemuParams for ObjectIommufd { + async fn qemu_params(&self) -> Result> { + Ok(vec![ + "-object".to_string(), + format!("iommufd,id={}", self.id), + ]) + } +} + +/// Representation of a PCIe Root Port device in QEMU. +#[derive(Debug, Clone)] pub struct PCIeRootPortDevice { id: String, - bus: String, - chassis: String, - slot: String, - multifunction: String, + bus: Cow<'static, str>, + port: Option, + /// Numerical identifier for the chassis. + chassis: u32, + /// Optional slot identifier. + slot: Option, + /// Whether the device supports multiple functions. + multifunction: bool, + /// PCI address; supports simple ("0x5") and complex multifunction ("0x5.0x1") formats. addr: String, - - mem_reserve: String, - pref64_reserve: String, } impl PCIeRootPortDevice { - fn new(id: &str, bus: &str, chassis: &str, slot: &str, multifunc: bool, addr: &str) -> Self { - PCIeRootPortDevice { - id: id.to_string(), - bus: if bus.is_empty() { - DEFAULT_PCIE_ROOT_BUS.to_owned() - } else { - bus.to_string() + /// Creates a new PCIe Root Port device instance. + pub fn new(id: impl Into, bus: impl Into) -> Self { + Self { + id: id.into(), + bus: { + let bus_str = bus.into(); + if bus_str.is_empty() { + Cow::Borrowed(DEFAULT_PCIE_ROOT_BUS) + } else { + Cow::Owned(bus_str) + } }, - chassis: if chassis.is_empty() { - "0x00".to_owned() - } else { - chassis.to_owned() - }, - slot: if slot.is_empty() { - "0x00".to_owned() - } else { - slot.to_owned() - }, - multifunction: if multifunc { - "on".to_owned() - } else { - "off".to_owned() - }, - addr: if addr.is_empty() { - "0x00".to_owned() - } else { - addr.to_owned() - }, - ..Default::default() + port: None, + chassis: 1, + slot: None, + multifunction: false, + addr: DEFAULT_START_ADDR.to_string(), } } - fn set_mem_reserve(&mut self, mem_reserve: u64) -> &mut Self { - if mem_reserve > 0 { - self.mem_reserve = format!("{mem_reserve}B"); - } - + pub fn with_port(mut self, port: u16) -> Self { + self.port = Some(port); self } - fn set_pref64_reserve(&mut self, pref64_reserve: u64) -> &mut Self { - if pref64_reserve > 0 { - self.pref64_reserve = format!("{pref64_reserve}B"); - } - + pub fn with_chassis(mut self, chassis: u32) -> Self { + self.chassis = chassis; self } + + pub fn with_slot(mut self, slot: u32) -> Self { + self.slot = Some(slot); + self + } + + pub fn with_multifunction(mut self, multifunction: bool) -> Self { + self.multifunction = multifunction; + self + } + + /// Sets the PCI address. Supports standard ("0x5") and multifunction ("0x5.0x1") strings. + pub fn with_addr(mut self, addr: impl Into) -> Self { + self.addr = addr.into(); + self + } + } #[async_trait] impl ToQemuParams for PCIeRootPortDevice { async fn qemu_params(&self) -> Result> { - let mut device_params = Vec::new(); + let mut params = String::with_capacity(256); - // -device pcie-root-port,id=rp0 - device_params.push(format!("{},id={}", "pcie-root-port", self.id)); - device_params.push(format!("bus={}", self.bus)); - device_params.push(format!("chassis={}", self.chassis)); - device_params.push(format!("slot={}", self.slot)); - device_params.push(format!("multifunction={}", self.multifunction)); - if self.multifunction.as_str() == "on" { - device_params.push(format!("addr={}", self.addr)); - } - if !self.mem_reserve.is_empty() { - device_params.push(format!("mem-reserve={}", self.mem_reserve)); - } - if !self.pref64_reserve.is_empty() { - device_params.push(format!("pref64-reserve={}", self.pref64_reserve)); + // Example: -device pcie-root-port,id=rp0 + write!(params, "pcie-root-port,id={}", self.id).unwrap(); + + if let Some(port) = self.port { + write!(params, ",port={}", port).unwrap(); } - Ok(vec!["-device".to_string(), device_params.join(",")]) + // Match govmm: only pass `addr=` for multifunction ports, or when a concrete address + // is required (VFIO cold-plug uses e.g. 0x09). Placeholder pool ports use addr "0" from + // `add_pcie_root_ports`; emitting `addr=0` for every `pcie-root-port` collides on + // `pcie.0` ("slot 0 ... in use by mch"). + if self.multifunction || self.addr != "0" { + write!(params, ",addr={}", self.addr).unwrap(); + } + write!(params, ",chassis={}", self.chassis).unwrap(); + + if let Some(slot) = self.slot { + write!(params, ",slot={}", slot).unwrap(); + } + + write!(params, ",bus={}", self.bus).unwrap(); + write!( + params, + ",multifunction={}", + if self.multifunction { "on" } else { "off" } + ) + .unwrap(); + + Ok(vec!["-device".to_string(), params]) } } -/// PCIeSwitchUpstreamPortDevice is the port attached to the root port. -#[derive(Debug, Default)] +/// PCIe Switch Upstream Port device, which must be connected to a PCIe Root Port, +/// and can have PCIe devices or downstream ports connected to it. +#[derive(Debug, Clone)] pub struct PCIeSwitchUpstreamPortDevice { id: String, bus: String, } impl PCIeSwitchUpstreamPortDevice { - fn new(id: &str, bus: &str) -> Self { - PCIeSwitchUpstreamPortDevice { - id: id.to_string(), - bus: bus.to_string(), + pub fn new(id: impl Into, bus: impl Into) -> Self { + Self { + id: id.into(), + bus: bus.into(), } } } @@ -2121,17 +2161,16 @@ impl PCIeSwitchUpstreamPortDevice { #[async_trait] impl ToQemuParams for PCIeSwitchUpstreamPortDevice { async fn qemu_params(&self) -> Result> { - let mut device_params = Vec::new(); - - device_params.push(format!("{},id={}", "x3130-upstream", self.id)); - device_params.push(format!("bus={}", self.bus)); - - Ok(vec!["-device".to_string(), device_params.join(",")]) + Ok(vec![ + "-device".to_string(), + format!("x3130-upstream,id={},bus={}", self.id, self.bus), + ]) } } -/// PCIeSwitchDownstreamPortDevice is the port attached to the root port. -#[derive(Debug, Default)] +/// PCIe Switch Downstream Port device, which must be connected to a PCIe Root Port or another downstream port, +/// and can have PCIe devices or another switch's downstream ports connected to it. +#[derive(Debug, Clone)] pub struct PCIeSwitchDownstreamPortDevice { // format: sup{n}, n>=0 pub id: String, @@ -2140,36 +2179,312 @@ pub struct PCIeSwitchDownstreamPortDevice { pub bus: String, // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00 - pub chassis: String, + pub chassis: u32, // >=0, default is 0x00 - pub slot: String, + pub slot: u32, } impl PCIeSwitchDownstreamPortDevice { - fn new(bus: &str, chassis: u32, index: u32) -> Self { - PCIeSwitchDownstreamPortDevice { - // "swdp{i}" - id: format!("{}{}", PCIePortBusPrefix::SwitchDownstreamPort, index), - // "swup0" - bus: bus.to_string(), - chassis: chassis.to_string(), - slot: index.to_string(), + pub fn new(bus: impl Into, chassis: u32, slot: u32) -> Self { + Self { + id: format!("swdp{}", slot), + bus: bus.into(), + chassis, + slot, } } + + #[allow(dead_code)] + pub fn with_id(mut self, id: impl Into) -> Self { + self.id = id.into(); + self + } } #[async_trait] impl ToQemuParams for PCIeSwitchDownstreamPortDevice { async fn qemu_params(&self) -> Result> { - let mut device_params = Vec::new(); + Ok(vec![ + "-device".to_string(), + format!( + "xio3130-downstream,id={},bus={},chassis={},slot={}", + self.id, self.bus, self.chassis, self.slot + ), + ]) + } +} - device_params.push(format!("{},id={}", "xio3130-downstream", self.id)); - device_params.push(format!("bus={}", self.bus)); - device_params.push(format!("chassis={}", self.chassis)); - device_params.push(format!("slot={}", self.slot)); +/// VFIO PCI device +#[derive(Debug, Clone)] +pub struct PCIeVfioDevice { + host_bdf: String, + bus: String, + addr: String, + iommufd: Option, + x_pci_vendor_id: Option, + x_pci_device_id: Option, +} - Ok(vec!["-device".to_string(), device_params.join(",")]) +impl PCIeVfioDevice { + pub fn new( + host_bdf: impl Into, + bus: impl Into, + iommufd: impl Into, + ) -> Self { + Self { + host_bdf: host_bdf.into(), + bus: bus.into(), + addr: "0x0".to_string(), + iommufd: Some(iommufd.into()), + x_pci_vendor_id: None, + x_pci_device_id: None, + } + } + + #[allow(dead_code)] + pub fn with_addr(mut self, addr: impl Into) -> Self { + self.addr = addr.into(); + self + } + + pub fn with_vendor_id(mut self, vendor_id: impl Into) -> Self { + self.x_pci_vendor_id = Some(vendor_id.into()); + self + } + + pub fn with_device_id(mut self, device_id: impl Into) -> Self { + self.x_pci_device_id = Some(device_id.into()); + self + } +} + +#[async_trait] +impl ToQemuParams for PCIeVfioDevice { + async fn qemu_params(&self) -> Result> { + let mut params = String::with_capacity(256); + + write!(params, "vfio-pci,host={}", self.host_bdf).unwrap(); + write!(params, ",bus={}", self.bus).unwrap(); + write!(params, ",addr={}", self.addr).unwrap(); + + if let Some(iommufd) = &self.iommufd { + write!(params, ",iommufd={}", iommufd).unwrap(); + } + + if let Some(vendor) = &self.x_pci_vendor_id { + write!(params, ",x-pci-vendor-id={}", vendor).unwrap(); + } + + if let Some(device) = &self.x_pci_device_id { + write!(params, ",x-pci-device-id={}", device).unwrap(); + } + + Ok(vec!["-device".to_string(), params]) + } +} + +#[allow(dead_code)] +pub struct VfioDeviceBase { + /// Host BDF address (e.g., "0000:21:00.0" or the short form "21:00.0"). + pub host_bdf: String, + + /// The bus to which the device is attached (e.g., "pci.1"). + pub bus: String, + + /// IOMMU file descriptor ID (e.g., "iommufd0"). + pub iommufd: Option, +} + +/// Comprehensive configuration for a VFIO device. +#[derive(Debug, Clone, Default)] +#[allow(dead_code)] +pub struct VfioDeviceConfig { + /// Host BDF address (e.g., "0000:21:00.0" or "21:00.0"). + pub host_bdf: String, + + /// The bus to which the device is attached (e.g., "pci.1"). + pub bus: String, + + /// Port number of the associated PCIe Root Port. + pub port: u16, + + /// Chassis number of the associated PCIe Root Port. + pub chassis: u32, + + /// Whether to enable multifunction support on the PCIe Root Port. + pub multifunction: bool, + + /// Indicates if this is the primary device (function 0) in a multifunction group. + /// If true, the Root Port will be configured with `multifunction=on`. + pub is_multifunction_primary: bool, + + /// Address of the PCIe Root Port on the system bus (e.g., "0x5" or "0x5.0x1"). + pub root_port_addr: String, + + /// Device address for the VFIO device itself (typically "0x0"). + #[allow(dead_code)] + pub vfio_addr: String, + + /// Optional PCI Vendor ID override. + pub x_pci_vendor_id: Option, + + /// Optional PCI Device ID override. + pub x_pci_device_id: Option, +} + +impl VfioDeviceConfig { + /// Creates a new VFIO device configuration. + pub fn new(host_bdf: impl Into, port: u16, chassis: u32) -> Self { + let chassis_val = chassis; + Self { + host_bdf: host_bdf.into(), + bus: format!("pci.{}", chassis_val), + port, + chassis: chassis_val, + multifunction: true, + is_multifunction_primary: true, + // Defaults to 0x5 based on port offset; subsequent devices increment from here. + root_port_addr: format!("0x{}", port), + vfio_addr: format!("0x{}", port), + x_pci_vendor_id: None, + x_pci_device_id: None, + } + } + + pub fn with_multifunction(mut self, multifunction: bool) -> Self { + self.multifunction = multifunction; + self + } + + pub fn with_vfio_bus(mut self, bus: impl Into) -> Self { + self.bus = bus.into(); + self + } + + /// Sets a specific root port address (used for non-multifunction modes). + pub fn with_root_port_addr(mut self, addr: impl Into) -> Self { + self.root_port_addr = addr.into(); + self.is_multifunction_primary = false; + self + } + + /// Configures the device as the primary device (function 0) in a multifunction group. + #[allow(dead_code)] + pub fn as_multifunction_primary(base_addr: impl Into) -> Self { + Self { + is_multifunction_primary: true, + root_port_addr: base_addr.into(), + ..Default::default() + } + } + + /// Configures the device as a secondary device (functions 1-7) in a multifunction group. + #[allow(dead_code)] + pub fn as_multifunction_secondary(base_addr: impl Into, function: u8) -> Self { + if function == 0 || function > 7 { + panic!("Function number must be between 1 and 7 for secondary devices"); + } + Self { + is_multifunction_primary: false, + root_port_addr: format!("{}.0x{:x}", base_addr.into(), function), + ..Default::default() + } + } + #[allow(dead_code)] + pub fn with_vfio_addr(mut self, addr: impl Into) -> Self { + self.vfio_addr = addr.into(); + self + } + #[allow(dead_code)] + pub fn with_vendor_id(mut self, vendor_id: impl Into) -> Self { + self.x_pci_vendor_id = Some(vendor_id.into()); + self + } + #[allow(dead_code)] + pub fn with_device_id(mut self, device_id: impl Into) -> Self { + self.x_pci_device_id = Some(device_id.into()); + self + } + +} + +/// Configuration for a group of VFIO devices, typically used to manage multiple +/// devices sharing the same PCI slot via multifunction support. +#[derive(Debug, Clone)] +pub struct VfioDeviceGroup { + /// Base PCI slot address (e.g., "0x5"). + pub base_addr: String, + + /// Identifier for the IOMMU file descriptor (IOMMUFD) backend. + #[allow(dead_code)] + pub iommufd: String, + + /// Starting port number for the assigned PCIe root ports. + pub start_port: u16, + + /// Starting chassis number for the assigned PCIe root ports. + pub start_chassis: u32, + + /// List of host BDF (Bus-Device-Function) addresses. + pub devices: Vec, + + /// Indicates whether to enable PCI multifunction support for this group. + pub multifunction: bool, +} + +impl VfioDeviceGroup { + pub fn new( + base_addr: impl Into, + iommufd: impl Into, + start_port: u16, + start_chassis: u32, + ) -> Self { + Self { + base_addr: base_addr.into(), + iommufd: iommufd.into(), + start_port, + start_chassis, + devices: Vec::new(), + multifunction: false, + } + } + + pub fn with_devices(mut self, devices: Vec) -> Self { + self.devices = devices; + self + } + + pub fn with_multifunction(mut self, multifunction: bool) -> Self { + self.multifunction = multifunction; + self + } + + /// Generates a list of configuration objects for all devices in the group. + pub fn generate_configs(&self) -> Vec { + self.devices + .iter() + .enumerate() + .map(|(idx, bdf)| { + let port = self.start_port + idx as u16; + let chassis = self.start_chassis + idx as u32; + + let addr = if idx == 0 && self.multifunction { + // Use the base address for the primary device (function 0) + self.base_addr.clone() + } else if self.multifunction && idx > 0 { + // Map subsequent devices to specific PCI functions (e.g., 0x5.0x1) + format!("{}.0x{:x}", self.base_addr, idx) + } else { + // In non-multifunction mode, use the base address independently + self.base_addr.clone() + }; + + VfioDeviceConfig::new(bdf, port, chassis) + .with_multifunction(idx == 0 && self.multifunction) + .with_root_port_addr(addr) + }) + .collect() } } @@ -2615,6 +2930,165 @@ impl<'a> QemuCmdLine<'a> { .set_nvdimm(false); } + pub fn add_iommufd(&mut self, id: impl Into) -> Result<()> { + let id_str = id.into(); + if !id_str.is_empty() { + let iommufd = ObjectIommufd::new(id_str); + self.devices.push(Box::new(iommufd)); + } + + Ok(()) + } + + /// add_vfio_device + /// "-object", "iommufd,id=iommufd0", + /// + /// -device pcie-root-port,port=24,chassis=9,id=pci.9,bus=pcie.0,multifunction=on,addr=0x4 + /// -device vfio-pci,host=0000:21:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x2321,bus=pci.1,addr=0x0,iommufd=iommufd0 + pub fn add_vfio_device(&mut self, config: VfioDeviceConfig) -> Result<()> { + self.add_iommufd("iommufd0")?; + + let root_port_id = format!("pci.{}", config.chassis); + let root_port = PCIeRootPortDevice::new(&root_port_id, DEFAULT_PCIE_ROOT_BUS) + .with_port(config.port) + .with_chassis(config.chassis) + .with_multifunction(config.multifunction) + .with_addr(&config.root_port_addr); + + let mut vfio_device = PCIeVfioDevice::new(&config.host_bdf, root_port_id, "iommufd0"); + + if let Some(vendor_id) = &config.x_pci_vendor_id { + vfio_device = vfio_device.with_vendor_id(vendor_id); + } + + if let Some(device_id) = &config.x_pci_device_id { + vfio_device = vfio_device.with_device_id(device_id); + } + + self.devices.reserve(2); + self.devices.push(Box::new(root_port)); + self.devices.push(Box::new(vfio_device)); + + Ok(()) + } + + /// Configures PCIe VFIO devices using multifunction Root Ports for optimized address space + /// -device pcie-root-port,id=root_port1,multifunction=on,chassis=x,addr=z.0 \ + /// -device pcie-root-port,id=root_port2,chassis=x1,addr=z.1 \ + pub fn add_pcie_vfio_device(&mut self, config: VfioDeviceConfig) -> Result<()> { + let machine_type = &self.config.machine_info.machine_type; + let (_start_addr, multi_function) = match machine_type.as_str() { + "q35" | "virt" => (DEFAULT_START_ADDR, false), + _ => { + info!( + sl!(), + "PCIe root ports not supported for machine type: {}", machine_type + ); + return Ok(()); + } + }; + + let iommufd_name = format!("iommufd{}", config.bus); + self.add_iommufd(&iommufd_name)?; + + let root_port_id = config.bus.clone(); + let root_port = PCIeRootPortDevice::new(&root_port_id, DEFAULT_PCIE_ROOT_BUS) + .with_chassis(config.chassis) + .with_slot(config.port as u32) + .with_multifunction(multi_function) + .with_addr(format!("0x{:02x}", config.port)); + info!(sl!(), "PCIe Root Port: {:?}", root_port.clone()); + + let mut vfio_device = PCIeVfioDevice::new(&config.host_bdf, root_port_id, &iommufd_name); + + if let Some(vendor_id) = &config.x_pci_vendor_id { + vfio_device = vfio_device.with_vendor_id(vendor_id); + } + + if let Some(device_id) = &config.x_pci_device_id { + vfio_device = vfio_device.with_device_id(device_id); + } + + self.devices.reserve(2); + self.devices.push(Box::new(root_port)); + self.devices.push(Box::new(vfio_device)); + + Ok(()) + } + + /// Batch adds multiple VFIO devices to the QEMU command line. + pub fn add_vfio_devices( + &mut self, + configs: Vec, + ) -> Result<()> { + if configs.is_empty() { + return Ok(()); + } + + self.devices.reserve(configs.len() * 2); + + for config in configs { + self.add_vfio_device(config)?; + } + + Ok(()) + } + + /// Adds a group of VFIO devices that share the same PCI slot (Multifunction configuration). + pub fn add_vfio_device_group(&mut self, group: VfioDeviceGroup) -> Result<()> { + let configs = group.generate_configs(); + self.add_vfio_devices(configs) + } + + /// Convenience method to configure a standard high-performance GPU and NVSwitch topology. + #[allow(dead_code)] + pub fn add_gpu_nvswitch_setup( + &mut self, + gpus: Vec<&str>, + nvswitches: Vec<&str>, + iommufd: &str, + ) -> Result<()> { + self.add_iommufd(iommufd)?; + + if !gpus.is_empty() { + let gpu_group = VfioDeviceGroup::new("0x5", iommufd, 16, 1) + .with_devices(gpus.iter().map(|s| s.to_string()).collect()) + .with_multifunction(true); + + self.add_vfio_device_group(gpu_group)?; + } + + if !nvswitches.is_empty() { + let nvswitch_configs: Vec = nvswitches + .iter() + .enumerate() + .map(|(idx, bdf)| { + let port = 24 + idx as u16; + let chassis = 9 + idx as u32; + let addr = if idx == 0 { + "0x4".to_string() + } else { + format!("0x4.0x{:x}", idx) + }; + + let full_bdf = if bdf.starts_with("0000:") { + bdf.to_string() + } else { + format!("0000:{}", bdf) + }; + + VfioDeviceConfig::new(full_bdf, port, chassis) + .with_multifunction(idx == 0) + .with_root_port_addr(addr) + }) + .collect(); + + self.add_vfio_devices(nvswitch_configs)?; + } + + Ok(()) + } + /// Note: add_pcie_root_port and add_pcie_switch_port follow kata-runtime's related implementations of vfio devices. /// The design origins from https://github.com/qemu/qemu/blob/master/docs/pcie.txt /// @@ -2645,15 +3119,13 @@ impl<'a> QemuCmdLine<'a> { pub fn add_pcie_root_ports( &mut self, root_ports: HashMap, - mem_reserve: u64, - pref64_reserve: u64, ) -> Result<()> { if root_ports.is_empty() { return Ok(()); } - let machine_type: &str = &self.config.machine_info.machine_type; - let (addr, multi_function) = match machine_type { + let machine_type = &self.config.machine_info.machine_type; + let (addr, multi_function) = match machine_type.as_str() { "q35" | "virt" => ("0", false), _ => { info!( @@ -2664,20 +3136,27 @@ impl<'a> QemuCmdLine<'a> { } }; - // -device pcie-root-port,id=root_port1,multifunction=on,chassis=x,addr=z.0[,slot=y][,bus=pcie.0] - for (index, rp) in root_ports.iter() { - let (chassis, slot) = (format!("{}", index + 1), format!("{index}")); - let mut root_port_dev = PCIeRootPortDevice::new( - &rp.port_id(), // rpX - &rp.bus, // pcie.0 - &chassis, - &slot, - multi_function, - addr, - ); - root_port_dev - .set_mem_reserve(mem_reserve) - .set_pref64_reserve(pref64_reserve); + self.devices.reserve(root_ports.len()); + + for (index, rp) in root_ports { + // VFIO cold-plug (see `add_pcie_vfio_device`) runs before this when resource order + // is CDI VFIO then port pool; it already emits `pcie-root-port,id=rpN` for reserved + // slots (`TopologyPortDevice::allocated`). Skip placeholders for those IDs or QEMU + // errors with duplicate device id (e.g. two `id=rp0`). + if rp.allocated { + debug!( + sl!(), + "skip add_pcie_root_ports for {} (already allocated / emitted)", + rp.port_id() + ); + continue; + } + + let root_port_dev = PCIeRootPortDevice::new(rp.port_id(), &rp.bus) + .with_chassis(index + 1) + .with_slot(index) + .with_multifunction(multi_function) + .with_addr(addr); self.devices.push(Box::new(root_port_dev)); } @@ -2698,15 +3177,13 @@ impl<'a> QemuCmdLine<'a> { pub fn add_pcie_switch_ports( &mut self, switch_ports: HashMap, - mem_reserve: u64, - pref64_reserve: u64, ) -> Result<()> { if switch_ports.is_empty() { return Ok(()); } let machine_type = &self.config.machine_info.machine_type; - if machine_type != "q35" && machine_type != "virt" { + if !matches!(machine_type.as_str(), "q35" | "virt") { info!( sl!(), "PCIe switch ports not supported for machine type: {}", machine_type @@ -2714,43 +3191,41 @@ impl<'a> QemuCmdLine<'a> { return Ok(()); } - for (index, rp) in switch_ports.iter() { - // 1. Create Root Port - // -device pcie-root-port,id=root_port1,chassis=x,slot=y[,bus=pcie.0][,addr=z] + let estimated_devices: usize = switch_ports + .values() + .map(|rp| { + 2 + rp + .connected_switch + .as_ref() + .map_or(0, |s| s.switch_ports.len()) + }) + .sum(); + self.devices.reserve(estimated_devices); + + for (index, rp) in switch_ports { + let chassis = index + 1; + + // Root Port + let pcie_root_port = PCIeRootPortDevice::new(rp.port_id(), &rp.bus) + .with_chassis(chassis) + .with_slot(index) + .with_multifunction(false) + .with_addr("0"); - // (slot, chassis) pair is mandatory and must be unique for each PCI Express Root Port - let (slot, chassis) = (format!("{index}"), format!("{}", index + 1)); - let mut pcie_root_port = PCIeRootPortDevice::new( - &rp.port_id(), - &rp.bus, // pcie.0 - &chassis, - &slot, - false, - "0", - ); - pcie_root_port - .set_mem_reserve(mem_reserve) - .set_pref64_reserve(pref64_reserve); self.devices.push(Box::new(pcie_root_port)); if let Some(switch) = &rp.connected_switch { - // 2. Create Upstream Port - // -device x3130-upstream,id=upstream_port1,bus=root_port1[,addr=x] - let upstream_port_id = switch.port_id(); - let pcie_switch_upstream_port = - PCIeSwitchUpstreamPortDevice::new(&upstream_port_id, &switch.bus); - self.devices.push(Box::new(pcie_switch_upstream_port)); + // Upstream Port + let upstream_port = + PCIeSwitchUpstreamPortDevice::new(switch.port_id(), &switch.bus); + self.devices.push(Box::new(upstream_port)); - // 3. Create Downstream Ports - // -device xio3130-downstream,id=downstream_port1,bus=upstream_port1,chassis=x1,slot=y1[,addr=z1]] - let next_chassis = chassis.parse::()? + 1; - for (index, swdp) in switch.switch_ports.iter() { - let pcie_switch_downstream_port = PCIeSwitchDownstreamPortDevice::new( - &swdp.bus, - next_chassis + index, - *index, - ); - self.devices.push(Box::new(pcie_switch_downstream_port)); + // Downstream Ports + let next_chassis = chassis + 1; + for (idx, swdp) in &switch.switch_ports { + let downstream_port = + PCIeSwitchDownstreamPortDevice::new(&swdp.bus, next_chassis + idx, *idx); + self.devices.push(Box::new(downstream_port)); } } }