diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 7fa49b96c0..b8d0ea6e84 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -49,7 +49,7 @@ trait ToQemuParams: Send + Sync { async fn qemu_params(&self) -> Result>; } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone, Copy)] enum VirtioBusType { Pci, Ccw, @@ -70,6 +70,14 @@ impl Display for VirtioBusType { } } +fn bus_type(config: &HypervisorConfig) -> VirtioBusType { + if config.machine_info.machine_type.contains("-ccw-") { + VirtioBusType::Ccw + } else { + VirtioBusType::Pci + } +} + // Conventions used in qemu command line generation // ================================================ // @@ -975,7 +983,7 @@ fn format_fds(files: &[File]) -> String { } #[derive(Debug)] -struct Netdev { +pub struct Netdev { id: String, // File descriptors for vhost multi-queue support. @@ -1013,6 +1021,18 @@ impl Netdev { self.disable_vhost_net = disable_vhost_net; self } + + pub fn get_id(&self) -> &String { + &self.id + } + + pub fn get_fds(&self) -> &Vec { + &self.fds["fds"] + } + + pub fn get_vhostfds(&self) -> &Vec { + &self.fds["vhostfds"] + } } #[async_trait] @@ -1081,6 +1101,26 @@ impl DeviceVirtioNet { self.iommu_platform = iommu_platform; self } + + pub fn get_netdev_id(&self) -> &String { + &self.netdev_id + } + + pub fn get_device_driver(&self) -> &String { + &self.device_driver + } + + pub fn get_mac_addr(&self) -> String { + format!("{:?}", self.mac_address) + } + + pub fn get_num_queues(&self) -> u32 { + self.num_queues + } + + pub fn get_disable_modern(&self) -> bool { + self.disable_modern + } } #[async_trait] @@ -1290,6 +1330,74 @@ impl ToQemuParams for DeviceIntelIommu { } } +#[derive(Debug)] +struct DevicePciBridge { + driver: String, + bus: String, + id: String, + chassis_nr: u32, + shpc: bool, + addr: u32, + io_reserve: String, + mem_reserve: String, + pref64_reserve: String, +} + +impl DevicePciBridge { + fn new(config: &HypervisorConfig, bridge_idx: u32) -> DevicePciBridge { + DevicePciBridge { + // The go runtime doesn't support bridges other than PCI although + // PCIe should also be available. Stick with the legacy behaviour + // of ignoring PCIe since it's not clear to me how to decide + // between the two. + driver: "pci-bridge".to_owned(), + bus: match config.machine_info.machine_type.as_str() { + "q35" | "virt" => "pcie.0", + _ => "pci.0", + } + .to_owned(), + id: format!("pci-bridge-{}", bridge_idx), + // Each bridge is required to be assigned a unique chassis id > 0. + chassis_nr: bridge_idx + 1, + shpc: false, + // 2 is documented by the go runtime as the first slot available + // for a bridge (on x86_64) + // (https://github.com/kata-containers/kata-containers/blob/99730256a2899c82d111400024621519d17ea15d/src/runtime/virtcontainers/qemu_arch_base.go#L212) + addr: 2 + bridge_idx, + // Values taken from the go runtime implementation which comments + // the choices as follows: + // Certain guest BIOS versions think !SHPC means no hotplug, and + // won't reserve the IO and memory windows that will be needed for + // devices added underneath this bridge. This will only break for + // certain combinations of exact qemu, BIOS and guest kernel + // versions, but for consistency, just hint the usual default + // windows for a bridge (as the BIOS would use with SHPC) so that + // we can do ACPI hotplug. + // (https://github.com/kata-containers/kata-containers/blob/99730256a2899c82d111400024621519d17ea15d/src/runtime/virtcontainers/qemu.go#L2474) + io_reserve: "4k".to_owned(), + mem_reserve: "1m".to_owned(), + pref64_reserve: "1m".to_owned(), + } + } +} + +#[async_trait] +impl ToQemuParams for DevicePciBridge { + async fn qemu_params(&self) -> Result> { + let mut params = Vec::new(); + params.push(self.driver.clone()); + params.push(format!("bus={}", self.bus)); + params.push(format!("id={}", self.id)); + params.push(format!("chassis_nr={}", self.chassis_nr)); + params.push(format!("shpc={}", if self.shpc { "on" } else { "off" })); + params.push(format!("addr={}", self.addr)); + params.push(format!("io-reserve={}", self.io_reserve)); + params.push(format!("mem-reserve={}", self.mem_reserve)); + params.push(format!("pref64-reserve={}", self.pref64_reserve)); + Ok(vec!["-device".to_owned(), params.join(",")]) + } +} + // Qemu provides methods and types for managing QEMU instances. // To manage a qemu instance after it has been launched you need // to pass the -qmp option during launch requesting the qemu instance @@ -1480,10 +1588,14 @@ impl<'a> QemuCmdLine<'a> { qemu_cmd_line.add_rtc(); - if qemu_cmd_line.bus_type() != VirtioBusType::Ccw { + if bus_type(config) != VirtioBusType::Ccw { qemu_cmd_line.add_rng(); } + if bus_type(config) != VirtioBusType::Ccw && config.device_info.default_bridges > 0 { + qemu_cmd_line.add_bridges(config.device_info.default_bridges); + } + Ok(qemu_cmd_line) } @@ -1507,14 +1619,6 @@ impl<'a> QemuCmdLine<'a> { self.devices.push(Box::new(rng_device)); } - fn bus_type(&self) -> VirtioBusType { - if self.config.machine_info.machine_type.contains("-ccw-") { - VirtioBusType::Ccw - } else { - VirtioBusType::Pci - } - } - fn add_iommu(&mut self) { let dev_iommu = DeviceIntelIommu::new(); self.devices.push(Box::new(dev_iommu)); @@ -1526,6 +1630,13 @@ impl<'a> QemuCmdLine<'a> { self.machine.set_kernel_irqchip("split"); } + fn add_bridges(&mut self, count: u32) { + for idx in 0..count { + let bridge = DevicePciBridge::new(self.config, idx); + self.devices.push(Box::new(bridge)); + } + } + pub fn add_virtiofs_share( &mut self, virtiofsd_socket_path: &str, @@ -1542,9 +1653,11 @@ impl<'a> QemuCmdLine<'a> { self.devices.push(Box::new(virtiofsd_socket_chardev)); - let mut virtiofs_device = DeviceVhostUserFs::new(chardev_name, mount_tag, self.bus_type()); + let bus_type = bus_type(self.config); + + let mut virtiofs_device = DeviceVhostUserFs::new(chardev_name, mount_tag, bus_type); virtiofs_device.set_queue_size(queue_size); - if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw { + if self.config.device_info.enable_iommu_platform && bus_type == VirtioBusType::Ccw { virtiofs_device.set_iommu_platform(true); } self.devices.push(Box::new(virtiofs_device)); @@ -1558,7 +1671,7 @@ impl<'a> QemuCmdLine<'a> { //self.devices.push(Box::new(mem_file)); self.memory.set_memory_backend_file(&mem_file); - match self.bus_type() { + match bus_type { VirtioBusType::Pci => { self.machine.set_nvdimm(true); self.devices.push(Box::new(NumaNode::new(&mem_file.id))); @@ -1572,7 +1685,7 @@ impl<'a> QemuCmdLine<'a> { pub fn add_vsock(&mut self, vhostfd: tokio::fs::File, guest_cid: u32) -> Result<()> { clear_cloexec(vhostfd.as_raw_fd()).context("clearing O_CLOEXEC failed on vsock fd")?; - let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, self.bus_type()); + let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, bus_type(self.config)); if !self.config.disable_nesting_checks && should_disable_modern() { vhost_vsock_pci.set_disable_modern(true); @@ -1619,8 +1732,10 @@ impl<'a> QemuCmdLine<'a> { pub fn add_block_device(&mut self, device_id: &str, path: &str) -> Result<()> { self.devices .push(Box::new(BlockBackend::new(device_id, path))); - self.devices - .push(Box::new(DeviceVirtioBlk::new(device_id, self.bus_type()))); + self.devices.push(Box::new(DeviceVirtioBlk::new( + device_id, + bus_type(self.config), + ))); Ok(()) } @@ -1634,32 +1749,9 @@ impl<'a> QemuCmdLine<'a> { )); } - pub fn add_network_device( - &mut self, - dev_index: u64, - host_dev_name: &str, - guest_mac: Address, - ) -> Result<()> { - let mut netdev = Netdev::new( - &format!("network-{}", dev_index), - host_dev_name, - self.config.network_info.network_queues, - )?; - if self.config.network_info.disable_vhost_net { - netdev.set_disable_vhost_net(true); - } - - let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, guest_mac); - - if should_disable_modern() { - virtio_net_device.set_disable_modern(true); - } - if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw { - virtio_net_device.set_iommu_platform(true); - } - if self.config.network_info.network_queues > 1 { - virtio_net_device.set_num_queues(self.config.network_info.network_queues); - } + pub fn add_network_device(&mut self, host_dev_name: &str, guest_mac: Address) -> Result<()> { + let (netdev, virtio_net_device) = + get_network_device(self.config, host_dev_name, guest_mac)?; self.devices.push(Box::new(netdev)); self.devices.push(Box::new(virtio_net_device)); @@ -1667,8 +1759,10 @@ impl<'a> QemuCmdLine<'a> { } pub fn add_console(&mut self, console_socket_path: &str) { - let mut serial_dev = DeviceVirtioSerial::new("serial0", self.bus_type()); - if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw { + let mut serial_dev = DeviceVirtioSerial::new("serial0", bus_type(self.config)); + if self.config.device_info.enable_iommu_platform + && bus_type(self.config) == VirtioBusType::Ccw + { serial_dev.set_iommu_platform(true); } self.devices.push(Box::new(serial_dev)); @@ -1709,3 +1803,32 @@ impl<'a> QemuCmdLine<'a> { Ok(result) } } + +pub fn get_network_device( + config: &HypervisorConfig, + host_dev_name: &str, + guest_mac: Address, +) -> Result<(Netdev, DeviceVirtioNet)> { + let mut netdev = Netdev::new( + &format!("network-{}", host_dev_name), + host_dev_name, + config.network_info.network_queues, + )?; + if config.network_info.disable_vhost_net { + netdev.set_disable_vhost_net(true); + } + + let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, guest_mac); + + if should_disable_modern() { + virtio_net_device.set_disable_modern(true); + } + if config.device_info.enable_iommu_platform && bus_type(config) == VirtioBusType::Ccw { + virtio_net_device.set_iommu_platform(true); + } + if config.network_info.network_queues > 1 { + virtio_net_device.set_num_queues(config.network_info.network_queues); + } + + Ok((netdev, virtio_net_device)) +} diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index e292662a4b..1d3380ca40 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use super::cmdline_generator::{QemuCmdLine, QMP_SOCKET_FILE}; +use super::cmdline_generator::{get_network_device, QemuCmdLine, QMP_SOCKET_FILE}; use super::qmp::Qmp; use crate::{ hypervisor_persist::HypervisorState, utils::enter_netns, HypervisorConfig, MemoryConfig, @@ -120,7 +120,6 @@ impl QemuInner { let _netns_guard = NetnsGuard::new(&netns).context("new netns guard")?; cmdline.add_network_device( - network.config.index, &network.config.host_dev_name, network.config.guest_mac.clone().unwrap(), )?; @@ -540,9 +539,16 @@ use crate::device::DeviceType; // device manager part of Hypervisor impl QemuInner { - pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result { + pub(crate) async fn add_device(&mut self, mut device: DeviceType) -> Result { info!(sl!(), "QemuInner::add_device() {}", device); - self.devices.push(device.clone()); + let is_qemu_ready_to_hotplug = self.qmp.is_some(); + if is_qemu_ready_to_hotplug { + // hypervisor is running already + device = self.hotplug_device(device)?; + } else { + // store the device to coldplug it later, on hypervisor launch + self.devices.push(device.clone()); + } Ok(device) } @@ -553,6 +559,26 @@ impl QemuInner { device )) } + + fn hotplug_device(&mut self, device: DeviceType) -> Result { + let qmp = match self.qmp { + Some(ref mut qmp) => qmp, + None => return Err(anyhow!("QMP not initialized")), + }; + + match device { + DeviceType::Network(ref network_device) => { + let (netdev, virtio_net_device) = get_network_device( + &self.config, + &network_device.config.host_dev_name, + network_device.config.guest_mac.clone().unwrap(), + )?; + qmp.hotplug_network_device(&netdev, &virtio_net_device)? + } + _ => info!(sl!(), "hotplugging of {:#?} is unsupported", device), + } + Ok(device) + } } // private helpers diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/qmp.rs b/src/runtime-rs/crates/hypervisor/src/qemu/qmp.rs index 7339efdf8b..32efa5c43e 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/qmp.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/qmp.rs @@ -3,9 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 // +use crate::qemu::cmdline_generator::{DeviceVirtioNet, Netdev}; + use anyhow::{anyhow, Result}; +use nix::sys::socket::{sendmsg, ControlMessage, MsgFlags}; use std::fmt::{Debug, Error, Formatter}; use std::io::BufReader; +use std::os::fd::{AsRawFd, RawFd}; use std::os::unix::net::UnixStream; use std::time::Duration; @@ -291,6 +295,178 @@ impl Qmp { } Ok(()) } + + fn find_free_slot(&mut self) -> Result<(String, i64)> { + let pci = self.qmp.execute(&qapi_qmp::query_pci {})?; + for pci_info in &pci { + for pci_dev in &pci_info.devices { + let pci_bridge = match &pci_dev.pci_bridge { + Some(bridge) => bridge, + None => continue, + }; + + info!(sl!(), "found PCI bridge: {}", pci_dev.qdev_id); + + if let Some(bridge_devices) = &pci_bridge.devices { + let occupied_slots = bridge_devices + .iter() + .map(|pci_dev| pci_dev.slot) + .collect::>(); + + info!( + sl!(), + "already occupied slots on bridge {}: {:#?}", + pci_dev.qdev_id, + occupied_slots + ); + + // from virtcontainers' bridges.go + let pci_bridge_max_capacity = 30; + for slot in 0..pci_bridge_max_capacity { + if !occupied_slots.iter().any(|elem| *elem == slot) { + info!( + sl!(), + "found free slot on bridge {}: {}", pci_dev.qdev_id, slot + ); + return Ok((pci_dev.qdev_id.clone(), slot)); + } + } + } + } + } + Err(anyhow!("no free slots on PCI bridges")) + } + + fn pass_fd(&mut self, fd: RawFd, fdname: &str) -> Result<()> { + info!(sl!(), "passing fd {:?} as {}", fd, fdname); + + // Put the QMP 'getfd' command itself into the message payload. + let getfd_cmd = format!( + "{{ \"execute\": \"getfd\", \"arguments\": {{ \"fdname\": \"{}\" }} }}", + fdname + ); + let buf = getfd_cmd.as_bytes(); + let bufs = &mut [std::io::IoSlice::new(buf)][..]; + + debug!(sl!(), "bufs: {:?}", bufs); + + let fds = [fd]; + let cmsg = [ControlMessage::ScmRights(&fds)]; + + let result = sendmsg::<()>( + self.qmp.inner_mut().get_mut_write().as_raw_fd(), + bufs, + &cmsg, + MsgFlags::empty(), + None, + ); + info!(sl!(), "sendmsg() result: {:#?}", result); + + let result = self.qmp.read_response::<&qmp::getfd>(); + + match result { + Ok(_) => { + info!(sl!(), "successfully passed {} ({})", fdname, fd); + Ok(()) + } + Err(err) => Err(anyhow!("failed to pass {} ({}): {}", fdname, fd, err)), + } + } + + pub fn hotplug_network_device( + &mut self, + netdev: &Netdev, + virtio_net_device: &DeviceVirtioNet, + ) -> Result<()> { + debug!( + sl!(), + "hotplug_network_device(): PCI before {}: {:#?}", + virtio_net_device.get_netdev_id(), + self.qmp.execute(&qapi_qmp::query_pci {})? + ); + + let (bus, slot) = self.find_free_slot()?; + + let mut fd_names = vec![]; + for (idx, fd) in netdev.get_fds().iter().enumerate() { + let fdname = format!("fd{}", idx); + self.pass_fd(fd.as_raw_fd(), fdname.as_ref())?; + fd_names.push(fdname); + } + + let mut vhostfd_names = vec![]; + for (idx, fd) in netdev.get_vhostfds().iter().enumerate() { + let vhostfdname = format!("vhostfd{}", idx); + self.pass_fd(fd.as_raw_fd(), vhostfdname.as_ref())?; + vhostfd_names.push(vhostfdname); + } + + self.qmp + .execute(&qapi_qmp::netdev_add(qapi_qmp::Netdev::tap { + id: netdev.get_id().clone(), + tap: qapi_qmp::NetdevTapOptions { + br: None, + downscript: None, + fd: None, + // Logic in cmdline_generator::Netdev::new() seems to + // guarantee that there will always be at least one fd. + fds: Some(fd_names.join(",")), + helper: None, + ifname: None, + poll_us: None, + queues: None, + script: None, + sndbuf: None, + vhost: if vhostfd_names.is_empty() { + None + } else { + Some(true) + }, + vhostfd: None, + vhostfds: if vhostfd_names.is_empty() { + None + } else { + Some(vhostfd_names.join(",")) + }, + vhostforce: None, + vnet_hdr: None, + }, + }))?; + + let mut netdev_frontend_args = Dictionary::new(); + netdev_frontend_args.insert( + "netdev".to_owned(), + virtio_net_device.get_netdev_id().clone().into(), + ); + netdev_frontend_args.insert("addr".to_owned(), format!("{:02}", slot).into()); + netdev_frontend_args.insert("mac".to_owned(), virtio_net_device.get_mac_addr().into()); + netdev_frontend_args.insert("mq".to_owned(), "on".into()); + // As the golang runtime documents the vectors computation, it's + // 2N+2 vectors, N for tx queues, N for rx queues, 1 for config, and one for possible control vq + netdev_frontend_args.insert( + "vectors".to_owned(), + (2 * virtio_net_device.get_num_queues() + 2).into(), + ); + if virtio_net_device.get_disable_modern() { + netdev_frontend_args.insert("disable-modern".to_owned(), true.into()); + } + + self.qmp.execute(&qmp::device_add { + bus: Some(bus), + id: Some(format!("frontend-{}", virtio_net_device.get_netdev_id())), + driver: virtio_net_device.get_device_driver().clone(), + arguments: netdev_frontend_args, + })?; + + debug!( + sl!(), + "hotplug_network_device(): PCI after {}: {:#?}", + virtio_net_device.get_netdev_id(), + self.qmp.execute(&qapi_qmp::query_pci {})? + ); + + Ok(()) + } } fn vcpu_id_from_core_id(core_id: i64) -> String {