Merge pull request #10165 from pmores/add-network-device-hotplugging

runtime-rs: add network device hotplugging to qemu-rs
This commit is contained in:
Greg Kurz 2024-10-03 17:44:50 +02:00 committed by GitHub
commit 96336d141b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 374 additions and 49 deletions

View File

@ -49,7 +49,7 @@ trait ToQemuParams: Send + Sync {
async fn qemu_params(&self) -> Result<Vec<String>>;
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone, Copy)]
enum VirtioBusType {
Pci,
Ccw,
@ -70,6 +70,14 @@ impl Display for VirtioBusType {
}
}
fn bus_type(config: &HypervisorConfig) -> VirtioBusType {
if config.machine_info.machine_type.contains("-ccw-") {
VirtioBusType::Ccw
} else {
VirtioBusType::Pci
}
}
// Conventions used in qemu command line generation
// ================================================
//
@ -975,7 +983,7 @@ fn format_fds(files: &[File]) -> String {
}
#[derive(Debug)]
struct Netdev {
pub struct Netdev {
id: String,
// File descriptors for vhost multi-queue support.
@ -1013,6 +1021,18 @@ impl Netdev {
self.disable_vhost_net = disable_vhost_net;
self
}
pub fn get_id(&self) -> &String {
&self.id
}
pub fn get_fds(&self) -> &Vec<File> {
&self.fds["fds"]
}
pub fn get_vhostfds(&self) -> &Vec<File> {
&self.fds["vhostfds"]
}
}
#[async_trait]
@ -1081,6 +1101,26 @@ impl DeviceVirtioNet {
self.iommu_platform = iommu_platform;
self
}
pub fn get_netdev_id(&self) -> &String {
&self.netdev_id
}
pub fn get_device_driver(&self) -> &String {
&self.device_driver
}
pub fn get_mac_addr(&self) -> String {
format!("{:?}", self.mac_address)
}
pub fn get_num_queues(&self) -> u32 {
self.num_queues
}
pub fn get_disable_modern(&self) -> bool {
self.disable_modern
}
}
#[async_trait]
@ -1290,6 +1330,74 @@ impl ToQemuParams for DeviceIntelIommu {
}
}
#[derive(Debug)]
struct DevicePciBridge {
driver: String,
bus: String,
id: String,
chassis_nr: u32,
shpc: bool,
addr: u32,
io_reserve: String,
mem_reserve: String,
pref64_reserve: String,
}
impl DevicePciBridge {
fn new(config: &HypervisorConfig, bridge_idx: u32) -> DevicePciBridge {
DevicePciBridge {
// The go runtime doesn't support bridges other than PCI although
// PCIe should also be available. Stick with the legacy behaviour
// of ignoring PCIe since it's not clear to me how to decide
// between the two.
driver: "pci-bridge".to_owned(),
bus: match config.machine_info.machine_type.as_str() {
"q35" | "virt" => "pcie.0",
_ => "pci.0",
}
.to_owned(),
id: format!("pci-bridge-{}", bridge_idx),
// Each bridge is required to be assigned a unique chassis id > 0.
chassis_nr: bridge_idx + 1,
shpc: false,
// 2 is documented by the go runtime as the first slot available
// for a bridge (on x86_64)
// (https://github.com/kata-containers/kata-containers/blob/99730256a2899c82d111400024621519d17ea15d/src/runtime/virtcontainers/qemu_arch_base.go#L212)
addr: 2 + bridge_idx,
// Values taken from the go runtime implementation which comments
// the choices as follows:
// Certain guest BIOS versions think !SHPC means no hotplug, and
// won't reserve the IO and memory windows that will be needed for
// devices added underneath this bridge. This will only break for
// certain combinations of exact qemu, BIOS and guest kernel
// versions, but for consistency, just hint the usual default
// windows for a bridge (as the BIOS would use with SHPC) so that
// we can do ACPI hotplug.
// (https://github.com/kata-containers/kata-containers/blob/99730256a2899c82d111400024621519d17ea15d/src/runtime/virtcontainers/qemu.go#L2474)
io_reserve: "4k".to_owned(),
mem_reserve: "1m".to_owned(),
pref64_reserve: "1m".to_owned(),
}
}
}
#[async_trait]
impl ToQemuParams for DevicePciBridge {
async fn qemu_params(&self) -> Result<Vec<String>> {
let mut params = Vec::new();
params.push(self.driver.clone());
params.push(format!("bus={}", self.bus));
params.push(format!("id={}", self.id));
params.push(format!("chassis_nr={}", self.chassis_nr));
params.push(format!("shpc={}", if self.shpc { "on" } else { "off" }));
params.push(format!("addr={}", self.addr));
params.push(format!("io-reserve={}", self.io_reserve));
params.push(format!("mem-reserve={}", self.mem_reserve));
params.push(format!("pref64-reserve={}", self.pref64_reserve));
Ok(vec!["-device".to_owned(), params.join(",")])
}
}
// Qemu provides methods and types for managing QEMU instances.
// To manage a qemu instance after it has been launched you need
// to pass the -qmp option during launch requesting the qemu instance
@ -1480,10 +1588,14 @@ impl<'a> QemuCmdLine<'a> {
qemu_cmd_line.add_rtc();
if qemu_cmd_line.bus_type() != VirtioBusType::Ccw {
if bus_type(config) != VirtioBusType::Ccw {
qemu_cmd_line.add_rng();
}
if bus_type(config) != VirtioBusType::Ccw && config.device_info.default_bridges > 0 {
qemu_cmd_line.add_bridges(config.device_info.default_bridges);
}
Ok(qemu_cmd_line)
}
@ -1507,14 +1619,6 @@ impl<'a> QemuCmdLine<'a> {
self.devices.push(Box::new(rng_device));
}
fn bus_type(&self) -> VirtioBusType {
if self.config.machine_info.machine_type.contains("-ccw-") {
VirtioBusType::Ccw
} else {
VirtioBusType::Pci
}
}
fn add_iommu(&mut self) {
let dev_iommu = DeviceIntelIommu::new();
self.devices.push(Box::new(dev_iommu));
@ -1526,6 +1630,13 @@ impl<'a> QemuCmdLine<'a> {
self.machine.set_kernel_irqchip("split");
}
fn add_bridges(&mut self, count: u32) {
for idx in 0..count {
let bridge = DevicePciBridge::new(self.config, idx);
self.devices.push(Box::new(bridge));
}
}
pub fn add_virtiofs_share(
&mut self,
virtiofsd_socket_path: &str,
@ -1542,9 +1653,11 @@ impl<'a> QemuCmdLine<'a> {
self.devices.push(Box::new(virtiofsd_socket_chardev));
let mut virtiofs_device = DeviceVhostUserFs::new(chardev_name, mount_tag, self.bus_type());
let bus_type = bus_type(self.config);
let mut virtiofs_device = DeviceVhostUserFs::new(chardev_name, mount_tag, bus_type);
virtiofs_device.set_queue_size(queue_size);
if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw {
if self.config.device_info.enable_iommu_platform && bus_type == VirtioBusType::Ccw {
virtiofs_device.set_iommu_platform(true);
}
self.devices.push(Box::new(virtiofs_device));
@ -1558,7 +1671,7 @@ impl<'a> QemuCmdLine<'a> {
//self.devices.push(Box::new(mem_file));
self.memory.set_memory_backend_file(&mem_file);
match self.bus_type() {
match bus_type {
VirtioBusType::Pci => {
self.machine.set_nvdimm(true);
self.devices.push(Box::new(NumaNode::new(&mem_file.id)));
@ -1572,7 +1685,7 @@ impl<'a> QemuCmdLine<'a> {
pub fn add_vsock(&mut self, vhostfd: tokio::fs::File, guest_cid: u32) -> Result<()> {
clear_cloexec(vhostfd.as_raw_fd()).context("clearing O_CLOEXEC failed on vsock fd")?;
let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, self.bus_type());
let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, bus_type(self.config));
if !self.config.disable_nesting_checks && should_disable_modern() {
vhost_vsock_pci.set_disable_modern(true);
@ -1619,8 +1732,10 @@ impl<'a> QemuCmdLine<'a> {
pub fn add_block_device(&mut self, device_id: &str, path: &str) -> Result<()> {
self.devices
.push(Box::new(BlockBackend::new(device_id, path)));
self.devices
.push(Box::new(DeviceVirtioBlk::new(device_id, self.bus_type())));
self.devices.push(Box::new(DeviceVirtioBlk::new(
device_id,
bus_type(self.config),
)));
Ok(())
}
@ -1634,32 +1749,9 @@ impl<'a> QemuCmdLine<'a> {
));
}
pub fn add_network_device(
&mut self,
dev_index: u64,
host_dev_name: &str,
guest_mac: Address,
) -> Result<()> {
let mut netdev = Netdev::new(
&format!("network-{}", dev_index),
host_dev_name,
self.config.network_info.network_queues,
)?;
if self.config.network_info.disable_vhost_net {
netdev.set_disable_vhost_net(true);
}
let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, guest_mac);
if should_disable_modern() {
virtio_net_device.set_disable_modern(true);
}
if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw {
virtio_net_device.set_iommu_platform(true);
}
if self.config.network_info.network_queues > 1 {
virtio_net_device.set_num_queues(self.config.network_info.network_queues);
}
pub fn add_network_device(&mut self, host_dev_name: &str, guest_mac: Address) -> Result<()> {
let (netdev, virtio_net_device) =
get_network_device(self.config, host_dev_name, guest_mac)?;
self.devices.push(Box::new(netdev));
self.devices.push(Box::new(virtio_net_device));
@ -1667,8 +1759,10 @@ impl<'a> QemuCmdLine<'a> {
}
pub fn add_console(&mut self, console_socket_path: &str) {
let mut serial_dev = DeviceVirtioSerial::new("serial0", self.bus_type());
if self.config.device_info.enable_iommu_platform && self.bus_type() == VirtioBusType::Ccw {
let mut serial_dev = DeviceVirtioSerial::new("serial0", bus_type(self.config));
if self.config.device_info.enable_iommu_platform
&& bus_type(self.config) == VirtioBusType::Ccw
{
serial_dev.set_iommu_platform(true);
}
self.devices.push(Box::new(serial_dev));
@ -1709,3 +1803,32 @@ impl<'a> QemuCmdLine<'a> {
Ok(result)
}
}
pub fn get_network_device(
config: &HypervisorConfig,
host_dev_name: &str,
guest_mac: Address,
) -> Result<(Netdev, DeviceVirtioNet)> {
let mut netdev = Netdev::new(
&format!("network-{}", host_dev_name),
host_dev_name,
config.network_info.network_queues,
)?;
if config.network_info.disable_vhost_net {
netdev.set_disable_vhost_net(true);
}
let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, guest_mac);
if should_disable_modern() {
virtio_net_device.set_disable_modern(true);
}
if config.device_info.enable_iommu_platform && bus_type(config) == VirtioBusType::Ccw {
virtio_net_device.set_iommu_platform(true);
}
if config.network_info.network_queues > 1 {
virtio_net_device.set_num_queues(config.network_info.network_queues);
}
Ok((netdev, virtio_net_device))
}

View File

@ -3,7 +3,7 @@
// SPDX-License-Identifier: Apache-2.0
//
use super::cmdline_generator::{QemuCmdLine, QMP_SOCKET_FILE};
use super::cmdline_generator::{get_network_device, QemuCmdLine, QMP_SOCKET_FILE};
use super::qmp::Qmp;
use crate::{
hypervisor_persist::HypervisorState, utils::enter_netns, HypervisorConfig, MemoryConfig,
@ -120,7 +120,6 @@ impl QemuInner {
let _netns_guard = NetnsGuard::new(&netns).context("new netns guard")?;
cmdline.add_network_device(
network.config.index,
&network.config.host_dev_name,
network.config.guest_mac.clone().unwrap(),
)?;
@ -540,9 +539,16 @@ use crate::device::DeviceType;
// device manager part of Hypervisor
impl QemuInner {
pub(crate) async fn add_device(&mut self, device: DeviceType) -> Result<DeviceType> {
pub(crate) async fn add_device(&mut self, mut device: DeviceType) -> Result<DeviceType> {
info!(sl!(), "QemuInner::add_device() {}", device);
self.devices.push(device.clone());
let is_qemu_ready_to_hotplug = self.qmp.is_some();
if is_qemu_ready_to_hotplug {
// hypervisor is running already
device = self.hotplug_device(device)?;
} else {
// store the device to coldplug it later, on hypervisor launch
self.devices.push(device.clone());
}
Ok(device)
}
@ -553,6 +559,26 @@ impl QemuInner {
device
))
}
fn hotplug_device(&mut self, device: DeviceType) -> Result<DeviceType> {
let qmp = match self.qmp {
Some(ref mut qmp) => qmp,
None => return Err(anyhow!("QMP not initialized")),
};
match device {
DeviceType::Network(ref network_device) => {
let (netdev, virtio_net_device) = get_network_device(
&self.config,
&network_device.config.host_dev_name,
network_device.config.guest_mac.clone().unwrap(),
)?;
qmp.hotplug_network_device(&netdev, &virtio_net_device)?
}
_ => info!(sl!(), "hotplugging of {:#?} is unsupported", device),
}
Ok(device)
}
}
// private helpers

View File

@ -3,9 +3,13 @@
// SPDX-License-Identifier: Apache-2.0
//
use crate::qemu::cmdline_generator::{DeviceVirtioNet, Netdev};
use anyhow::{anyhow, Result};
use nix::sys::socket::{sendmsg, ControlMessage, MsgFlags};
use std::fmt::{Debug, Error, Formatter};
use std::io::BufReader;
use std::os::fd::{AsRawFd, RawFd};
use std::os::unix::net::UnixStream;
use std::time::Duration;
@ -291,6 +295,178 @@ impl Qmp {
}
Ok(())
}
fn find_free_slot(&mut self) -> Result<(String, i64)> {
let pci = self.qmp.execute(&qapi_qmp::query_pci {})?;
for pci_info in &pci {
for pci_dev in &pci_info.devices {
let pci_bridge = match &pci_dev.pci_bridge {
Some(bridge) => bridge,
None => continue,
};
info!(sl!(), "found PCI bridge: {}", pci_dev.qdev_id);
if let Some(bridge_devices) = &pci_bridge.devices {
let occupied_slots = bridge_devices
.iter()
.map(|pci_dev| pci_dev.slot)
.collect::<Vec<_>>();
info!(
sl!(),
"already occupied slots on bridge {}: {:#?}",
pci_dev.qdev_id,
occupied_slots
);
// from virtcontainers' bridges.go
let pci_bridge_max_capacity = 30;
for slot in 0..pci_bridge_max_capacity {
if !occupied_slots.iter().any(|elem| *elem == slot) {
info!(
sl!(),
"found free slot on bridge {}: {}", pci_dev.qdev_id, slot
);
return Ok((pci_dev.qdev_id.clone(), slot));
}
}
}
}
}
Err(anyhow!("no free slots on PCI bridges"))
}
fn pass_fd(&mut self, fd: RawFd, fdname: &str) -> Result<()> {
info!(sl!(), "passing fd {:?} as {}", fd, fdname);
// Put the QMP 'getfd' command itself into the message payload.
let getfd_cmd = format!(
"{{ \"execute\": \"getfd\", \"arguments\": {{ \"fdname\": \"{}\" }} }}",
fdname
);
let buf = getfd_cmd.as_bytes();
let bufs = &mut [std::io::IoSlice::new(buf)][..];
debug!(sl!(), "bufs: {:?}", bufs);
let fds = [fd];
let cmsg = [ControlMessage::ScmRights(&fds)];
let result = sendmsg::<()>(
self.qmp.inner_mut().get_mut_write().as_raw_fd(),
bufs,
&cmsg,
MsgFlags::empty(),
None,
);
info!(sl!(), "sendmsg() result: {:#?}", result);
let result = self.qmp.read_response::<&qmp::getfd>();
match result {
Ok(_) => {
info!(sl!(), "successfully passed {} ({})", fdname, fd);
Ok(())
}
Err(err) => Err(anyhow!("failed to pass {} ({}): {}", fdname, fd, err)),
}
}
pub fn hotplug_network_device(
&mut self,
netdev: &Netdev,
virtio_net_device: &DeviceVirtioNet,
) -> Result<()> {
debug!(
sl!(),
"hotplug_network_device(): PCI before {}: {:#?}",
virtio_net_device.get_netdev_id(),
self.qmp.execute(&qapi_qmp::query_pci {})?
);
let (bus, slot) = self.find_free_slot()?;
let mut fd_names = vec![];
for (idx, fd) in netdev.get_fds().iter().enumerate() {
let fdname = format!("fd{}", idx);
self.pass_fd(fd.as_raw_fd(), fdname.as_ref())?;
fd_names.push(fdname);
}
let mut vhostfd_names = vec![];
for (idx, fd) in netdev.get_vhostfds().iter().enumerate() {
let vhostfdname = format!("vhostfd{}", idx);
self.pass_fd(fd.as_raw_fd(), vhostfdname.as_ref())?;
vhostfd_names.push(vhostfdname);
}
self.qmp
.execute(&qapi_qmp::netdev_add(qapi_qmp::Netdev::tap {
id: netdev.get_id().clone(),
tap: qapi_qmp::NetdevTapOptions {
br: None,
downscript: None,
fd: None,
// Logic in cmdline_generator::Netdev::new() seems to
// guarantee that there will always be at least one fd.
fds: Some(fd_names.join(",")),
helper: None,
ifname: None,
poll_us: None,
queues: None,
script: None,
sndbuf: None,
vhost: if vhostfd_names.is_empty() {
None
} else {
Some(true)
},
vhostfd: None,
vhostfds: if vhostfd_names.is_empty() {
None
} else {
Some(vhostfd_names.join(","))
},
vhostforce: None,
vnet_hdr: None,
},
}))?;
let mut netdev_frontend_args = Dictionary::new();
netdev_frontend_args.insert(
"netdev".to_owned(),
virtio_net_device.get_netdev_id().clone().into(),
);
netdev_frontend_args.insert("addr".to_owned(), format!("{:02}", slot).into());
netdev_frontend_args.insert("mac".to_owned(), virtio_net_device.get_mac_addr().into());
netdev_frontend_args.insert("mq".to_owned(), "on".into());
// As the golang runtime documents the vectors computation, it's
// 2N+2 vectors, N for tx queues, N for rx queues, 1 for config, and one for possible control vq
netdev_frontend_args.insert(
"vectors".to_owned(),
(2 * virtio_net_device.get_num_queues() + 2).into(),
);
if virtio_net_device.get_disable_modern() {
netdev_frontend_args.insert("disable-modern".to_owned(), true.into());
}
self.qmp.execute(&qmp::device_add {
bus: Some(bus),
id: Some(format!("frontend-{}", virtio_net_device.get_netdev_id())),
driver: virtio_net_device.get_device_driver().clone(),
arguments: netdev_frontend_args,
})?;
debug!(
sl!(),
"hotplug_network_device(): PCI after {}: {:#?}",
virtio_net_device.get_netdev_id(),
self.qmp.execute(&qapi_qmp::query_pci {})?
);
Ok(())
}
}
fn vcpu_id_from_core_id(core_id: i64) -> String {