From d4ac45d840cc51fc4b2663b4476e656bcb527ecd Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Tue, 19 Mar 2024 12:39:52 +0100 Subject: [PATCH 1/8] runtime-rs: refactor clear_fd_flags() The idea of this function is to make sure O_CLOEXEC is not set on file descriptors that should be inherited by a child (=hypervisor) process. The approach so far is however rather heavy-handed - clearing *all* flags is unjustifiably aggresive for a low-level function with no knowledge of context whatsoever. This commit refactors the function so that it only does what's expected and renames it accordingly. It also clarifies some of its call sites. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 4 ++-- .../crates/hypervisor/src/qemu/network.rs | 6 ++--- src/runtime-rs/crates/hypervisor/src/utils.rs | 22 +++++++++++-------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 3e3a919ff9..abd974cbdc 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -4,7 +4,7 @@ // use super::network::{generate_netdev_fds, NetDevice}; -use crate::utils::clear_fd_flags; +use crate::utils::clear_cloexec; use crate::{kernel_param::KernelParams, HypervisorConfig, NetworkConfig}; use anyhow::{anyhow, Context, Result}; @@ -1056,7 +1056,7 @@ impl<'a> QemuCmdLine<'a> { } pub fn add_vsock(&mut self, vhostfd: RawFd, guest_cid: u32) -> Result<()> { - clear_fd_flags(vhostfd).context("clear flags failed")?; + clear_cloexec(vhostfd).context("clearing O_CLOEXEC failed on vsock fd")?; let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, self.bus_type()); diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs b/src/runtime-rs/crates/hypervisor/src/qemu/network.rs index ce0a0b7e89..3d314e9f54 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/network.rs @@ -8,7 +8,7 @@ use std::convert::TryFrom; use std::fs::{File, OpenOptions}; use std::os::fd::RawFd; -use crate::utils::{clear_fd_flags, open_named_tuntap}; +use crate::utils::{clear_cloexec, open_named_tuntap}; use crate::{Address, NetworkConfig}; use anyhow::{anyhow, Context, Result}; @@ -232,7 +232,7 @@ impl NetDevice { netdev_params.push("vhost=on".to_owned()); if let Some(vhost_fds) = self.fds.get("vhostfds") { for fd in vhost_fds.iter() { - clear_fd_flags(*fd)?; + clear_cloexec(*fd).context("clearing O_CLOEXEC failed on vhost fd")?; } let s = vhost_fds .iter() @@ -246,7 +246,7 @@ impl NetDevice { if let Some(tuntap_fds) = self.fds.get("fds") { for fd in tuntap_fds.iter() { - clear_fd_flags(*fd).context("clear flag of fd failed")?; + clear_cloexec(*fd).context("clearing O_CLOEXEC failed on tuntap fd")?; } let s = tuntap_fds .iter() diff --git a/src/runtime-rs/crates/hypervisor/src/utils.rs b/src/runtime-rs/crates/hypervisor/src/utils.rs index 7e3606dabb..95e6554b3d 100644 --- a/src/runtime-rs/crates/hypervisor/src/utils.rs +++ b/src/runtime-rs/crates/hypervisor/src/utils.rs @@ -58,15 +58,19 @@ pub fn get_jailer_root(sid: &str) -> String { [&sandbox_path, JAILER_ROOT].join("/") } -// Clear the O_CLOEXEC which is set by default by Rust standard library -// as it would obviously prevent passing the descriptor to the hypervisor process. -pub fn clear_fd_flags(rawfd: RawFd) -> Result<()> { - if let Err(err) = fcntl::fcntl(rawfd, fcntl::FcntlArg::F_SETFD(fcntl::FdFlag::empty())) { - info!( - sl!(), - "couldn't clear O_CLOEXEC on device's fd, communication with agent will not work: {:?}", - err - ); +// Clear the O_CLOEXEC which is set by default by Rust standard library on +// file descriptors that it opens. This function is mostly meant to be +// called on descriptors to be passed to a child (hypervisor) process as +// O_CLOEXEC would obviously prevent that. +pub fn clear_cloexec(rawfd: RawFd) -> Result<()> { + let cur_flags = fcntl::fcntl(rawfd, fcntl::FcntlArg::F_GETFD)?; + let mut new_flags = fcntl::FdFlag::from_bits(cur_flags).ok_or(anyhow!( + "couldn't construct FdFlag from flags value {:?}", + cur_flags + ))?; + new_flags.remove(fcntl::FdFlag::FD_CLOEXEC); + if let Err(err) = fcntl::fcntl(rawfd, fcntl::FcntlArg::F_SETFD(new_flags)) { + info!(sl!(), "couldn't clear O_CLOEXEC on fd: {:?}", err); return Err(err.into()); } From 7f237341721f1c8e0e6081d2d943db30f079be67 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Thu, 21 Mar 2024 12:48:29 +0100 Subject: [PATCH 2/8] runtime-rs: reduce generate_netdev_fds() dependencies generate_netdev_fds() takes NetworkConfig from which it however only needs a host-side network device name. This commit makes it take the device name directly, making the function useful to callers who don't have the whole NetworkConfig but do have the requisite device name. Signed-off-by: Pavel Mores --- .../crates/hypervisor/src/qemu/cmdline_generator.rs | 3 ++- src/runtime-rs/crates/hypervisor/src/qemu/network.rs | 6 +----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index abd974cbdc..62584c36c8 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -1135,8 +1135,9 @@ impl<'a> QemuCmdLine<'a> { ) -> Result> { let disable_vhost_net = network_info.disable_vhost_net; let queues = network_info.network_queues; + let if_name = config.host_dev_name.as_str(); - let (tun_files, vhost_files) = generate_netdev_fds(config, queues)?; + let (tun_files, vhost_files) = generate_netdev_fds(if_name, queues)?; let tun_fds: Vec = tun_files.iter().map(|dev| dev.as_raw_fd()).collect(); let vhost_fds: Vec = vhost_files.iter().map(|dev| dev.as_raw_fd()).collect(); diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs b/src/runtime-rs/crates/hypervisor/src/qemu/network.rs index 3d314e9f54..1e64f8e5d5 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/network.rs @@ -313,11 +313,7 @@ fn create_fds(device: &str, num_fds: usize) -> Result> { Ok(fds) } -pub fn generate_netdev_fds( - network_config: &NetworkConfig, - queues: u32, -) -> Result<(Vec, Vec)> { - let if_name = network_config.host_dev_name.as_str(); +pub fn generate_netdev_fds(if_name: &str, queues: u32) -> Result<(Vec, Vec)> { let tun_taps = open_named_tuntap(if_name, queues)?; let vhost_fds = create_vhost_net_fds(queues)?; From 0a57e2bb3250e8f1e778984f38bf35fc9866f5d1 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Fri, 22 Mar 2024 18:18:22 +0100 Subject: [PATCH 3/8] runtime-rs: refactor NetDevice in qemu driver In keeping with architecture of QemuCmdLine implementation we split the functionality into two objects: Netdev to represent and generate the -netdev part and DeviceVirtioNet for the -device virtio-net- part. This change is a pure refactor, existing functionality does not change. However, we do remove some stub generalizations and govmm-isms, notably: - we remove the NetDev enum since the only network interface types that kata seems to use with qemu are tuntap and macvtap, both of which are implemented by the same -netdev tap - enum DeviceDriver is also left out since it doesn't seem reasonable to try to represent VFIO NICs (which are completely different from virtio-net ones) with the same struct as virtio-net - we also remove VirtioTransport because there's no use for it so far, but with the expectation that it will be added soon. We also make struct Netdev the owner of any vhost-net and queue file descriptors so that their lifetime is tied ultimately to the lifetime of QemuCmdLine automatically, instead of returning the fds to the caller and forcing it to achieve the equivalent functionality but manually. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 133 +++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 62584c36c8..28d95064f7 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -5,11 +5,12 @@ use super::network::{generate_netdev_fds, NetDevice}; use crate::utils::clear_cloexec; -use crate::{kernel_param::KernelParams, HypervisorConfig, NetworkConfig}; +use crate::{kernel_param::KernelParams, Address, HypervisorConfig, NetworkConfig}; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use kata_types::config::hypervisor::NetworkInfo; +use std::collections::HashMap; use std::fmt::Display; use std::fs::{read_to_string, File}; use std::os::fd::AsRawFd; @@ -892,6 +893,136 @@ impl ToQemuParams for Serial { } } +fn format_fds(files: &[File]) -> String { + files + .iter() + .map(|file| file.as_raw_fd().to_string()) + .collect::>() + .join(":") +} + +#[derive(Debug)] +struct Netdev { + id: String, + + // File descriptors for vhost multi-queue support. + // { + // queue_fds: Vec, + // vhost_fds: Vec, + // } + fds: HashMap>, + + // disable_vhost_net disables virtio device emulation from the host kernel instead of from qemu. + disable_vhost_net: bool, +} + +impl Netdev { + fn new(id: &str, host_if_name: &str, num_queues: u32) -> Result { + let (tun_files, vhost_files) = generate_netdev_fds(host_if_name, num_queues)?; + let fds = HashMap::from([ + ("fds".to_owned(), tun_files), + ("vhostfds".to_owned(), vhost_files), + ]); + for file in fds.values().flatten() { + clear_cloexec(file.as_raw_fd()).context("clearing O_CLOEXEC failed")?; + } + + Ok(Netdev { + id: id.to_owned(), + fds, + disable_vhost_net: false, + }) + } + + fn set_disable_vhost_net(&mut self, disable_vhost_net: bool) -> &mut Self { + self.disable_vhost_net = disable_vhost_net; + self + } +} + +#[async_trait] +impl ToQemuParams for Netdev { + async fn qemu_params(&self) -> Result> { + let mut params: Vec = Vec::new(); + params.push("tap".to_owned()); + params.push(format!("id={}", self.id)); + + if !self.disable_vhost_net { + params.push("vhost=on".to_owned()); + if let Some(vhost_fds) = self.fds.get("vhostfds") { + params.push(format!("vhostfds={}", format_fds(vhost_fds))); + } + } + + if let Some(tuntap_fds) = self.fds.get("fds") { + params.push(format!("fds={}", format_fds(tuntap_fds))); + } + + Ok(vec!["-netdev".to_owned(), params.join(",")]) + } +} + +#[derive(Debug)] +pub struct DeviceVirtioNet { + // driver is the qemu device driver + device_driver: String, + + // id is the corresponding backend net device identifier. + netdev_id: String, + + // mac_address is the guest-side networking device interface MAC address. + mac_address: Address, + + // disable_modern prevents qemu from relying on fast MMIO. + disable_modern: bool, + + num_queues: u32, +} + +impl DeviceVirtioNet { + fn new(netdev_id: &str, mac_address: Address) -> DeviceVirtioNet { + DeviceVirtioNet { + device_driver: "virtio-net-pci".to_owned(), + netdev_id: netdev_id.to_owned(), + mac_address, + disable_modern: false, + num_queues: 1, + } + } + + fn set_disable_modern(&mut self, disable_modern: bool) -> &mut Self { + self.disable_modern = disable_modern; + self + } + + fn set_num_queues(&mut self, num_queues: u32) -> &mut Self { + self.num_queues = num_queues; + self + } +} + +#[async_trait] +impl ToQemuParams for DeviceVirtioNet { + async fn qemu_params(&self) -> Result> { + let mut params: Vec = Vec::new(); + + //params.push(format!("driver={}", &self.device_driver.to_string())); + params.push(self.device_driver.clone()); + params.push(format!("netdev={}", &self.netdev_id)); + + params.push(format!("mac={}", self.mac_address.to_string())); + + if self.disable_modern { + params.push("disable-modern=true".to_owned()); + } + + params.push("mq=on".to_owned()); + params.push(format!("vectors={}", 2 * self.num_queues + 2)); + + Ok(vec!["-device".to_owned(), params.join(",")]) + } +} + #[async_trait] impl ToQemuParams for NetDevice { // qemu_params returns the qemu parameters built out of this network device. From 12e40ede97d334f3747419206ebf201fbb705a85 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Fri, 22 Mar 2024 18:23:24 +0100 Subject: [PATCH 4/8] runtime-rs: reimplement add_network_device() using Netdev & DeviceVirtioNet This commit replaces the existing NetDevice-based implementation with one using Netdev and DeviceVirtioNet. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 28d95064f7..15c4491239 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -1262,23 +1262,40 @@ impl<'a> QemuCmdLine<'a> { pub fn add_network_device( &mut self, config: &NetworkConfig, - network_info: &NetworkInfo, + _network_info: &NetworkInfo, ) -> Result> { - let disable_vhost_net = network_info.disable_vhost_net; - let queues = network_info.network_queues; - let if_name = config.host_dev_name.as_str(); + let mut netdev = Netdev::new( + &format!("network-{}", &config.index), + &config.host_dev_name, + self.config.network_info.network_queues, + )?; + if self.config.network_info.disable_vhost_net { + netdev.set_disable_vhost_net(true); + } - let (tun_files, vhost_files) = generate_netdev_fds(if_name, queues)?; - let tun_fds: Vec = tun_files.iter().map(|dev| dev.as_raw_fd()).collect(); - let vhost_fds: Vec = vhost_files.iter().map(|dev| dev.as_raw_fd()).collect(); + let mut virtio_net_device = + DeviceVirtioNet::new(&netdev.id, config.guest_mac.clone().unwrap()); - let net_device = NetDevice::new(config, disable_vhost_net, tun_fds, vhost_fds); - self.devices.push(Box::new(net_device)); + let nested = match is_running_in_vm() { + Ok(retval) => retval, + Err(err) => { + info!( + sl!(), + "unable to check if running in VM, assuming not: {}", err + ); + false + } + }; + if nested { + virtio_net_device.set_disable_modern(true); + } + if self.config.network_info.network_queues > 1 { + virtio_net_device.set_num_queues(self.config.network_info.network_queues); + } - let dev_files = vec![tun_files, vhost_files]; - let fds: Vec = dev_files.into_iter().flatten().collect(); - - Ok(fds) + self.devices.push(Box::new(netdev)); + self.devices.push(Box::new(virtio_net_device)); + Ok(vec![]) } pub fn add_console(&mut self, console_socket_path: &str) { From a4f033f864d8133327fa8172d87e0963578dbed0 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Fri, 22 Mar 2024 18:33:21 +0100 Subject: [PATCH 5/8] runtime-rs: add should_disable_modern() utility function is_running_in_vm() is enough to figure out whether to disable_modern but it's clumsy and verbose to use. should_disable_modern() streamlines the usage by encapsulating the verbosity. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 15c4491239..d40e9ff0e4 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -1105,6 +1105,19 @@ fn is_running_in_vm() -> Result { Ok(res) } +fn should_disable_modern() -> bool { + match is_running_in_vm() { + Ok(retval) => retval, + Err(err) => { + info!( + sl!(), + "unable to check if running in VM, assuming not: {}", err + ); + false + } + } +} + pub struct QemuCmdLine<'a> { id: String, config: &'a HypervisorConfig, @@ -1191,20 +1204,8 @@ impl<'a> QemuCmdLine<'a> { let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, self.bus_type()); - if !self.config.disable_nesting_checks { - let nested = match is_running_in_vm() { - Ok(retval) => retval, - Err(err) => { - info!( - sl!(), - "unable to check if running in VM, assuming not: {}", err - ); - false - } - }; - if nested { - vhost_vsock_pci.set_disable_modern(true); - } + if !self.config.disable_nesting_checks && should_disable_modern() { + vhost_vsock_pci.set_disable_modern(true); } self.devices.push(Box::new(vhost_vsock_pci)); @@ -1276,17 +1277,7 @@ impl<'a> QemuCmdLine<'a> { let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, config.guest_mac.clone().unwrap()); - let nested = match is_running_in_vm() { - Ok(retval) => retval, - Err(err) => { - info!( - sl!(), - "unable to check if running in VM, assuming not: {}", err - ); - false - } - }; - if nested { + if should_disable_modern() { virtio_net_device.set_disable_modern(true); } if self.config.network_info.network_queues > 1 { From 0cf0e923fcf9c41363b887a6f1b59ef6d3202000 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Fri, 22 Mar 2024 19:33:33 +0100 Subject: [PATCH 6/8] runtime-rs: refactor QemuCmdLine::add_network_device() signature add_network_device() doesn't need to be passed NetworkInfo since it already has access to the full HypervisorConfig. Also, one of the goals of QemuCmdLine interface's design is to avoid coupling between QemuCmdLine and the hypervisor crate's device module, if at all possible. That's why add_network_device() shouldn't take device module's NetworkConfig but just parts that are useful in add_network_device()'s implementation. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 19 +++++++++---------- .../crates/hypervisor/src/qemu/inner.rs | 9 +++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index d40e9ff0e4..2a1eddb064 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -5,11 +5,10 @@ use super::network::{generate_netdev_fds, NetDevice}; use crate::utils::clear_cloexec; -use crate::{kernel_param::KernelParams, Address, HypervisorConfig, NetworkConfig}; +use crate::{kernel_param::KernelParams, Address, HypervisorConfig}; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; -use kata_types::config::hypervisor::NetworkInfo; use std::collections::HashMap; use std::fmt::Display; use std::fs::{read_to_string, File}; @@ -1262,20 +1261,20 @@ impl<'a> QemuCmdLine<'a> { pub fn add_network_device( &mut self, - config: &NetworkConfig, - _network_info: &NetworkInfo, - ) -> Result> { + dev_index: u64, + host_dev_name: &str, + guest_mac: Address, + ) -> Result<()> { let mut netdev = Netdev::new( - &format!("network-{}", &config.index), - &config.host_dev_name, + &format!("network-{}", dev_index), + host_dev_name, self.config.network_info.network_queues, )?; if self.config.network_info.disable_vhost_net { netdev.set_disable_vhost_net(true); } - let mut virtio_net_device = - DeviceVirtioNet::new(&netdev.id, config.guest_mac.clone().unwrap()); + let mut virtio_net_device = DeviceVirtioNet::new(&netdev.id, guest_mac); if should_disable_modern() { virtio_net_device.set_disable_modern(true); @@ -1286,7 +1285,7 @@ impl<'a> QemuCmdLine<'a> { self.devices.push(Box::new(netdev)); self.devices.push(Box::new(virtio_net_device)); - Ok(vec![]) + Ok(()) } pub fn add_console(&mut self, console_socket_path: &str) { diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index 64de76b8d7..31fc6ff650 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -112,13 +112,14 @@ impl QemuInner { } } DeviceType::Network(network) => { - let network_info = &self.config.network_info; - // we need ensure add_network_device happens in netns. let _netns_guard = NetnsGuard::new(&netns).context("new netns guard")?; - _fds_for_qemu - .append(&mut cmdline.add_network_device(&network.config, network_info)?); + cmdline.add_network_device( + network.config.index, + &network.config.host_dev_name, + network.config.guest_mac.clone().unwrap(), + )?; } _ => info!(sl!(), "qemu cmdline: unsupported device: {:?}", device), } From c94e55d45a7ca15e294095643202107d1d6c69a7 Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Mon, 25 Mar 2024 11:10:37 +0100 Subject: [PATCH 7/8] runtime-rs: make QemuCmdLine own vsock file descriptor Make file descriptors to be passed to qemu owned by QemuCmdLine. See commit 52958f17cd for more explanation. Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 12 ++++++------ src/runtime-rs/crates/hypervisor/src/qemu/inner.rs | 14 ++++---------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index 2a1eddb064..e6e63cff41 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -13,7 +13,7 @@ use std::collections::HashMap; use std::fmt::Display; use std::fs::{read_to_string, File}; use std::os::fd::AsRawFd; -use std::os::unix::io::RawFd; +use tokio; // These should have been called MiB and GiB for better readability but the // more fitting names unfortunately generate linter warnings. @@ -811,13 +811,13 @@ impl ToQemuParams for DeviceVirtioBlk { struct VhostVsock { bus_type: VirtioBusType, - vhostfd: RawFd, + vhostfd: tokio::fs::File, guest_cid: u32, disable_modern: bool, } impl VhostVsock { - fn new(vhostfd: RawFd, guest_cid: u32, bus_type: VirtioBusType) -> VhostVsock { + fn new(vhostfd: tokio::fs::File, guest_cid: u32, bus_type: VirtioBusType) -> VhostVsock { VhostVsock { bus_type, vhostfd, @@ -840,7 +840,7 @@ impl ToQemuParams for VhostVsock { if self.disable_modern { params.push("disable-modern=true".to_owned()); } - params.push(format!("vhostfd={}", self.vhostfd)); + params.push(format!("vhostfd={}", self.vhostfd.as_raw_fd())); params.push(format!("guest-cid={}", self.guest_cid)); Ok(vec!["-device".to_owned(), params.join(",")]) @@ -1198,8 +1198,8 @@ impl<'a> QemuCmdLine<'a> { } } - pub fn add_vsock(&mut self, vhostfd: RawFd, guest_cid: u32) -> Result<()> { - clear_cloexec(vhostfd).context("clearing O_CLOEXEC failed on vsock fd")?; + pub fn add_vsock(&mut self, vhostfd: tokio::fs::File, guest_cid: u32) -> Result<()> { + clear_cloexec(vhostfd.as_raw_fd()).context("clearing O_CLOEXEC failed on vsock fd")?; let mut vhost_vsock_pci = VhostVsock::new(vhostfd, guest_cid, self.bus_type()); diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index 31fc6ff650..16da0bdafb 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -17,7 +17,6 @@ use kata_types::{ }; use persist::sandbox_persist::Persist; use std::collections::HashMap; -use std::os::unix::io::AsRawFd; use std::path::Path; use std::process::Stdio; use tokio::{ @@ -65,15 +64,11 @@ impl QemuInner { info!(sl!(), "Starting QEMU VM"); let netns = self.netns.clone().unwrap_or_default(); + // CAUTION: since 'cmdline' contains file descriptors that have to stay + // open until spawn() is called to launch qemu later in this function, + // 'cmdline' has to live at least until spawn() is called let mut cmdline = QemuCmdLine::new(&self.id, &self.config)?; - // CAUTION: File descriptors that are passed to QEMU must stay open until the QEMU process - // is started and closed afterwards. This is achieved by collecting them in _fds_for_qemu. - // It is mandatory for _fds_for_qemu to last until the QEMU process is forked. Leave it - // in the outer scope of this function for this to happen. The files in _fds_for_qemu - // should not be used in any way. - let mut _fds_for_qemu: Vec = Vec::new(); - for device in &mut self.devices { match device { DeviceType::ShareFs(share_fs_dev) => { @@ -87,8 +82,7 @@ impl QemuInner { } DeviceType::Vsock(vsock_dev) => { let fd = vsock_dev.init_config().await?; - cmdline.add_vsock(fd.as_raw_fd(), vsock_dev.config.guest_cid)?; - _fds_for_qemu.push(fd.into_std().await); + cmdline.add_vsock(fd, vsock_dev.config.guest_cid)?; } DeviceType::Block(block_dev) => { if block_dev.config.path_on_host == self.config.boot_info.initrd { From 4c72b02e5383368646a697a47089922bf5862a1e Mon Sep 17 00:00:00 2001 From: Pavel Mores Date: Mon, 25 Mar 2024 17:42:42 +0100 Subject: [PATCH 8/8] runtime-rs: remove the now-unused code of NetDevice The remaining code in network.rs was mostly moved to utils.rs which seems better home for these utility functions anyway (and a closely related function open_named_tuntap() has already lived there). ToString implementation for Address was removed after some consideration. Address should probably ideally implement Display (as per RFC 565) which would also supply a ToString implementation, however it implements Debug instead, probably to enable automatic implementation of Debug for anything that Address is a member of, if for no other reason. Rather than having two identical functions this commit simply switches to using the Debug implementation for printing Address on qemu command line. Fixes #9352 Signed-off-by: Pavel Mores --- .../hypervisor/src/qemu/cmdline_generator.rs | 32 +- .../crates/hypervisor/src/qemu/mod.rs | 1 - .../crates/hypervisor/src/qemu/network.rs | 335 ------------------ src/runtime-rs/crates/hypervisor/src/utils.rs | 54 ++- 4 files changed, 60 insertions(+), 362 deletions(-) delete mode 100644 src/runtime-rs/crates/hypervisor/src/qemu/network.rs diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs index e6e63cff41..c88a7702c1 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/cmdline_generator.rs @@ -3,8 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use super::network::{generate_netdev_fds, NetDevice}; -use crate::utils::clear_cloexec; +use crate::utils::{clear_cloexec, create_vhost_net_fds, open_named_tuntap}; use crate::{kernel_param::KernelParams, Address, HypervisorConfig}; use anyhow::{anyhow, Context, Result}; @@ -917,10 +916,12 @@ struct Netdev { impl Netdev { fn new(id: &str, host_if_name: &str, num_queues: u32) -> Result { - let (tun_files, vhost_files) = generate_netdev_fds(host_if_name, num_queues)?; let fds = HashMap::from([ - ("fds".to_owned(), tun_files), - ("vhostfds".to_owned(), vhost_files), + ( + "fds".to_owned(), + open_named_tuntap(host_if_name, num_queues)?, + ), + ("vhostfds".to_owned(), create_vhost_net_fds(num_queues)?), ]); for file in fds.values().flatten() { clear_cloexec(file.as_raw_fd()).context("clearing O_CLOEXEC failed")?; @@ -1009,7 +1010,7 @@ impl ToQemuParams for DeviceVirtioNet { params.push(self.device_driver.clone()); params.push(format!("netdev={}", &self.netdev_id)); - params.push(format!("mac={}", self.mac_address.to_string())); + params.push(format!("mac={:?}", self.mac_address)); if self.disable_modern { params.push("disable-modern=true".to_owned()); @@ -1022,25 +1023,6 @@ impl ToQemuParams for DeviceVirtioNet { } } -#[async_trait] -impl ToQemuParams for NetDevice { - // qemu_params returns the qemu parameters built out of this network device. - async fn qemu_params(&self) -> Result> { - let mut qemu_params: Vec = Vec::new(); - - let netdev_params = self.qemu_netdev_params()?; - let device_params = self.qemu_device_params()?; - - qemu_params.push("-netdev".to_owned()); - qemu_params.push(netdev_params.join(",")); - - qemu_params.push("-device".to_owned()); - qemu_params.push(device_params.join(",")); - - Ok(qemu_params) - } -} - #[derive(Debug)] struct DeviceVirtioSerial { id: String, diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/mod.rs b/src/runtime-rs/crates/hypervisor/src/qemu/mod.rs index d1399def25..57afe0b216 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/mod.rs @@ -5,7 +5,6 @@ mod cmdline_generator; mod inner; -mod network; use crate::device::DeviceType; use crate::hypervisor_persist::HypervisorState; diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs b/src/runtime-rs/crates/hypervisor/src/qemu/network.rs deleted file mode 100644 index 1e64f8e5d5..0000000000 --- a/src/runtime-rs/crates/hypervisor/src/qemu/network.rs +++ /dev/null @@ -1,335 +0,0 @@ -// Copyright (c) 2024 Ant Group -// -// SPDX-License-Identifier: Apache-2.0 -// - -use std::collections::HashMap; -use std::convert::TryFrom; -use std::fs::{File, OpenOptions}; -use std::os::fd::RawFd; - -use crate::utils::{clear_cloexec, open_named_tuntap}; -use crate::{Address, NetworkConfig}; -use anyhow::{anyhow, Context, Result}; - -// VirtioTransport is the transport in use for a virtio device. -#[derive(Debug, Default, PartialEq)] -enum VirtioTransport { - #[default] - Pci, -} - -impl ToString for VirtioTransport { - fn to_string(&self) -> String { - match self { - VirtioTransport::Pci => "pci".to_owned(), - } - } -} - -impl TryFrom<&str> for VirtioTransport { - type Error = anyhow::Error; - - fn try_from(_transport: &str) -> Result { - Ok(VirtioTransport::Pci) - } -} - -// DeviceDriver is set in "-device driver=" -#[derive(Debug, Default, PartialEq)] -enum DeviceDriver { - // VirtioNetPci("virtio-net-pci") is a virtio-net device using PCI transport. - #[default] - VirtioNetPci, - - // VfioPci("vfio-pci") is an attached host device using PCI transport. - VfioPci, -} - -impl ToString for DeviceDriver { - fn to_string(&self) -> String { - match self { - DeviceDriver::VirtioNetPci => "virtio-net-pci".to_owned(), - DeviceDriver::VfioPci => "vfio-pci".to_owned(), - } - } -} - -impl TryFrom<&str> for DeviceDriver { - type Error = anyhow::Error; - - fn try_from(device_driver: &str) -> Result { - Ok(match device_driver { - "virtio-net-pci" => DeviceDriver::VirtioNetPci, - "vfio-pci" => DeviceDriver::VfioPci, - _ => return Err(anyhow!("unsupported transport")), - }) - } -} - -#[derive(Debug, Default, PartialEq)] -enum NetDev { - /// Tap("tap") is a tap networking device type. - #[default] - Tap, - - /// MacTap("macvtap") is a macvtap networking device type. - #[allow(dead_code)] - MacvTap, -} - -impl ToString for NetDev { - fn to_string(&self) -> String { - match self { - NetDev::Tap | NetDev::MacvTap => "tap".to_owned(), - // VhostUser is to be added in future. - // NetDev::VhostUser => "vhost-user".to_owned(), - } - } -} - -// NetDevice represents a guest networking device -// -netdev tap,id=hostnet0,vhost=on,vhostfds=x:y:z,fds=a:b:c -// -device virtio-net-pci,netdev=hostnet0,id=net0,mac=24:42:54:20:50:46,bus=pci.0,addr=0x7 -#[derive(Debug, Default)] -pub struct NetDevice { - // device_type is the netdev type (e.g. tap). - device_type: NetDev, - - // driver is the qemu device driver - device_driver: DeviceDriver, - - // id is the net device identifier. - id: String, - - // if_name is the interface name, - if_name: String, - - // bus is the bus path name of a PCI device. - bus: String, - - // pci_addr is the address offset of a PCI device. - pci_addr: String, - - // fds represents the list of already existing file descriptors to be used. - // This is mostly useful for mq support. - // { - // fds: Vec, - // vhost_fds: Vec, - // } - fds: HashMap>, - - // disable_vhost_net disables virtio device emulation from the host kernel instead of from qemu. - disable_vhost_net: bool, - - // mac_address is the networking device interface MAC address. - mac_address: Address, - - // disable_modern prevents qemu from relying on fast MMIO. - disable_modern: bool, - - // transport is the virtio transport for this device. - transport: VirtioTransport, -} - -impl NetDevice { - #[allow(dead_code)] - pub fn new( - config: &NetworkConfig, - disable_vhost_net: bool, - tun_fds: Vec, - vhost_fds: Vec, - ) -> Self { - // we have only two s: - // { - // "fds": vec![fd1, fd2,...], - // "vhostfds": vec![fd3, fd4,...], - // } - let mut fds: HashMap> = HashMap::with_capacity(2); - fds.insert("fds".to_owned(), tun_fds); - fds.insert("vhostfds".to_owned(), vhost_fds); - - // FIXME(Hard Code): It's safe to unwrap here because of the valid input. - // Ideally device_driver should be derived from transport to minimize code duplication. - // While currently we focus on PCI for the initial implementation. - // And we'll support other transports, e.g. s390x's CCW. - let device_driver = DeviceDriver::try_from("virtio-net-pci").unwrap(); - let transport = VirtioTransport::try_from("pci").unwrap(); - - NetDevice { - device_type: NetDev::Tap, - device_driver, - id: format!("network-{}", &config.index), - if_name: config.virt_iface_name.clone(), - mac_address: config.guest_mac.clone().unwrap(), - disable_vhost_net, - disable_modern: false, - fds, - transport, - ..Default::default() - } - } - - fn mq_param(&self) -> String { - let mut params = vec!["mq=on".to_owned()]; - if self.transport == VirtioTransport::Pci { - // https://www.linux-kvm.org/page/Multiqueue - // -netdev tap,vhost=on,queues=N - // enable mq and specify msix vectors in qemu cmdline - // (2N+2 vectors, N for tx queues, N for rx queues, 1 for config, and one for possible control vq) - // -device virtio-net-pci,mq=on,vectors=2N+2... - // enable mq in guest by 'ethtool -L eth0 combined $queue_num' - // Clearlinux automatically sets up the queues properly - // The agent implementation should do this to ensure that it is - // always set - - // vectors = len(netdev.FDs) * 2 + 2 - if let Some(fds) = self.fds.get("fds") { - params.push(format!("vectors={}", 2 * fds.len() + 2)); - } - } - - params.join(",") - } - - pub fn qemu_device_params(&self) -> Result> { - let mut device_params: Vec = Vec::new(); - - device_params.push(format!("driver={}", &self.device_driver.to_string())); - device_params.push(format!("netdev={}", &self.id)); - - let mac = self.mac_address.to_string(); - device_params.push(format!("mac={}", &mac)); - - if !self.bus.is_empty() { - device_params.push(format!("bus={}", &self.bus)); - } - - if !self.pci_addr.is_empty() { - // FIXME: pci_addr: PciPath - device_params.push(format!("addr={}", &self.pci_addr)); - } - - device_params.push(format!( - "disable-modern={}", - if self.disable_modern { "true" } else { "false" } - )); - - if !self.fds.is_empty() { - device_params.push(self.mq_param()); - } - - Ok(device_params) - } - - pub fn qemu_netdev_params(&self) -> Result> { - let mut netdev_params: Vec = Vec::new(); - let netdev_type = self.device_type.to_string(); - netdev_params.push(netdev_type); - netdev_params.push(format!("id={}", self.id)); - - if !self.disable_vhost_net { - netdev_params.push("vhost=on".to_owned()); - if let Some(vhost_fds) = self.fds.get("vhostfds") { - for fd in vhost_fds.iter() { - clear_cloexec(*fd).context("clearing O_CLOEXEC failed on vhost fd")?; - } - let s = vhost_fds - .iter() - .map(|&n| n.to_string()) - .collect::>() - .join(":"); - - netdev_params.push(format!("vhostfds={}", s)); - } - } - - if let Some(tuntap_fds) = self.fds.get("fds") { - for fd in tuntap_fds.iter() { - clear_cloexec(*fd).context("clearing O_CLOEXEC failed on tuntap fd")?; - } - let s = tuntap_fds - .iter() - .map(|&n| n.to_string()) - .collect::>() - .join(":"); - netdev_params.push(format!("fds={}", s)); - } else { - netdev_params.push(format!("ifname={}", self.if_name)); - netdev_params.push("script=no".to_owned()); - netdev_params.push("downscript=no".to_owned()); - } - - Ok(netdev_params) - } -} - -impl ToString for Address { - fn to_string(&self) -> String { - let b: [u8; 6] = self.0; - - format!( - "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", - b[0], b[1], b[2], b[3], b[4], b[5] - ) - } -} - -// /dev/tap$(cat /sys/class/net/macvtap1/ifindex) -// for example: /dev/tap2381 -#[allow(dead_code)] -pub fn create_macvtap_fds(ifindex: u32, queues: u32) -> Result> { - let macvtap = format!("/dev/tap{}", ifindex); - create_fds(macvtap.as_str(), queues as usize) -} - -pub fn create_vhost_net_fds(queues: u32) -> Result> { - let vhost_dev = "/dev/vhost-net"; - let num_fds = if queues > 1 { queues as usize } else { 1_usize }; - - create_fds(vhost_dev, num_fds) -} - -// For example: if num_fds = 3; fds = {0xc000012028, 0xc000012030, 0xc000012038} -fn create_fds(device: &str, num_fds: usize) -> Result> { - let mut fds: Vec = Vec::with_capacity(num_fds); - - for i in 0..num_fds { - match OpenOptions::new().read(true).write(true).open(device) { - Ok(f) => { - fds.push(f); - } - Err(e) => { - fds.clear(); - return Err(anyhow!( - "It failed with error {:?} when opened the {:?} device.", - e, - i - )); - } - }; - } - - Ok(fds) -} - -pub fn generate_netdev_fds(if_name: &str, queues: u32) -> Result<(Vec, Vec)> { - let tun_taps = open_named_tuntap(if_name, queues)?; - let vhost_fds = create_vhost_net_fds(queues)?; - - Ok((tun_taps, vhost_fds)) -} - -#[cfg(test)] -mod tests { - use super::create_fds; - - #[test] - fn test_ctreate_fds() { - let device = "/dev/null"; - let num_fds = 3_usize; - let fds = create_fds(device, num_fds); - assert!(fds.is_ok()); - assert_eq!(fds.unwrap().len(), num_fds); - } -} diff --git a/src/runtime-rs/crates/hypervisor/src/utils.rs b/src/runtime-rs/crates/hypervisor/src/utils.rs index 95e6554b3d..6c2059bd4c 100644 --- a/src/runtime-rs/crates/hypervisor/src/utils.rs +++ b/src/runtime-rs/crates/hypervisor/src/utils.rs @@ -6,7 +6,7 @@ use std::{ collections::HashSet, - fs::File, + fs::{File, OpenOptions}, os::fd::{AsRawFd, RawFd}, }; @@ -104,3 +104,55 @@ pub fn open_named_tuntap(if_name: &str, queues: u32) -> Result> { Ok(tap_files) } + +// /dev/tap$(cat /sys/class/net/macvtap1/ifindex) +// for example: /dev/tap2381 +#[allow(dead_code)] +pub fn create_macvtap_fds(ifindex: u32, queues: u32) -> Result> { + let macvtap = format!("/dev/tap{}", ifindex); + create_fds(macvtap.as_str(), queues as usize) +} + +pub fn create_vhost_net_fds(queues: u32) -> Result> { + let vhost_dev = "/dev/vhost-net"; + let num_fds = if queues > 1 { queues as usize } else { 1_usize }; + + create_fds(vhost_dev, num_fds) +} + +// For example: if num_fds = 3; fds = {0xc000012028, 0xc000012030, 0xc000012038} +fn create_fds(device: &str, num_fds: usize) -> Result> { + let mut fds: Vec = Vec::with_capacity(num_fds); + + for i in 0..num_fds { + match OpenOptions::new().read(true).write(true).open(device) { + Ok(f) => { + fds.push(f); + } + Err(e) => { + fds.clear(); + return Err(anyhow!( + "It failed with error {:?} when opened the {:?} device.", + e, + i + )); + } + }; + } + + Ok(fds) +} + +#[cfg(test)] +mod tests { + use super::create_fds; + + #[test] + fn test_ctreate_fds() { + let device = "/dev/null"; + let num_fds = 3_usize; + let fds = create_fds(device, num_fds); + assert!(fds.is_ok()); + assert_eq!(fds.unwrap().len(), num_fds); + } +}