Merge pull request #8866 from Apokleos/netdev-qemu-rs

runtime-rs: add netdev params to cmdline for qemu-rs.
This commit is contained in:
Alex Lyn
2024-03-13 13:07:43 +08:00
committed by GitHub
18 changed files with 488 additions and 27 deletions

2
src/libs/Cargo.lock generated
View File

@@ -692,6 +692,7 @@ dependencies = [
"chrono",
"common-path",
"fail",
"hex",
"kata-types",
"lazy_static",
"libc",
@@ -708,6 +709,7 @@ dependencies = [
"slog-scope",
"subprocess",
"tempfile",
"test-utils",
"thiserror",
]

View File

@@ -28,6 +28,7 @@ slog-scope = "4.4.0"
subprocess = "0.2.8"
rand = "0.8.5"
thiserror = "1.0.30"
hex = "0.4.3"
kata-types = { path = "../kata-types" }
oci = { path = "../oci" }
@@ -37,3 +38,4 @@ safe-path = { path = "../safe-path" }
num_cpus = "1.13.1"
serial_test = "0.5.1"
tempfile = "3.2.0"
test-utils = { path = "../test-utils" }

View File

@@ -12,6 +12,7 @@ pub mod fs;
pub mod hooks;
pub mod k8s;
pub mod mount;
pub mod netns;
pub mod numa;
pub mod protection;
pub mod rand;

View File

@@ -11,6 +11,8 @@ use nix::sched::{setns, CloneFlags};
use nix::unistd::{getpid, gettid};
use rand::Rng;
use kata_types::sl;
pub struct NetnsGuard {
old_netns: Option<File>,
}

View File

@@ -1796,6 +1796,7 @@ dependencies = [
"chrono",
"common-path",
"fail",
"hex",
"kata-types",
"lazy_static",
"libc",
@@ -3211,7 +3212,6 @@ dependencies = [
"byte-unit 4.0.19",
"cgroups-rs",
"futures 0.3.28",
"hex",
"hypervisor",
"kata-sys-util",
"kata-types",

View File

@@ -29,6 +29,7 @@ path-clean = "1.0.1"
lazy_static = "1.4"
tracing = "0.1.36"
dbs-utils = { path = "../../../dragonball/src/dbs_utils" }
kata-sys-util = { path = "../../../libs/kata-sys-util" }
kata-types = { path = "../../../libs/kata-types" }
logging = { path = "../../../libs/logging" }
@@ -43,7 +44,6 @@ crossbeam-channel = "0.5.6"
[target.'cfg(not(target_arch = "s390x"))'.dependencies]
dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs", "vhost-net", "dbs-upcall", "virtio-mem", "virtio-balloon", "vhost-user-net", "host-device"] }
dbs-utils = { path = "../../../dragonball/src/dbs_utils" }
[features]
default = []

View File

@@ -13,7 +13,7 @@ use crate::device::topology::PCIeTopology;
use crate::device::{Device, DeviceType};
use crate::Hypervisor as hypervisor;
#[derive(Clone)]
#[derive(Clone, Default)]
pub struct Address(pub [u8; 6]);
impl fmt::Debug for Address {

View File

@@ -3,12 +3,15 @@
// SPDX-License-Identifier: Apache-2.0
//
use crate::{kernel_param::KernelParams, HypervisorConfig};
use super::network::{generate_netdev_fds, NetDevice};
use crate::utils::clear_fd_flags;
use crate::{kernel_param::KernelParams, HypervisorConfig, NetworkConfig};
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use nix::fcntl;
use std::fs::read_to_string;
use kata_types::config::hypervisor::NetworkInfo;
use std::fs::{read_to_string, File};
use std::os::fd::AsRawFd;
use std::os::unix::io::RawFd;
// These should have been called MiB and GiB for better readability but the
@@ -703,6 +706,25 @@ impl ToQemuParams for Serial {
}
}
#[async_trait]
impl ToQemuParams for NetDevice {
// qemu_params returns the qemu parameters built out of this network device.
async fn qemu_params(&self) -> Result<Vec<String>> {
let mut qemu_params: Vec<String> = Vec::new();
let netdev_params = self.qemu_netdev_params()?;
let device_params = self.qemu_device_params()?;
qemu_params.push("-netdev".to_owned());
qemu_params.push(netdev_params.join(","));
qemu_params.push("-device".to_owned());
qemu_params.push(device_params.join(","));
Ok(qemu_params)
}
}
fn is_running_in_vm() -> Result<bool> {
let res = read_to_string("/proc/cpuinfo")?
.lines()
@@ -782,16 +804,7 @@ impl<'a> QemuCmdLine<'a> {
}
pub fn add_vsock(&mut self, vhostfd: RawFd, guest_cid: u32) -> Result<()> {
// Clear the O_CLOEXEC which is set by default by Rust standard library
// as it would obviously prevent passing the descriptor to the qemu process.
if let Err(err) = fcntl::fcntl(vhostfd, fcntl::FcntlArg::F_SETFD(fcntl::FdFlag::empty())) {
info!(
sl!(),
"couldn't clear O_CLOEXEC on vsock, communication with agent will not work: {:?}",
err
);
return Err(err.into());
}
clear_fd_flags(vhostfd).context("clear flags failed")?;
let mut vhost_vsock_pci = VhostVsockPci::new(vhostfd, guest_cid);
@@ -855,6 +868,27 @@ impl<'a> QemuCmdLine<'a> {
));
}
pub fn add_network_device(
&mut self,
config: &NetworkConfig,
network_info: &NetworkInfo,
) -> Result<Vec<File>> {
let disable_vhost_net = network_info.disable_vhost_net;
let queues = network_info.network_queues;
let (tun_files, vhost_files) = generate_netdev_fds(config, queues)?;
let tun_fds: Vec<i32> = tun_files.iter().map(|dev| dev.as_raw_fd()).collect();
let vhost_fds: Vec<i32> = vhost_files.iter().map(|dev| dev.as_raw_fd()).collect();
let net_device = NetDevice::new(config, disable_vhost_net, tun_fds, vhost_fds);
self.devices.push(Box::new(net_device));
let dev_files = vec![tun_files, vhost_files];
let fds: Vec<File> = dev_files.into_iter().flatten().collect();
Ok(fds)
}
pub async fn build(&self) -> Result<Vec<String>> {
let mut result = Vec::new();

View File

@@ -5,11 +5,12 @@
use super::cmdline_generator::QemuCmdLine;
use crate::{
hypervisor_persist::HypervisorState, HypervisorConfig, MemoryConfig, VcpuThreadIds,
VsockDevice, HYPERVISOR_QEMU,
hypervisor_persist::HypervisorState, utils::enter_netns, HypervisorConfig, MemoryConfig,
VcpuThreadIds, VsockDevice, HYPERVISOR_QEMU,
};
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use kata_sys_util::netns::NetnsGuard;
use kata_types::{
capabilities::{Capabilities, CapabilityBits},
config::KATA_PATH,
@@ -34,6 +35,7 @@ pub struct QemuInner {
config: HypervisorConfig,
devices: Vec<DeviceType>,
netns: Option<String>,
}
impl QemuInner {
@@ -43,12 +45,14 @@ impl QemuInner {
qemu_process: None,
config: Default::default(),
devices: Vec::new(),
netns: None,
}
}
pub(crate) async fn prepare_vm(&mut self, id: &str, _netns: Option<String>) -> Result<()> {
pub(crate) async fn prepare_vm(&mut self, id: &str, netns: Option<String>) -> Result<()> {
info!(sl!(), "Preparing QEMU VM");
self.id = id.to_string();
self.netns = netns;
let vm_path = [KATA_PATH, self.id.as_str()].join("/");
std::fs::create_dir_all(vm_path)?;
@@ -58,6 +62,7 @@ impl QemuInner {
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
info!(sl!(), "Starting QEMU VM");
let netns = self.netns.clone().unwrap_or_default();
let mut cmdline = QemuCmdLine::new(&self.id, &self.config)?;
@@ -65,6 +70,10 @@ impl QemuInner {
// descriptor needs to stay open until the qemu process launches.
// This is why we need to store it in a variable at this scope.
let mut _vhost_fd = None;
// We need to keep the vhost-net/tuntap file descriptor open until the QEMU process launches.
// However, we're likely not interested in the specific type of file descriptor itself. We just
// want to ensure any fds associated with network devices remain open within the current scope.
let mut _fds_for_qemu: Vec<std::fs::File> = Vec::new();
for device in &mut self.devices {
match device {
@@ -99,6 +108,14 @@ impl QemuInner {
}
}
}
DeviceType::Network(network) => {
let network_info = &self.config.network_info;
// we need ensure add_network_device happens in netns.
let _netns_guard = NetnsGuard::new(&netns).context("new netns guard")?;
_fds_for_qemu = cmdline.add_network_device(&network.config, network_info)?;
}
_ => info!(sl!(), "qemu cmdline: unsupported device: {:?}", device),
}
}
@@ -112,6 +129,16 @@ impl QemuInner {
command.args(cmdline.build().await?);
info!(sl!(), "qemu cmd: {:?}", command);
// we need move the qemu process into Network Namespace.
unsafe {
let _pre_exec = command.pre_exec(move || {
let _ = enter_netns(&netns);
Ok(())
});
}
self.qemu_process = Some(command.stderr(Stdio::piped()).spawn()?);
info!(sl!(), "qemu process started");
@@ -357,6 +384,7 @@ impl Persist for QemuInner {
qemu_process: None,
config: hypervisor_state.config,
devices: Vec::new(),
netns: None,
})
}
}

View File

@@ -5,6 +5,7 @@
mod cmdline_generator;
mod inner;
mod network;
use crate::device::DeviceType;
use crate::hypervisor_persist::HypervisorState;

View File

@@ -0,0 +1,339 @@
// Copyright (c) 2024 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use std::collections::HashMap;
use std::convert::TryFrom;
use std::fs::{File, OpenOptions};
use std::os::fd::RawFd;
use crate::utils::{clear_fd_flags, open_named_tuntap};
use crate::{Address, NetworkConfig};
use anyhow::{anyhow, Context, Result};
// VirtioTransport is the transport in use for a virtio device.
#[derive(Debug, Default, PartialEq)]
enum VirtioTransport {
#[default]
Pci,
}
impl ToString for VirtioTransport {
fn to_string(&self) -> String {
match self {
VirtioTransport::Pci => "pci".to_owned(),
}
}
}
impl TryFrom<&str> for VirtioTransport {
type Error = anyhow::Error;
fn try_from(_transport: &str) -> Result<Self> {
Ok(VirtioTransport::Pci)
}
}
// DeviceDriver is set in "-device driver=<DeviceDriver>"
#[derive(Debug, Default, PartialEq)]
enum DeviceDriver {
// VirtioNetPci("virtio-net-pci") is a virtio-net device using PCI transport.
#[default]
VirtioNetPci,
// VfioPci("vfio-pci") is an attached host device using PCI transport.
VfioPci,
}
impl ToString for DeviceDriver {
fn to_string(&self) -> String {
match self {
DeviceDriver::VirtioNetPci => "virtio-net-pci".to_owned(),
DeviceDriver::VfioPci => "vfio-pci".to_owned(),
}
}
}
impl TryFrom<&str> for DeviceDriver {
type Error = anyhow::Error;
fn try_from(device_driver: &str) -> Result<Self> {
Ok(match device_driver {
"virtio-net-pci" => DeviceDriver::VirtioNetPci,
"vfio-pci" => DeviceDriver::VfioPci,
_ => return Err(anyhow!("unsupported transport")),
})
}
}
#[derive(Debug, Default, PartialEq)]
enum NetDev {
/// Tap("tap") is a tap networking device type.
#[default]
Tap,
/// MacTap("macvtap") is a macvtap networking device type.
#[allow(dead_code)]
MacvTap,
}
impl ToString for NetDev {
fn to_string(&self) -> String {
match self {
NetDev::Tap | NetDev::MacvTap => "tap".to_owned(),
// VhostUser is to be added in future.
// NetDev::VhostUser => "vhost-user".to_owned(),
}
}
}
// NetDevice represents a guest networking device
// -netdev tap,id=hostnet0,vhost=on,vhostfds=x:y:z,fds=a:b:c
// -device virtio-net-pci,netdev=hostnet0,id=net0,mac=24:42:54:20:50:46,bus=pci.0,addr=0x7
#[derive(Debug, Default)]
pub struct NetDevice {
// device_type is the netdev type (e.g. tap).
device_type: NetDev,
// driver is the qemu device driver
device_driver: DeviceDriver,
// id is the net device identifier.
id: String,
// if_name is the interface name,
if_name: String,
// bus is the bus path name of a PCI device.
bus: String,
// pci_addr is the address offset of a PCI device.
pci_addr: String,
// fds represents the list of already existing file descriptors to be used.
// This is mostly useful for mq support.
// {
// fds: Vec<File>,
// vhost_fds: Vec<File>,
// }
fds: HashMap<String, Vec<RawFd>>,
// disable_vhost_net disables virtio device emulation from the host kernel instead of from qemu.
disable_vhost_net: bool,
// mac_address is the networking device interface MAC address.
mac_address: Address,
// disable_modern prevents qemu from relying on fast MMIO.
disable_modern: bool,
// transport is the virtio transport for this device.
transport: VirtioTransport,
}
impl NetDevice {
#[allow(dead_code)]
pub fn new(
config: &NetworkConfig,
disable_vhost_net: bool,
tun_fds: Vec<i32>,
vhost_fds: Vec<i32>,
) -> Self {
// we have only two <Key, Value>s:
// {
// "fds": vec![fd1, fd2,...],
// "vhostfds": vec![fd3, fd4,...],
// }
let mut fds: HashMap<String, Vec<RawFd>> = HashMap::with_capacity(2);
fds.insert("fds".to_owned(), tun_fds);
fds.insert("vhostfds".to_owned(), vhost_fds);
// FIXME(Hard Code): It's safe to unwrap here because of the valid input.
// Ideally device_driver should be derived from transport to minimize code duplication.
// While currently we focus on PCI for the initial implementation.
// And we'll support other transports, e.g. s390x's CCW.
let device_driver = DeviceDriver::try_from("virtio-net-pci").unwrap();
let transport = VirtioTransport::try_from("pci").unwrap();
NetDevice {
device_type: NetDev::Tap,
device_driver,
id: format!("network-{}", &config.index),
if_name: config.virt_iface_name.clone(),
mac_address: config.guest_mac.clone().unwrap(),
disable_vhost_net,
disable_modern: false,
fds,
transport,
..Default::default()
}
}
fn mq_param(&self) -> String {
let mut params = vec!["mq=on".to_owned()];
if self.transport == VirtioTransport::Pci {
// https://www.linux-kvm.org/page/Multiqueue
// -netdev tap,vhost=on,queues=N
// enable mq and specify msix vectors in qemu cmdline
// (2N+2 vectors, N for tx queues, N for rx queues, 1 for config, and one for possible control vq)
// -device virtio-net-pci,mq=on,vectors=2N+2...
// enable mq in guest by 'ethtool -L eth0 combined $queue_num'
// Clearlinux automatically sets up the queues properly
// The agent implementation should do this to ensure that it is
// always set
// vectors = len(netdev.FDs) * 2 + 2
if let Some(fds) = self.fds.get("fds") {
params.push(format!("vectors={}", 2 * fds.len() + 2));
}
}
params.join(",")
}
pub fn qemu_device_params(&self) -> Result<Vec<String>> {
let mut device_params: Vec<String> = Vec::new();
device_params.push(format!("driver={}", &self.device_driver.to_string()));
device_params.push(format!("netdev={}", &self.id));
let mac = self.mac_address.to_string();
device_params.push(format!("mac={}", &mac));
if !self.bus.is_empty() {
device_params.push(format!("bus={}", &self.bus));
}
if !self.pci_addr.is_empty() {
// FIXME: pci_addr: PciPath
device_params.push(format!("addr={}", &self.pci_addr));
}
device_params.push(format!(
"disable-modern={}",
if self.disable_modern { "true" } else { "false" }
));
if !self.fds.is_empty() {
device_params.push(self.mq_param());
}
Ok(device_params)
}
pub fn qemu_netdev_params(&self) -> Result<Vec<String>> {
let mut netdev_params: Vec<String> = Vec::new();
let netdev_type = self.device_type.to_string();
netdev_params.push(netdev_type);
netdev_params.push(format!("id={}", self.id));
if !self.disable_vhost_net {
netdev_params.push("vhost=on".to_owned());
if let Some(vhost_fds) = self.fds.get("vhostfds") {
for fd in vhost_fds.iter() {
clear_fd_flags(*fd)?;
}
let s = vhost_fds
.iter()
.map(|&n| n.to_string())
.collect::<Vec<String>>()
.join(":");
netdev_params.push(format!("vhostfds={}", s));
}
}
if let Some(tuntap_fds) = self.fds.get("fds") {
for fd in tuntap_fds.iter() {
clear_fd_flags(*fd).context("clear flag of fd failed")?;
}
let s = tuntap_fds
.iter()
.map(|&n| n.to_string())
.collect::<Vec<String>>()
.join(":");
netdev_params.push(format!("fds={}", s));
} else {
netdev_params.push(format!("ifname={}", self.if_name));
netdev_params.push("script=no".to_owned());
netdev_params.push("downscript=no".to_owned());
}
Ok(netdev_params)
}
}
impl ToString for Address {
fn to_string(&self) -> String {
let b: [u8; 6] = self.0;
format!(
"{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}",
b[0], b[1], b[2], b[3], b[4], b[5]
)
}
}
// /dev/tap$(cat /sys/class/net/macvtap1/ifindex)
// for example: /dev/tap2381
#[allow(dead_code)]
pub fn create_macvtap_fds(ifindex: u32, queues: u32) -> Result<Vec<File>> {
let macvtap = format!("/dev/tap{}", ifindex);
create_fds(macvtap.as_str(), queues as usize)
}
pub fn create_vhost_net_fds(queues: u32) -> Result<Vec<File>> {
let vhost_dev = "/dev/vhost-net";
let num_fds = if queues > 1 { queues as usize } else { 1_usize };
create_fds(vhost_dev, num_fds)
}
// For example: if num_fds = 3; fds = {0xc000012028, 0xc000012030, 0xc000012038}
fn create_fds(device: &str, num_fds: usize) -> Result<Vec<File>> {
let mut fds: Vec<File> = Vec::with_capacity(num_fds);
for i in 0..num_fds {
match OpenOptions::new().read(true).write(true).open(device) {
Ok(f) => {
fds.push(f);
}
Err(e) => {
fds.clear();
return Err(anyhow!(
"It failed with error {:?} when opened the {:?} device.",
e,
i
));
}
};
}
Ok(fds)
}
pub fn generate_netdev_fds(
network_config: &NetworkConfig,
queues: u32,
) -> Result<(Vec<File>, Vec<File>)> {
let if_name = network_config.host_dev_name.as_str();
let tun_taps = open_named_tuntap(if_name, queues)?;
let vhost_fds = create_vhost_net_fds(queues)?;
Ok((tun_taps, vhost_fds))
}
#[cfg(test)]
mod tests {
use super::create_fds;
#[test]
fn test_ctreate_fds() {
let device = "/dev/null";
let num_fds = 3_usize;
let fds = create_fds(device, num_fds);
assert!(fds.is_ok());
assert_eq!(fds.unwrap().len(), num_fds);
}
}

View File

@@ -4,9 +4,19 @@
// SPDX-License-Identifier: Apache-2.0
//
use std::collections::HashSet;
use std::{
collections::HashSet,
fs::File,
os::fd::{AsRawFd, RawFd},
};
use anyhow::{anyhow, Context, Result};
use dbs_utils::net::Tap;
use kata_types::config::KATA_PATH;
use nix::{
fcntl,
sched::{setns, CloneFlags},
};
use crate::{DEFAULT_HYBRID_VSOCK_NAME, JAILER_ROOT};
@@ -47,3 +57,46 @@ pub fn get_jailer_root(sid: &str) -> String {
[&sandbox_path, JAILER_ROOT].join("/")
}
// Clear the O_CLOEXEC which is set by default by Rust standard library
// as it would obviously prevent passing the descriptor to the hypervisor process.
pub fn clear_fd_flags(rawfd: RawFd) -> Result<()> {
if let Err(err) = fcntl::fcntl(rawfd, fcntl::FcntlArg::F_SETFD(fcntl::FdFlag::empty())) {
info!(
sl!(),
"couldn't clear O_CLOEXEC on device's fd, communication with agent will not work: {:?}",
err
);
return Err(err.into());
}
Ok(())
}
pub fn enter_netns(netns_path: &str) -> Result<()> {
if !netns_path.is_empty() {
let netns =
File::open(netns_path).context(anyhow!("open netns path {:?} failed.", netns_path))?;
setns(netns.as_raw_fd(), CloneFlags::CLONE_NEWNET).context("set netns failed")?;
}
Ok(())
}
pub fn open_named_tuntap(if_name: &str, queues: u32) -> Result<Vec<File>> {
let (multi_vq, vq_pairs) = if queues > 1 {
(true, queues as usize)
} else {
(false, 1_usize)
};
let tap: Tap = Tap::open_named(if_name, multi_vq).context("open named tuntap device failed")?;
let taps: Vec<Tap> = tap.into_mq_taps(vq_pairs).context("into mq taps failed.")?;
let mut tap_files: Vec<std::fs::File> = Vec::new();
for tap in taps {
tap_files.push(tap.tap_file);
}
Ok(tap_files)
}

View File

@@ -17,7 +17,6 @@ bitflags = "1.2.1"
byte-unit = "4.0.14"
cgroups-rs = "0.3.2"
futures = "0.3.11"
hex = "0.4.3"
lazy_static = "1.4.0"
libc = ">=0.2.39"
netns-rs = "0.1.0"

View File

@@ -24,6 +24,7 @@ use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::device::device_manager::DeviceManager;
use hypervisor::Hypervisor;
use kata_sys_util::netns::NetnsGuard;
use kata_types::config::TomlConfig;
use scopeguard::defer;
use serde::{Deserialize, Serialize};
@@ -32,7 +33,7 @@ use tokio::sync::RwLock;
use super::network_entity::NetworkEntity;
use super::utils::address::{ip_family_from_ip_addr, parse_ip_cidr};
use super::{EndpointState, NetnsGuard, Network};
use super::{EndpointState, Network};
use crate::network::endpoint::{TapEndpoint, VhostUserEndpoint};
use crate::network::network_info::network_info_from_dan::NetworkInfoFromDan;
use crate::network::utils::generate_private_mac_addr;

View File

@@ -22,8 +22,8 @@ use network_with_netns::NetworkWithNetns;
mod network_pair;
use network_pair::NetworkPair;
mod utils;
pub use kata_sys_util::netns::{generate_netns_name, NetnsGuard};
use tokio::sync::RwLock;
pub use utils::netns::{generate_netns_name, NetnsGuard};
use anyhow::{Context, Result};
use async_trait::async_trait;

View File

@@ -17,6 +17,7 @@ use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use futures::stream::TryStreamExt;
use hypervisor::{device::device_manager::DeviceManager, Hypervisor};
use kata_sys_util::netns;
use netns_rs::get_from_path;
use scopeguard::defer;
use tokio::sync::RwLock;
@@ -27,7 +28,7 @@ use super::{
},
network_entity::NetworkEntity,
network_info::network_info_from_link::{handle_addresses, NetworkInfoFromLink},
utils::{link, netns},
utils::link,
Network,
};
use crate::network::NetworkInfo;

View File

@@ -6,7 +6,6 @@
pub(crate) mod address;
pub(crate) mod link;
pub(crate) mod netns;
use anyhow::{anyhow, Result};
use rand::rngs::OsRng;

View File

@@ -21,12 +21,11 @@ use common::{
};
use hypervisor::Hypervisor;
use oci::Process as OCIProcess;
use resource::network::NetnsGuard;
use resource::ResourceManager;
use tokio::sync::RwLock;
use tracing::instrument;
use kata_sys_util::hooks::HookStates;
use kata_sys_util::{hooks::HookStates, netns::NetnsGuard};
use super::{logger_with_process, Container};