diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs index dddd3adc5b..853e4aef3f 100644 --- a/src/libs/kata-types/src/config/runtime.rs +++ b/src/libs/kata-types/src/config/runtime.rs @@ -137,6 +137,17 @@ pub struct Runtime { /// This option is typically used to retain abnormal information for debugging. #[serde(default)] pub keep_abnormal: bool, + + /// Base directory of directly attachable network config, the default value + /// is "/run/kata-containers/dans". + /// + /// Network devices for VM-based containers are allowed to be placed in the + /// host netns to eliminate as many hops as possible, which is what we + /// called a "directly attachable network". The config, set by special CNI + /// plugins, is used to tell the Kata Containers what devices are attached + /// to the hypervisor. + #[serde(default)] + pub dan_conf: String, } impl ConfigOps for Runtime { diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 1981a37d9c..496a9b1c20 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -162,6 +162,7 @@ DEFVFIOMODE := guest-kernel DEFSANDBOXCGROUPONLY ?= false DEFSTATICRESOURCEMGMT_DB ?= false DEFBINDMOUNTS := [] +DEFDANCONF := /run/kata-containers/dans SED = sed CLI_DIR = cmd SHIMV2 = containerd-shim-kata-v2 @@ -308,6 +309,7 @@ USER_VARS += DBSHAREDFS USER_VARS += KATA_INSTALL_GROUP USER_VARS += KATA_INSTALL_OWNER USER_VARS += KATA_INSTALL_CFG_PERMS +USER_VARS += DEFDANCONF SOURCES := \ $(shell find . 2>&1 | grep -E '.*\.rs$$') \ diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 58e29b9dda..f4b6bcfdbd 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -323,3 +323,12 @@ static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_DB@ # - "/path/to:ro", readonly mode. # - "/path/to:rw", readwrite mode. sandbox_bind_mounts=@DEFBINDMOUNTS@ + +# Base directory of directly attachable network config. +# Network devices for VM-based containers are allowed to be placed in the +# host netns to eliminate as many hops as possible, which is what we +# called a "Directly Attachable Network". The config, set by special CNI +# plugins, is used to tell the Kata containers what devices are attached +# to the hypervisor. +# (default: /run/kata-containers/dans) +dan_conf = "@DEFDANCONF@" diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs index 2fa825b517..3d19625b9f 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs @@ -41,6 +41,12 @@ pub struct NetworkConfig { /// Guest MAC address. pub guest_mac: Option
, + + /// Virtio queue size + pub queue_size: usize, + + /// Virtio queue num + pub queue_num: usize, } #[derive(Clone, Debug, Default)] diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs index fe7186c760..85ac7bd196 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs @@ -214,6 +214,8 @@ impl DragonballInner { Some(mac) => MacAddr::from_bytes(&mac.0).ok(), None => None, }, + num_queues: config.queue_num, + queue_size: config.queue_size as u16, ..Default::default() }; diff --git a/src/runtime-rs/crates/resource/src/network/dan.rs b/src/runtime-rs/crates/resource/src/network/dan.rs new file mode 100644 index 0000000000..d59875bca1 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/dan.rs @@ -0,0 +1,406 @@ +// Copyright (c) 2019-2023 Alibaba Cloud +// Copyright (c) 2019-2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Directly Attachable Network (DAN) is a type of network that runs in the host +//! netns. It supports host-tap, vhost-user (DPDK), etc. +//! The device information is retrieved from a JSON file, the type of which is +//! `Vec`. +//! In this module, `IPAddress`, `Interface`, etc., are duplicated mostly from +//! `agent::IPAddress`, `agent::Interface`, and so on. They can't be referenced +//! directly because the former represents the structure of the JSON file written +//! by CNI plugins. They might have some slight differences, and may be revised in +//! the future. + +use std::net::IpAddr; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; + +use agent::IPFamily; +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use hypervisor::device::device_manager::DeviceManager; +use hypervisor::Hypervisor; +use kata_types::config::TomlConfig; +use scopeguard::defer; +use serde::{Deserialize, Serialize}; +use tokio::fs; +use tokio::sync::RwLock; + +use super::network_entity::NetworkEntity; +use super::utils::address::{ip_family_from_ip_addr, parse_ip_cidr}; +use super::{EndpointState, NetnsGuard, Network}; +use crate::network::endpoint::TapEndpoint; +use crate::network::network_info::network_info_from_dan::NetworkInfoFromDan; +use crate::network::utils::generate_private_mac_addr; + +/// Directly attachable network +pub struct Dan { + inner: Arc>, +} + +pub struct DanInner { + netns: Option, + entity_list: Vec, +} + +impl Dan { + pub async fn new( + config: &DanNetworkConfig, + dev_mgr: Arc>, + ) -> Result { + Ok(Self { + inner: Arc::new(RwLock::new(DanInner::new(config, &dev_mgr).await?)), + }) + } +} + +impl DanInner { + /// DanInner initialization deserializes DAN devices from a file writen + /// by CNI plugins. Respective endpoint and network_info are retrieved + /// from the devices, and compose NetworkEntity. + async fn new(config: &DanNetworkConfig, dev_mgr: &Arc>) -> Result { + let json_str = fs::read_to_string(&config.dan_conf_path) + .await + .context("Read DAN config from file")?; + let config: DanConfig = serde_json::from_str(&json_str).context("Invalid DAN config")?; + info!(sl!(), "Dan config is loaded = {:?}", config); + + let (connection, handle, _) = rtnetlink::new_connection().context("New connection")?; + let thread_handler = tokio::spawn(connection); + defer!({ + thread_handler.abort(); + }); + + let mut entity_list = Vec::with_capacity(config.devices.len()); + for (idx, device) in config.devices.iter().enumerate() { + let name = format!("eth{}", idx); + let endpoint = match &device.device { + // TODO: Support VhostUserNet protocol + Device::VhostUser { + path, + queue_num: _, + queue_size: _, + } => { + warn!(sl!(), "A DAN device whose type is \"vhost-user\" and socket path is {} is ignored.", path); + continue; + } + Device::HostTap { + tap_name, + queue_num, + queue_size, + } => Arc::new( + TapEndpoint::new( + &handle, + idx as u32, + &name, + tap_name, + &device.guest_mac, + *queue_num, + *queue_size, + dev_mgr, + ) + .await + .with_context(|| format!("New a {} tap endpoint", tap_name))?, + ), + }; + + let network_info = Arc::new( + NetworkInfoFromDan::new(device) + .await + .context("Network info from DAN")?, + ); + + entity_list.push(NetworkEntity { + endpoint, + network_info, + }) + } + + Ok(Self { + netns: config.netns, + entity_list, + }) + } +} + +#[async_trait] +impl Network for Dan { + async fn setup(&self) -> Result<()> { + let inner = self.inner.read().await; + let _netns_guard; + if let Some(netns) = inner.netns.as_ref() { + _netns_guard = NetnsGuard::new(netns).context("New netns guard")?; + } + for e in inner.entity_list.iter() { + e.endpoint.attach().await.context("Attach")?; + } + Ok(()) + } + + async fn interfaces(&self) -> Result> { + let inner = self.inner.read().await; + let mut interfaces = vec![]; + for e in inner.entity_list.iter() { + interfaces.push(e.network_info.interface().await.context("Interface")?); + } + Ok(interfaces) + } + + async fn routes(&self) -> Result> { + let inner = self.inner.read().await; + let mut routes = vec![]; + for e in inner.entity_list.iter() { + let mut list = e.network_info.routes().await.context("Routes")?; + routes.append(&mut list); + } + Ok(routes) + } + + async fn neighs(&self) -> Result> { + let inner = self.inner.read().await; + let mut neighs = vec![]; + for e in &inner.entity_list { + let mut list = e.network_info.neighs().await.context("Neighs")?; + neighs.append(&mut list); + } + Ok(neighs) + } + + async fn save(&self) -> Option> { + let inner = self.inner.read().await; + let mut ep_states = vec![]; + for e in &inner.entity_list { + if let Some(state) = e.endpoint.save().await { + ep_states.push(state); + } + } + Some(ep_states) + } + + async fn remove(&self, h: &dyn Hypervisor) -> Result<()> { + let inner = self.inner.read().await; + let _netns_guard; + if let Some(netns) = inner.netns.as_ref() { + _netns_guard = NetnsGuard::new(netns).context("New netns guard")?; + } + for e in inner.entity_list.iter() { + e.endpoint.detach(h).await.context("Detach")?; + } + Ok(()) + } +} + +/// Directly attachable network config +#[derive(Debug)] +pub struct DanNetworkConfig { + pub dan_conf_path: PathBuf, +} + +/// Directly attachable network config written by CNI plugins +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DanConfig { + netns: Option, + devices: Vec, +} + +/// Directly attachable network device +/// This struct is serilized from a file containing devices information, +/// sent from CNI plugins. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub(crate) struct DanDevice { + // Name of device (interface name on the guest) + pub(crate) name: String, + // Mac address of interface on the guest, if it is not specified, a + // private address is generated as default. + #[serde(default = "generate_private_mac_addr")] + pub(crate) guest_mac: String, + // Device + pub(crate) device: Device, + // Network info + pub(crate) network_info: NetworkInfo, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(tag = "type")] +pub(crate) enum Device { + #[serde(rename = "vhost-user")] + VhostUser { + // Vhost-user socket path + path: String, + #[serde(default)] + queue_num: usize, + #[serde(default)] + queue_size: usize, + }, + #[serde(rename = "host-tap")] + HostTap { + tap_name: String, + #[serde(default)] + queue_num: usize, + #[serde(default)] + queue_size: usize, + }, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub(crate) struct NetworkInfo { + pub(crate) interface: Interface, + #[serde(default)] + pub(crate) routes: Vec, + #[serde(default)] + pub(crate) neighbors: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub(crate) struct Interface { + // IP addresses in the format of CIDR + pub ip_addresses: Vec, + #[serde(default = "default_mtu")] + pub mtu: u64, + #[serde(default)] + // Link type + pub ntype: String, + #[serde(default)] + pub flags: u32, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub(crate) struct Route { + #[serde(default)] + // Destination(CIDR), an empty string denotes no destination + pub dest: String, + #[serde(default)] + // Gateway(IP Address), an empty string denotes no gateway + pub gateway: String, + // Source(IP Address), an empty string denotes no gateway + #[serde(default)] + pub source: String, + // Scope + #[serde(default)] + pub scope: u32, +} + +impl Route { + pub(crate) fn ip_family(&self) -> Result { + if !self.dest.is_empty() { + return Ok(ip_family_from_ip_addr( + &parse_ip_cidr(&self.dest) + .context("Parse ip addr from dest")? + .0, + )); + } + + if !self.gateway.is_empty() { + return Ok(ip_family_from_ip_addr( + &IpAddr::from_str(&self.gateway).context("Parse ip addr from gateway")?, + )); + } + + if !self.source.is_empty() { + return Ok(ip_family_from_ip_addr( + &IpAddr::from_str(&self.source).context("Parse ip addr from source")?, + )); + } + + Err(anyhow!("Failed to retrieve IP family from {:?}", self)) + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub(crate) struct ARPNeighbor { + // IP address in the format of CIDR + pub ip_address: Option, + #[serde(default)] + pub hardware_addr: String, + #[serde(default)] + pub state: u32, + #[serde(default)] + pub flags: u32, +} + +fn default_mtu() -> u64 { + 1500 +} + +/// Path of DAN config, the file contains an array of DanDevices. +#[inline] +pub fn dan_config_path(config: &TomlConfig, sandbox_id: &str) -> PathBuf { + PathBuf::from(config.runtime.dan_conf.as_str()).join(format!("{}.json", sandbox_id)) +} + +#[cfg(test)] +mod tests { + use crate::network::dan::{ARPNeighbor, DanDevice, Device, Interface, NetworkInfo, Route}; + + #[test] + fn test_dan_json() { + let json_str = r#"{ + "name": "eth0", + "guest_mac": "xx:xx:xx:xx:xx", + "device": { + "type": "vhost-user", + "path": "/tmp/test", + "queue_num": 1, + "queue_size": 1 + }, + "network_info": { + "interface": { + "ip_addresses": ["192.168.0.1/24"], + "mtu": 1500, + "ntype": "tuntap", + "flags": 0 + }, + "routes": [{ + "dest": "172.18.0.0/16", + "source": "172.18.0.1", + "gateway": "172.18.31.1", + "scope": 0, + "flags": 0 + }], + "neighbors": [{ + "ip_address": "192.168.0.3/16", + "device": "", + "state": 0, + "flags": 0, + "hardware_addr": "xx:xx:xx:xx:xx" + }] + } + }"#; + let dev_from_json: DanDevice = serde_json::from_str(json_str).unwrap(); + let dev = DanDevice { + name: "eth0".to_owned(), + guest_mac: "xx:xx:xx:xx:xx".to_owned(), + device: Device::VhostUser { + path: "/tmp/test".to_owned(), + queue_num: 1, + queue_size: 1, + }, + network_info: NetworkInfo { + interface: Interface { + ip_addresses: vec!["192.168.0.1/24".to_owned()], + mtu: 1500, + ntype: "tuntap".to_owned(), + flags: 0, + }, + routes: vec![Route { + dest: "172.18.0.0/16".to_owned(), + source: "172.18.0.1".to_owned(), + gateway: "172.18.31.1".to_owned(), + scope: 0, + }], + neighbors: vec![ARPNeighbor { + ip_address: Some("192.168.0.3/16".to_owned()), + hardware_addr: "xx:xx:xx:xx:xx".to_owned(), + state: 0, + flags: 0, + }], + }, + }; + + assert_eq!(dev_from_json, dev); + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/endpoint_persist.rs b/src/runtime-rs/crates/resource/src/network/endpoint/endpoint_persist.rs index 1f6fe3c58f..b637b2afe6 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/endpoint_persist.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/endpoint_persist.rs @@ -39,6 +39,11 @@ pub struct IpVlanEndpointState { pub network_qos: bool, } +#[derive(Serialize, Deserialize, Clone, Default)] +pub struct TapEndpointState { + pub if_name: String, +} + #[derive(Serialize, Deserialize, Clone, Default)] pub struct EndpointState { pub physical_endpoint: Option, @@ -46,5 +51,6 @@ pub struct EndpointState { pub ipvlan_endpoint: Option, pub macvlan_endpoint: Option, pub vlan_endpoint: Option, + pub tap_endpoint: Option, // TODO : other endpoint } diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs index 2efd0bc349..1c15f67e03 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs @@ -16,6 +16,8 @@ mod macvlan_endpoint; pub use macvlan_endpoint::MacVlanEndpoint; pub mod endpoint_persist; mod endpoints_test; +mod tap_endpoint; +pub use tap_endpoint::TapEndpoint; use anyhow::Result; use async_trait::async_trait; diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/tap_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/tap_endpoint.rs new file mode 100644 index 0000000000..e22a91d922 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/tap_endpoint.rs @@ -0,0 +1,124 @@ +// Copyright (c) 2019-2023 Alibaba Cloud +// Copyright (c) 2019-2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hypervisor::device::device_manager::{do_handle_device, DeviceManager}; +use hypervisor::device::{DeviceConfig, DeviceType}; +use hypervisor::{Hypervisor, NetworkConfig, NetworkDevice}; +use tokio::sync::RwLock; + +use super::endpoint_persist::TapEndpointState; +use super::Endpoint; +use crate::network::network_pair::{get_link_by_name, NetworkInterface}; +use crate::network::{utils, EndpointState}; + +/// TapEndpoint is used to attach to the hypervisor directly +#[derive(Debug)] +pub struct TapEndpoint { + // Index + #[allow(dead_code)] + index: u32, + // Name of virt interface + name: String, + // Hardware address of virt interface + guest_mac: String, + // Tap interface on the host + tap_iface: NetworkInterface, + // Device manager + dev_mgr: Arc>, + // Virtio queue num + queue_num: usize, + // Virtio queue size + queue_size: usize, +} + +impl TapEndpoint { + #[allow(clippy::too_many_arguments)] + pub async fn new( + handle: &rtnetlink::Handle, + index: u32, + name: &str, + tap_name: &str, + guest_mac: &str, + queue_num: usize, + queue_size: usize, + dev_mgr: &Arc>, + ) -> Result { + let tap_link = get_link_by_name(handle, tap_name) + .await + .context("get link by name")?; + let tap_hard_addr = + utils::get_mac_addr(&tap_link.attrs().hardware_addr).context("Get mac addr of tap")?; + + Ok(TapEndpoint { + index, + name: name.to_owned(), + guest_mac: guest_mac.to_owned(), + tap_iface: NetworkInterface { + name: tap_name.to_owned(), + hard_addr: tap_hard_addr, + ..Default::default() + }, + dev_mgr: dev_mgr.clone(), + queue_num, + queue_size, + }) + } + + fn get_network_config(&self) -> Result { + let guest_mac = utils::parse_mac(&self.guest_mac).context("Parse mac address")?; + Ok(NetworkConfig { + host_dev_name: self.tap_iface.name.clone(), + virt_iface_name: self.name.clone(), + guest_mac: Some(guest_mac), + queue_num: self.queue_num, + queue_size: self.queue_size, + ..Default::default() + }) + } +} + +#[async_trait] +impl Endpoint for TapEndpoint { + async fn name(&self) -> String { + self.name.clone() + } + + async fn hardware_addr(&self) -> String { + self.guest_mac.clone() + } + + async fn attach(&self) -> Result<()> { + let config = self.get_network_config().context("Get network config")?; + do_handle_device(&self.dev_mgr, &DeviceConfig::NetworkCfg(config)) + .await + .context("Handle device")?; + Ok(()) + } + + async fn detach(&self, h: &dyn Hypervisor) -> Result<()> { + let config = self.get_network_config().context("Get network config")?; + h.remove_device(DeviceType::Network(NetworkDevice { + config, + ..Default::default() + })) + .await + .context("Remove device")?; + Ok(()) + } + + async fn save(&self) -> Option { + Some(EndpointState { + tap_endpoint: Some(TapEndpointState { + if_name: self.name.clone(), + }), + ..Default::default() + }) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/mod.rs b/src/runtime-rs/crates/resource/src/network/mod.rs index ed9a9e4f98..5a85ee0895 100644 --- a/src/runtime-rs/crates/resource/src/network/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/mod.rs @@ -4,9 +4,11 @@ // SPDX-License-Identifier: Apache-2.0 // -mod endpoint; use std::sync::Arc; +mod dan; +mod endpoint; +pub use dan::{dan_config_path, Dan, DanNetworkConfig}; pub use endpoint::endpoint_persist::EndpointState; pub use endpoint::Endpoint; mod network_entity; @@ -20,9 +22,8 @@ use network_with_netns::NetworkWithNetns; mod network_pair; use network_pair::NetworkPair; mod utils; -pub use utils::netns::{generate_netns_name, NetnsGuard}; - use tokio::sync::RwLock; +pub use utils::netns::{generate_netns_name, NetnsGuard}; use anyhow::{Context, Result}; use async_trait::async_trait; @@ -30,7 +31,8 @@ use hypervisor::{device::device_manager::DeviceManager, Hypervisor}; #[derive(Debug)] pub enum NetworkConfig { - NetworkResourceWithNetNs(NetworkWithNetNsConfig), + NetNs(NetworkWithNetNsConfig), + Dan(DanNetworkConfig), } #[async_trait] @@ -48,10 +50,15 @@ pub async fn new( d: Arc>, ) -> Result> { match config { - NetworkConfig::NetworkResourceWithNetNs(c) => Ok(Arc::new( + NetworkConfig::NetNs(c) => Ok(Arc::new( NetworkWithNetns::new(c, d) .await .context("new network with netns")?, )), + NetworkConfig::Dan(c) => Ok(Arc::new( + Dan::new(c, d) + .await + .context("New directly attachable network")?, + )), } } diff --git a/src/runtime-rs/crates/resource/src/network/network_info/mod.rs b/src/runtime-rs/crates/resource/src/network/network_info/mod.rs index 1500d5179e..a0e896bb3a 100644 --- a/src/runtime-rs/crates/resource/src/network/network_info/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/network_info/mod.rs @@ -4,6 +4,7 @@ // SPDX-License-Identifier: Apache-2.0 // +pub(crate) mod network_info_from_dan; pub(crate) mod network_info_from_link; use agent::{ARPNeighbor, Interface, Route}; diff --git a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_dan.rs b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_dan.rs new file mode 100644 index 0000000000..5ca06d340c --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_dan.rs @@ -0,0 +1,213 @@ +// Copyright (c) 2019-2023 Alibaba Cloud +// Copyright (c) 2019-2023 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use agent::{ARPNeighbor, IPAddress, Interface, Route}; +use anyhow::Result; +use async_trait::async_trait; +use netlink_packet_route::IFF_NOARP; + +use super::NetworkInfo; +use crate::network::dan::DanDevice; +use crate::network::utils::address::{ip_family_from_ip_addr, parse_ip_cidr}; + +/// NetworkInfoFromDan is responsible for converting network info in JSON +/// to agent's network info. +#[derive(Debug)] +pub(crate) struct NetworkInfoFromDan { + interface: Interface, + routes: Vec, + neighs: Vec, +} + +impl NetworkInfoFromDan { + pub async fn new(dan_device: &DanDevice) -> Result { + let ip_addresses = dan_device + .network_info + .interface + .ip_addresses + .iter() + .filter_map(|addr| { + let (ipaddr, mask) = match parse_ip_cidr(addr) { + Ok(ip_cidr) => (ip_cidr.0, ip_cidr.1), + Err(_) => return None, + }; + // Skip if it is a loopback address + if ipaddr.is_loopback() { + return None; + } + + Some(IPAddress { + family: ip_family_from_ip_addr(&ipaddr), + address: ipaddr.to_string(), + mask: format!("{}", mask), + }) + }) + .collect(); + + let interface = Interface { + device: dan_device.name.clone(), + name: dan_device.name.clone(), + ip_addresses, + mtu: dan_device.network_info.interface.mtu, + hw_addr: dan_device.guest_mac.clone(), + pci_addr: String::default(), + field_type: dan_device.network_info.interface.ntype.clone(), + raw_flags: dan_device.network_info.interface.flags & IFF_NOARP, + }; + + let routes = dan_device + .network_info + .routes + .iter() + .filter_map(|route| { + let family = match route.ip_family() { + Ok(family) => family, + Err(_) => return None, + }; + Some(Route { + dest: route.dest.clone(), + gateway: route.gateway.clone(), + device: dan_device.name.clone(), + source: route.source.clone(), + scope: route.scope, + family, + }) + }) + .collect(); + + let neighs = dan_device + .network_info + .neighbors + .iter() + .map(|neigh| { + let to_ip_address = neigh.ip_address.as_ref().and_then(|ip_address| { + parse_ip_cidr(ip_address) + .ok() + .map(|(ipaddr, mask)| IPAddress { + family: ip_family_from_ip_addr(&ipaddr), + address: ipaddr.to_string(), + mask: format!("{}", mask), + }) + }); + + ARPNeighbor { + to_ip_address, + device: dan_device.name.clone(), + ll_addr: neigh.hardware_addr.clone(), + state: neigh.state as i32, + flags: neigh.flags as i32, + } + }) + .collect(); + + Ok(Self { + interface, + routes, + neighs, + }) + } +} + +#[async_trait] +impl NetworkInfo for NetworkInfoFromDan { + async fn interface(&self) -> Result { + Ok(self.interface.clone()) + } + + async fn routes(&self) -> Result> { + Ok(self.routes.clone()) + } + + async fn neighs(&self) -> Result> { + Ok(self.neighs.clone()) + } +} + +#[cfg(test)] +mod tests { + use agent::{ARPNeighbor, IPAddress, IPFamily, Interface, Route}; + + use super::NetworkInfoFromDan; + use crate::network::dan::{ + ARPNeighbor as DanARPNeighbor, DanDevice, Device, Interface as DanInterface, + NetworkInfo as DanNetworkInfo, Route as DanRoute, + }; + use crate::network::NetworkInfo; + + #[tokio::test] + async fn test_network_info_from_dan() { + let dan_device = DanDevice { + name: "eth0".to_owned(), + guest_mac: "xx:xx:xx:xx:xx".to_owned(), + device: Device::HostTap { + tap_name: "tap0".to_owned(), + queue_num: 0, + queue_size: 0, + }, + network_info: DanNetworkInfo { + interface: DanInterface { + ip_addresses: vec!["192.168.0.1/24".to_owned()], + mtu: 1500, + ntype: "tuntap".to_owned(), + flags: 0, + }, + routes: vec![DanRoute { + dest: "172.18.0.0/16".to_owned(), + source: "172.18.0.1".to_owned(), + gateway: "172.18.31.1".to_owned(), + scope: 0, + }], + neighbors: vec![DanARPNeighbor { + ip_address: Some("192.168.0.3/16".to_owned()), + hardware_addr: "yy:yy:yy:yy:yy".to_owned(), + state: 0, + flags: 0, + }], + }, + }; + + let network_info = NetworkInfoFromDan::new(&dan_device).await.unwrap(); + + let interface = Interface { + device: "eth0".to_owned(), + name: "eth0".to_owned(), + ip_addresses: vec![IPAddress { + family: IPFamily::V4, + address: "192.168.0.1".to_owned(), + mask: "24".to_owned(), + }], + mtu: 1500, + hw_addr: "xx:xx:xx:xx:xx".to_owned(), + pci_addr: String::default(), + field_type: "tuntap".to_owned(), + raw_flags: 0, + }; + assert_eq!(interface, network_info.interface().await.unwrap()); + + let routes = vec![Route { + dest: "172.18.0.0/16".to_owned(), + gateway: "172.18.31.1".to_owned(), + device: "eth0".to_owned(), + source: "172.18.0.1".to_owned(), + scope: 0, + family: IPFamily::V4, + }]; + assert_eq!(routes, network_info.routes().await.unwrap()); + + let neighbors = vec![ARPNeighbor { + to_ip_address: Some(IPAddress { + family: IPFamily::V4, + address: "192.168.0.3".to_owned(), + mask: "16".to_owned(), + }), + device: "eth0".to_owned(), + ll_addr: "yy:yy:yy:yy:yy".to_owned(), + state: 0, + flags: 0, + }]; + assert_eq!(neighbors, network_info.neighs().await.unwrap()); + } +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/address.rs b/src/runtime-rs/crates/resource/src/network/utils/address.rs index 3046d3685b..792c4fbb5f 100644 --- a/src/runtime-rs/crates/resource/src/network/utils/address.rs +++ b/src/runtime-rs/crates/resource/src/network/utils/address.rs @@ -4,13 +4,14 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::{ - convert::TryFrom, - net::{IpAddr, Ipv4Addr, Ipv6Addr}, -}; +use std::convert::TryFrom; +use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; +use std::str::FromStr; -use anyhow::{anyhow, Result}; -use netlink_packet_route::{nlas::address::Nla, AddressMessage, AF_INET, AF_INET6}; +use agent::IPFamily; +use anyhow::{anyhow, Context, Result}; +use netlink_packet_route::nlas::address::Nla; +use netlink_packet_route::{AddressMessage, AF_INET, AF_INET6}; #[derive(Debug, PartialEq, Eq, Clone)] pub struct Address { @@ -84,6 +85,41 @@ pub(crate) fn parse_ip(ip: &[u8], family: u8) -> Result { } } +pub(crate) fn parse_ip_cidr(ip: &str) -> Result<(IpAddr, u8)> { + let items: Vec<&str> = ip.split('/').collect(); + if items.len() != 2 { + return Err(anyhow!(format!( + "{} is a bad IP address in format of CIDR", + ip + ))); + } + let ipaddr = IpAddr::from_str(items[0]).context("Parse IP address from string")?; + let mask = u8::from_str(items[1]).context("Parse mask")?; + if ipaddr.is_ipv4() && mask > 32 { + return Err(anyhow!(format!( + "The mask of IPv4 address should be less than or equal to 32, but we got {}.", + mask + ))); + } + if mask > 128 { + return Err(anyhow!(format!( + "The mask should be less than or equal to 128, but we got {}.", + mask + ))); + } + Ok((ipaddr, mask)) +} + +/// Retrieve IP Family defined at agent crate from IpAddr. +#[inline] +pub(crate) fn ip_family_from_ip_addr(ip_addr: &IpAddr) -> IPFamily { + if ip_addr.is_ipv4() { + IPFamily::V4 + } else { + IPFamily::V6 + } +} + #[cfg(test)] mod tests { use super::*; @@ -109,4 +145,28 @@ mod tests { let fail_ipv6 = [1, 2, 3, 4, 5, 6, 7, 8, 2, 3]; assert!(parse_ip(fail_ipv6.as_slice(), AF_INET6 as u8).is_err()); } + + #[test] + fn test_parse_ip_cidr() { + let test_cases = vec![ + ("127.0.0.1/32", ("127.0.0.1", 32u8)), + ("2001:4860:4860::8888/32", ("2001:4860:4860::8888", 32u8)), + ("2001:4860:4860::8888/128", ("2001:4860:4860::8888", 128u8)), + ]; + for tc in test_cases.iter() { + let (ipaddr, mask) = parse_ip_cidr(tc.0).unwrap(); + assert_eq!(ipaddr.to_string(), tc.1 .0); + assert_eq!(mask, tc.1 .1); + } + let test_cases = vec![ + "127.0.0.1/33", + "2001:4860:4860::8888/129", + "2001:4860:4860::8888/300", + "127.0.0.1/33/1", + "127.0.0.1", + ]; + for tc in test_cases.iter() { + assert!(parse_ip_cidr(tc).is_err()); + } + } } diff --git a/src/runtime-rs/crates/resource/src/network/utils/mod.rs b/src/runtime-rs/crates/resource/src/network/utils/mod.rs index 341038cb9f..39a34d6876 100644 --- a/src/runtime-rs/crates/resource/src/network/utils/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/utils/mod.rs @@ -9,6 +9,8 @@ pub(crate) mod link; pub(crate) mod netns; use anyhow::{anyhow, Result}; +use rand::rngs::OsRng; +use rand::RngCore; pub(crate) fn parse_mac(s: &str) -> Option { let v: Vec<_> = s.split(':').collect(); @@ -34,6 +36,17 @@ pub(crate) fn get_mac_addr(b: &[u8]) -> Result { } } +/// Generate a private mac address. +/// The range of private mac addressess is +/// x2-xx-xx-xx-xx-xx, x6-xx-xx-xx-xx-xx, xA-xx-xx-xx-xx-xx, xE-xx-xx-xx-xx-xx. +pub(crate) fn generate_private_mac_addr() -> String { + let mut addr: [u8; 6] = [0, 0, 0, 0, 0, 0]; + OsRng.fill_bytes(&mut addr); + addr[0] = (addr[0] | 2) & 0xfe; + // This is a safty unwrap since the len of addr is 6 + get_mac_addr(&addr).unwrap() +} + #[cfg(test)] mod tests { use super::*; @@ -63,4 +76,14 @@ mod tests { assert!(addr.is_some()); assert_eq!(expected_addr.0, addr.unwrap().0); } + + #[test] + fn test_generate_private_mac_addr() { + let addr1 = generate_private_mac_addr(); + let addr2 = generate_private_mac_addr(); + assert_ne!(addr1, addr2); + let ch1 = addr1.chars().nth(1).unwrap(); + let is_private = ch1 == '2' || ch1 == '6' || ch1 == 'a' || ch1 == 'e'; + assert!(is_private) + } } diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index 1244b6835a..a5af2a3fdf 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -21,7 +21,10 @@ use kata_types::{ use linux_container::LinuxContainer; use netns_rs::NetNs; use persist::sandbox_persist::Persist; -use resource::{cpu_mem::initial_size::InitialSizeManager, network::generate_netns_name}; +use resource::{ + cpu_mem::initial_size::InitialSizeManager, + network::{dan_config_path, generate_netns_name}, +}; use shim_interface::shim_mgmt::ERR_NO_SHIM_SERVER; use tokio::fs; use tokio::sync::{mpsc::Sender, Mutex, RwLock}; @@ -146,10 +149,14 @@ impl RuntimeHandlerManagerInner { let config = load_config(spec, options).context("load config")?; + let dan_path = dan_config_path(&config, &self.id); let mut network_created = false; // set netns to None if we want no network for the VM let netns = if config.runtime.disable_new_netns { None + } else if dan_path.exists() { + info!(sl!(), "Do not create a netns due to DAN"); + None } else { let mut netns_path = None; if let Some(linux) = &spec.linux { diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 9ded5033cf..6ede6fb2cf 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -6,30 +6,25 @@ use std::sync::Arc; -use agent::{ - self, kata::KataAgent, types::KernelModule, Agent, GetIPTablesRequest, SetIPTablesRequest, - VolumeStatsRequest, -}; +use agent::kata::KataAgent; +use agent::types::KernelModule; +use agent::{self, Agent, GetIPTablesRequest, SetIPTablesRequest, VolumeStatsRequest}; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; -use common::{ - message::{Action, Message}, - Sandbox, SandboxNetworkEnv, -}; +use common::message::{Action, Message}; +use common::{Sandbox, SandboxNetworkEnv}; use containerd_shim_protos::events::task::TaskOOM; use hypervisor::{dragonball::Dragonball, BlockConfig, Hypervisor, HYPERVISOR_DRAGONBALL}; use kata_sys_util::hooks::HookStates; use kata_types::config::TomlConfig; -use resource::{ - manager::ManagerArgs, - network::{NetworkConfig, NetworkWithNetNsConfig}, - ResourceConfig, ResourceManager, -}; +use persist::{self, sandbox_persist::Persist}; +use resource::manager::ManagerArgs; +use resource::network::{dan_config_path, DanNetworkConfig, NetworkConfig, NetworkWithNetNsConfig}; +use resource::{ResourceConfig, ResourceManager}; use tokio::sync::{mpsc::Sender, Mutex, RwLock}; use tracing::instrument; use crate::health_check::HealthCheck; -use persist::{self, sandbox_persist::Persist}; pub(crate) const VIRTCONTAINER: &str = "virt_container"; pub struct SandboxRestoreArgs { @@ -101,19 +96,15 @@ impl VirtSandbox { #[instrument] async fn prepare_for_start_sandbox( &self, - _id: &str, + id: &str, network_env: SandboxNetworkEnv, ) -> Result> { let mut resource_configs = vec![]; // prepare network config if !network_env.network_created { - if let Some(netns_path) = network_env.netns { - let network_config = ResourceConfig::Network( - self.prepare_network_config(netns_path, network_env.network_created) - .await, - ); - resource_configs.push(network_config); + if let Some(network_resource) = self.prepare_network_resource(&network_env).await { + resource_configs.push(network_resource); } } @@ -133,6 +124,39 @@ impl VirtSandbox { Ok(resource_configs) } + async fn prepare_network_resource( + &self, + network_env: &SandboxNetworkEnv, + ) -> Option { + let config = self.resource_manager.config().await; + let dan_path = dan_config_path(&config, &self.sid); + + // Network priority: DAN > NetNS + if dan_path.exists() { + Some(ResourceConfig::Network(NetworkConfig::Dan( + DanNetworkConfig { + dan_conf_path: dan_path, + }, + ))) + } else if let Some(netns_path) = network_env.netns.as_ref() { + Some(ResourceConfig::Network(NetworkConfig::NetNs( + NetworkWithNetNsConfig { + network_model: config.runtime.internetworking_model.clone(), + netns_path: netns_path.to_owned(), + queues: self + .hypervisor + .hypervisor_config() + .await + .network_info + .network_queues as usize, + network_created: network_env.network_created, + }, + ))) + } else { + None + } + } + async fn execute_oci_hook_functions( &self, prestart_hooks: &[oci::Hook], @@ -166,25 +190,6 @@ impl VirtSandbox { Ok(()) } - async fn prepare_network_config( - &self, - netns_path: String, - network_created: bool, - ) -> NetworkConfig { - let config = self.resource_manager.config().await; - NetworkConfig::NetworkResourceWithNetNs(NetworkWithNetNsConfig { - network_model: config.runtime.internetworking_model.clone(), - netns_path, - queues: self - .hypervisor - .hypervisor_config() - .await - .network_info - .network_queues as usize, - network_created, - }) - } - async fn prepare_rootfs_config(&self) -> Result { let boot_info = self.hypervisor.hypervisor_config().await.boot_info; @@ -270,18 +275,23 @@ impl Sandbox for VirtSandbox { // We need to rescan the netns to handle the change. // 2. Do not scan the netns if we want no network for the VM. // TODO In case of vm factory, scan the netns to hotplug interfaces after the VM is started. + let config = self.resource_manager.config().await; if self.has_prestart_hooks(prestart_hooks, create_runtime_hooks) - && !self - .resource_manager - .config() - .await - .runtime - .disable_new_netns + && !config.runtime.disable_new_netns + && !dan_config_path(&config, &self.sid).exists() { if let Some(netns_path) = network_env.netns { - let network_resource = self - .prepare_network_config(netns_path, network_env.network_created) - .await; + let network_resource = NetworkConfig::NetNs(NetworkWithNetNsConfig { + network_model: config.runtime.internetworking_model.clone(), + netns_path: netns_path.to_owned(), + queues: self + .hypervisor + .hypervisor_config() + .await + .network_info + .network_queues as usize, + network_created: network_env.network_created, + }); self.resource_manager .handle_network(network_resource) .await