diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs index 0170bd10ee..7459814b28 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/fc_api.rs @@ -13,7 +13,7 @@ use crate::{ }; use anyhow::{anyhow, Context, Result}; use dbs_utils::net::MacAddr; -use hyper::{Body, Method, Request, Response}; +use hyper::{Body, Method, Request}; use hyperlocal::Uri; use kata_sys_util::mount; use kata_types::config::hypervisor::RateLimiterConfig; @@ -23,6 +23,16 @@ use tokio::{fs, fs::File}; const REQUEST_RETRY: u32 = 500; const FC_KERNEL: &str = "vmlinux"; + +/// Distinguishes a transient transport error (FC not ready yet, retry allowed) +/// from a permanent HTTP-level API error returned by FC (no retry). +#[derive(Debug)] +enum FcRequestError { + /// Could not reach the FC API socket (connection refused, etc.) + Transport(String), + /// FC returned a non-2xx HTTP status. (status_code, response_body) + Api(u16, String), +} const FC_ROOT_FS: &str = "rootfs"; const DRIVE_PREFIX: &str = "drive"; const DISK_POOL_SIZE: u32 = 6; @@ -215,13 +225,29 @@ impl FcInner { Some(mac) => MacAddr::from_bytes(&mac.0).ok(), None => None, }; + + let rx_rate_limiter = RateLimiterConfig::new( + self.config.network_info.rx_rate_limiter_max_rate, + 0, + None, + None, + ); + let tx_rate_limiter = RateLimiterConfig::new( + self.config.network_info.tx_rate_limiter_max_rate, + 0, + None, + None, + ); + let body: String = json!({ "iface_id": &device_id, "guest_mac": g_mac, - "host_dev_name": &config.host_dev_name - + "host_dev_name": &config.host_dev_name, + "rx_rate_limiter": rx_rate_limiter, + "tx_rate_limiter": tx_rate_limiter, }) .to_string(); + info!(sl(), "FC: add network device: iface_id={} guest_mac={:?} host_dev_name={}", device_id, g_mac, config.host_dev_name); self.request_with_retry( Method::PUT, &["/network-interfaces/", &device_id].concat(), @@ -259,50 +285,54 @@ impl FcInner { .body(Body::from(data.clone()))?; match self.send_request(req).await { - Ok(resp) => { - debug!(sl(), "Request sent, resp: {:?}", resp); - return Ok(()); - } - Err(resp) => { - debug!(sl(), "Request sent with error, resp: {:?}", resp); + Ok(_) => return Ok(()), + // A transport error (FC not ready yet) — retry. + Err(FcRequestError::Transport(e)) => { + debug!(sl(), "FC not reachable yet, retrying: {:?}", e); std::thread::sleep(std::time::Duration::from_millis(10)); continue; } + // An HTTP-level error from FC — fail immediately with the + // actual error body so the problem is visible in logs. + Err(FcRequestError::Api(status, body)) => { + return Err(anyhow::anyhow!( + "FC API error: status={} body={}", + status, + body + )); + } } } Err(anyhow::anyhow!( - "After {} attempts, it still doesn't work.", - REQUEST_RETRY + "FC not reachable after {} attempts (method={:?} uri={:?})", + REQUEST_RETRY, + method, + uri, )) } - pub(crate) async fn send_request(&self, req: Request) -> Result> { - let resp = self.client.request(req).await?; + async fn send_request(&self, req: Request) -> Result<(), FcRequestError> { + let resp = self + .client + .request(req) + .await + .map_err(|e| FcRequestError::Transport(e.to_string()))?; let status = resp.status(); - debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp); if status.is_success() { - return Ok(resp); - } else { - let body = hyper::body::to_bytes(resp.into_body()).await?; - if body.is_empty() { - debug!(sl(), "Request FAILED WITH STATUS: {:?}", status); - None - } else { - let body = String::from_utf8_lossy(&body).into_owned(); - debug!( - sl(), - "Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body - ); - Some(body) - }; + debug!(sl(), "FC request succeeded: {:?}", status); + return Ok(()); } - Err(anyhow::anyhow!( - "After {} attempts, it - still doesn't work.", - REQUEST_RETRY - )) + let body = hyper::body::to_bytes(resp.into_body()) + .await + .map(|b| String::from_utf8_lossy(&b).into_owned()) + .unwrap_or_default(); + error!( + sl(), + "FC API rejected request: status={:?} body={:?}", status, body + ); + Err(FcRequestError::Api(status.as_u16(), body)) } pub(crate) fn cleanup_resource(&self) { if self.jailed { diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs index cc6d44fbd6..9cf888cc80 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner.rs @@ -4,6 +4,7 @@ //SPDX-License-Identifier: Apache-2.0 use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl}; +use crate::device::driver::NetworkConfig; use crate::MemoryConfig; use crate::HYPERVISOR_FIRECRACKER; use crate::{device::DeviceType, VmmState}; @@ -43,6 +44,9 @@ pub struct FcInner { pub(crate) jailed: bool, pub(crate) run_dir: String, pub(crate) pending_devices: Vec, + /// Network devices buffered until start_vm() so they are always sent to FC + /// before InstanceStart, mirroring the Go runtime's batch-configuration approach. + pub(crate) pending_net_devices: Vec<(NetworkConfig, String)>, pub(crate) capabilities: Capabilities, pub(crate) fc_process: Mutex>, pub(crate) exit_notify: Option>, @@ -51,7 +55,9 @@ pub struct FcInner { impl FcInner { pub fn new(exit_notify: mpsc::Sender<()>) -> FcInner { let mut capabilities = Capabilities::new(); - capabilities.set(CapabilityBits::BlockDeviceSupport); + capabilities.set( + CapabilityBits::BlockDeviceSupport | CapabilityBits::HybridVsockSupport, + ); FcInner { id: String::default(), @@ -66,6 +72,7 @@ impl FcInner { jailed: false, run_dir: String::default(), pending_devices: vec![], + pending_net_devices: vec![], capabilities, fc_process: Mutex::new(None), exit_notify: Some(exit_notify), @@ -80,7 +87,7 @@ impl FcInner { debug!(sl(), "Running Jailed"); cmd = Command::new(&self.config.jailer_path); let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/"); - let args = [ + let mut args = vec![ "--id", &self.id, "--gid", @@ -91,11 +98,16 @@ impl FcInner { &self.config.path, "--chroot-base-dir", &self.jailer_root, - "--", - "--api-sock", - &api_socket, ]; - cmd.args(args); + // Pass the network namespace to the jailer so that the FC process + // is placed in the correct netns. This is the recommended approach + // over relying on pre_exec setns inheritance. + let netns_path = netns.clone().unwrap_or_default(); + if !netns_path.is_empty() { + args.extend_from_slice(&["--netns", &netns_path]); + } + args.extend_from_slice(&["--", "--api-sock", &api_socket]); + cmd.args(&args); } false => { debug!(sl(), "Running non-Jailed"); @@ -108,15 +120,20 @@ impl FcInner { } debug!(sl(), "Exec: {:?}", cmd); - // Make sure we're in the correct Network Namespace + // For the non-jailed case, enter the network namespace via pre_exec so that + // the FC process inherits it. For the jailed case, --netns is passed to the + // jailer above and pre_exec setns is skipped. + let jailed = self.jailed; unsafe { let selinux_label = self.config.security_info.selinux_label.clone(); let _pre = cmd.pre_exec(move || { - if let Some(netns_path) = &netns { - debug!(sl(), "set netns for vmm master {:?}", &netns_path); - let netns_fd = std::fs::File::open(netns_path); - let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET) - .context("set netns failed"); + if !jailed { + if let Some(netns_path) = &netns { + debug!(sl(), "set netns for vmm master {:?}", &netns_path); + let netns_fd = std::fs::File::open(netns_path); + let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET) + .context("set netns failed"); + } } if let Some(label) = selinux_label.as_ref() { if let Err(e) = selinux::set_exec_label(label) { @@ -256,6 +273,7 @@ impl Persist for FcInner { jailer_root: hypervisor_state.jailer_root, client: Client::unix(), pending_devices: vec![], + pending_net_devices: vec![], run_dir: hypervisor_state.run_dir, capabilities: Capabilities::new(), fc_process: Mutex::new(None), diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs index 6e46ccee48..b7b9ed7bf5 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_device.rs @@ -31,10 +31,29 @@ impl FcInner { .hotplug_block_device(block.config.path_on_host.as_str(), block.config.index) .await .context("add block device"), - DeviceType::Network(network) => self - .add_net_device(&network.config, network.device_id) - .await - .context("add net device"), + DeviceType::Network(network) => { + // Buffer network devices and send them to FC just before InstanceStart + // in start_vm(). Firecracker rejects PUT /network-interfaces after the + // VM has started, so we must ensure they arrive before InstanceStart. + // This mirrors the Go runtime's batch-configuration approach. + // + // If the VM is already running (e.g. a post-start prestart-hooks rescan + // called add_device again), we cannot do anything useful — FC has already + // started and does not support network-interface hotplug. Log a warning + // and return Ok so the rest of the setup path can continue. + if self.state == VmmState::VmRunning { + warn!( + sl(), + "FC: ignoring late network device add for iface {} — VM already running, hotplug not supported", + network.device_id + ); + return Ok(()); + } + debug!(sl(), "buffering network device for pre-start flush"); + self.pending_net_devices + .push((network.config, network.device_id)); + Ok(()) + } DeviceType::HybridVsock(hvsock) => { self.add_hvsock(&hvsock.config).await.context("add vsock") } diff --git a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs index de05ab16e1..aff7152005 100644 --- a/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs +++ b/src/runtime-rs/crates/hypervisor/src/firecracker/inner_hypervisor.rs @@ -77,6 +77,17 @@ impl FcInner { pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> { debug!(sl(), "Starting sandbox"); + + // Flush all buffered network devices before sending InstanceStart. + // FC rejects PUT /network-interfaces once the VM is running, so network + // interfaces must be configured here, immediately before the start action. + let net_devices = std::mem::take(&mut self.pending_net_devices); + for (config, device_id) in net_devices { + self.add_net_device(&config, device_id) + .await + .context("configure network interface before start")?; + } + let body: String = serde_json::json!({ "action_type": "InstanceStart" }) diff --git a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs index c5b8ac2629..e43acdeae2 100644 --- a/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/qemu/inner.rs @@ -691,14 +691,12 @@ impl QemuInner { let is_unaligned = !new_hotplugged_mem.is_multiple_of(guest_mem_block_size); if is_unaligned { - new_hotplugged_mem = ch_config::convert::checked_next_multiple_of( - new_hotplugged_mem, - guest_mem_block_size, - ) - .ok_or(anyhow!(format!( - "alignment of {} B to the block size of {} B failed", - new_hotplugged_mem, guest_mem_block_size - )))? + new_hotplugged_mem = new_hotplugged_mem + .checked_next_multiple_of(guest_mem_block_size) + .ok_or(anyhow!(format!( + "alignment of {} B to the block size of {} B failed", + new_hotplugged_mem, guest_mem_block_size + )))? } let new_hotplugged_mem = new_hotplugged_mem; diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index ca99987916..0078c5766f 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -249,13 +249,48 @@ impl ResourceManagerInner { } async fn handle_interfaces(&self, network: &dyn Network) -> Result<()> { + // The guest virtio-net device may not be visible to the kernel immediately + // after InstanceStart completes. Retry on "Link not found" to allow time + // for virtio-net driver initialisation in the guest. + // Use a generous window (100 × 100 ms = 10 s) since on some systems + // virtio-net initialisation is slower than the Go runtime's 20 × 20 ms. + const MAX_ATTEMPTS: u32 = 100; + const RETRY_DELAY_MS: u64 = 100; + for i in network.interfaces().await.context("get interfaces")? { - // update interface - info!(sl!(), "update interface {:?}", i); - self.agent - .update_interface(agent::UpdateInterfaceRequest { interface: Some(i) }) - .await - .context("update interface")?; + info!(sl!(), "update interface: hw_addr={} name={}", i.hw_addr, i.name); + let mut last_err = None; + for attempt in 0..MAX_ATTEMPTS { + let result = self + .agent + .update_interface(agent::UpdateInterfaceRequest { + interface: Some(i.clone()), + }) + .await; + if let Err(e) = result { + let msg = e.to_string(); + if msg.contains("Link not found") { + info!( + sl!(), + "update interface: link not found (attempt {}/{}), retrying in {}ms", + attempt + 1, + MAX_ATTEMPTS, + RETRY_DELAY_MS, + ); + last_err = Some(e); + tokio::time::sleep(std::time::Duration::from_millis(RETRY_DELAY_MS)) + .await; + } else { + return Err(e).context("update interface"); + } + } else { + last_err = None; + break; + } + } + if let Some(e) = last_err { + return Err(e).context("update interface"); + } } Ok(()) diff --git a/src/runtime-rs/crates/resource/src/network/network_pair.rs b/src/runtime-rs/crates/resource/src/network/network_pair.rs index 8be2d392b7..fd45043692 100644 --- a/src/runtime-rs/crates/resource/src/network/network_pair.rs +++ b/src/runtime-rs/crates/resource/src/network/network_pair.rs @@ -49,7 +49,13 @@ impl NetworkPair { let unique_id = kata_sys_util::rand::UUID::new(); let model = network_model::new(model).context("new network model")?; let tap_iface_name = format!("tap{idx}{TAP_SUFFIX}"); - let virt_iface_name = format!("eth{idx}"); + // Use the actual interface name from the netns scan. Fall back to eth{idx} + // only if the caller passed an empty name. + let virt_iface_name = if name.is_empty() { + format!("eth{idx}") + } else { + name.to_string() + }; let tap_link = create_link(handle, &tap_iface_name, queues) .await .context("create link")?; @@ -106,7 +112,7 @@ impl NetworkPair { .await .context("set link up")?; - let mut net_pair = NetworkPair { + let net_pair = NetworkPair { tap: TapInterface { id: String::from(&unique_id), name: format!("br{idx}{TAP_SUFFIX}"), @@ -125,10 +131,6 @@ impl NetworkPair { network_qos: false, }; - if !name.is_empty() { - net_pair.virt_iface.name = String::from(name); - } - Ok(net_pair) } diff --git a/src/runtime-rs/crates/resource/src/network/network_with_netns.rs b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs index a872cf97f2..849636bc58 100644 --- a/src/runtime-rs/crates/resource/src/network/network_with_netns.rs +++ b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs @@ -140,21 +140,22 @@ impl Network for NetworkWithNetns { async fn remove(&self, h: &dyn Hypervisor) -> Result<()> { let inner = self.inner.read().await; - // The network namespace would have been deleted at this point - // if it has not been created by virtcontainers. - if !inner.network_created { - return Ok(()); - } - { + // Always clean up endpoint resources (TC filter rules, TAP devices) regardless + // of who created the network namespace. + if !inner.netns_path.is_empty() { let _netns_guard = netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?; for e in &inner.entity_list { e.endpoint.detach(h).await.context("detach")?; } } - let netns = get_from_path(inner.netns_path.clone())?; - netns.remove()?; - fs::remove_dir_all(inner.netns_path.clone()).context("failed to remove netns path")?; + // Only remove the network namespace itself if virtcontainers created it. + if inner.network_created { + let netns = get_from_path(inner.netns_path.clone())?; + netns.remove()?; + fs::remove_dir_all(inner.netns_path.clone()) + .context("failed to remove netns path")?; + } Ok(()) } } diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 232c054780..592a033c13 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -611,11 +611,14 @@ impl Sandbox for VirtSandbox { .await .context("set up device before start vm")?; - // start vm - self.hypervisor.start_vm(10_000).await.context("start vm")?; - info!(sl!(), "start vm"); - // execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks + // + // These must run BEFORE start_vm so that: + // (a) createRuntime hooks (e.g. nerdctl's CNI hook) can create the veth pair + // in the container netns while the VMM process (already started by + // prepare_vm and placed in the netns) is still pre-InstanceStart, and + // (b) hypervisors that do not support network-interface hotplug (e.g. + // Firecracker) can configure the interface before InstanceStart. let (prestart_hooks, create_runtime_hooks) = if let Some(hooks) = sandbox_config.hooks.as_ref() { ( @@ -633,12 +636,15 @@ impl Sandbox for VirtSandbox { ) .await?; - // 1. if there are pre-start hook functions, network config might have been changed. - // We need to rescan the netns to handle the change. - // 2. Do not scan the netns if we want no network for the VM. - // TODO In case of vm factory, scan the netns to hotplug interfaces after the VM is started. + // Rescan the netns and update the network configuration before start_vm: + // 1. When network_created==true the veth is set up by the createRuntime hook + // above; we must scan now so the network device lands in pending_net_devices + // before InstanceStart (required for FC which has no hotplug). + // 2. When there are pre-start hooks the network config may have changed. + // 3. Do not scan if disable_new_netns is set. let config = self.resource_manager.config().await; - if self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks) + if (sandbox_config.network_env.network_created + || self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks)) && !config.runtime.disable_new_netns && !dan_config_path(&config, &self.sid).exists() { @@ -657,10 +663,14 @@ impl Sandbox for VirtSandbox { self.resource_manager .handle_network(network_resource) .await - .context("set up device after start vm")?; + .context("set up network before start vm")?; } } + // start vm + self.hypervisor.start_vm(10_000).await.context("start vm")?; + info!(sl!(), "start vm"); + // connect agent // set agent socket let address = self