runtime-rs: Fix FC spawn with network

Refactor network link detection so that FC can boot properly.

Signed-off-by: Anastassios Nanos <ananos@nubificus.co.uk>
This commit is contained in:
Anastassios Nanos
2026-04-01 19:42:11 +00:00
parent ccfdf5e11b
commit cb5ba485a2
9 changed files with 212 additions and 88 deletions

View File

@@ -13,7 +13,7 @@ use crate::{
};
use anyhow::{anyhow, Context, Result};
use dbs_utils::net::MacAddr;
use hyper::{Body, Method, Request, Response};
use hyper::{Body, Method, Request};
use hyperlocal::Uri;
use kata_sys_util::mount;
use kata_types::config::hypervisor::RateLimiterConfig;
@@ -23,6 +23,16 @@ use tokio::{fs, fs::File};
const REQUEST_RETRY: u32 = 500;
const FC_KERNEL: &str = "vmlinux";
/// Distinguishes a transient transport error (FC not ready yet, retry allowed)
/// from a permanent HTTP-level API error returned by FC (no retry).
#[derive(Debug)]
enum FcRequestError {
/// Could not reach the FC API socket (connection refused, etc.)
Transport(String),
/// FC returned a non-2xx HTTP status. (status_code, response_body)
Api(u16, String),
}
const FC_ROOT_FS: &str = "rootfs";
const DRIVE_PREFIX: &str = "drive";
const DISK_POOL_SIZE: u32 = 6;
@@ -215,13 +225,29 @@ impl FcInner {
Some(mac) => MacAddr::from_bytes(&mac.0).ok(),
None => None,
};
let rx_rate_limiter = RateLimiterConfig::new(
self.config.network_info.rx_rate_limiter_max_rate,
0,
None,
None,
);
let tx_rate_limiter = RateLimiterConfig::new(
self.config.network_info.tx_rate_limiter_max_rate,
0,
None,
None,
);
let body: String = json!({
"iface_id": &device_id,
"guest_mac": g_mac,
"host_dev_name": &config.host_dev_name
"host_dev_name": &config.host_dev_name,
"rx_rate_limiter": rx_rate_limiter,
"tx_rate_limiter": tx_rate_limiter,
})
.to_string();
info!(sl(), "FC: add network device: iface_id={} guest_mac={:?} host_dev_name={}", device_id, g_mac, config.host_dev_name);
self.request_with_retry(
Method::PUT,
&["/network-interfaces/", &device_id].concat(),
@@ -259,50 +285,54 @@ impl FcInner {
.body(Body::from(data.clone()))?;
match self.send_request(req).await {
Ok(resp) => {
debug!(sl(), "Request sent, resp: {:?}", resp);
return Ok(());
}
Err(resp) => {
debug!(sl(), "Request sent with error, resp: {:?}", resp);
Ok(_) => return Ok(()),
// A transport error (FC not ready yet) — retry.
Err(FcRequestError::Transport(e)) => {
debug!(sl(), "FC not reachable yet, retrying: {:?}", e);
std::thread::sleep(std::time::Duration::from_millis(10));
continue;
}
// An HTTP-level error from FC — fail immediately with the
// actual error body so the problem is visible in logs.
Err(FcRequestError::Api(status, body)) => {
return Err(anyhow::anyhow!(
"FC API error: status={} body={}",
status,
body
));
}
}
}
Err(anyhow::anyhow!(
"After {} attempts, it still doesn't work.",
REQUEST_RETRY
"FC not reachable after {} attempts (method={:?} uri={:?})",
REQUEST_RETRY,
method,
uri,
))
}
pub(crate) async fn send_request(&self, req: Request<Body>) -> Result<Response<Body>> {
let resp = self.client.request(req).await?;
async fn send_request(&self, req: Request<Body>) -> Result<(), FcRequestError> {
let resp = self
.client
.request(req)
.await
.map_err(|e| FcRequestError::Transport(e.to_string()))?;
let status = resp.status();
debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp);
if status.is_success() {
return Ok(resp);
} else {
let body = hyper::body::to_bytes(resp.into_body()).await?;
if body.is_empty() {
debug!(sl(), "Request FAILED WITH STATUS: {:?}", status);
None
} else {
let body = String::from_utf8_lossy(&body).into_owned();
debug!(
sl(),
"Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body
);
Some(body)
};
debug!(sl(), "FC request succeeded: {:?}", status);
return Ok(());
}
Err(anyhow::anyhow!(
"After {} attempts, it
still doesn't work.",
REQUEST_RETRY
))
let body = hyper::body::to_bytes(resp.into_body())
.await
.map(|b| String::from_utf8_lossy(&b).into_owned())
.unwrap_or_default();
error!(
sl(),
"FC API rejected request: status={:?} body={:?}", status, body
);
Err(FcRequestError::Api(status.as_u16(), body))
}
pub(crate) fn cleanup_resource(&self) {
if self.jailed {

View File

@@ -4,6 +4,7 @@
//SPDX-License-Identifier: Apache-2.0
use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl};
use crate::device::driver::NetworkConfig;
use crate::MemoryConfig;
use crate::HYPERVISOR_FIRECRACKER;
use crate::{device::DeviceType, VmmState};
@@ -43,6 +44,9 @@ pub struct FcInner {
pub(crate) jailed: bool,
pub(crate) run_dir: String,
pub(crate) pending_devices: Vec<DeviceType>,
/// Network devices buffered until start_vm() so they are always sent to FC
/// before InstanceStart, mirroring the Go runtime's batch-configuration approach.
pub(crate) pending_net_devices: Vec<(NetworkConfig, String)>,
pub(crate) capabilities: Capabilities,
pub(crate) fc_process: Mutex<Option<Child>>,
pub(crate) exit_notify: Option<mpsc::Sender<()>>,
@@ -51,7 +55,9 @@ pub struct FcInner {
impl FcInner {
pub fn new(exit_notify: mpsc::Sender<()>) -> FcInner {
let mut capabilities = Capabilities::new();
capabilities.set(CapabilityBits::BlockDeviceSupport);
capabilities.set(
CapabilityBits::BlockDeviceSupport | CapabilityBits::HybridVsockSupport,
);
FcInner {
id: String::default(),
@@ -66,6 +72,7 @@ impl FcInner {
jailed: false,
run_dir: String::default(),
pending_devices: vec![],
pending_net_devices: vec![],
capabilities,
fc_process: Mutex::new(None),
exit_notify: Some(exit_notify),
@@ -80,7 +87,7 @@ impl FcInner {
debug!(sl(), "Running Jailed");
cmd = Command::new(&self.config.jailer_path);
let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/");
let args = [
let mut args = vec![
"--id",
&self.id,
"--gid",
@@ -91,11 +98,16 @@ impl FcInner {
&self.config.path,
"--chroot-base-dir",
&self.jailer_root,
"--",
"--api-sock",
&api_socket,
];
cmd.args(args);
// Pass the network namespace to the jailer so that the FC process
// is placed in the correct netns. This is the recommended approach
// over relying on pre_exec setns inheritance.
let netns_path = netns.clone().unwrap_or_default();
if !netns_path.is_empty() {
args.extend_from_slice(&["--netns", &netns_path]);
}
args.extend_from_slice(&["--", "--api-sock", &api_socket]);
cmd.args(&args);
}
false => {
debug!(sl(), "Running non-Jailed");
@@ -108,15 +120,20 @@ impl FcInner {
}
debug!(sl(), "Exec: {:?}", cmd);
// Make sure we're in the correct Network Namespace
// For the non-jailed case, enter the network namespace via pre_exec so that
// the FC process inherits it. For the jailed case, --netns is passed to the
// jailer above and pre_exec setns is skipped.
let jailed = self.jailed;
unsafe {
let selinux_label = self.config.security_info.selinux_label.clone();
let _pre = cmd.pre_exec(move || {
if let Some(netns_path) = &netns {
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
let netns_fd = std::fs::File::open(netns_path);
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
.context("set netns failed");
if !jailed {
if let Some(netns_path) = &netns {
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
let netns_fd = std::fs::File::open(netns_path);
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
.context("set netns failed");
}
}
if let Some(label) = selinux_label.as_ref() {
if let Err(e) = selinux::set_exec_label(label) {
@@ -256,6 +273,7 @@ impl Persist for FcInner {
jailer_root: hypervisor_state.jailer_root,
client: Client::unix(),
pending_devices: vec![],
pending_net_devices: vec![],
run_dir: hypervisor_state.run_dir,
capabilities: Capabilities::new(),
fc_process: Mutex::new(None),

View File

@@ -31,10 +31,29 @@ impl FcInner {
.hotplug_block_device(block.config.path_on_host.as_str(), block.config.index)
.await
.context("add block device"),
DeviceType::Network(network) => self
.add_net_device(&network.config, network.device_id)
.await
.context("add net device"),
DeviceType::Network(network) => {
// Buffer network devices and send them to FC just before InstanceStart
// in start_vm(). Firecracker rejects PUT /network-interfaces after the
// VM has started, so we must ensure they arrive before InstanceStart.
// This mirrors the Go runtime's batch-configuration approach.
//
// If the VM is already running (e.g. a post-start prestart-hooks rescan
// called add_device again), we cannot do anything useful — FC has already
// started and does not support network-interface hotplug. Log a warning
// and return Ok so the rest of the setup path can continue.
if self.state == VmmState::VmRunning {
warn!(
sl(),
"FC: ignoring late network device add for iface {} — VM already running, hotplug not supported",
network.device_id
);
return Ok(());
}
debug!(sl(), "buffering network device for pre-start flush");
self.pending_net_devices
.push((network.config, network.device_id));
Ok(())
}
DeviceType::HybridVsock(hvsock) => {
self.add_hvsock(&hvsock.config).await.context("add vsock")
}

View File

@@ -77,6 +77,17 @@ impl FcInner {
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
debug!(sl(), "Starting sandbox");
// Flush all buffered network devices before sending InstanceStart.
// FC rejects PUT /network-interfaces once the VM is running, so network
// interfaces must be configured here, immediately before the start action.
let net_devices = std::mem::take(&mut self.pending_net_devices);
for (config, device_id) in net_devices {
self.add_net_device(&config, device_id)
.await
.context("configure network interface before start")?;
}
let body: String = serde_json::json!({
"action_type": "InstanceStart"
})

View File

@@ -691,14 +691,12 @@ impl QemuInner {
let is_unaligned = !new_hotplugged_mem.is_multiple_of(guest_mem_block_size);
if is_unaligned {
new_hotplugged_mem = ch_config::convert::checked_next_multiple_of(
new_hotplugged_mem,
guest_mem_block_size,
)
.ok_or(anyhow!(format!(
"alignment of {} B to the block size of {} B failed",
new_hotplugged_mem, guest_mem_block_size
)))?
new_hotplugged_mem = new_hotplugged_mem
.checked_next_multiple_of(guest_mem_block_size)
.ok_or(anyhow!(format!(
"alignment of {} B to the block size of {} B failed",
new_hotplugged_mem, guest_mem_block_size
)))?
}
let new_hotplugged_mem = new_hotplugged_mem;

View File

@@ -249,13 +249,48 @@ impl ResourceManagerInner {
}
async fn handle_interfaces(&self, network: &dyn Network) -> Result<()> {
// The guest virtio-net device may not be visible to the kernel immediately
// after InstanceStart completes. Retry on "Link not found" to allow time
// for virtio-net driver initialisation in the guest.
// Use a generous window (100 × 100 ms = 10 s) since on some systems
// virtio-net initialisation is slower than the Go runtime's 20 × 20 ms.
const MAX_ATTEMPTS: u32 = 100;
const RETRY_DELAY_MS: u64 = 100;
for i in network.interfaces().await.context("get interfaces")? {
// update interface
info!(sl!(), "update interface {:?}", i);
self.agent
.update_interface(agent::UpdateInterfaceRequest { interface: Some(i) })
.await
.context("update interface")?;
info!(sl!(), "update interface: hw_addr={} name={}", i.hw_addr, i.name);
let mut last_err = None;
for attempt in 0..MAX_ATTEMPTS {
let result = self
.agent
.update_interface(agent::UpdateInterfaceRequest {
interface: Some(i.clone()),
})
.await;
if let Err(e) = result {
let msg = e.to_string();
if msg.contains("Link not found") {
info!(
sl!(),
"update interface: link not found (attempt {}/{}), retrying in {}ms",
attempt + 1,
MAX_ATTEMPTS,
RETRY_DELAY_MS,
);
last_err = Some(e);
tokio::time::sleep(std::time::Duration::from_millis(RETRY_DELAY_MS))
.await;
} else {
return Err(e).context("update interface");
}
} else {
last_err = None;
break;
}
}
if let Some(e) = last_err {
return Err(e).context("update interface");
}
}
Ok(())

View File

@@ -49,7 +49,13 @@ impl NetworkPair {
let unique_id = kata_sys_util::rand::UUID::new();
let model = network_model::new(model).context("new network model")?;
let tap_iface_name = format!("tap{idx}{TAP_SUFFIX}");
let virt_iface_name = format!("eth{idx}");
// Use the actual interface name from the netns scan. Fall back to eth{idx}
// only if the caller passed an empty name.
let virt_iface_name = if name.is_empty() {
format!("eth{idx}")
} else {
name.to_string()
};
let tap_link = create_link(handle, &tap_iface_name, queues)
.await
.context("create link")?;
@@ -106,7 +112,7 @@ impl NetworkPair {
.await
.context("set link up")?;
let mut net_pair = NetworkPair {
let net_pair = NetworkPair {
tap: TapInterface {
id: String::from(&unique_id),
name: format!("br{idx}{TAP_SUFFIX}"),
@@ -125,10 +131,6 @@ impl NetworkPair {
network_qos: false,
};
if !name.is_empty() {
net_pair.virt_iface.name = String::from(name);
}
Ok(net_pair)
}

View File

@@ -140,21 +140,22 @@ impl Network for NetworkWithNetns {
async fn remove(&self, h: &dyn Hypervisor) -> Result<()> {
let inner = self.inner.read().await;
// The network namespace would have been deleted at this point
// if it has not been created by virtcontainers.
if !inner.network_created {
return Ok(());
}
{
// Always clean up endpoint resources (TC filter rules, TAP devices) regardless
// of who created the network namespace.
if !inner.netns_path.is_empty() {
let _netns_guard =
netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?;
for e in &inner.entity_list {
e.endpoint.detach(h).await.context("detach")?;
}
}
let netns = get_from_path(inner.netns_path.clone())?;
netns.remove()?;
fs::remove_dir_all(inner.netns_path.clone()).context("failed to remove netns path")?;
// Only remove the network namespace itself if virtcontainers created it.
if inner.network_created {
let netns = get_from_path(inner.netns_path.clone())?;
netns.remove()?;
fs::remove_dir_all(inner.netns_path.clone())
.context("failed to remove netns path")?;
}
Ok(())
}
}

View File

@@ -611,11 +611,14 @@ impl Sandbox for VirtSandbox {
.await
.context("set up device before start vm")?;
// start vm
self.hypervisor.start_vm(10_000).await.context("start vm")?;
info!(sl!(), "start vm");
// execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks
//
// These must run BEFORE start_vm so that:
// (a) createRuntime hooks (e.g. nerdctl's CNI hook) can create the veth pair
// in the container netns while the VMM process (already started by
// prepare_vm and placed in the netns) is still pre-InstanceStart, and
// (b) hypervisors that do not support network-interface hotplug (e.g.
// Firecracker) can configure the interface before InstanceStart.
let (prestart_hooks, create_runtime_hooks) =
if let Some(hooks) = sandbox_config.hooks.as_ref() {
(
@@ -633,12 +636,15 @@ impl Sandbox for VirtSandbox {
)
.await?;
// 1. if there are pre-start hook functions, network config might have been changed.
// We need to rescan the netns to handle the change.
// 2. Do not scan the netns if we want no network for the VM.
// TODO In case of vm factory, scan the netns to hotplug interfaces after the VM is started.
// Rescan the netns and update the network configuration before start_vm:
// 1. When network_created==true the veth is set up by the createRuntime hook
// above; we must scan now so the network device lands in pending_net_devices
// before InstanceStart (required for FC which has no hotplug).
// 2. When there are pre-start hooks the network config may have changed.
// 3. Do not scan if disable_new_netns is set.
let config = self.resource_manager.config().await;
if self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks)
if (sandbox_config.network_env.network_created
|| self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks))
&& !config.runtime.disable_new_netns
&& !dan_config_path(&config, &self.sid).exists()
{
@@ -657,10 +663,14 @@ impl Sandbox for VirtSandbox {
self.resource_manager
.handle_network(network_resource)
.await
.context("set up device after start vm")?;
.context("set up network before start vm")?;
}
}
// start vm
self.hypervisor.start_vm(10_000).await.context("start vm")?;
info!(sl!(), "start vm");
// connect agent
// set agent socket
let address = self