mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-03 18:43:50 +00:00
runtime-rs: Fix FC spawn with network
Refactor network link detection so that FC can boot properly. Signed-off-by: Anastassios Nanos <ananos@nubificus.co.uk>
This commit is contained in:
@@ -13,7 +13,7 @@ use crate::{
|
||||
};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use dbs_utils::net::MacAddr;
|
||||
use hyper::{Body, Method, Request, Response};
|
||||
use hyper::{Body, Method, Request};
|
||||
use hyperlocal::Uri;
|
||||
use kata_sys_util::mount;
|
||||
use kata_types::config::hypervisor::RateLimiterConfig;
|
||||
@@ -23,6 +23,16 @@ use tokio::{fs, fs::File};
|
||||
|
||||
const REQUEST_RETRY: u32 = 500;
|
||||
const FC_KERNEL: &str = "vmlinux";
|
||||
|
||||
/// Distinguishes a transient transport error (FC not ready yet, retry allowed)
|
||||
/// from a permanent HTTP-level API error returned by FC (no retry).
|
||||
#[derive(Debug)]
|
||||
enum FcRequestError {
|
||||
/// Could not reach the FC API socket (connection refused, etc.)
|
||||
Transport(String),
|
||||
/// FC returned a non-2xx HTTP status. (status_code, response_body)
|
||||
Api(u16, String),
|
||||
}
|
||||
const FC_ROOT_FS: &str = "rootfs";
|
||||
const DRIVE_PREFIX: &str = "drive";
|
||||
const DISK_POOL_SIZE: u32 = 6;
|
||||
@@ -215,13 +225,29 @@ impl FcInner {
|
||||
Some(mac) => MacAddr::from_bytes(&mac.0).ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let rx_rate_limiter = RateLimiterConfig::new(
|
||||
self.config.network_info.rx_rate_limiter_max_rate,
|
||||
0,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
let tx_rate_limiter = RateLimiterConfig::new(
|
||||
self.config.network_info.tx_rate_limiter_max_rate,
|
||||
0,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let body: String = json!({
|
||||
"iface_id": &device_id,
|
||||
"guest_mac": g_mac,
|
||||
"host_dev_name": &config.host_dev_name
|
||||
|
||||
"host_dev_name": &config.host_dev_name,
|
||||
"rx_rate_limiter": rx_rate_limiter,
|
||||
"tx_rate_limiter": tx_rate_limiter,
|
||||
})
|
||||
.to_string();
|
||||
info!(sl(), "FC: add network device: iface_id={} guest_mac={:?} host_dev_name={}", device_id, g_mac, config.host_dev_name);
|
||||
self.request_with_retry(
|
||||
Method::PUT,
|
||||
&["/network-interfaces/", &device_id].concat(),
|
||||
@@ -259,50 +285,54 @@ impl FcInner {
|
||||
.body(Body::from(data.clone()))?;
|
||||
|
||||
match self.send_request(req).await {
|
||||
Ok(resp) => {
|
||||
debug!(sl(), "Request sent, resp: {:?}", resp);
|
||||
return Ok(());
|
||||
}
|
||||
Err(resp) => {
|
||||
debug!(sl(), "Request sent with error, resp: {:?}", resp);
|
||||
Ok(_) => return Ok(()),
|
||||
// A transport error (FC not ready yet) — retry.
|
||||
Err(FcRequestError::Transport(e)) => {
|
||||
debug!(sl(), "FC not reachable yet, retrying: {:?}", e);
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
continue;
|
||||
}
|
||||
// An HTTP-level error from FC — fail immediately with the
|
||||
// actual error body so the problem is visible in logs.
|
||||
Err(FcRequestError::Api(status, body)) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"FC API error: status={} body={}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow::anyhow!(
|
||||
"After {} attempts, it still doesn't work.",
|
||||
REQUEST_RETRY
|
||||
"FC not reachable after {} attempts (method={:?} uri={:?})",
|
||||
REQUEST_RETRY,
|
||||
method,
|
||||
uri,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn send_request(&self, req: Request<Body>) -> Result<Response<Body>> {
|
||||
let resp = self.client.request(req).await?;
|
||||
async fn send_request(&self, req: Request<Body>) -> Result<(), FcRequestError> {
|
||||
let resp = self
|
||||
.client
|
||||
.request(req)
|
||||
.await
|
||||
.map_err(|e| FcRequestError::Transport(e.to_string()))?;
|
||||
|
||||
let status = resp.status();
|
||||
debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp);
|
||||
if status.is_success() {
|
||||
return Ok(resp);
|
||||
} else {
|
||||
let body = hyper::body::to_bytes(resp.into_body()).await?;
|
||||
if body.is_empty() {
|
||||
debug!(sl(), "Request FAILED WITH STATUS: {:?}", status);
|
||||
None
|
||||
} else {
|
||||
let body = String::from_utf8_lossy(&body).into_owned();
|
||||
debug!(
|
||||
sl(),
|
||||
"Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body
|
||||
);
|
||||
Some(body)
|
||||
};
|
||||
debug!(sl(), "FC request succeeded: {:?}", status);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"After {} attempts, it
|
||||
still doesn't work.",
|
||||
REQUEST_RETRY
|
||||
))
|
||||
let body = hyper::body::to_bytes(resp.into_body())
|
||||
.await
|
||||
.map(|b| String::from_utf8_lossy(&b).into_owned())
|
||||
.unwrap_or_default();
|
||||
error!(
|
||||
sl(),
|
||||
"FC API rejected request: status={:?} body={:?}", status, body
|
||||
);
|
||||
Err(FcRequestError::Api(status.as_u16(), body))
|
||||
}
|
||||
pub(crate) fn cleanup_resource(&self) {
|
||||
if self.jailed {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl};
|
||||
use crate::device::driver::NetworkConfig;
|
||||
use crate::MemoryConfig;
|
||||
use crate::HYPERVISOR_FIRECRACKER;
|
||||
use crate::{device::DeviceType, VmmState};
|
||||
@@ -43,6 +44,9 @@ pub struct FcInner {
|
||||
pub(crate) jailed: bool,
|
||||
pub(crate) run_dir: String,
|
||||
pub(crate) pending_devices: Vec<DeviceType>,
|
||||
/// Network devices buffered until start_vm() so they are always sent to FC
|
||||
/// before InstanceStart, mirroring the Go runtime's batch-configuration approach.
|
||||
pub(crate) pending_net_devices: Vec<(NetworkConfig, String)>,
|
||||
pub(crate) capabilities: Capabilities,
|
||||
pub(crate) fc_process: Mutex<Option<Child>>,
|
||||
pub(crate) exit_notify: Option<mpsc::Sender<()>>,
|
||||
@@ -51,7 +55,9 @@ pub struct FcInner {
|
||||
impl FcInner {
|
||||
pub fn new(exit_notify: mpsc::Sender<()>) -> FcInner {
|
||||
let mut capabilities = Capabilities::new();
|
||||
capabilities.set(CapabilityBits::BlockDeviceSupport);
|
||||
capabilities.set(
|
||||
CapabilityBits::BlockDeviceSupport | CapabilityBits::HybridVsockSupport,
|
||||
);
|
||||
|
||||
FcInner {
|
||||
id: String::default(),
|
||||
@@ -66,6 +72,7 @@ impl FcInner {
|
||||
jailed: false,
|
||||
run_dir: String::default(),
|
||||
pending_devices: vec![],
|
||||
pending_net_devices: vec![],
|
||||
capabilities,
|
||||
fc_process: Mutex::new(None),
|
||||
exit_notify: Some(exit_notify),
|
||||
@@ -80,7 +87,7 @@ impl FcInner {
|
||||
debug!(sl(), "Running Jailed");
|
||||
cmd = Command::new(&self.config.jailer_path);
|
||||
let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/");
|
||||
let args = [
|
||||
let mut args = vec![
|
||||
"--id",
|
||||
&self.id,
|
||||
"--gid",
|
||||
@@ -91,11 +98,16 @@ impl FcInner {
|
||||
&self.config.path,
|
||||
"--chroot-base-dir",
|
||||
&self.jailer_root,
|
||||
"--",
|
||||
"--api-sock",
|
||||
&api_socket,
|
||||
];
|
||||
cmd.args(args);
|
||||
// Pass the network namespace to the jailer so that the FC process
|
||||
// is placed in the correct netns. This is the recommended approach
|
||||
// over relying on pre_exec setns inheritance.
|
||||
let netns_path = netns.clone().unwrap_or_default();
|
||||
if !netns_path.is_empty() {
|
||||
args.extend_from_slice(&["--netns", &netns_path]);
|
||||
}
|
||||
args.extend_from_slice(&["--", "--api-sock", &api_socket]);
|
||||
cmd.args(&args);
|
||||
}
|
||||
false => {
|
||||
debug!(sl(), "Running non-Jailed");
|
||||
@@ -108,15 +120,20 @@ impl FcInner {
|
||||
}
|
||||
debug!(sl(), "Exec: {:?}", cmd);
|
||||
|
||||
// Make sure we're in the correct Network Namespace
|
||||
// For the non-jailed case, enter the network namespace via pre_exec so that
|
||||
// the FC process inherits it. For the jailed case, --netns is passed to the
|
||||
// jailer above and pre_exec setns is skipped.
|
||||
let jailed = self.jailed;
|
||||
unsafe {
|
||||
let selinux_label = self.config.security_info.selinux_label.clone();
|
||||
let _pre = cmd.pre_exec(move || {
|
||||
if let Some(netns_path) = &netns {
|
||||
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
|
||||
let netns_fd = std::fs::File::open(netns_path);
|
||||
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
|
||||
.context("set netns failed");
|
||||
if !jailed {
|
||||
if let Some(netns_path) = &netns {
|
||||
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
|
||||
let netns_fd = std::fs::File::open(netns_path);
|
||||
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
|
||||
.context("set netns failed");
|
||||
}
|
||||
}
|
||||
if let Some(label) = selinux_label.as_ref() {
|
||||
if let Err(e) = selinux::set_exec_label(label) {
|
||||
@@ -256,6 +273,7 @@ impl Persist for FcInner {
|
||||
jailer_root: hypervisor_state.jailer_root,
|
||||
client: Client::unix(),
|
||||
pending_devices: vec![],
|
||||
pending_net_devices: vec![],
|
||||
run_dir: hypervisor_state.run_dir,
|
||||
capabilities: Capabilities::new(),
|
||||
fc_process: Mutex::new(None),
|
||||
|
||||
@@ -31,10 +31,29 @@ impl FcInner {
|
||||
.hotplug_block_device(block.config.path_on_host.as_str(), block.config.index)
|
||||
.await
|
||||
.context("add block device"),
|
||||
DeviceType::Network(network) => self
|
||||
.add_net_device(&network.config, network.device_id)
|
||||
.await
|
||||
.context("add net device"),
|
||||
DeviceType::Network(network) => {
|
||||
// Buffer network devices and send them to FC just before InstanceStart
|
||||
// in start_vm(). Firecracker rejects PUT /network-interfaces after the
|
||||
// VM has started, so we must ensure they arrive before InstanceStart.
|
||||
// This mirrors the Go runtime's batch-configuration approach.
|
||||
//
|
||||
// If the VM is already running (e.g. a post-start prestart-hooks rescan
|
||||
// called add_device again), we cannot do anything useful — FC has already
|
||||
// started and does not support network-interface hotplug. Log a warning
|
||||
// and return Ok so the rest of the setup path can continue.
|
||||
if self.state == VmmState::VmRunning {
|
||||
warn!(
|
||||
sl(),
|
||||
"FC: ignoring late network device add for iface {} — VM already running, hotplug not supported",
|
||||
network.device_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
debug!(sl(), "buffering network device for pre-start flush");
|
||||
self.pending_net_devices
|
||||
.push((network.config, network.device_id));
|
||||
Ok(())
|
||||
}
|
||||
DeviceType::HybridVsock(hvsock) => {
|
||||
self.add_hvsock(&hvsock.config).await.context("add vsock")
|
||||
}
|
||||
|
||||
@@ -77,6 +77,17 @@ impl FcInner {
|
||||
|
||||
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
|
||||
debug!(sl(), "Starting sandbox");
|
||||
|
||||
// Flush all buffered network devices before sending InstanceStart.
|
||||
// FC rejects PUT /network-interfaces once the VM is running, so network
|
||||
// interfaces must be configured here, immediately before the start action.
|
||||
let net_devices = std::mem::take(&mut self.pending_net_devices);
|
||||
for (config, device_id) in net_devices {
|
||||
self.add_net_device(&config, device_id)
|
||||
.await
|
||||
.context("configure network interface before start")?;
|
||||
}
|
||||
|
||||
let body: String = serde_json::json!({
|
||||
"action_type": "InstanceStart"
|
||||
})
|
||||
|
||||
@@ -691,14 +691,12 @@ impl QemuInner {
|
||||
|
||||
let is_unaligned = !new_hotplugged_mem.is_multiple_of(guest_mem_block_size);
|
||||
if is_unaligned {
|
||||
new_hotplugged_mem = ch_config::convert::checked_next_multiple_of(
|
||||
new_hotplugged_mem,
|
||||
guest_mem_block_size,
|
||||
)
|
||||
.ok_or(anyhow!(format!(
|
||||
"alignment of {} B to the block size of {} B failed",
|
||||
new_hotplugged_mem, guest_mem_block_size
|
||||
)))?
|
||||
new_hotplugged_mem = new_hotplugged_mem
|
||||
.checked_next_multiple_of(guest_mem_block_size)
|
||||
.ok_or(anyhow!(format!(
|
||||
"alignment of {} B to the block size of {} B failed",
|
||||
new_hotplugged_mem, guest_mem_block_size
|
||||
)))?
|
||||
}
|
||||
let new_hotplugged_mem = new_hotplugged_mem;
|
||||
|
||||
|
||||
@@ -249,13 +249,48 @@ impl ResourceManagerInner {
|
||||
}
|
||||
|
||||
async fn handle_interfaces(&self, network: &dyn Network) -> Result<()> {
|
||||
// The guest virtio-net device may not be visible to the kernel immediately
|
||||
// after InstanceStart completes. Retry on "Link not found" to allow time
|
||||
// for virtio-net driver initialisation in the guest.
|
||||
// Use a generous window (100 × 100 ms = 10 s) since on some systems
|
||||
// virtio-net initialisation is slower than the Go runtime's 20 × 20 ms.
|
||||
const MAX_ATTEMPTS: u32 = 100;
|
||||
const RETRY_DELAY_MS: u64 = 100;
|
||||
|
||||
for i in network.interfaces().await.context("get interfaces")? {
|
||||
// update interface
|
||||
info!(sl!(), "update interface {:?}", i);
|
||||
self.agent
|
||||
.update_interface(agent::UpdateInterfaceRequest { interface: Some(i) })
|
||||
.await
|
||||
.context("update interface")?;
|
||||
info!(sl!(), "update interface: hw_addr={} name={}", i.hw_addr, i.name);
|
||||
let mut last_err = None;
|
||||
for attempt in 0..MAX_ATTEMPTS {
|
||||
let result = self
|
||||
.agent
|
||||
.update_interface(agent::UpdateInterfaceRequest {
|
||||
interface: Some(i.clone()),
|
||||
})
|
||||
.await;
|
||||
if let Err(e) = result {
|
||||
let msg = e.to_string();
|
||||
if msg.contains("Link not found") {
|
||||
info!(
|
||||
sl!(),
|
||||
"update interface: link not found (attempt {}/{}), retrying in {}ms",
|
||||
attempt + 1,
|
||||
MAX_ATTEMPTS,
|
||||
RETRY_DELAY_MS,
|
||||
);
|
||||
last_err = Some(e);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(RETRY_DELAY_MS))
|
||||
.await;
|
||||
} else {
|
||||
return Err(e).context("update interface");
|
||||
}
|
||||
} else {
|
||||
last_err = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some(e) = last_err {
|
||||
return Err(e).context("update interface");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -49,7 +49,13 @@ impl NetworkPair {
|
||||
let unique_id = kata_sys_util::rand::UUID::new();
|
||||
let model = network_model::new(model).context("new network model")?;
|
||||
let tap_iface_name = format!("tap{idx}{TAP_SUFFIX}");
|
||||
let virt_iface_name = format!("eth{idx}");
|
||||
// Use the actual interface name from the netns scan. Fall back to eth{idx}
|
||||
// only if the caller passed an empty name.
|
||||
let virt_iface_name = if name.is_empty() {
|
||||
format!("eth{idx}")
|
||||
} else {
|
||||
name.to_string()
|
||||
};
|
||||
let tap_link = create_link(handle, &tap_iface_name, queues)
|
||||
.await
|
||||
.context("create link")?;
|
||||
@@ -106,7 +112,7 @@ impl NetworkPair {
|
||||
.await
|
||||
.context("set link up")?;
|
||||
|
||||
let mut net_pair = NetworkPair {
|
||||
let net_pair = NetworkPair {
|
||||
tap: TapInterface {
|
||||
id: String::from(&unique_id),
|
||||
name: format!("br{idx}{TAP_SUFFIX}"),
|
||||
@@ -125,10 +131,6 @@ impl NetworkPair {
|
||||
network_qos: false,
|
||||
};
|
||||
|
||||
if !name.is_empty() {
|
||||
net_pair.virt_iface.name = String::from(name);
|
||||
}
|
||||
|
||||
Ok(net_pair)
|
||||
}
|
||||
|
||||
|
||||
@@ -140,21 +140,22 @@ impl Network for NetworkWithNetns {
|
||||
|
||||
async fn remove(&self, h: &dyn Hypervisor) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
// The network namespace would have been deleted at this point
|
||||
// if it has not been created by virtcontainers.
|
||||
if !inner.network_created {
|
||||
return Ok(());
|
||||
}
|
||||
{
|
||||
// Always clean up endpoint resources (TC filter rules, TAP devices) regardless
|
||||
// of who created the network namespace.
|
||||
if !inner.netns_path.is_empty() {
|
||||
let _netns_guard =
|
||||
netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?;
|
||||
for e in &inner.entity_list {
|
||||
e.endpoint.detach(h).await.context("detach")?;
|
||||
}
|
||||
}
|
||||
let netns = get_from_path(inner.netns_path.clone())?;
|
||||
netns.remove()?;
|
||||
fs::remove_dir_all(inner.netns_path.clone()).context("failed to remove netns path")?;
|
||||
// Only remove the network namespace itself if virtcontainers created it.
|
||||
if inner.network_created {
|
||||
let netns = get_from_path(inner.netns_path.clone())?;
|
||||
netns.remove()?;
|
||||
fs::remove_dir_all(inner.netns_path.clone())
|
||||
.context("failed to remove netns path")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -611,11 +611,14 @@ impl Sandbox for VirtSandbox {
|
||||
.await
|
||||
.context("set up device before start vm")?;
|
||||
|
||||
// start vm
|
||||
self.hypervisor.start_vm(10_000).await.context("start vm")?;
|
||||
info!(sl!(), "start vm");
|
||||
|
||||
// execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks
|
||||
//
|
||||
// These must run BEFORE start_vm so that:
|
||||
// (a) createRuntime hooks (e.g. nerdctl's CNI hook) can create the veth pair
|
||||
// in the container netns while the VMM process (already started by
|
||||
// prepare_vm and placed in the netns) is still pre-InstanceStart, and
|
||||
// (b) hypervisors that do not support network-interface hotplug (e.g.
|
||||
// Firecracker) can configure the interface before InstanceStart.
|
||||
let (prestart_hooks, create_runtime_hooks) =
|
||||
if let Some(hooks) = sandbox_config.hooks.as_ref() {
|
||||
(
|
||||
@@ -633,12 +636,15 @@ impl Sandbox for VirtSandbox {
|
||||
)
|
||||
.await?;
|
||||
|
||||
// 1. if there are pre-start hook functions, network config might have been changed.
|
||||
// We need to rescan the netns to handle the change.
|
||||
// 2. Do not scan the netns if we want no network for the VM.
|
||||
// TODO In case of vm factory, scan the netns to hotplug interfaces after the VM is started.
|
||||
// Rescan the netns and update the network configuration before start_vm:
|
||||
// 1. When network_created==true the veth is set up by the createRuntime hook
|
||||
// above; we must scan now so the network device lands in pending_net_devices
|
||||
// before InstanceStart (required for FC which has no hotplug).
|
||||
// 2. When there are pre-start hooks the network config may have changed.
|
||||
// 3. Do not scan if disable_new_netns is set.
|
||||
let config = self.resource_manager.config().await;
|
||||
if self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks)
|
||||
if (sandbox_config.network_env.network_created
|
||||
|| self.has_prestart_hooks(&prestart_hooks, &create_runtime_hooks))
|
||||
&& !config.runtime.disable_new_netns
|
||||
&& !dan_config_path(&config, &self.sid).exists()
|
||||
{
|
||||
@@ -657,10 +663,14 @@ impl Sandbox for VirtSandbox {
|
||||
self.resource_manager
|
||||
.handle_network(network_resource)
|
||||
.await
|
||||
.context("set up device after start vm")?;
|
||||
.context("set up network before start vm")?;
|
||||
}
|
||||
}
|
||||
|
||||
// start vm
|
||||
self.hypervisor.start_vm(10_000).await.context("start vm")?;
|
||||
info!(sl!(), "start vm");
|
||||
|
||||
// connect agent
|
||||
// set agent socket
|
||||
let address = self
|
||||
|
||||
Reference in New Issue
Block a user