mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-03 02:22:55 +00:00
Compare commits
3 Commits
fix/docker
...
fix/runtim
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f48cd23836 | ||
|
|
c4a0b8e102 | ||
|
|
26b6ad72c1 |
3
.github/workflows/basic-ci-amd64.yaml
vendored
3
.github/workflows/basic-ci-amd64.yaml
vendored
@@ -88,7 +88,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
containerd_version: ['lts', 'active']
|
||||
vmm: ['clh', 'cloud-hypervisor', 'dragonball', 'qemu', 'qemu-runtime-rs']
|
||||
vmm: ['clh', 'cloud-hypervisor', 'dragonball', 'qemu', 'qemu-runtime-rs', 'fc-rs']
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
CONTAINERD_VERSION: ${{ matrix.containerd_version }}
|
||||
@@ -283,6 +283,7 @@ jobs:
|
||||
- qemu
|
||||
- cloud-hypervisor
|
||||
- qemu-runtime-rs
|
||||
- fc-rs
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
|
||||
@@ -275,7 +275,7 @@ Each hypervisor has a dedicated configuration file:
|
||||
|------------|----------------|-----------------|
|
||||
| QEMU |`configuration-qemu-runtime-rs.toml` |`configuration-qemu.toml` |
|
||||
| Cloud Hypervisor | `configuration-cloud-hypervisor.toml` | `configuration-clh.toml` |
|
||||
| Firecracker | `configuration-rs-fc.toml` | `configuration-fc.toml` |
|
||||
| Firecracker | `configuration-fc-rs.toml` | `configuration-fc.toml` |
|
||||
| Dragonball | `configuration-dragonball.toml` (default) | `No` |
|
||||
|
||||
> **Note:** Configuration files are typically installed in `/opt/kata/share/defaults/kata-containers/` or `/opt/kata/share/defaults/kata-containers/runtime-rs/` or `/usr/share/defaults/kata-containers/`.
|
||||
|
||||
@@ -423,7 +423,7 @@ endif
|
||||
|
||||
ifneq (,$(FCCMD))
|
||||
KNOWN_HYPERVISORS += $(HYPERVISOR_FC)
|
||||
CONFIG_FILE_FC = configuration-rs-fc.toml
|
||||
CONFIG_FILE_FC = configuration-fc-rs.toml
|
||||
CONFIG_FC = config/$(CONFIG_FILE_FC)
|
||||
CONFIG_FC_IN = $(CONFIG_FC).in
|
||||
CONFIG_PATH_FC = $(abspath $(CONFDIR)/$(CONFIG_FILE_FC))
|
||||
|
||||
@@ -157,7 +157,7 @@ Configuration files in `config/`:
|
||||
| `configuration-dragonball.toml.in` | Dragonball | Built-in VMM |
|
||||
| `configuration-qemu-runtime-rs.toml.in` | QEMU | Default external |
|
||||
| `configuration-cloud-hypervisor.toml.in` | Cloud Hypervisor | Modern VMM |
|
||||
| `configuration-rs-fc.toml.in` | Firecracker | Lightweight microVM |
|
||||
| `configuration-fc-rs.toml.in` | Firecracker | Lightweight microVM |
|
||||
| `configuration-remote.toml.in` | Remote | Remote hypervisor |
|
||||
| `configuration-qemu-tdx-runtime-rs.toml.in` | QEMU + TDX | Intel TDX confidential computing |
|
||||
| `configuration-qemu-snp-runtime-rs.toml.in` | QEMU + SEV-SNP | AMD SEV-SNP confidential computing |
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
//
|
||||
//SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::convert::TryFrom;
|
||||
|
||||
use crate::{
|
||||
firecracker::{
|
||||
inner_hypervisor::{FC_AGENT_SOCKET_NAME, ROOT},
|
||||
@@ -13,7 +15,7 @@ use crate::{
|
||||
};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use dbs_utils::net::MacAddr;
|
||||
use hyper::{Body, Method, Request, Response};
|
||||
use hyper::{Body, Method, Request};
|
||||
use hyperlocal::Uri;
|
||||
use kata_sys_util::mount;
|
||||
use kata_types::config::hypervisor::RateLimiterConfig;
|
||||
@@ -23,6 +25,16 @@ use tokio::{fs, fs::File};
|
||||
|
||||
const REQUEST_RETRY: u32 = 500;
|
||||
const FC_KERNEL: &str = "vmlinux";
|
||||
|
||||
/// Distinguishes a transient transport error (FC not ready yet, retry allowed)
|
||||
/// from a permanent HTTP-level API error returned by FC (no retry).
|
||||
#[derive(Debug)]
|
||||
enum FcRequestError {
|
||||
/// Could not reach the FC API socket (connection refused, etc.)
|
||||
Transport(String),
|
||||
/// FC returned a non-2xx HTTP status. (status_code, response_body)
|
||||
Api(u16, String),
|
||||
}
|
||||
const FC_ROOT_FS: &str = "rootfs";
|
||||
const DRIVE_PREFIX: &str = "drive";
|
||||
const DISK_POOL_SIZE: u32 = 6;
|
||||
@@ -111,7 +123,8 @@ impl FcInner {
|
||||
|
||||
let body_config: String = json!({
|
||||
"mem_size_mib": self.config.memory_info.default_memory,
|
||||
"vcpu_count": self.config.cpu_info.default_vcpus.ceil() as u8,
|
||||
"vcpu_count": u8::try_from(self.config.cpu_info.default_vcpus.ceil() as u64)
|
||||
.context("vcpu_count overflows u8")?,
|
||||
})
|
||||
.to_string();
|
||||
let body_kernel: String = json!({
|
||||
@@ -215,13 +228,29 @@ impl FcInner {
|
||||
Some(mac) => MacAddr::from_bytes(&mac.0).ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let rx_rate_limiter = RateLimiterConfig::new(
|
||||
self.config.network_info.rx_rate_limiter_max_rate,
|
||||
0,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
let tx_rate_limiter = RateLimiterConfig::new(
|
||||
self.config.network_info.tx_rate_limiter_max_rate,
|
||||
0,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let body: String = json!({
|
||||
"iface_id": &device_id,
|
||||
"guest_mac": g_mac,
|
||||
"host_dev_name": &config.host_dev_name
|
||||
|
||||
"host_dev_name": &config.host_dev_name,
|
||||
"rx_rate_limiter": rx_rate_limiter,
|
||||
"tx_rate_limiter": tx_rate_limiter,
|
||||
})
|
||||
.to_string();
|
||||
info!(sl(), "FC: add network device: iface_id={} guest_mac={:?} host_dev_name={}", device_id, g_mac, config.host_dev_name);
|
||||
self.request_with_retry(
|
||||
Method::PUT,
|
||||
&["/network-interfaces/", &device_id].concat(),
|
||||
@@ -259,50 +288,54 @@ impl FcInner {
|
||||
.body(Body::from(data.clone()))?;
|
||||
|
||||
match self.send_request(req).await {
|
||||
Ok(resp) => {
|
||||
debug!(sl(), "Request sent, resp: {:?}", resp);
|
||||
return Ok(());
|
||||
}
|
||||
Err(resp) => {
|
||||
debug!(sl(), "Request sent with error, resp: {:?}", resp);
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
Ok(_) => return Ok(()),
|
||||
// A transport error (FC not ready yet) — retry.
|
||||
Err(FcRequestError::Transport(e)) => {
|
||||
debug!(sl(), "FC not reachable yet, retrying: {:?}", e);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
|
||||
continue;
|
||||
}
|
||||
// An HTTP-level error from FC — fail immediately with the
|
||||
// actual error body so the problem is visible in logs.
|
||||
Err(FcRequestError::Api(status, body)) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
"FC API error: status={} body={}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow::anyhow!(
|
||||
"After {} attempts, it still doesn't work.",
|
||||
REQUEST_RETRY
|
||||
"FC not reachable after {} attempts (method={:?} uri={:?})",
|
||||
REQUEST_RETRY,
|
||||
method,
|
||||
uri,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) async fn send_request(&self, req: Request<Body>) -> Result<Response<Body>> {
|
||||
let resp = self.client.request(req).await?;
|
||||
async fn send_request(&self, req: Request<Body>) -> Result<(), FcRequestError> {
|
||||
let resp = self
|
||||
.client
|
||||
.request(req)
|
||||
.await
|
||||
.map_err(|e| FcRequestError::Transport(e.to_string()))?;
|
||||
|
||||
let status = resp.status();
|
||||
debug!(sl(), "Request RESPONSE {:?} {:?}", &status, resp);
|
||||
if status.is_success() {
|
||||
return Ok(resp);
|
||||
} else {
|
||||
let body = hyper::body::to_bytes(resp.into_body()).await?;
|
||||
if body.is_empty() {
|
||||
debug!(sl(), "Request FAILED WITH STATUS: {:?}", status);
|
||||
None
|
||||
} else {
|
||||
let body = String::from_utf8_lossy(&body).into_owned();
|
||||
debug!(
|
||||
sl(),
|
||||
"Request FAILED WITH STATUS: {:?} and BODY: {:?}", status, body
|
||||
);
|
||||
Some(body)
|
||||
};
|
||||
debug!(sl(), "FC request succeeded: {:?}", status);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"After {} attempts, it
|
||||
still doesn't work.",
|
||||
REQUEST_RETRY
|
||||
))
|
||||
let body = hyper::body::to_bytes(resp.into_body())
|
||||
.await
|
||||
.map(|b| String::from_utf8_lossy(&b).into_owned())
|
||||
.unwrap_or_default();
|
||||
error!(
|
||||
sl(),
|
||||
"FC API rejected request: status={:?} body={:?}", status, body
|
||||
);
|
||||
Err(FcRequestError::Api(status.as_u16(), body))
|
||||
}
|
||||
pub(crate) fn cleanup_resource(&self) {
|
||||
if self.jailed {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use crate::firecracker::{inner_hypervisor::FC_API_SOCKET_NAME, sl};
|
||||
use crate::device::driver::NetworkConfig;
|
||||
use crate::MemoryConfig;
|
||||
use crate::HYPERVISOR_FIRECRACKER;
|
||||
use crate::{device::DeviceType, VmmState};
|
||||
@@ -43,6 +44,9 @@ pub struct FcInner {
|
||||
pub(crate) jailed: bool,
|
||||
pub(crate) run_dir: String,
|
||||
pub(crate) pending_devices: Vec<DeviceType>,
|
||||
/// Network devices buffered until start_vm() so they are always sent to FC
|
||||
/// before InstanceStart, mirroring the Go runtime's batch-configuration approach.
|
||||
pub(crate) pending_net_devices: Vec<(NetworkConfig, String)>,
|
||||
pub(crate) capabilities: Capabilities,
|
||||
pub(crate) fc_process: Mutex<Option<Child>>,
|
||||
pub(crate) exit_notify: Option<mpsc::Sender<()>>,
|
||||
@@ -51,7 +55,9 @@ pub struct FcInner {
|
||||
impl FcInner {
|
||||
pub fn new(exit_notify: mpsc::Sender<()>) -> FcInner {
|
||||
let mut capabilities = Capabilities::new();
|
||||
capabilities.set(CapabilityBits::BlockDeviceSupport);
|
||||
capabilities.set(
|
||||
CapabilityBits::BlockDeviceSupport | CapabilityBits::HybridVsockSupport,
|
||||
);
|
||||
|
||||
FcInner {
|
||||
id: String::default(),
|
||||
@@ -66,6 +72,7 @@ impl FcInner {
|
||||
jailed: false,
|
||||
run_dir: String::default(),
|
||||
pending_devices: vec![],
|
||||
pending_net_devices: vec![],
|
||||
capabilities,
|
||||
fc_process: Mutex::new(None),
|
||||
exit_notify: Some(exit_notify),
|
||||
@@ -80,7 +87,7 @@ impl FcInner {
|
||||
debug!(sl(), "Running Jailed");
|
||||
cmd = Command::new(&self.config.jailer_path);
|
||||
let api_socket = ["/run/", FC_API_SOCKET_NAME].join("/");
|
||||
let args = [
|
||||
let mut args = vec![
|
||||
"--id",
|
||||
&self.id,
|
||||
"--gid",
|
||||
@@ -91,11 +98,16 @@ impl FcInner {
|
||||
&self.config.path,
|
||||
"--chroot-base-dir",
|
||||
&self.jailer_root,
|
||||
"--",
|
||||
"--api-sock",
|
||||
&api_socket,
|
||||
];
|
||||
cmd.args(args);
|
||||
// Pass the network namespace to the jailer so that the FC process
|
||||
// is placed in the correct netns. This is the recommended approach
|
||||
// over relying on pre_exec setns inheritance.
|
||||
let netns_path = netns.clone().unwrap_or_default();
|
||||
if !netns_path.is_empty() {
|
||||
args.extend_from_slice(&["--netns", &netns_path]);
|
||||
}
|
||||
args.extend_from_slice(&["--", "--api-sock", &api_socket]);
|
||||
cmd.args(&args);
|
||||
}
|
||||
false => {
|
||||
debug!(sl(), "Running non-Jailed");
|
||||
@@ -108,15 +120,22 @@ impl FcInner {
|
||||
}
|
||||
debug!(sl(), "Exec: {:?}", cmd);
|
||||
|
||||
// Make sure we're in the correct Network Namespace
|
||||
// For the non-jailed case, enter the network namespace via pre_exec so that
|
||||
// the FC process inherits it. For the jailed case, --netns is passed to the
|
||||
// jailer above and pre_exec setns is skipped.
|
||||
let jailed = self.jailed;
|
||||
unsafe {
|
||||
let selinux_label = self.config.security_info.selinux_label.clone();
|
||||
let _pre = cmd.pre_exec(move || {
|
||||
if let Some(netns_path) = &netns {
|
||||
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
|
||||
let netns_fd = std::fs::File::open(netns_path);
|
||||
let _ = setns(netns_fd?.as_raw_fd(), CloneFlags::CLONE_NEWNET)
|
||||
.context("set netns failed");
|
||||
if !jailed {
|
||||
if let Some(netns_path) = &netns {
|
||||
debug!(sl(), "set netns for vmm master {:?}", &netns_path);
|
||||
let netns_fd = std::fs::File::open(netns_path)?;
|
||||
setns(netns_fd.as_raw_fd(), CloneFlags::CLONE_NEWNET)
|
||||
.map_err(|e| std::io::Error::other(
|
||||
format!("setns into {:?} failed: {}", netns_path, e),
|
||||
))?;
|
||||
}
|
||||
}
|
||||
if let Some(label) = selinux_label.as_ref() {
|
||||
if let Err(e) = selinux::set_exec_label(label) {
|
||||
@@ -256,6 +275,7 @@ impl Persist for FcInner {
|
||||
jailer_root: hypervisor_state.jailer_root,
|
||||
client: Client::unix(),
|
||||
pending_devices: vec![],
|
||||
pending_net_devices: vec![],
|
||||
run_dir: hypervisor_state.run_dir,
|
||||
capabilities: Capabilities::new(),
|
||||
fc_process: Mutex::new(None),
|
||||
|
||||
@@ -31,10 +31,29 @@ impl FcInner {
|
||||
.hotplug_block_device(block.config.path_on_host.as_str(), block.config.index)
|
||||
.await
|
||||
.context("add block device"),
|
||||
DeviceType::Network(network) => self
|
||||
.add_net_device(&network.config, network.device_id)
|
||||
.await
|
||||
.context("add net device"),
|
||||
DeviceType::Network(network) => {
|
||||
// Buffer network devices and send them to FC just before InstanceStart
|
||||
// in boot_vm(). Firecracker rejects PUT /network-interfaces after the
|
||||
// VM has started, so we must ensure they arrive before InstanceStart.
|
||||
// This mirrors the Go runtime's batch-configuration approach.
|
||||
//
|
||||
// If the VM is already running (e.g. a post-start prestart-hooks rescan
|
||||
// called add_device again), we cannot do anything useful — FC has already
|
||||
// started and does not support network-interface hotplug. Log a warning
|
||||
// and return Ok so the rest of the setup path can continue.
|
||||
if self.state == VmmState::VmRunning {
|
||||
warn!(
|
||||
sl(),
|
||||
"FC: ignoring late network device add for iface {} — VM already running, hotplug not supported",
|
||||
network.device_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
debug!(sl(), "buffering network device for pre-start flush");
|
||||
self.pending_net_devices
|
||||
.push((network.config, network.device_id));
|
||||
Ok(())
|
||||
}
|
||||
DeviceType::HybridVsock(hvsock) => {
|
||||
self.add_hvsock(&hvsock.config).await.context("add vsock")
|
||||
}
|
||||
|
||||
@@ -76,7 +76,29 @@ impl FcInner {
|
||||
}
|
||||
|
||||
pub(crate) async fn start_vm(&mut self, _timeout: i32) -> Result<()> {
|
||||
debug!(sl(), "Starting sandbox");
|
||||
// For Firecracker, the VMM process was already started in prepare_vm.
|
||||
// Network interfaces must be configured before InstanceStart, but
|
||||
// OCI hooks (which create the container veth via CNI) have not run
|
||||
// yet. Defer the network flush and InstanceStart to boot_vm(), which
|
||||
// sandbox.rs calls after the hooks + network rescan.
|
||||
debug!(sl(), "FC start_vm: VMM already running; deferring InstanceStart to boot_vm");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn boot_vm(&mut self) -> Result<()> {
|
||||
debug!(sl(), "FC boot_vm: flushing network devices and sending InstanceStart");
|
||||
|
||||
// Flush all buffered network devices. These were populated by
|
||||
// add_device(Network) after the OCI hooks ran and the netns was
|
||||
// rescanned by sandbox.rs. FC rejects PUT /network-interfaces once
|
||||
// the VM is running, so this must happen before InstanceStart.
|
||||
let net_devices = std::mem::take(&mut self.pending_net_devices);
|
||||
for (config, device_id) in net_devices {
|
||||
self.add_net_device(&config, device_id)
|
||||
.await
|
||||
.context("configure network interface before InstanceStart")?;
|
||||
}
|
||||
|
||||
let body: String = serde_json::json!({
|
||||
"action_type": "InstanceStart"
|
||||
})
|
||||
|
||||
@@ -75,6 +75,11 @@ impl Hypervisor for Firecracker {
|
||||
inner.start_vm(timeout).await
|
||||
}
|
||||
|
||||
async fn boot_vm(&self) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.boot_vm().await
|
||||
}
|
||||
|
||||
async fn stop_vm(&self) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.stop_vm().await
|
||||
|
||||
@@ -106,6 +106,23 @@ pub trait Hypervisor: std::fmt::Debug + Send + Sync {
|
||||
selinux_label: Option<String>,
|
||||
) -> Result<()>;
|
||||
async fn start_vm(&self, timeout: i32) -> Result<()>;
|
||||
|
||||
/// Finalize VM boot after OCI hooks and network setup have run.
|
||||
///
|
||||
/// For hypervisors that require all devices (including network) to be
|
||||
/// registered before the guest boots (e.g. Firecracker, which has no
|
||||
/// hotplug), `start_vm` only starts the VMM process, while `boot_vm`
|
||||
/// flushes the device queue and issues the actual boot command
|
||||
/// (InstanceStart for FC). For hypervisors that start the guest
|
||||
/// immediately in `start_vm` (QEMU, dragonball, cloud-hypervisor), the
|
||||
/// default no-op implementation is sufficient.
|
||||
///
|
||||
/// sandbox.rs calls this after OCI hooks and the post-hooks network
|
||||
/// rescan, but before connecting to the kata-agent.
|
||||
async fn boot_vm(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn stop_vm(&self) -> Result<()>;
|
||||
async fn wait_vm(&self) -> Result<i32>;
|
||||
async fn pause_vm(&self) -> Result<()>;
|
||||
|
||||
@@ -689,14 +689,12 @@ impl QemuInner {
|
||||
|
||||
let is_unaligned = !new_hotplugged_mem.is_multiple_of(guest_mem_block_size);
|
||||
if is_unaligned {
|
||||
new_hotplugged_mem = ch_config::convert::checked_next_multiple_of(
|
||||
new_hotplugged_mem,
|
||||
guest_mem_block_size,
|
||||
)
|
||||
.ok_or(anyhow!(format!(
|
||||
"alignment of {} B to the block size of {} B failed",
|
||||
new_hotplugged_mem, guest_mem_block_size
|
||||
)))?
|
||||
new_hotplugged_mem = new_hotplugged_mem
|
||||
.checked_next_multiple_of(guest_mem_block_size)
|
||||
.ok_or(anyhow!(format!(
|
||||
"alignment of {} B to the block size of {} B failed",
|
||||
new_hotplugged_mem, guest_mem_block_size
|
||||
)))?
|
||||
}
|
||||
let new_hotplugged_mem = new_hotplugged_mem;
|
||||
|
||||
|
||||
@@ -249,13 +249,48 @@ impl ResourceManagerInner {
|
||||
}
|
||||
|
||||
async fn handle_interfaces(&self, network: &dyn Network) -> Result<()> {
|
||||
// The guest virtio-net device may not be visible to the kernel immediately
|
||||
// after InstanceStart completes. Retry on "Link not found" to allow time
|
||||
// for virtio-net driver initialisation in the guest.
|
||||
// Use a generous window (100 × 100 ms = 10 s) since on some systems
|
||||
// virtio-net initialisation is slower than the Go runtime's 20 × 20 ms.
|
||||
const MAX_ATTEMPTS: u32 = 100;
|
||||
const RETRY_DELAY_MS: u64 = 100;
|
||||
|
||||
for i in network.interfaces().await.context("get interfaces")? {
|
||||
// update interface
|
||||
info!(sl!(), "update interface {:?}", i);
|
||||
self.agent
|
||||
.update_interface(agent::UpdateInterfaceRequest { interface: Some(i) })
|
||||
.await
|
||||
.context("update interface")?;
|
||||
info!(sl!(), "update interface: hw_addr={} name={}", i.hw_addr, i.name);
|
||||
let mut last_err = None;
|
||||
for attempt in 0..MAX_ATTEMPTS {
|
||||
let result = self
|
||||
.agent
|
||||
.update_interface(agent::UpdateInterfaceRequest {
|
||||
interface: Some(i.clone()),
|
||||
})
|
||||
.await;
|
||||
if let Err(e) = result {
|
||||
let msg = e.to_string();
|
||||
if msg.contains("Link not found") {
|
||||
info!(
|
||||
sl!(),
|
||||
"update interface: link not found (attempt {}/{}), retrying in {}ms",
|
||||
attempt + 1,
|
||||
MAX_ATTEMPTS,
|
||||
RETRY_DELAY_MS,
|
||||
);
|
||||
last_err = Some(e);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(RETRY_DELAY_MS))
|
||||
.await;
|
||||
} else {
|
||||
return Err(e).context("update interface");
|
||||
}
|
||||
} else {
|
||||
last_err = None;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if let Some(e) = last_err {
|
||||
return Err(e).context("update interface");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -49,7 +49,13 @@ impl NetworkPair {
|
||||
let unique_id = kata_sys_util::rand::UUID::new();
|
||||
let model = network_model::new(model).context("new network model")?;
|
||||
let tap_iface_name = format!("tap{idx}{TAP_SUFFIX}");
|
||||
let virt_iface_name = format!("eth{idx}");
|
||||
// Use the actual interface name from the netns scan. Fall back to eth{idx}
|
||||
// only if the caller passed an empty name.
|
||||
let virt_iface_name = if name.is_empty() {
|
||||
format!("eth{idx}")
|
||||
} else {
|
||||
name.to_string()
|
||||
};
|
||||
let tap_link = create_link(handle, &tap_iface_name, queues)
|
||||
.await
|
||||
.context("create link")?;
|
||||
@@ -106,7 +112,7 @@ impl NetworkPair {
|
||||
.await
|
||||
.context("set link up")?;
|
||||
|
||||
let mut net_pair = NetworkPair {
|
||||
let net_pair = NetworkPair {
|
||||
tap: TapInterface {
|
||||
id: String::from(&unique_id),
|
||||
name: format!("br{idx}{TAP_SUFFIX}"),
|
||||
@@ -125,10 +131,6 @@ impl NetworkPair {
|
||||
network_qos: false,
|
||||
};
|
||||
|
||||
if !name.is_empty() {
|
||||
net_pair.virt_iface.name = String::from(name);
|
||||
}
|
||||
|
||||
Ok(net_pair)
|
||||
}
|
||||
|
||||
|
||||
@@ -140,21 +140,22 @@ impl Network for NetworkWithNetns {
|
||||
|
||||
async fn remove(&self, h: &dyn Hypervisor) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
// The network namespace would have been deleted at this point
|
||||
// if it has not been created by virtcontainers.
|
||||
if !inner.network_created {
|
||||
return Ok(());
|
||||
}
|
||||
{
|
||||
// Always clean up endpoint resources (TC filter rules, TAP devices) regardless
|
||||
// of who created the network namespace.
|
||||
if !inner.netns_path.is_empty() {
|
||||
let _netns_guard =
|
||||
netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?;
|
||||
for e in &inner.entity_list {
|
||||
e.endpoint.detach(h).await.context("detach")?;
|
||||
}
|
||||
}
|
||||
let netns = get_from_path(inner.netns_path.clone())?;
|
||||
netns.remove()?;
|
||||
fs::remove_dir_all(inner.netns_path.clone()).context("failed to remove netns path")?;
|
||||
// Only remove the network namespace itself if virtcontainers created it.
|
||||
if inner.network_created {
|
||||
let netns = get_from_path(inner.netns_path.clone())?;
|
||||
netns.remove()?;
|
||||
fs::remove_dir_all(inner.netns_path.clone())
|
||||
.context("failed to remove netns path")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -613,7 +613,6 @@ impl Sandbox for VirtSandbox {
|
||||
|
||||
// start vm
|
||||
self.hypervisor.start_vm(10_000).await.context("start vm")?;
|
||||
info!(sl!(), "start vm");
|
||||
|
||||
// execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks
|
||||
let (prestart_hooks, create_runtime_hooks) =
|
||||
@@ -661,6 +660,18 @@ impl Sandbox for VirtSandbox {
|
||||
}
|
||||
}
|
||||
|
||||
// Give the hypervisor a chance to finalize boot now that OCI hooks and
|
||||
// the post-hooks network rescan have completed. For hypervisors that
|
||||
// require all devices (including network) to be registered before the
|
||||
// guest boots (e.g. Firecracker), start_vm defers the actual boot
|
||||
// command to this call. For hypervisors that boot the guest in
|
||||
// start_vm (QEMU, dragonball, cloud-hypervisor), this is a no-op.
|
||||
self.hypervisor
|
||||
.boot_vm()
|
||||
.await
|
||||
.context("boot vm")?;
|
||||
info!(sl!(), "start vm");
|
||||
|
||||
// connect agent
|
||||
// set agent socket
|
||||
let address = self
|
||||
|
||||
@@ -517,7 +517,7 @@ function enabling_hypervisor() {
|
||||
declare -r CONTAINERD_SHIM_KATA="/usr/local/bin/containerd-shim-kata-${KATA_HYPERVISOR}-v2"
|
||||
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
dragonball|cloud-hypervisor|qemu-runtime-rs|qemu-se-runtime-rs)
|
||||
dragonball|cloud-hypervisor|qemu-runtime-rs|qemu-se-runtime-rs|fc-rs)
|
||||
sudo ln -sf "${KATA_DIR}/runtime-rs/bin/containerd-shim-kata-v2" "${CONTAINERD_SHIM_KATA}"
|
||||
declare -r CONFIG_DIR="${KATA_DIR}/share/defaults/kata-containers/runtime-rs"
|
||||
;;
|
||||
@@ -536,6 +536,61 @@ function enabling_hypervisor() {
|
||||
}
|
||||
|
||||
|
||||
# Sets up a devmapper thin-pool and reconfigures standalone containerd to use
|
||||
# it as the default snapshotter. Required for block-device based hypervisors
|
||||
# (e.g. Firecracker / fc-rs) that cannot use the overlayfs snapshotter.
|
||||
# Expects containerd to already be installed and /etc/containerd/config.toml
|
||||
# to exist (e.g. after `containerd config default | sudo tee ...`).
|
||||
function configure_devmapper_for_containerd() {
|
||||
info "Configuring devmapper snapshotter for standalone containerd"
|
||||
|
||||
sudo mkdir -p /var/lib/containerd/devmapper
|
||||
sudo truncate --size 10G /var/lib/containerd/devmapper/data-disk.img
|
||||
sudo truncate --size 1G /var/lib/containerd/devmapper/meta-disk.img
|
||||
|
||||
# Allocate loop devices dynamically to avoid conflicts with pre-existing ones.
|
||||
local loop_data loop_meta
|
||||
loop_data=$(sudo losetup --find --show /var/lib/containerd/devmapper/data-disk.img)
|
||||
loop_meta=$(sudo losetup --find --show /var/lib/containerd/devmapper/meta-disk.img)
|
||||
info "devmapper: data=${loop_data} meta=${loop_meta}"
|
||||
|
||||
# data device size in 512-byte sectors: 10 GiB = 10*1024*1024*1024/512 = 20971520
|
||||
local data_sectors
|
||||
data_sectors=$(sudo blockdev --getsz "${loop_data}")
|
||||
sudo dmsetup create contd-thin-pool \
|
||||
--table "0 ${data_sectors} thin-pool ${loop_meta} ${loop_data} 512 32768 1 skip_block_zeroing"
|
||||
|
||||
# Add the devmapper snapshotter plugin config only if not already present
|
||||
# (makes the function idempotent on re-runs).
|
||||
if ! sudo grep -q 'io.containerd.snapshotter.v1.devmapper' /etc/containerd/config.toml; then
|
||||
cat <<'EOF' | sudo tee -a /etc/containerd/config.toml
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.devmapper"]
|
||||
pool_name = "contd-thin-pool"
|
||||
root_path = "/var/lib/containerd/devmapper"
|
||||
base_image_size = "4096MB"
|
||||
discard_blocks = true
|
||||
EOF
|
||||
fi
|
||||
|
||||
# Patch the default snapshotter to devmapper if not already set.
|
||||
if ! sudo grep -q 'snapshotter = "devmapper"' /etc/containerd/config.toml; then
|
||||
sudo sed -i \
|
||||
's|snapshotter = "overlayfs"|snapshotter = "devmapper"|g' \
|
||||
/etc/containerd/config.toml
|
||||
fi
|
||||
|
||||
sudo systemctl restart containerd
|
||||
|
||||
# Verify the plugin came up healthy
|
||||
local dm_status
|
||||
dm_status=$(sudo ctr plugins ls | awk '$2 ~ /^devmapper$/ { print $4 }' || true)
|
||||
[ "${dm_status}" = "ok" ] || \
|
||||
die "containerd devmapper snapshotter not healthy (status: '${dm_status}')"
|
||||
|
||||
info "devmapper snapshotter configured and healthy"
|
||||
}
|
||||
|
||||
function check_containerd_config_for_kata() {
|
||||
# check containerd config
|
||||
declare -r line1="default_runtime_name = \"kata\""
|
||||
|
||||
@@ -51,8 +51,14 @@ function install_dependencies() {
|
||||
|
||||
# Create the default containerd configuration
|
||||
sudo mkdir -p /etc/containerd
|
||||
containerd config default > sudo tee /etc/containerd/config.toml
|
||||
containerd config default | sudo tee /etc/containerd/config.toml
|
||||
sudo systemctl restart containerd
|
||||
|
||||
# Firecracker (fc-rs) uses block devices and requires the devmapper
|
||||
# snapshotter; other hypervisors work fine with the default overlayfs.
|
||||
if [ "${KATA_HYPERVISOR:-}" = "fc-rs" ]; then
|
||||
configure_devmapper_for_containerd
|
||||
fi
|
||||
}
|
||||
|
||||
function collect_artifacts() {
|
||||
|
||||
@@ -48,11 +48,21 @@ function run() {
|
||||
# bash "${stability_dir}/agent_stability_test.sh"
|
||||
}
|
||||
|
||||
function install_kata_for_stability() {
|
||||
install_kata
|
||||
|
||||
# Firecracker (fc-rs) uses block devices and requires the devmapper
|
||||
# snapshotter; other hypervisors work fine with the default overlayfs.
|
||||
if [ "${KATA_HYPERVISOR:-}" = "fc-rs" ]; then
|
||||
configure_devmapper_for_containerd
|
||||
fi
|
||||
}
|
||||
|
||||
function main() {
|
||||
action="${1:-}"
|
||||
case "${action}" in
|
||||
install-dependencies) install_dependencies ;;
|
||||
install-kata) install_kata ;;
|
||||
install-kata) install_kata_for_stability ;;
|
||||
enabling-hypervisor) enabling_hypervisor ;;
|
||||
run) run ;;
|
||||
*) >&2 die "Invalid argument" ;;
|
||||
|
||||
Reference in New Issue
Block a user