Merge pull request #12957 from Apokleos/fix-sb-api

runtime-rs: Fix sandbox-api lifecycle and CRI status handling
This commit is contained in:
Fabiano Fidêncio
2026-05-23 09:26:14 +02:00
committed by GitHub
8 changed files with 141 additions and 33 deletions

View File

@@ -28,14 +28,12 @@ jobs:
# all the tests due to a single flaky instance.
fail-fast: false
matrix:
containerd_version: ['active']
containerd_version: ['sandbox_api']
vmm: ['dragonball', 'clh-runtime-rs', 'qemu-runtime-rs']
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-sandboxapi-amd64-${{ toJSON(matrix) }}
cancel-in-progress: true
# TODO: enable me when https://github.com/containerd/containerd/issues/11640 is fixed
if: false
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04
env:
CONTAINERD_VERSION: ${{ matrix.containerd_version }}
GOPATH: ${{ github.workspace }}
@@ -60,16 +58,11 @@ jobs:
env:
INSTALL_IN_GOPATH: false
- name: Read properties from versions.yaml
run: |
go_version="$(yq '.languages.golang.version' versions.yaml)"
[ -n "$go_version" ]
echo "GO_VERSION=${go_version}" >> "$GITHUB_ENV"
- name: Setup Golang version ${{ env.GO_VERSION }}
# TODO: revert to versions.yaml Go once Kata bumps to a Go version compatible with containerd 2.3
- name: Setup Golang 1.26.3 (required by containerd 2.3)
uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
with:
go-version: ${{ env.GO_VERSION }}
go-version: "1.26.3"
- name: Install dependencies
run: bash tests/integration/cri-containerd/gha-run.sh install-dependencies

View File

@@ -233,6 +233,7 @@ pub struct SandboxStatus {
pub pid: u32,
pub state: String,
pub info: std::collections::HashMap<String, String>,
pub created_at: Option<std::time::SystemTime>,
}
#[derive(Debug, Clone)]

View File

@@ -581,7 +581,7 @@ impl RuntimeHandlerManager {
sandbox_id: status.sandbox_id,
pid: status.pid,
state: status.state,
created_at: None,
created_at: status.created_at,
exited_at: None,
}))
}

View File

@@ -460,7 +460,7 @@ impl ContainerManager for VirtContainerManager {
#[instrument]
async fn need_shutdown_sandbox(&self, req: &ShutdownRequest) -> bool {
req.is_now || self.containers.read().await.is_empty() || self.sid == req.container_id
req.is_now || self.sid == req.container_id
}
#[instrument]

View File

@@ -75,8 +75,9 @@ use resource::{ResourceConfig, ResourceManager};
use runtime_spec as spec;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::SystemTime;
use strum::Display;
use tokio::sync::{mpsc::Sender, Mutex, RwLock};
use tokio::sync::{mpsc::Sender, watch, Mutex, RwLock};
use tracing::instrument;
pub(crate) const VIRTCONTAINER: &str = "virt_container";
@@ -94,14 +95,27 @@ pub enum SandboxState {
Stopped,
}
impl SandboxState {
fn to_cri_state(self) -> &'static str {
match self {
SandboxState::Running => "SANDBOX_READY",
SandboxState::Init | SandboxState::Stopped => "SANDBOX_NOTREADY",
}
}
}
struct SandboxInner {
state: SandboxState,
exit_info: Option<SandboxExitInfo>,
created_at: Option<SystemTime>,
}
impl SandboxInner {
pub fn new() -> Self {
Self {
state: SandboxState::Init,
exit_info: None,
created_at: None,
}
}
}
@@ -115,6 +129,7 @@ pub struct VirtSandbox {
agent: Arc<dyn Agent>,
hypervisor: Arc<dyn Hypervisor>,
monitor: Arc<HealthCheck>,
exit_notify_tx: watch::Sender<bool>,
sandbox_config: Option<SandboxConfig>,
shm_size: u64,
factory: Option<Factory>,
@@ -130,6 +145,7 @@ impl std::fmt::Debug for VirtSandbox {
.field("agent", &"<Agent>")
.field("hypervisor", &self.hypervisor)
.field("monitor", &"<HealthCheck>")
.field("exit_notify_tx", &"<watch::Sender<bool>>")
.field("sandbox_config", &self.sandbox_config)
.field("factory", &self.factory)
.finish()
@@ -148,6 +164,7 @@ impl VirtSandbox {
) -> Result<Self> {
let config = resource_manager.config().await;
let keep_abnormal = config.runtime.keep_abnormal;
let (exit_notify_tx, _) = watch::channel(false);
Ok(Self {
sid: sid.to_string(),
msg_sender: Arc::new(Mutex::new(msg_sender)),
@@ -156,6 +173,7 @@ impl VirtSandbox {
hypervisor,
resource_manager,
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
exit_notify_tx,
shm_size: sandbox_config.shm_size,
sandbox_config: Some(sandbox_config),
factory: Some(factory),
@@ -174,6 +192,20 @@ impl VirtSandbox {
self.hypervisor.clone()
}
async fn record_stop(&self, exit_status: u32, exited_at: std::time::SystemTime) {
let mut inner = self.inner.write().await;
if inner.state == SandboxState::Stopped {
return;
}
inner.state = SandboxState::Stopped;
inner.exit_info = Some(SandboxExitInfo {
exit_status,
exited_at: Some(exited_at),
});
let _ = self.exit_notify_tx.send(true);
}
#[instrument]
async fn prepare_for_start_sandbox(
&self,
@@ -751,6 +783,22 @@ impl Sandbox for VirtSandbox {
self.hypervisor.start_vm(10_000).await.context("start vm")?;
info!(sl!(), "start vm");
let sandbox = self.clone();
// wait for vm exit in background, and record the exit status and time when vm exited.
tokio::spawn(async move {
match sandbox.hypervisor.wait_vm().await {
Ok(exit_code) => {
sandbox
.record_stop(exit_code as u32, SystemTime::now())
.await;
}
Err(err) => {
warn!(sl!(), "failed waiting for sandbox VM exit: {:?}", err);
sandbox.record_stop(255, SystemTime::now()).await;
}
}
});
// execute pre-start hook functions, including Prestart Hooks and CreateRuntime Hooks
let (prestart_hooks, create_runtime_hooks) =
if let Some(hooks) = sandbox_config.hooks.as_ref() {
@@ -843,6 +891,7 @@ impl Sandbox for VirtSandbox {
.context("create sandbox")?;
inner.state = SandboxState::Running;
inner.created_at = Some(std::time::SystemTime::now());
// get and store guest details
self.store_guest_details()
@@ -937,41 +986,81 @@ impl Sandbox for VirtSandbox {
.await
.context("start template vm")?;
info!(sl!(), "vm started from template");
let sandbox = self.clone();
tokio::spawn(async move {
match sandbox.hypervisor.wait_vm().await {
Ok(exit_code) => {
sandbox
.record_stop(exit_code as u32, SystemTime::now())
.await;
}
Err(err) => {
warn!(sl!(), "failed waiting for sandbox VM exit: {:?}", err);
sandbox.record_stop(255, SystemTime::now()).await;
}
}
});
Ok(())
}
async fn status(&self) -> Result<SandboxStatus> {
info!(sl!(), "get sandbox status");
let inner = self.inner.read().await;
let state = inner.state.to_string();
let state = inner.state.to_cri_state().to_string();
Ok(SandboxStatus {
sandbox_id: self.sid.clone(),
pid: std::process::id(),
state,
..Default::default()
info: std::collections::HashMap::new(),
created_at: inner.created_at,
})
}
async fn wait(&self) -> Result<SandboxExitInfo> {
info!(sl!(), "wait sandbox");
let exit_code = self.hypervisor.wait_vm().await.context("wait vm")?;
Ok(SandboxExitInfo {
exit_status: exit_code as u32,
exited_at: Some(std::time::SystemTime::now()),
})
{
let inner = self.inner.read().await;
if inner.state == SandboxState::Stopped {
return Ok(inner.exit_info.clone().unwrap_or_default());
}
}
let mut exit_notify_rx = self.exit_notify_tx.subscribe();
while !*exit_notify_rx.borrow() {
exit_notify_rx
.changed()
.await
.context("wait for sandbox stop notification")?;
}
let inner = self.inner.read().await;
Ok(inner.exit_info.clone().unwrap_or_default())
}
async fn stop(&self) -> Result<()> {
let mut sandbox_inner = self.inner.write().await;
let state = {
let sandbox_inner = self.inner.read().await;
sandbox_inner.state
};
if sandbox_inner.state != SandboxState::Stopped {
info!(sl!(), "begin stop sandbox");
self.hypervisor.stop_vm().await.context("stop vm")?;
sandbox_inner.state = SandboxState::Stopped;
info!(sl!(), "sandbox stopped");
if state == SandboxState::Stopped {
return Ok(());
}
info!(sl!(), "begin stop sandbox");
if state == SandboxState::Init {
let _ = self.hypervisor.stop_vm().await;
self.record_stop(0, SystemTime::now()).await;
info!(sl!(), "sandbox stopped during Init");
return Ok(());
}
self.hypervisor.stop_vm().await.context("stop vm")?;
self.wait().await.context("wait for vm exit after stop")?;
info!(sl!(), "sandbox stopped");
Ok(())
}
@@ -1251,6 +1340,7 @@ impl Persist for VirtSandbox {
hypervisor,
resource_manager,
monitor: Arc::new(HealthCheck::new(true, keep_abnormal)),
exit_notify_tx: watch::channel(false).0,
sandbox_config: None,
shm_size: DEFAULT_SHM_SIZE,
factory: None,

View File

@@ -767,6 +767,14 @@ function get_latest_patch_release_from_a_github_project() {
| grep "${regex}" -m1
}
# GitHub Actions' setup-go often sets GOTOOLCHAIN=local, which forbids fetching a newer
# toolchain required by cloned containerd (e.g. v2.3 go.mod vs Kata's pinned Go). Use
# automatic toolchain selection only while building upstream containerd.
function export_go_toolchain_for_containerd_source_builds() {
export GOTOOLCHAIN=auto
info "GOTOOLCHAIN=auto so containerd is built with the toolchain its go.mod requires"
}
# base_version: The version to be intalled in the ${major}.${minor} format
function clone_cri_containerd() {
base_version="${1}"

View File

@@ -113,7 +113,7 @@ function create_containerd_config() {
fi
# check containerd config version
if containerd config default | grep -q "version = 3\>"; then
if containerd config default | grep -qE "^version = [34]"; then
pluginid=\"io.containerd.cri.v1.runtime\"
else
pluginid="cri"
@@ -170,6 +170,13 @@ function err_report() {
function check_daemon_setup() {
info "containerd(cri): Check daemon works with runc"
# Use podsandbox for the runc sanity check: the shim sandboxer has a known
# containerd-side bug where the OCI spec is not populated before NewBundle is
# called, so config.json is never written and containerd-shim-runc-v2 fails.
# See https://github.com/containerd/containerd/issues/11640
# This check only verifies that containerd + runc are functional before the
# real kata tests run, so the sandboxer choice doesn't matter here.
local SANDBOXER="podsandbox"
create_containerd_config "runc"
# containerd cri-integration will modify the passed in config file. Let's
@@ -268,7 +275,8 @@ function TestContainerMemoryUpdate() {
# Currently, dragonball fails at decrease memory, just test increasing memory.
# We'll re-enable it as soon as we get it to work.
# Reference: https://github.com/kata-containers/kata-containers/issues/8804
DoContainerMemoryUpdate 0
# DoContainerMemoryUpdate 0
info "TestContainerMemoryUpdate skipped for dragonball"
fi
if [[ "${KATA_HYPERVISOR}" == "qemu-runtime-rs" ]]; then
@@ -662,6 +670,8 @@ function main() {
pushd "containerd"
export_go_toolchain_for_containerd_source_builds
# Make sure the right artifacts are going to be built
sudo make clean
@@ -680,8 +690,13 @@ function main() {
info "containerd(cri): Running cri-integration"
passing_test="TestContainerStats|TestContainerRestart|TestContainerListStatsWithIdFilter|TestContainerListStatsWithIdSandboxIdFilter|TestDuplicateName|TestImageLoad|TestImageFSInfo|TestSandboxCleanRemove"
# TestContainerRestart is excluded: creating a new container in the same
# sandbox VM after the previous container has exited and been removed has
# never been supported by kata-containers (neither with the go-based nor
# the rust-based runtime). The kata VM shuts down when its last container
# is removed, so any attempt to start a new container in the same sandbox
# fails. This test exercises a use-case kata does not currently support.
passing_test="TestContainerStats|TestContainerListStatsWithIdFilter|TestContainerListStatsWithIdSandboxIdFilter|TestDuplicateName|TestImageLoad|TestImageFSInfo|TestSandboxCleanRemove"
if [[ "${KATA_HYPERVISOR}" == "clh-runtime-rs" || \
"${KATA_HYPERVISOR}" == "qemu" ]]; then

View File

@@ -315,6 +315,7 @@ externals:
version: "v1.7.25"
lts: "v1.7"
active: "v2.2"
sandbox_api: "v2.3"
critools:
description: "CLI tool for Container Runtime Interface (CRI)"