Merge pull request #12766 from fidencio/topic/kata-deploy-avoid-kata-pods-to-crash-after-containerd-restart

kata-deploy: Fix kata-deploy pods crashing if containerd restarts
This commit is contained in:
Fabiano Fidêncio
2026-04-01 18:28:16 +02:00
committed by GitHub
9 changed files with 317 additions and 201 deletions

View File

@@ -1,124 +0,0 @@
# Copyright (c) K3s contributors
#
# SPDX-License-Identifier: Apache-2.0
#
{{- /* */ -}}
# File generated by {{ .Program }}. DO NOT EDIT. Use config-v3.toml.tmpl instead.
version = 3
imports = ["__CONTAINERD_IMPORTS_PATH__"]
root = {{ printf "%q" .NodeConfig.Containerd.Root }}
state = {{ printf "%q" .NodeConfig.Containerd.State }}
[grpc]
address = {{ deschemify .NodeConfig.Containerd.Address | printf "%q" }}
[plugins.'io.containerd.internal.v1.opt']
path = {{ printf "%q" .NodeConfig.Containerd.Opt }}
[plugins.'io.containerd.grpc.v1.cri']
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
[plugins.'io.containerd.cri.v1.runtime']
enable_selinux = {{ .NodeConfig.SELinux }}
enable_unprivileged_ports = {{ .EnableUnprivileged }}
enable_unprivileged_icmp = {{ .EnableUnprivileged }}
device_ownership_from_security_context = {{ .NonrootDevices }}
{{ if .DisableCgroup}}
disable_cgroup = true
{{ end }}
{{ if .IsRunningInUserNS }}
disable_apparmor = true
restrict_oom_score_adj = true
{{ end }}
{{ with .NodeConfig.AgentConfig.Snapshotter }}
[plugins.'io.containerd.cri.v1.images']
snapshotter = "{{ . }}"
disable_snapshot_annotations = {{ if eq . "stargz" }}false{{else}}true{{end}}
use_local_image_pull = true
{{ end }}
{{ with .NodeConfig.AgentConfig.PauseImage }}
[plugins.'io.containerd.cri.v1.images'.pinned_images]
sandbox = "{{ . }}"
{{ end }}
{{- if or .NodeConfig.AgentConfig.CNIBinDir .NodeConfig.AgentConfig.CNIConfDir }}
[plugins.'io.containerd.cri.v1.runtime'.cni]
{{ with .NodeConfig.AgentConfig.CNIBinDir }}bin_dirs = [{{ printf "%q" . }}]{{ end }}
{{ with .NodeConfig.AgentConfig.CNIConfDir }}conf_dir = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ if or .NodeConfig.Containerd.BlockIOConfig .NodeConfig.Containerd.RDTConfig }}
[plugins.'io.containerd.service.v1.tasks-service']
{{ with .NodeConfig.Containerd.BlockIOConfig }}blockio_config_file = {{ printf "%q" . }}{{ end }}
{{ with .NodeConfig.Containerd.RDTConfig }}rdt_config_file = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ with .NodeConfig.DefaultRuntime }}
[plugins.'io.containerd.cri.v1.runtime'.containerd]
default_runtime_name = "{{ . }}"
{{ end }}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options]
SystemdCgroup = {{ .SystemdCgroup }}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process]
runtime_type = "io.containerd.runhcs.v1"
{{ range $k, $v := .ExtraRuntimes }}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}']
runtime_type = "{{$v.RuntimeType}}"
{{ with $v.BinaryName}}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}'.options]
BinaryName = {{ printf "%q" . }}
SystemdCgroup = {{ $.SystemdCgroup }}
{{ end }}
{{ end }}
[plugins.'io.containerd.cri.v1.images'.registry]
config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }}
{{ if .PrivateRegistryConfig }}
{{ range $k, $v := .PrivateRegistryConfig.Configs }}
{{ with $v.Auth }}
[plugins.'io.containerd.cri.v1.images'.registry.configs.'{{ $k }}'.auth]
{{ with .Username }}username = {{ printf "%q" . }}{{ end }}
{{ with .Password }}password = {{ printf "%q" . }}{{ end }}
{{ with .Auth }}auth = {{ printf "%q" . }}{{ end }}
{{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ end }}
{{ end }}
{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }}
{{ with .NodeConfig.AgentConfig.ImageServiceSocket }}
[plugins.'io.containerd.snapshotter.v1.stargz']
cri_keychain_image_service_path = {{ printf "%q" . }}
[plugins.'io.containerd.snapshotter.v1.stargz'.cri_keychain]
enable_keychain = true
{{ end }}
[plugins.'io.containerd.snapshotter.v1.stargz'.registry]
config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }}
{{ if .PrivateRegistryConfig }}
{{ range $k, $v := .PrivateRegistryConfig.Configs }}
{{ with $v.Auth }}
[plugins.'io.containerd.snapshotter.v1.stargz'.registry.configs.'{{ $k }}'.auth]
{{ with .Username }}username = {{ printf "%q" . }}{{ end }}
{{ with .Password }}password = {{ printf "%q" . }}{{ end }}
{{ with .Auth }}auth = {{ printf "%q" . }}{{ end }}
{{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ end }}
{{ end }}
{{ end }}

View File

@@ -0,0 +1,213 @@
#!/usr/bin/env bats
#
# Copyright (c) 2026 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# Kata Deploy Lifecycle Tests
#
# Validates kata-deploy behavior during DaemonSet restarts and uninstalls:
#
# 1. Artifacts present: After install, kata artifacts exist on the host,
# RuntimeClasses are created, and the node is labeled.
#
# 2. Restart resilience: Running kata pods must survive a kata-deploy
# DaemonSet restart without crashing. (Regression test for #12761)
#
# 3. Artifact cleanup: After helm uninstall, kata artifacts must be
# fully removed from the host and containerd must remain healthy.
#
# Required environment variables:
# DOCKER_REGISTRY - Container registry for kata-deploy image
# DOCKER_REPO - Repository name for kata-deploy image
# DOCKER_TAG - Image tag to test
# KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.)
# KUBERNETES - K8s distribution (microk8s, k3s, rke2, etc.)
load "${BATS_TEST_DIRNAME}/../../common.bash"
repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
source "${BATS_TEST_DIRNAME}/lib/helm-deploy.bash"
LIFECYCLE_POD_NAME="kata-lifecycle-test"
# Run a command on the host node's filesystem using a short-lived privileged pod.
# The host root is mounted at /host inside the pod.
# Usage: run_on_host "test -d /host/opt/kata && echo YES || echo NO"
run_on_host() {
local cmd="$1"
local node_name
node_name=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1)
local pod_name="host-exec-${RANDOM}"
kubectl run "${pod_name}" \
--image=quay.io/kata-containers/alpine-bash-curl:latest \
--restart=Never --rm -i \
--overrides="{
\"spec\": {
\"nodeName\": \"${node_name}\",
\"activeDeadlineSeconds\": 300,
\"tolerations\": [{\"operator\": \"Exists\"}],
\"containers\": [{
\"name\": \"exec\",
\"image\": \"quay.io/kata-containers/alpine-bash-curl:latest\",
\"imagePullPolicy\": \"IfNotPresent\",
\"command\": [\"sh\", \"-c\", \"${cmd}\"],
\"securityContext\": {\"privileged\": true},
\"volumeMounts\": [{\"name\": \"host\", \"mountPath\": \"/host\", \"readOnly\": true}]
}],
\"volumes\": [{\"name\": \"host\", \"hostPath\": {\"path\": \"/\"}}]
}
}"
}
setup_file() {
ensure_helm
echo "# Image: ${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" >&3
echo "# Hypervisor: ${KATA_HYPERVISOR}" >&3
echo "# K8s distribution: ${KUBERNETES}" >&3
echo "# Deploying kata-deploy..." >&3
deploy_kata
echo "# kata-deploy deployed successfully" >&3
}
@test "Kata artifacts are present on host after install" {
echo "# Checking kata artifacts on host..." >&3
run run_on_host "test -d /host/opt/kata && echo PRESENT || echo MISSING"
echo "# /opt/kata directory: ${output}" >&3
[[ "${output}" == *"PRESENT"* ]]
run run_on_host "test -f /host/opt/kata/bin/containerd-shim-kata-v2 && echo FOUND || (test -f /host/opt/kata/runtime-rs/bin/containerd-shim-kata-v2 && echo FOUND || echo MISSING)"
echo "# containerd-shim-kata-v2: ${output}" >&3
[[ "${output}" == *"FOUND"* ]]
# RuntimeClasses must exist (filter out AKS-managed ones)
local rc_count
rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true)
echo "# Kata RuntimeClasses: ${rc_count}" >&3
[[ ${rc_count} -gt 0 ]]
# Node must have the kata-runtime label
local label
label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}')
echo "# Node label katacontainers.io/kata-runtime: ${label}" >&3
[[ "${label}" == "true" ]]
}
@test "DaemonSet restart does not crash running kata pods" {
# Create a long-running kata pod
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: ${LIFECYCLE_POD_NAME}
spec:
runtimeClassName: kata-${KATA_HYPERVISOR}
restartPolicy: Always
nodeSelector:
katacontainers.io/kata-runtime: "true"
containers:
- name: test
image: quay.io/kata-containers/alpine-bash-curl:latest
imagePullPolicy: IfNotPresent
command: ["sleep", "infinity"]
EOF
echo "# Waiting for kata pod to be running..." >&3
kubectl wait --for=condition=Ready "pod/${LIFECYCLE_POD_NAME}" --timeout=120s
# Record pod identity before the DaemonSet restart
local pod_uid_before
pod_uid_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}')
local restart_count_before
restart_count_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}')
echo "# Pod UID before: ${pod_uid_before}, restarts: ${restart_count_before}" >&3
# Trigger a DaemonSet restart — this simulates what happens when a user
# changes a label, updates a config value, or does a rolling update.
echo "# Triggering kata-deploy DaemonSet restart..." >&3
kubectl -n "${HELM_NAMESPACE}" rollout restart daemonset/kata-deploy
echo "# Waiting for DaemonSet rollout to complete..." >&3
kubectl -n "${HELM_NAMESPACE}" rollout status daemonset/kata-deploy --timeout=300s
# On k3s/rke2 the new kata-deploy pod restarts the k3s service as
# part of install, which causes a brief API server outage. Wait for
# the node to become ready before querying pod status.
kubectl wait nodes --timeout=120s --all --for condition=Ready=True
echo "# Node is ready after DaemonSet rollout" >&3
# The kata pod must still be Running with the same UID and no extra restarts.
# Retry kubectl through any residual API unavailability.
local pod_phase=""
local retries=0
while [[ ${retries} -lt 30 ]]; do
pod_phase=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null) && break
retries=$((retries + 1))
sleep 2
done
echo "# Pod phase after restart: ${pod_phase}" >&3
[[ "${pod_phase}" == "Running" ]]
local pod_uid_after
pod_uid_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}')
echo "# Pod UID after: ${pod_uid_after}" >&3
[[ "${pod_uid_before}" == "${pod_uid_after}" ]]
local restart_count_after
restart_count_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}')
echo "# Restart count after: ${restart_count_after}" >&3
[[ "${restart_count_before}" == "${restart_count_after}" ]]
echo "# SUCCESS: Kata pod survived DaemonSet restart without crashing" >&3
}
@test "Artifacts are fully cleaned up after uninstall" {
echo "# Uninstalling kata-deploy..." >&3
uninstall_kata
echo "# Uninstall complete, verifying cleanup..." >&3
# Wait for node to recover — containerd restart during cleanup may
# cause brief unavailability (especially on k3s/rke2).
kubectl wait nodes --timeout=120s --all --for condition=Ready=True
# RuntimeClasses must be gone (filter out AKS-managed ones)
local rc_count
rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true)
echo "# Kata RuntimeClasses remaining: ${rc_count}" >&3
[[ ${rc_count} -eq 0 ]]
# Node label must be removed
local label
label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}' 2>/dev/null || echo "")
echo "# Node label after uninstall: '${label}'" >&3
[[ -z "${label}" ]]
# Kata artifacts must be removed from the host filesystem
echo "# Checking host filesystem for leftover artifacts..." >&3
run run_on_host "test -d /host/opt/kata && echo EXISTS || echo REMOVED"
echo "# /opt/kata: ${output}" >&3
[[ "${output}" == *"REMOVED"* ]]
# Containerd must still be healthy and reporting a valid version
local container_runtime_version
container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion)
echo "# Container runtime version: ${container_runtime_version}" >&3
[[ "${container_runtime_version}" != *"Unknown"* ]]
echo "# SUCCESS: All kata artifacts cleaned up, containerd healthy" >&3
}
teardown() {
if [[ "${BATS_TEST_NAME}" == *"restart"* ]]; then
kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true
fi
}
teardown_file() {
kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true
uninstall_kata 2>/dev/null || true
}

View File

@@ -20,6 +20,7 @@ else
KATA_DEPLOY_TEST_UNION=( \
"kata-deploy.bats" \
"kata-deploy-custom-runtimes.bats" \
"kata-deploy-lifecycle.bats" \
)
fi

View File

@@ -296,36 +296,6 @@ function deploy_k0s() {
sudo chown "${USER}":"${USER}" ~/.kube/config
}
# If the rendered containerd config (v3) does not import the drop-in dir, write
# the full V3 template (from tests/containerd-config-v3.tmpl) with the given
# import path and restart the service.
# Args: containerd_dir (e.g. /var/lib/rancher/k3s/agent/etc/containerd), service_name (e.g. k3s or rke2-server).
function _setup_containerd_v3_template_if_needed() {
local containerd_dir="$1"
local service_name="$2"
local template_file="${tests_dir}/containerd-config-v3.tmpl"
local rendered_v3="${containerd_dir}/config-v3.toml"
local imports_path="${containerd_dir}/config-v3.toml.d/*.toml"
if sudo test -f "${rendered_v3}" && sudo grep -q 'config-v3\.toml\.d' "${rendered_v3}" 2>/dev/null; then
return 0
fi
if [[ ! -f "${template_file}" ]]; then
echo "Template not found: ${template_file}" >&2
return 1
fi
sudo mkdir -p "${containerd_dir}/config-v3.toml.d"
sed "s|__CONTAINERD_IMPORTS_PATH__|${imports_path}|g" "${template_file}" | sudo tee "${containerd_dir}/config-v3.toml.tmpl" > /dev/null
sudo systemctl restart "${service_name}"
}
function setup_k3s_containerd_v3_template_if_needed() {
_setup_containerd_v3_template_if_needed "/var/lib/rancher/k3s/agent/etc/containerd" "k3s"
}
function setup_rke2_containerd_v3_template_if_needed() {
_setup_containerd_v3_template_if_needed "/var/lib/rancher/rke2/agent/etc/containerd" "rke2-server"
}
function deploy_k3s() {
# Set CRI runtime-request-timeout to 600s (same as kubeadm) for CoCo and long-running create requests.
curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg runtime-request-timeout=600s
@@ -333,9 +303,6 @@ function deploy_k3s() {
# This is an arbitrary value that came up from local tests
sleep 120s
# If rendered config does not import the drop-in dir, write full V3 template so kata-deploy can use it.
setup_k3s_containerd_v3_template_if_needed
# Download the kubectl binary into /usr/bin and remove /usr/local/bin/kubectl
#
# We need to do this to avoid hitting issues like:
@@ -405,9 +372,6 @@ function deploy_rke2() {
# This is an arbitrary value that came up from local tests
sleep 120s
# If rendered config does not import the drop-in dir, write full V3 template so kata-deploy can use it.
setup_rke2_containerd_v3_template_if_needed
# Link the kubectl binary into /usr/bin
sudo ln -sf /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl

View File

@@ -155,6 +155,7 @@ pub struct Config {
pub containerd_conf_file: String,
pub containerd_conf_file_backup: String,
pub containerd_drop_in_conf_file: String,
pub daemonset_name: String,
pub custom_runtimes_enabled: bool,
pub custom_runtimes: Vec<CustomRuntime>,
}
@@ -169,6 +170,12 @@ impl Config {
return Err(anyhow::anyhow!("NODE_NAME must not be empty"));
}
let daemonset_name = env::var("DAEMONSET_NAME")
.ok()
.map(|v| v.trim().to_string())
.filter(|v| !v.is_empty())
.unwrap_or_else(|| "kata-deploy".to_string());
let debug = env::var("DEBUG").unwrap_or_else(|_| "false".to_string()) == "true";
// Parse shims - only use arch-specific variable
@@ -293,6 +300,7 @@ impl Config {
containerd_conf_file,
containerd_conf_file_backup,
containerd_drop_in_conf_file,
daemonset_name,
custom_runtimes_enabled,
custom_runtimes,
};

View File

@@ -94,30 +94,41 @@ impl K8sClient {
Ok(())
}
pub async fn count_kata_deploy_daemonsets(&self) -> Result<usize> {
/// Returns whether a non-terminating DaemonSet with this exact name
/// exists in the current namespace. Used to decide whether this pod is
/// being restarted (true) or uninstalled (false).
pub async fn own_daemonset_exists(&self, daemonset_name: &str) -> Result<bool> {
use k8s_openapi::api::apps::v1::DaemonSet;
use kube::api::Api;
let ds_api: Api<DaemonSet> = Api::default_namespaced(self.client.clone());
match ds_api.get_opt(daemonset_name).await? {
Some(ds) => Ok(ds.metadata.deletion_timestamp.is_none()),
None => Ok(false),
}
}
/// Returns how many non-terminating DaemonSets across all namespaces
/// have a name containing "kata-deploy". Used to decide whether shared
/// node-level resources (node label, CRI restart) should be cleaned up:
/// they are only safe to remove when no kata-deploy instance remains
/// on the cluster.
pub async fn count_any_kata_deploy_daemonsets(&self) -> Result<usize> {
use k8s_openapi::api::apps::v1::DaemonSet;
use kube::api::{Api, ListParams};
let ds_api: Api<DaemonSet> = Api::default_namespaced(self.client.clone());
let lp = ListParams::default();
let daemonsets = ds_api.list(&lp).await?;
let ds_api: Api<DaemonSet> = Api::all(self.client.clone());
let daemonsets = ds_api.list(&ListParams::default()).await?;
// Note: We use client-side filtering here because Kubernetes field selectors
// don't support "contains" operations - they only support exact matches and comparisons.
// Filtering by name containing "kata-deploy" requires client-side processing.
// Exclude DaemonSets that are terminating (have deletion_timestamp) so that when our
// DaemonSet pod runs cleanup on SIGTERM during uninstall, we count 0 and remove the label.
let count = daemonsets
.iter()
.filter(|ds| {
if ds.metadata.deletion_timestamp.is_some() {
return false;
}
ds.metadata
.name
.as_ref()
.map(|n| n.contains("kata-deploy"))
.unwrap_or(false)
ds.metadata.deletion_timestamp.is_none()
&& ds
.metadata
.name
.as_ref()
.is_some_and(|n| n.contains("kata-deploy"))
})
.count();
@@ -584,9 +595,14 @@ pub async fn label_node(
client.label_node(label_key, label_value, overwrite).await
}
pub async fn count_kata_deploy_daemonsets(config: &Config) -> Result<usize> {
pub async fn own_daemonset_exists(config: &Config) -> Result<bool> {
let client = K8sClient::new(&config.node_name).await?;
client.count_kata_deploy_daemonsets().await
client.own_daemonset_exists(&config.daemonset_name).await
}
pub async fn count_any_kata_deploy_daemonsets(config: &Config) -> Result<usize> {
let client = K8sClient::new(&config.node_name).await?;
client.count_any_kata_deploy_daemonsets().await
}
pub async fn crd_exists(config: &Config, crd_name: &str) -> Result<bool> {

View File

@@ -236,19 +236,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
info!("Cleaning up Kata Containers");
info!("Counting kata-deploy daemonsets");
let kata_deploy_installations = k8s::count_kata_deploy_daemonsets(config).await?;
// Step 1: Check if THIS pod's owning DaemonSet still exists.
// If it does, this is a pod restart (rolling update, label change, etc.),
// not an uninstall — skip everything so running kata pods are not disrupted.
info!(
"Found {} kata-deploy daemonset(s)",
kata_deploy_installations
"Checking if DaemonSet '{}' still exists",
config.daemonset_name
);
if kata_deploy_installations == 0 {
info!("Removing kata-runtime label from node");
k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
info!("Successfully removed kata-runtime label");
if k8s::own_daemonset_exists(config).await? {
info!(
"DaemonSet '{}' still exists, \
skipping all cleanup to avoid disrupting running kata pods",
config.daemonset_name
);
return Ok(());
}
// Step 2: Our DaemonSet is gone (uninstall). Perform instance-specific
// cleanup: snapshotters, CRI config, and artifacts for this instance.
info!(
"DaemonSet '{}' not found, proceeding with instance cleanup",
config.daemonset_name
);
match config.experimental_setup_snapshotter.as_ref() {
Some(snapshotters) => {
for snapshotter in snapshotters {
@@ -270,6 +280,25 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
artifacts::remove_artifacts(config).await?;
info!("Successfully removed kata artifacts");
// Step 3: Check if ANY other kata-deploy DaemonSets still exist.
// Shared resources (node label, CRI restart) are only safe to touch
// when no other kata-deploy instance remains.
let other_ds_count = k8s::count_any_kata_deploy_daemonsets(config).await?;
if other_ds_count > 0 {
info!(
"{} other kata-deploy DaemonSet(s) still exist, \
skipping node label removal and CRI restart",
other_ds_count
);
return Ok(());
}
info!("No other kata-deploy DaemonSets found, performing full shared cleanup");
info!("Removing kata-runtime label from node");
k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
info!("Successfully removed kata-runtime label");
// Restart the CRI runtime last. On k3s/rke2 this restarts the entire
// server process, which kills this (terminating) pod. By doing it after
// all other cleanup, we ensure config and artifacts are already gone.

View File

@@ -51,18 +51,19 @@ pub async fn get_container_runtime(config: &Config) -> Result<String> {
return Ok("crio".to_string());
}
if runtime_version.contains("containerd") && runtime_version.contains("-k3s") {
// Check systemd services (ignore errors - service might not exist)
let _ = utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]);
if utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]).is_ok() {
return Ok("rke2-agent".to_string());
}
if utils::host_systemctl(&["is-active", "--quiet", "rke2-server"]).is_ok() {
return Ok("rke2-server".to_string());
}
if utils::host_systemctl(&["is-active", "--quiet", "k3s-agent"]).is_ok() {
return Ok("k3s-agent".to_string());
}
// Detect k3s/rke2 via systemd services rather than the containerd version
// string, which no longer reliably contains "k3s" in newer releases
// (e.g. "containerd://2.2.2-bd1.34").
if utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]).is_ok() {
return Ok("rke2-agent".to_string());
}
if utils::host_systemctl(&["is-active", "--quiet", "rke2-server"]).is_ok() {
return Ok("rke2-server".to_string());
}
if utils::host_systemctl(&["is-active", "--quiet", "k3s-agent"]).is_ok() {
return Ok("k3s-agent".to_string());
}
if utils::host_systemctl(&["is-active", "--quiet", "k3s"]).is_ok() {
return Ok("k3s".to_string());
}
@@ -83,7 +84,7 @@ pub async fn get_container_runtime(config: &Config) -> Result<String> {
Ok(runtime)
}
/// Returns true if containerRuntimeVersion (e.g. "containerd://2.1.5-k3s1") indicates
/// Returns true if containerRuntimeVersion (e.g. "containerd://2.1.5-k3s1", "containerd://2.2.2-bd1.34") indicates
/// containerd 2.x or newer, false for 1.x or unparseable. Used for drop-in support
/// and for K3s/RKE2 template selection (config-v3.toml.tmpl vs config.toml.tmpl).
pub fn containerd_version_is_2_or_newer(runtime_version: &str) -> bool {
@@ -191,6 +192,7 @@ mod tests {
#[case("containerd://2.0.0", true)]
#[case("containerd://2.1.5", true)]
#[case("containerd://2.1.5-k3s1", true)]
#[case("containerd://2.2.2-bd1.34", true)]
#[case("containerd://2.2.0", true)]
#[case("containerd://2.3.1", true)]
#[case("containerd://2.0.0-rc.1", true)]

View File

@@ -143,6 +143,13 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- if .Values.env.multiInstallSuffix }}
- name: DAEMONSET_NAME
value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }}
{{- else }}
- name: DAEMONSET_NAME
value: {{ .Chart.Name | quote }}
{{- end }}
- name: DEBUG
value: {{ include "kata-deploy.getDebug" . | quote }}
{{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}}