diff --git a/tests/containerd-config-v3.tmpl b/tests/containerd-config-v3.tmpl deleted file mode 100644 index d7e94c9d31..0000000000 --- a/tests/containerd-config-v3.tmpl +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) K3s contributors -# -# SPDX-License-Identifier: Apache-2.0 -# - -{{- /* */ -}} -# File generated by {{ .Program }}. DO NOT EDIT. Use config-v3.toml.tmpl instead. -version = 3 -imports = ["__CONTAINERD_IMPORTS_PATH__"] -root = {{ printf "%q" .NodeConfig.Containerd.Root }} -state = {{ printf "%q" .NodeConfig.Containerd.State }} - -[grpc] - address = {{ deschemify .NodeConfig.Containerd.Address | printf "%q" }} - -[plugins.'io.containerd.internal.v1.opt'] - path = {{ printf "%q" .NodeConfig.Containerd.Opt }} - -[plugins.'io.containerd.grpc.v1.cri'] - stream_server_address = "127.0.0.1" - stream_server_port = "10010" - -[plugins.'io.containerd.cri.v1.runtime'] - enable_selinux = {{ .NodeConfig.SELinux }} - enable_unprivileged_ports = {{ .EnableUnprivileged }} - enable_unprivileged_icmp = {{ .EnableUnprivileged }} - device_ownership_from_security_context = {{ .NonrootDevices }} - -{{ if .DisableCgroup}} - disable_cgroup = true -{{ end }} - -{{ if .IsRunningInUserNS }} - disable_apparmor = true - restrict_oom_score_adj = true -{{ end }} - -{{ with .NodeConfig.AgentConfig.Snapshotter }} -[plugins.'io.containerd.cri.v1.images'] - snapshotter = "{{ . }}" - disable_snapshot_annotations = {{ if eq . "stargz" }}false{{else}}true{{end}} - use_local_image_pull = true -{{ end }} - -{{ with .NodeConfig.AgentConfig.PauseImage }} -[plugins.'io.containerd.cri.v1.images'.pinned_images] - sandbox = "{{ . }}" -{{ end }} - -{{- if or .NodeConfig.AgentConfig.CNIBinDir .NodeConfig.AgentConfig.CNIConfDir }} -[plugins.'io.containerd.cri.v1.runtime'.cni] - {{ with .NodeConfig.AgentConfig.CNIBinDir }}bin_dirs = [{{ printf "%q" . }}]{{ end }} - {{ with .NodeConfig.AgentConfig.CNIConfDir }}conf_dir = {{ printf "%q" . }}{{ end }} -{{ end }} - -{{ if or .NodeConfig.Containerd.BlockIOConfig .NodeConfig.Containerd.RDTConfig }} -[plugins.'io.containerd.service.v1.tasks-service'] - {{ with .NodeConfig.Containerd.BlockIOConfig }}blockio_config_file = {{ printf "%q" . }}{{ end }} - {{ with .NodeConfig.Containerd.RDTConfig }}rdt_config_file = {{ printf "%q" . }}{{ end }} -{{ end }} - -{{ with .NodeConfig.DefaultRuntime }} -[plugins.'io.containerd.cri.v1.runtime'.containerd] - default_runtime_name = "{{ . }}" -{{ end }} - -[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc] - runtime_type = "io.containerd.runc.v2" - -[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options] - SystemdCgroup = {{ .SystemdCgroup }} - -[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process] - runtime_type = "io.containerd.runhcs.v1" - -{{ range $k, $v := .ExtraRuntimes }} -[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}'] - runtime_type = "{{$v.RuntimeType}}" -{{ with $v.BinaryName}} -[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}'.options] - BinaryName = {{ printf "%q" . }} - SystemdCgroup = {{ $.SystemdCgroup }} -{{ end }} -{{ end }} - -[plugins.'io.containerd.cri.v1.images'.registry] - config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }} - -{{ if .PrivateRegistryConfig }} -{{ range $k, $v := .PrivateRegistryConfig.Configs }} -{{ with $v.Auth }} -[plugins.'io.containerd.cri.v1.images'.registry.configs.'{{ $k }}'.auth] - {{ with .Username }}username = {{ printf "%q" . }}{{ end }} - {{ with .Password }}password = {{ printf "%q" . }}{{ end }} - {{ with .Auth }}auth = {{ printf "%q" . }}{{ end }} - {{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }} -{{ end }} -{{ end }} -{{ end }} - -{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }} -{{ with .NodeConfig.AgentConfig.ImageServiceSocket }} -[plugins.'io.containerd.snapshotter.v1.stargz'] - cri_keychain_image_service_path = {{ printf "%q" . }} - -[plugins.'io.containerd.snapshotter.v1.stargz'.cri_keychain] - enable_keychain = true -{{ end }} - -[plugins.'io.containerd.snapshotter.v1.stargz'.registry] - config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }} - -{{ if .PrivateRegistryConfig }} -{{ range $k, $v := .PrivateRegistryConfig.Configs }} -{{ with $v.Auth }} -[plugins.'io.containerd.snapshotter.v1.stargz'.registry.configs.'{{ $k }}'.auth] - {{ with .Username }}username = {{ printf "%q" . }}{{ end }} - {{ with .Password }}password = {{ printf "%q" . }}{{ end }} - {{ with .Auth }}auth = {{ printf "%q" . }}{{ end }} - {{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }} -{{ end }} -{{ end }} -{{ end }} -{{ end }} diff --git a/tests/functional/kata-deploy/kata-deploy-lifecycle.bats b/tests/functional/kata-deploy/kata-deploy-lifecycle.bats new file mode 100644 index 0000000000..1c883b2c4a --- /dev/null +++ b/tests/functional/kata-deploy/kata-deploy-lifecycle.bats @@ -0,0 +1,213 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Kata Deploy Lifecycle Tests +# +# Validates kata-deploy behavior during DaemonSet restarts and uninstalls: +# +# 1. Artifacts present: After install, kata artifacts exist on the host, +# RuntimeClasses are created, and the node is labeled. +# +# 2. Restart resilience: Running kata pods must survive a kata-deploy +# DaemonSet restart without crashing. (Regression test for #12761) +# +# 3. Artifact cleanup: After helm uninstall, kata artifacts must be +# fully removed from the host and containerd must remain healthy. +# +# Required environment variables: +# DOCKER_REGISTRY - Container registry for kata-deploy image +# DOCKER_REPO - Repository name for kata-deploy image +# DOCKER_TAG - Image tag to test +# KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.) +# KUBERNETES - K8s distribution (microk8s, k3s, rke2, etc.) + +load "${BATS_TEST_DIRNAME}/../../common.bash" +repo_root_dir="${BATS_TEST_DIRNAME}/../../../" +load "${repo_root_dir}/tests/gha-run-k8s-common.sh" + +source "${BATS_TEST_DIRNAME}/lib/helm-deploy.bash" + +LIFECYCLE_POD_NAME="kata-lifecycle-test" + +# Run a command on the host node's filesystem using a short-lived privileged pod. +# The host root is mounted at /host inside the pod. +# Usage: run_on_host "test -d /host/opt/kata && echo YES || echo NO" +run_on_host() { + local cmd="$1" + local node_name + node_name=$(kubectl get nodes --no-headers -o custom-columns=NAME:.metadata.name | head -1) + local pod_name="host-exec-${RANDOM}" + + kubectl run "${pod_name}" \ + --image=quay.io/kata-containers/alpine-bash-curl:latest \ + --restart=Never --rm -i \ + --overrides="{ + \"spec\": { + \"nodeName\": \"${node_name}\", + \"activeDeadlineSeconds\": 300, + \"tolerations\": [{\"operator\": \"Exists\"}], + \"containers\": [{ + \"name\": \"exec\", + \"image\": \"quay.io/kata-containers/alpine-bash-curl:latest\", + \"imagePullPolicy\": \"IfNotPresent\", + \"command\": [\"sh\", \"-c\", \"${cmd}\"], + \"securityContext\": {\"privileged\": true}, + \"volumeMounts\": [{\"name\": \"host\", \"mountPath\": \"/host\", \"readOnly\": true}] + }], + \"volumes\": [{\"name\": \"host\", \"hostPath\": {\"path\": \"/\"}}] + } + }" +} + +setup_file() { + ensure_helm + + echo "# Image: ${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" >&3 + echo "# Hypervisor: ${KATA_HYPERVISOR}" >&3 + echo "# K8s distribution: ${KUBERNETES}" >&3 + echo "# Deploying kata-deploy..." >&3 + deploy_kata + echo "# kata-deploy deployed successfully" >&3 +} + +@test "Kata artifacts are present on host after install" { + echo "# Checking kata artifacts on host..." >&3 + + run run_on_host "test -d /host/opt/kata && echo PRESENT || echo MISSING" + echo "# /opt/kata directory: ${output}" >&3 + [[ "${output}" == *"PRESENT"* ]] + + run run_on_host "test -f /host/opt/kata/bin/containerd-shim-kata-v2 && echo FOUND || (test -f /host/opt/kata/runtime-rs/bin/containerd-shim-kata-v2 && echo FOUND || echo MISSING)" + echo "# containerd-shim-kata-v2: ${output}" >&3 + [[ "${output}" == *"FOUND"* ]] + + # RuntimeClasses must exist (filter out AKS-managed ones) + local rc_count + rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true) + echo "# Kata RuntimeClasses: ${rc_count}" >&3 + [[ ${rc_count} -gt 0 ]] + + # Node must have the kata-runtime label + local label + label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}') + echo "# Node label katacontainers.io/kata-runtime: ${label}" >&3 + [[ "${label}" == "true" ]] +} + +@test "DaemonSet restart does not crash running kata pods" { + # Create a long-running kata pod + cat <&3 + kubectl wait --for=condition=Ready "pod/${LIFECYCLE_POD_NAME}" --timeout=120s + + # Record pod identity before the DaemonSet restart + local pod_uid_before + pod_uid_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}') + local restart_count_before + restart_count_before=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}') + echo "# Pod UID before: ${pod_uid_before}, restarts: ${restart_count_before}" >&3 + + # Trigger a DaemonSet restart — this simulates what happens when a user + # changes a label, updates a config value, or does a rolling update. + echo "# Triggering kata-deploy DaemonSet restart..." >&3 + kubectl -n "${HELM_NAMESPACE}" rollout restart daemonset/kata-deploy + + echo "# Waiting for DaemonSet rollout to complete..." >&3 + kubectl -n "${HELM_NAMESPACE}" rollout status daemonset/kata-deploy --timeout=300s + + # On k3s/rke2 the new kata-deploy pod restarts the k3s service as + # part of install, which causes a brief API server outage. Wait for + # the node to become ready before querying pod status. + kubectl wait nodes --timeout=120s --all --for condition=Ready=True + echo "# Node is ready after DaemonSet rollout" >&3 + + # The kata pod must still be Running with the same UID and no extra restarts. + # Retry kubectl through any residual API unavailability. + local pod_phase="" + local retries=0 + while [[ ${retries} -lt 30 ]]; do + pod_phase=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null) && break + retries=$((retries + 1)) + sleep 2 + done + echo "# Pod phase after restart: ${pod_phase}" >&3 + [[ "${pod_phase}" == "Running" ]] + + local pod_uid_after + pod_uid_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.metadata.uid}') + echo "# Pod UID after: ${pod_uid_after}" >&3 + [[ "${pod_uid_before}" == "${pod_uid_after}" ]] + + local restart_count_after + restart_count_after=$(kubectl get pod "${LIFECYCLE_POD_NAME}" -o jsonpath='{.status.containerStatuses[0].restartCount}') + echo "# Restart count after: ${restart_count_after}" >&3 + [[ "${restart_count_before}" == "${restart_count_after}" ]] + + echo "# SUCCESS: Kata pod survived DaemonSet restart without crashing" >&3 +} + +@test "Artifacts are fully cleaned up after uninstall" { + echo "# Uninstalling kata-deploy..." >&3 + uninstall_kata + echo "# Uninstall complete, verifying cleanup..." >&3 + + # Wait for node to recover — containerd restart during cleanup may + # cause brief unavailability (especially on k3s/rke2). + kubectl wait nodes --timeout=120s --all --for condition=Ready=True + + # RuntimeClasses must be gone (filter out AKS-managed ones) + local rc_count + rc_count=$(kubectl get runtimeclasses --no-headers 2>/dev/null | grep -v "kata-mshv-vm-isolation" | grep -c "kata" || true) + echo "# Kata RuntimeClasses remaining: ${rc_count}" >&3 + [[ ${rc_count} -eq 0 ]] + + # Node label must be removed + local label + label=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.katacontainers\.io/kata-runtime}' 2>/dev/null || echo "") + echo "# Node label after uninstall: '${label}'" >&3 + [[ -z "${label}" ]] + + # Kata artifacts must be removed from the host filesystem + echo "# Checking host filesystem for leftover artifacts..." >&3 + run run_on_host "test -d /host/opt/kata && echo EXISTS || echo REMOVED" + echo "# /opt/kata: ${output}" >&3 + [[ "${output}" == *"REMOVED"* ]] + + # Containerd must still be healthy and reporting a valid version + local container_runtime_version + container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion) + echo "# Container runtime version: ${container_runtime_version}" >&3 + [[ "${container_runtime_version}" != *"Unknown"* ]] + + echo "# SUCCESS: All kata artifacts cleaned up, containerd healthy" >&3 +} + +teardown() { + if [[ "${BATS_TEST_NAME}" == *"restart"* ]]; then + kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true + fi +} + +teardown_file() { + kubectl delete pod "${LIFECYCLE_POD_NAME}" --ignore-not-found=true --wait=false 2>/dev/null || true + uninstall_kata 2>/dev/null || true +} diff --git a/tests/functional/kata-deploy/run-kata-deploy-tests.sh b/tests/functional/kata-deploy/run-kata-deploy-tests.sh index 33beb14c4f..eff06efaa2 100644 --- a/tests/functional/kata-deploy/run-kata-deploy-tests.sh +++ b/tests/functional/kata-deploy/run-kata-deploy-tests.sh @@ -20,6 +20,7 @@ else KATA_DEPLOY_TEST_UNION=( \ "kata-deploy.bats" \ "kata-deploy-custom-runtimes.bats" \ + "kata-deploy-lifecycle.bats" \ ) fi diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh index d1cc1343d9..4f132601f6 100644 --- a/tests/gha-run-k8s-common.sh +++ b/tests/gha-run-k8s-common.sh @@ -296,36 +296,6 @@ function deploy_k0s() { sudo chown "${USER}":"${USER}" ~/.kube/config } -# If the rendered containerd config (v3) does not import the drop-in dir, write -# the full V3 template (from tests/containerd-config-v3.tmpl) with the given -# import path and restart the service. -# Args: containerd_dir (e.g. /var/lib/rancher/k3s/agent/etc/containerd), service_name (e.g. k3s or rke2-server). -function _setup_containerd_v3_template_if_needed() { - local containerd_dir="$1" - local service_name="$2" - local template_file="${tests_dir}/containerd-config-v3.tmpl" - local rendered_v3="${containerd_dir}/config-v3.toml" - local imports_path="${containerd_dir}/config-v3.toml.d/*.toml" - if sudo test -f "${rendered_v3}" && sudo grep -q 'config-v3\.toml\.d' "${rendered_v3}" 2>/dev/null; then - return 0 - fi - if [[ ! -f "${template_file}" ]]; then - echo "Template not found: ${template_file}" >&2 - return 1 - fi - sudo mkdir -p "${containerd_dir}/config-v3.toml.d" - sed "s|__CONTAINERD_IMPORTS_PATH__|${imports_path}|g" "${template_file}" | sudo tee "${containerd_dir}/config-v3.toml.tmpl" > /dev/null - sudo systemctl restart "${service_name}" -} - -function setup_k3s_containerd_v3_template_if_needed() { - _setup_containerd_v3_template_if_needed "/var/lib/rancher/k3s/agent/etc/containerd" "k3s" -} - -function setup_rke2_containerd_v3_template_if_needed() { - _setup_containerd_v3_template_if_needed "/var/lib/rancher/rke2/agent/etc/containerd" "rke2-server" -} - function deploy_k3s() { # Set CRI runtime-request-timeout to 600s (same as kubeadm) for CoCo and long-running create requests. curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg runtime-request-timeout=600s @@ -333,9 +303,6 @@ function deploy_k3s() { # This is an arbitrary value that came up from local tests sleep 120s - # If rendered config does not import the drop-in dir, write full V3 template so kata-deploy can use it. - setup_k3s_containerd_v3_template_if_needed - # Download the kubectl binary into /usr/bin and remove /usr/local/bin/kubectl # # We need to do this to avoid hitting issues like: @@ -405,9 +372,6 @@ function deploy_rke2() { # This is an arbitrary value that came up from local tests sleep 120s - # If rendered config does not import the drop-in dir, write full V3 template so kata-deploy can use it. - setup_rke2_containerd_v3_template_if_needed - # Link the kubectl binary into /usr/bin sudo ln -sf /var/lib/rancher/rke2/bin/kubectl /usr/local/bin/kubectl diff --git a/tools/packaging/kata-deploy/binary/src/config.rs b/tools/packaging/kata-deploy/binary/src/config.rs index a7d1c5f81e..1909561ef8 100644 --- a/tools/packaging/kata-deploy/binary/src/config.rs +++ b/tools/packaging/kata-deploy/binary/src/config.rs @@ -155,6 +155,7 @@ pub struct Config { pub containerd_conf_file: String, pub containerd_conf_file_backup: String, pub containerd_drop_in_conf_file: String, + pub daemonset_name: String, pub custom_runtimes_enabled: bool, pub custom_runtimes: Vec, } @@ -169,6 +170,12 @@ impl Config { return Err(anyhow::anyhow!("NODE_NAME must not be empty")); } + let daemonset_name = env::var("DAEMONSET_NAME") + .ok() + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| "kata-deploy".to_string()); + let debug = env::var("DEBUG").unwrap_or_else(|_| "false".to_string()) == "true"; // Parse shims - only use arch-specific variable @@ -293,6 +300,7 @@ impl Config { containerd_conf_file, containerd_conf_file_backup, containerd_drop_in_conf_file, + daemonset_name, custom_runtimes_enabled, custom_runtimes, }; diff --git a/tools/packaging/kata-deploy/binary/src/k8s/client.rs b/tools/packaging/kata-deploy/binary/src/k8s/client.rs index e0d2b65811..5415ceffa2 100644 --- a/tools/packaging/kata-deploy/binary/src/k8s/client.rs +++ b/tools/packaging/kata-deploy/binary/src/k8s/client.rs @@ -94,30 +94,41 @@ impl K8sClient { Ok(()) } - pub async fn count_kata_deploy_daemonsets(&self) -> Result { + /// Returns whether a non-terminating DaemonSet with this exact name + /// exists in the current namespace. Used to decide whether this pod is + /// being restarted (true) or uninstalled (false). + pub async fn own_daemonset_exists(&self, daemonset_name: &str) -> Result { + use k8s_openapi::api::apps::v1::DaemonSet; + use kube::api::Api; + + let ds_api: Api = Api::default_namespaced(self.client.clone()); + match ds_api.get_opt(daemonset_name).await? { + Some(ds) => Ok(ds.metadata.deletion_timestamp.is_none()), + None => Ok(false), + } + } + + /// Returns how many non-terminating DaemonSets across all namespaces + /// have a name containing "kata-deploy". Used to decide whether shared + /// node-level resources (node label, CRI restart) should be cleaned up: + /// they are only safe to remove when no kata-deploy instance remains + /// on the cluster. + pub async fn count_any_kata_deploy_daemonsets(&self) -> Result { use k8s_openapi::api::apps::v1::DaemonSet; use kube::api::{Api, ListParams}; - let ds_api: Api = Api::default_namespaced(self.client.clone()); - let lp = ListParams::default(); - let daemonsets = ds_api.list(&lp).await?; + let ds_api: Api = Api::all(self.client.clone()); + let daemonsets = ds_api.list(&ListParams::default()).await?; - // Note: We use client-side filtering here because Kubernetes field selectors - // don't support "contains" operations - they only support exact matches and comparisons. - // Filtering by name containing "kata-deploy" requires client-side processing. - // Exclude DaemonSets that are terminating (have deletion_timestamp) so that when our - // DaemonSet pod runs cleanup on SIGTERM during uninstall, we count 0 and remove the label. let count = daemonsets .iter() .filter(|ds| { - if ds.metadata.deletion_timestamp.is_some() { - return false; - } - ds.metadata - .name - .as_ref() - .map(|n| n.contains("kata-deploy")) - .unwrap_or(false) + ds.metadata.deletion_timestamp.is_none() + && ds + .metadata + .name + .as_ref() + .is_some_and(|n| n.contains("kata-deploy")) }) .count(); @@ -584,9 +595,14 @@ pub async fn label_node( client.label_node(label_key, label_value, overwrite).await } -pub async fn count_kata_deploy_daemonsets(config: &Config) -> Result { +pub async fn own_daemonset_exists(config: &Config) -> Result { let client = K8sClient::new(&config.node_name).await?; - client.count_kata_deploy_daemonsets().await + client.own_daemonset_exists(&config.daemonset_name).await +} + +pub async fn count_any_kata_deploy_daemonsets(config: &Config) -> Result { + let client = K8sClient::new(&config.node_name).await?; + client.count_any_kata_deploy_daemonsets().await } pub async fn crd_exists(config: &Config, crd_name: &str) -> Result { diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs index 14c31c6bd0..e439074585 100644 --- a/tools/packaging/kata-deploy/binary/src/main.rs +++ b/tools/packaging/kata-deploy/binary/src/main.rs @@ -236,19 +236,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { info!("Cleaning up Kata Containers"); - info!("Counting kata-deploy daemonsets"); - let kata_deploy_installations = k8s::count_kata_deploy_daemonsets(config).await?; + // Step 1: Check if THIS pod's owning DaemonSet still exists. + // If it does, this is a pod restart (rolling update, label change, etc.), + // not an uninstall — skip everything so running kata pods are not disrupted. info!( - "Found {} kata-deploy daemonset(s)", - kata_deploy_installations + "Checking if DaemonSet '{}' still exists", + config.daemonset_name ); - - if kata_deploy_installations == 0 { - info!("Removing kata-runtime label from node"); - k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; - info!("Successfully removed kata-runtime label"); + if k8s::own_daemonset_exists(config).await? { + info!( + "DaemonSet '{}' still exists, \ + skipping all cleanup to avoid disrupting running kata pods", + config.daemonset_name + ); + return Ok(()); } + // Step 2: Our DaemonSet is gone (uninstall). Perform instance-specific + // cleanup: snapshotters, CRI config, and artifacts for this instance. + info!( + "DaemonSet '{}' not found, proceeding with instance cleanup", + config.daemonset_name + ); + match config.experimental_setup_snapshotter.as_ref() { Some(snapshotters) => { for snapshotter in snapshotters { @@ -270,6 +280,25 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { artifacts::remove_artifacts(config).await?; info!("Successfully removed kata artifacts"); + // Step 3: Check if ANY other kata-deploy DaemonSets still exist. + // Shared resources (node label, CRI restart) are only safe to touch + // when no other kata-deploy instance remains. + let other_ds_count = k8s::count_any_kata_deploy_daemonsets(config).await?; + if other_ds_count > 0 { + info!( + "{} other kata-deploy DaemonSet(s) still exist, \ + skipping node label removal and CRI restart", + other_ds_count + ); + return Ok(()); + } + + info!("No other kata-deploy DaemonSets found, performing full shared cleanup"); + + info!("Removing kata-runtime label from node"); + k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; + info!("Successfully removed kata-runtime label"); + // Restart the CRI runtime last. On k3s/rke2 this restarts the entire // server process, which kills this (terminating) pod. By doing it after // all other cleanup, we ensure config and artifacts are already gone. diff --git a/tools/packaging/kata-deploy/binary/src/runtime/manager.rs b/tools/packaging/kata-deploy/binary/src/runtime/manager.rs index fd674427af..ec82059e2d 100644 --- a/tools/packaging/kata-deploy/binary/src/runtime/manager.rs +++ b/tools/packaging/kata-deploy/binary/src/runtime/manager.rs @@ -51,18 +51,19 @@ pub async fn get_container_runtime(config: &Config) -> Result { return Ok("crio".to_string()); } - if runtime_version.contains("containerd") && runtime_version.contains("-k3s") { - // Check systemd services (ignore errors - service might not exist) - let _ = utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]); - if utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]).is_ok() { - return Ok("rke2-agent".to_string()); - } - if utils::host_systemctl(&["is-active", "--quiet", "rke2-server"]).is_ok() { - return Ok("rke2-server".to_string()); - } - if utils::host_systemctl(&["is-active", "--quiet", "k3s-agent"]).is_ok() { - return Ok("k3s-agent".to_string()); - } + // Detect k3s/rke2 via systemd services rather than the containerd version + // string, which no longer reliably contains "k3s" in newer releases + // (e.g. "containerd://2.2.2-bd1.34"). + if utils::host_systemctl(&["is-active", "--quiet", "rke2-agent"]).is_ok() { + return Ok("rke2-agent".to_string()); + } + if utils::host_systemctl(&["is-active", "--quiet", "rke2-server"]).is_ok() { + return Ok("rke2-server".to_string()); + } + if utils::host_systemctl(&["is-active", "--quiet", "k3s-agent"]).is_ok() { + return Ok("k3s-agent".to_string()); + } + if utils::host_systemctl(&["is-active", "--quiet", "k3s"]).is_ok() { return Ok("k3s".to_string()); } @@ -83,7 +84,7 @@ pub async fn get_container_runtime(config: &Config) -> Result { Ok(runtime) } -/// Returns true if containerRuntimeVersion (e.g. "containerd://2.1.5-k3s1") indicates +/// Returns true if containerRuntimeVersion (e.g. "containerd://2.1.5-k3s1", "containerd://2.2.2-bd1.34") indicates /// containerd 2.x or newer, false for 1.x or unparseable. Used for drop-in support /// and for K3s/RKE2 template selection (config-v3.toml.tmpl vs config.toml.tmpl). pub fn containerd_version_is_2_or_newer(runtime_version: &str) -> bool { @@ -191,6 +192,7 @@ mod tests { #[case("containerd://2.0.0", true)] #[case("containerd://2.1.5", true)] #[case("containerd://2.1.5-k3s1", true)] + #[case("containerd://2.2.2-bd1.34", true)] #[case("containerd://2.2.0", true)] #[case("containerd://2.3.1", true)] #[case("containerd://2.0.0-rc.1", true)] diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml index 7e2cc55f36..21d2622f47 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml @@ -143,6 +143,13 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName +{{- if .Values.env.multiInstallSuffix }} + - name: DAEMONSET_NAME + value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }} +{{- else }} + - name: DAEMONSET_NAME + value: {{ .Chart.Name | quote }} +{{- end }} - name: DEBUG value: {{ include "kata-deploy.getDebug" . | quote }} {{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}}