diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs index 41454df863..afe4cc1bdf 100644 --- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs +++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs @@ -8,7 +8,7 @@ use crate::runtime::containerd; use crate::utils; use crate::utils::toml as toml_utils; use anyhow::Result; -use log::info; +use log::{info, warn}; use std::fs; use std::path::Path; @@ -135,6 +135,225 @@ pub async fn configure_snapshotter( Ok(()) } +/// Clean up all nydus-related entries from containerd's metadata store across all namespaces. +/// +/// ## Why this must run before stopping the nydus service +/// +/// `ctr snapshots rm` goes through containerd's metadata layer which calls the nydus gRPC +/// backend to physically remove the snapshot. If the service is stopped first, the backend +/// call fails and the BoltDB record is left behind as a stale entry. +/// +/// Stale snapshot records in BoltDB cause subsequent image pulls to fail with: +/// "unable to prepare extraction snapshot: target snapshot sha256:...: already exists" +/// +/// The failure path: containerd's metadata `Prepare` finds the target chainID in BoltDB and +/// returns AlreadyExists without calling the backend. The unpacker then calls `Stat`, which +/// finds the BoltDB record, but delegates to the backend which returns NotFound (nydus data +/// wiped). The unpacker treats this as a transient race and retries 3 times; all 3 fail the +/// same way, and the pull is aborted. +/// +/// ## What we clean +/// +/// The containerd BoltDB schema has these nydus-relevant buckets per namespace: +/// - `snapshots/nydus/*` — 100% nydus-specific; MUST be cleaned (triggers the pull bug) +/// - `containers/*` — records carry `snapshotter=nydus` + `snapshotKey`; after +/// removing the snapshots these become dangling references. +/// In a normal CI run they are already gone, but an aborted run +/// can leave orphaned container records that confuse reconciliation. +/// - `images/*` — snapshotter-agnostic (just manifest digest + labels); leave +/// them so the next pull can skip re-downloading content. +/// - `content/blob/*` — shared across all snapshotters; must NOT be removed. +/// - `leases/*`, `ingests/*` — temporary; expire and are GC'd automatically. +/// +/// Note: containerd's garbage collector will NOT remove stale snapshots for us, because the +/// image record (a GC root) still references the content blobs which reference the snapshots +/// via gc.ref labels, keeping the entire chain alive in the GC graph. +/// +/// ## Snapshot removal ordering +/// +/// Snapshots have parent→child relationships; a parent cannot be removed while children +/// exist. The retry loop removes whatever it can each round (leaves first), then retries +/// until nothing remains or no progress is made. +/// +/// ## Return value +/// +/// Returns `true` only if ALL snapshots were removed from ALL namespaces. A `false` return +/// means at least one snapshot could not be removed — almost certainly because a workload is +/// still actively using it. Callers MUST NOT wipe the nydus data directory in that case: +/// doing so would corrupt running containers whose rootfs mounts still depend on that data. +fn cleanup_containerd_nydus_snapshots(containerd_snapshotter: &str) -> bool { + info!( + "Cleaning up nydus entries from containerd metadata (snapshotter: '{containerd_snapshotter}')" + ); + + // Discover all containerd namespaces so every namespace is cleaned, not just k8s.io. + let namespaces = match utils::host_exec(&["ctr", "namespaces", "ls", "-q"]) { + Ok(out) => out + .lines() + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(String::from) + .collect::>(), + Err(e) => { + info!("Could not list containerd namespaces ({e}), defaulting to k8s.io"); + vec!["k8s.io".to_string()] + } + }; + + let mut all_clean = true; + for namespace in &namespaces { + cleanup_nydus_containers(namespace, containerd_snapshotter); + if !cleanup_nydus_snapshots(namespace, containerd_snapshotter) { + all_clean = false; + } + } + all_clean +} + +/// Remove all container records in this namespace whose snapshotter is the nydus instance. +/// +/// Container records carry `snapshotter` and `snapshotKey` fields. After the nydus snapshots +/// are removed these records become dangling references. They do not cause pull failures but +/// can confuse container reconciliation if a previous CI run was aborted mid-test. +fn cleanup_nydus_containers(namespace: &str, containerd_snapshotter: &str) { + // `ctr containers ls` output: ID IMAGE RUNTIME + // We need to cross-reference with `ctr containers info ` to filter by snapshotter, + // but that's expensive. Instead we rely on the fact that in the k8s.io namespace every + // container using nydus will have been created by a pod that references it — we can + // safely remove all containers whose snapshot key resolves to a nydus snapshot (i.e. any + // container whose snapshotter field equals our snapshotter name). Since `ctr` does not + // provide a direct --filter for snapshotter on the containers subcommand, we list all + // container IDs, then inspect each one and remove those using the nydus snapshotter. + let output = match utils::host_exec(&["ctr", "-n", namespace, "containers", "ls", "-q"]) { + Ok(out) => out, + Err(e) => { + info!("Namespace {namespace}: cannot list containers ({e}), skipping container cleanup"); + return; + } + }; + + let ids: Vec = output + .lines() + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(String::from) + .collect(); + + if ids.is_empty() { + return; + } + + for id in &ids { + // Inspect to check the snapshotter field; output is JSON. + let info = match utils::host_exec(&["ctr", "-n", namespace, "containers", "info", id]) { + Ok(out) => out, + Err(_) => continue, + }; + + // Simple string search — avoids pulling in a JSON parser. + // The field appears as `"Snapshotter": "nydus"` in the info output. + let snapshotter_pattern = format!("\"Snapshotter\": \"{containerd_snapshotter}\""); + if !info.contains(&snapshotter_pattern) { + continue; + } + + match utils::host_exec(&["ctr", "-n", namespace, "containers", "rm", id]) { + Ok(_) => info!("Namespace {namespace}: removed nydus container '{id}'"), + Err(e) => info!("Namespace {namespace}: could not remove container '{id}': {e}"), + } + } +} + +/// Remove all snapshot records for the nydus snapshotter from this namespace, with retries. +/// +/// Snapshot chains are linear (each layer is one snapshot parented on the previous), so an +/// image with N layers requires exactly N rounds — one leaf removal per round. There is no +/// fixed round limit: the loop terminates naturally once the list is empty (all removed) or +/// makes zero progress (all remaining snapshots are actively mounted by running containers). +/// +/// Returns `true` if all snapshots were removed, `false` if any remain (active workloads). +fn cleanup_nydus_snapshots(namespace: &str, containerd_snapshotter: &str) -> bool { + let mut round: u32 = 0; + loop { + round += 1; + + // List all snapshots managed by this snapshotter in this namespace. + let output = match utils::host_exec(&[ + "ctr", + "-n", + namespace, + "snapshots", + "--snapshotter", + containerd_snapshotter, + "list", + ]) { + Ok(out) => out, + Err(e) => { + info!("Namespace {namespace}: cannot list snapshots ({e}), skipping namespace"); + return true; // treat as clean: ctr unavailable, nothing we can do + } + }; + + // Skip the header line; first whitespace-delimited token is the snapshot key. + let keys: Vec = output + .lines() + .skip(1) + .filter_map(|l| { + let k = l.split_whitespace().next()?; + if k.is_empty() { + None + } else { + Some(k.to_string()) + } + }) + .collect(); + + if keys.is_empty() { + info!("Namespace {namespace}: no nydus snapshots remaining in containerd metadata"); + return true; + } + + info!( + "Namespace {namespace}: round {round}: removing {} snapshot(s)", + keys.len() + ); + + let mut any_removed = false; + for key in &keys { + match utils::host_exec(&[ + "ctr", + "-n", + namespace, + "snapshots", + "--snapshotter", + containerd_snapshotter, + "rm", + key, + ]) { + Ok(_) => { + any_removed = true; + } + Err(e) => { + // A snapshot cannot be removed when its children still exist. + // This is expected: we remove leaves first, parents in later rounds. + info!("Namespace {namespace}: could not remove snapshot '{key}': {e}"); + } + } + } + + if !any_removed { + // No progress this round: all remaining snapshots are actively mounted. + // Proceeding to wipe the data directory would corrupt running containers. + warn!( + "Namespace {namespace}: {} snapshot(s) remain after round {round} and none \ + could be removed — active workloads are still using them", + keys.len() + ); + return false; + } + } +} + pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> { info!("Deploying nydus-snapshotter"); @@ -143,20 +362,39 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> { _ => "nydus-snapshotter".to_string(), }; - // Clean up existing nydus-snapshotter state to ensure fresh start with new version. - // This is safe across all K8s distributions (k3s, rke2, k0s, microk8s, etc.) because - // we only touch the nydus data directory, not containerd's internals. - // When containerd tries to use non-existent snapshots, it will re-pull/re-unpack. - let nydus_data_dir = format!("/host/var/lib/{nydus_snapshotter}"); - info!("Cleaning up existing nydus-snapshotter state at {}", nydus_data_dir); + // The containerd proxy_plugins key for this nydus instance. + let containerd_snapshotter = match config.multi_install_suffix.as_ref() { + Some(suffix) if !suffix.is_empty() => format!("nydus-{suffix}"), + _ => "nydus".to_string(), + }; - // Stop the service first if it exists (ignore errors if not running) + // Clean up existing nydus-snapshotter state to ensure fresh start with new version. + // + // IMPORTANT: containerd metadata cleanup MUST happen before stopping the nydus service. + // `ctr snapshots rm` goes through containerd's metadata layer which calls the nydus + // gRPC backend to physically remove the snapshot. If the service is stopped first, the + // backend call fails, leaving stale BoltDB records that cause subsequent image pulls to + // fail with "target snapshot sha256:...: already exists" (see cleanup_containerd_nydus_snapshots). + // + // If cleanup returns false, active workloads are still using nydus snapshots. Wiping + // the data directory in that state would corrupt running containers, so we skip it and + // let the new nydus instance start on top of the existing backend state. + let all_clean = cleanup_containerd_nydus_snapshots(&containerd_snapshotter); + + // Stop the service now that the metadata has been cleaned. let _ = utils::host_systemctl(&["stop", &format!("{nydus_snapshotter}.service")]); - // Remove the data directory to clean up old snapshots with potentially incorrect labels - if Path::new(&nydus_data_dir).exists() { + // Only wipe the data directory when the metadata cleanup was complete. If snapshots + // remain (active workloads), preserve the backend so those containers are not broken. + let nydus_data_dir = format!("/host/var/lib/{nydus_snapshotter}"); + if all_clean && Path::new(&nydus_data_dir).exists() { info!("Removing nydus data directory: {}", nydus_data_dir); fs::remove_dir_all(&nydus_data_dir).ok(); + } else if !all_clean { + info!( + "Skipping removal of nydus data directory (active workloads present): {}", + nydus_data_dir + ); } let config_guest_pulling = "/opt/kata-artifacts/nydus-snapshotter/config-guest-pulling.toml"; @@ -267,6 +505,17 @@ pub async fn uninstall_nydus_snapshotter(config: &Config) -> Result<()> { _ => "nydus-snapshotter".to_string(), }; + let containerd_snapshotter = match config.multi_install_suffix.as_ref() { + Some(suffix) if !suffix.is_empty() => format!("nydus-{suffix}"), + _ => "nydus".to_string(), + }; + + // Clean up containerd metadata BEFORE disabling (and thus stopping) the service. + // See install_nydus_snapshotter for the full explanation of why ordering matters. + // If active workloads prevent a full cleanup, skip the data directory removal so + // running containers are not broken. + let all_clean = cleanup_containerd_nydus_snapshots(&containerd_snapshotter); + utils::host_systemctl(&["disable", "--now", &format!("{nydus_snapshotter}.service")])?; fs::remove_file(format!( @@ -275,6 +524,16 @@ pub async fn uninstall_nydus_snapshotter(config: &Config) -> Result<()> { .ok(); fs::remove_dir_all(format!("{}/nydus-snapshotter", config.host_install_dir)).ok(); + let nydus_data_dir = format!("/host/var/lib/{nydus_snapshotter}"); + if all_clean && Path::new(&nydus_data_dir).exists() { + fs::remove_dir_all(&nydus_data_dir).ok(); + } else if !all_clean { + info!( + "Skipping removal of nydus data directory (active workloads present): {}", + nydus_data_dir + ); + } + utils::host_systemctl(&["daemon-reload"])?; Ok(())