From 225ff2209ed49353311d35da4b608936b1b96974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:06:16 +0200 Subject: [PATCH] kata-deploy: split install/cleanup into staged actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of migrating kata-deploy from a DaemonSet to a staged JobSet workflow: refactor the binary's install/cleanup flows into discrete, independently invocable stages while keeping the existing DaemonSet path fully working. Add new staged subcommands that each run one step and exit, so a JobSet can drive them as ordered initContainers/Jobs per node: install: host-check -> artifacts -> cri -> label cleanup (reverse): unlabel -> revert-cri -> remove-artifacts `install` becomes a compatibility wrapper composing the install stages in the canonical order, so the DaemonSet deployment model is unchanged. The DaemonSet `cleanup` (with its DaemonSet-presence gating) is left intact; the staged cleanup actions are added alongside it and skip that gating since the JobSet workflow only schedules them on a real uninstall. Each stage has an idempotent skip check so reruns are safe: - install label / cleanup unlabel: short-circuit via the node label - cleanup remove-artifacts: skip when the install dir is already gone - cleanup revert-cri: skip the disruptive runtime restart when the CRI drop-ins are already absent (new cri_drop_in_present helper) Introduce a shared KATA_RUNTIME_LABEL constant and add rstest-based tests covering the subcommand-name -> Action mapping, rejection of unknown actions, and the visible/hidden help semantics. Signed-off-by: Fabiano FidĂȘncio Assisted-by: Cursor --- .../packaging/kata-deploy/binary/src/main.rs | 365 +++++++++++++++++- 1 file changed, 348 insertions(+), 17 deletions(-) diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs index 6691fed0e9..d015fa05e1 100644 --- a/tools/packaging/kata-deploy/binary/src/main.rs +++ b/tools/packaging/kata-deploy/binary/src/main.rs @@ -56,6 +56,39 @@ enum Action { Install, Cleanup, Reset, + /// Stage 0 of a staged (JobSet) install: validate host/node prerequisites + /// without mutating the host. Fails fast with actionable diagnostics when + /// the node cannot support installation. + #[clap(name = "install-stage-host-check")] + InstallStageHostCheck, + /// Stage 1 of a staged (JobSet) install: install kata artifacts/config on + /// the host and set up configured snapshotters. Does not touch CRI + /// configuration, but is still privileged (host writes + snapshotter setup + /// shell into the host via nsenter). + #[clap(name = "install-stage-artifacts")] + InstallStageArtifacts, + /// Stage 2 of a staged (JobSet) install: write CRI drop-ins, restart the + /// runtime, and wait for node readiness. Privileged + short-lived. + #[clap(name = "install-stage-cri")] + InstallStageCri, + /// Stage 3 of a staged (JobSet) install: apply the kata-runtime node label. + /// Unprivileged, Kubernetes API only. + #[clap(name = "install-stage-label")] + InstallStageLabel, + /// Cleanup stage 1 of a staged (JobSet) uninstall: remove the kata-runtime + /// node label first so the scheduler stops placing kata workloads here. + /// Unprivileged, Kubernetes API only. + #[clap(name = "cleanup-stage-unlabel")] + CleanupStageUnlabel, + /// Cleanup stage 2 of a staged (JobSet) uninstall: remove CRI drop-ins, + /// restart the runtime, and wait for readiness. Privileged + short-lived. + #[clap(name = "cleanup-stage-revert-cri")] + CleanupStageRevertCri, + /// Cleanup stage 3 of a staged (JobSet) uninstall: remove kata + /// artifacts/config/symlinks from the host. Privileged (mutates the host + /// filesystem under the install dir). + #[clap(name = "cleanup-stage-remove-artifacts")] + CleanupStageRemoveArtifacts, /// Internal: entered via re-exec after install completes. Holds the /// DaemonSet pod alive waiting for SIGTERM, then runs cleanup. Hidden /// from `--help`; users should never invoke this directly. @@ -63,6 +96,10 @@ enum Action { InternalPostInstallWait, } +/// Node label applied to mark a node as kata-capable. Shared across the +/// install/cleanup label stages so the key stays consistent. +const KATA_RUNTIME_LABEL: &str = "katacontainers.io/kata-runtime"; + // Cap the tokio runtime to a small fixed number of worker threads. The default // multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB // stack), which on a 200+ vCPU GPU node is the dominant contributor to the @@ -107,6 +144,13 @@ async fn main() -> Result<()> { Action::Install => "install", Action::Cleanup => "cleanup", Action::Reset => "reset", + Action::InstallStageHostCheck => "install-stage-host-check", + Action::InstallStageArtifacts => "install-stage-artifacts", + Action::InstallStageCri => "install-stage-cri", + Action::InstallStageLabel => "install-stage-label", + Action::CleanupStageUnlabel => "cleanup-stage-unlabel", + Action::CleanupStageRevertCri => "cleanup-stage-revert-cri", + Action::CleanupStageRemoveArtifacts => "cleanup-stage-remove-artifacts", Action::InternalPostInstallWait => "internal-post-install-wait", }; config.print_info(action_str); @@ -245,6 +289,42 @@ async fn main() -> Result<()> { // Exit after completion so the job can complete info!("Reset completed, exiting"); } + // Staged (JobSet) install actions. Each runs one step of the install + // pipeline as a short-lived Job/initContainer and exits. The DaemonSet + // path does not use these directly; it goes through `install` above, + // which composes the same stage functions. + Action::InstallStageHostCheck => { + install_stage_host_check(&config, &runtime).await?; + info!("Install host-check stage completed, exiting"); + } + Action::InstallStageArtifacts => { + install_stage_artifacts(&config, &runtime).await?; + info!("Install artifacts stage completed, exiting"); + } + Action::InstallStageCri => { + install_stage_cri(&config, &runtime).await?; + info!("Install CRI stage completed, exiting"); + } + Action::InstallStageLabel => { + install_stage_label(&config).await?; + info!("Install label stage completed, exiting"); + } + // Staged (JobSet) cleanup actions. These run in reverse order + // (unlabel -> revert-cri -> remove-artifacts) and, unlike the DaemonSet + // `cleanup` above, do not perform DaemonSet-presence gating: the JobSet + // workflow only schedules these when an uninstall is actually intended. + Action::CleanupStageUnlabel => { + cleanup_stage_unlabel(&config).await?; + info!("Cleanup unlabel stage completed, exiting"); + } + Action::CleanupStageRevertCri => { + cleanup_stage_revert_cri(&config, &runtime).await?; + info!("Cleanup revert-cri stage completed, exiting"); + } + Action::CleanupStageRemoveArtifacts => { + cleanup_stage_remove_artifacts(&config).await?; + info!("Cleanup remove-artifacts stage completed, exiting"); + } } Ok(()) @@ -273,20 +353,39 @@ fn reexec_into_post_install_wait( )) } +/// Full install pipeline. Used by the DaemonSet deployment model. Composes the +/// same per-stage functions the staged JobSet workflow invokes individually, in +/// the canonical order: host-check -> artifacts -> cri -> label. async fn install(config: &config::Config, runtime: &str) -> Result<()> { info!("Installing Kata Containers"); - const SUPPORTED_RUNTIMES: &[&str] = &[ - "crio", - "containerd", - "k3s", - "k3s-agent", - "rke2-agent", - "rke2-server", - "k0s-worker", - "k0s-controller", - "microk8s", - ]; + install_stage_host_check(config, runtime).await?; + install_stage_artifacts(config, runtime).await?; + install_stage_cri(config, runtime).await?; + install_stage_label(config).await?; + + info!("Kata Containers installation completed successfully"); + Ok(()) +} + +const SUPPORTED_RUNTIMES: &[&str] = &[ + "crio", + "containerd", + "k3s", + "k3s-agent", + "rke2-agent", + "rke2-server", + "k0s-worker", + "k0s-controller", + "microk8s", +]; + +/// Install stage 0 (host-check): validate that this node can support a Kata +/// installation before any host mutation happens. This is read-only and safe +/// to run repeatedly; it fails fast with actionable diagnostics so a staged +/// JobSet can abort the per-node pipeline before the privileged stages run. +async fn install_stage_host_check(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (host-check): validating node prerequisites for runtime {runtime}"); if !SUPPORTED_RUNTIMES.contains(&runtime) { return Err(anyhow::anyhow!( @@ -345,16 +444,44 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { } } - runtime::containerd::setup_containerd_config_files(runtime, config).await?; + info!("install (host-check): node prerequisites satisfied"); + Ok(()) +} + +/// Install stage 1 (artifacts): place kata artifacts/config on the host and set +/// up any configured snapshotters. This does not touch CRI configuration, but it +/// still needs privileged host access: writing under the host install dir and +/// the snapshotter setup (e.g. nydus) shell into the host via nsenter. +async fn install_stage_artifacts(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (artifacts): installing kata artifacts on host"); artifacts::install_artifacts(config, runtime).await?; + if runtime != "crio" { + if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { + for snapshotter in snapshotters { + artifacts::snapshotters::install_snapshotter(snapshotter, config).await?; + } + } + } + + info!("install (artifacts): artifacts installed"); + Ok(()) +} + +/// Install stage 2 (cri): write CRI drop-ins, configure snapshotters, restart +/// the runtime, and wait for the node to become ready. This is the privileged, +/// node-disrupting stage and is kept short-lived. +async fn install_stage_cri(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (cri): configuring CRI runtime"); + + runtime::containerd::setup_containerd_config_files(runtime, config).await?; + runtime::configure_cri_runtime(config, runtime).await?; if runtime != "crio" { if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { for snapshotter in snapshotters { - artifacts::snapshotters::install_snapshotter(snapshotter, config).await?; artifacts::snapshotters::configure_snapshotter(snapshotter, runtime, config) .await?; } @@ -365,9 +492,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { runtime::lifecycle::restart_runtime(config, runtime).await?; info!("Runtime restart completed successfully"); - label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?; + Ok(()) +} + +/// Install stage 3 (label): apply the kata-runtime node label. Unprivileged, +/// Kubernetes API only. Skips re-applying when the label is already correct. +async fn install_stage_label(config: &config::Config) -> Result<()> { + info!("install (label): applying node label"); + + match k8s::get_node_label(config, KATA_RUNTIME_LABEL).await { + Ok(Some(ref val)) if val == "true" => { + info!( + "install (label): node already labeled {}=true, skipping", + KATA_RUNTIME_LABEL + ); + return Ok(()); + } + // Any other state (absent, different value, or a transient read error) + // falls through to label_node_with_retry, which applies and verifies. + _ => {} + } + + label_node_with_retry(config, KATA_RUNTIME_LABEL, "true").await?; - info!("Kata Containers installation completed successfully"); Ok(()) } @@ -539,7 +686,7 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { info!("No other kata-deploy DaemonSets found, performing full shared cleanup"); info!("Removing kata-runtime label from node"); - k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; info!("Successfully removed kata-runtime label"); // Restart the CRI runtime last. On k3s/rke2 this restarts the entire @@ -553,10 +700,111 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { Ok(()) } +/// Cleanup stage 1 (unlabel): remove the kata-runtime node label first so the +/// scheduler stops placing kata workloads on this node before any host +/// mutation. Unprivileged, Kubernetes API only. Skips when already absent. +async fn cleanup_stage_unlabel(config: &config::Config) -> Result<()> { + info!("cleanup (unlabel): removing node label"); + + // If the label is already absent, there is nothing to do. Any other state + // (present, or unknown due to a transient read error) falls through to the + // removal below. + if let Ok(None) = k8s::get_node_label(config, KATA_RUNTIME_LABEL).await { + info!( + "cleanup (unlabel): label {} already absent, skipping", + KATA_RUNTIME_LABEL + ); + return Ok(()); + } + + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; + info!("cleanup (unlabel): label removed"); + Ok(()) +} + +/// Cleanup stage 2 (revert-cri): remove CRI drop-ins (and any snapshotter +/// config), then restart the runtime and wait for readiness. This is the +/// privileged, node-disrupting cleanup stage and is kept short-lived. Skips +/// entirely when the CRI drop-ins are already absent, avoiding an unnecessary +/// runtime restart. +async fn cleanup_stage_revert_cri(config: &config::Config, runtime: &str) -> Result<()> { + info!("cleanup (revert-cri): reverting CRI configuration"); + + if !cri_drop_in_present(config, runtime).await { + info!("cleanup (revert-cri): CRI drop-ins already absent, skipping"); + return Ok(()); + } + + if runtime != "crio" { + if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { + for snapshotter in snapshotters { + info!("cleanup (revert-cri): uninstalling snapshotter {snapshotter}"); + artifacts::snapshotters::uninstall_snapshotter(snapshotter, config).await?; + } + } + } + + runtime::cleanup_cri_runtime_config(config, runtime).await?; + + info!("cleanup (revert-cri): restarting runtime"); + runtime::restart_and_wait_for_ready(config, runtime).await?; + info!("cleanup (revert-cri): runtime restarted"); + + Ok(()) +} + +/// Cleanup stage 3 (remove-artifacts): delete kata artifacts/config/symlinks +/// from the host. Skips when the install directory is already gone. +async fn cleanup_stage_remove_artifacts(config: &config::Config) -> Result<()> { + info!("cleanup (remove-artifacts): removing kata artifacts from host"); + + if !std::path::Path::new(&config.host_install_dir).exists() { + info!( + "cleanup (remove-artifacts): install dir {} already absent, skipping", + config.host_install_dir + ); + return Ok(()); + } + + artifacts::remove_artifacts(config).await?; + info!("cleanup (remove-artifacts): artifacts removed"); + Ok(()) +} + +/// Best-effort check for whether kata's CRI drop-in configuration is present on +/// the host for this runtime. Used by the staged cleanup to skip a disruptive +/// runtime restart when there is nothing to revert. On any uncertainty (e.g. +/// the containerd paths cannot be resolved) this returns `true` so the caller +/// errs on the side of running the revert rather than incorrectly skipping it. +async fn cri_drop_in_present(config: &config::Config, runtime: &str) -> bool { + if runtime == "crio" { + return std::path::Path::new(&config.crio_drop_in_conf_file).exists(); + } + + match config.get_containerd_paths(runtime).await { + Ok(paths) => { + // /etc/containerd is mounted directly; other paths live under /host. + let resolved = if paths.drop_in_file.starts_with("/etc/containerd/") { + std::path::PathBuf::from(&paths.drop_in_file) + } else { + std::path::Path::new("/host").join(paths.drop_in_file.trim_start_matches('/')) + }; + resolved.exists() + } + Err(e) => { + log::warn!( + "cleanup (revert-cri): could not resolve containerd paths to check drop-in \ + presence ({e}); proceeding with revert" + ); + true + } + } +} + async fn reset(config: &config::Config, runtime: &str) -> Result<()> { info!("Resetting Kata Containers"); - k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; runtime::lifecycle::restart_cri_runtime(config, runtime).await?; if matches!(runtime, "crio" | "containerd") { utils::host_systemctl(&["restart", "kubelet"])?; @@ -566,3 +814,86 @@ async fn reset(config: &config::Config, runtime: &str) -> Result<()> { info!("Kata Containers reset completed successfully"); Ok(()) } + +#[cfg(test)] +mod tests { + //! Tests for CLI action wiring. The staged install/cleanup actions are the + //! entrypoints the JobSet workflow invokes per node, so we lock in their + //! exact subcommand names (a rename would silently break the chart) and the + //! mapping into the `Action` enum. + + use super::*; + use clap::ValueEnum; + use rstest::rstest; + + /// Every staged subcommand name parses into the expected `Action` variant. + /// Keep this in sync with the `#[clap(name = ...)]` attributes above. + #[rstest] + #[case("install", Action::Install)] + #[case("cleanup", Action::Cleanup)] + #[case("reset", Action::Reset)] + #[case("install-stage-host-check", Action::InstallStageHostCheck)] + #[case("install-stage-artifacts", Action::InstallStageArtifacts)] + #[case("install-stage-cri", Action::InstallStageCri)] + #[case("install-stage-label", Action::InstallStageLabel)] + #[case("cleanup-stage-unlabel", Action::CleanupStageUnlabel)] + #[case("cleanup-stage-revert-cri", Action::CleanupStageRevertCri)] + #[case("cleanup-stage-remove-artifacts", Action::CleanupStageRemoveArtifacts)] + #[case("internal-post-install-wait", Action::InternalPostInstallWait)] + fn test_action_parses_from_arg(#[case] arg: &str, #[case] expected: Action) { + let args = Args::try_parse_from(["kata-deploy", arg]) + .unwrap_or_else(|e| panic!("failed to parse action {arg:?}: {e}")); + assert_eq!( + std::mem::discriminant(&args.action), + std::mem::discriminant(&expected), + "arg {arg:?} parsed into the wrong Action variant", + ); + } + + /// Unknown actions must be rejected rather than silently accepted. + #[rstest] + #[case("install-stage")] + #[case("cleanup-stage")] + #[case("install-stage-foo")] + #[case("bogus")] + fn test_unknown_action_is_rejected(#[case] arg: &str) { + assert!( + Args::try_parse_from(["kata-deploy", arg]).is_err(), + "expected action {arg:?} to be rejected", + ); + } + + /// The hidden internal waiter must stay hidden from `--help` so users never + /// invoke it directly, while still being parseable (asserted above). + #[test] + fn test_internal_action_is_hidden() { + let internal = Action::InternalPostInstallWait + .to_possible_value() + .expect("internal action should have a possible value"); + assert!( + internal.is_hide_set(), + "internal-post-install-wait should be hidden from --help", + ); + } + + /// All non-internal staged actions remain visible in `--help` so operators + /// can discover and run individual stages. + #[rstest] + #[case(Action::InstallStageHostCheck)] + #[case(Action::InstallStageArtifacts)] + #[case(Action::InstallStageCri)] + #[case(Action::InstallStageLabel)] + #[case(Action::CleanupStageUnlabel)] + #[case(Action::CleanupStageRevertCri)] + #[case(Action::CleanupStageRemoveArtifacts)] + fn test_staged_actions_are_visible(#[case] action: Action) { + let value = action + .to_possible_value() + .expect("staged action should have a possible value"); + assert!( + !value.is_hide_set(), + "staged action {:?} should be visible in --help", + value.get_name(), + ); + } +}