From 225ff2209ed49353311d35da4b608936b1b96974 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= <ffidencio@nvidia.com>
Date: Wed, 3 Jun 2026 22:06:16 +0200
Subject: [PATCH] kata-deploy: split install/cleanup into staged actions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 of migrating kata-deploy from a DaemonSet to a staged JobSet
workflow: refactor the binary's install/cleanup flows into discrete,
independently invocable stages while keeping the existing DaemonSet
path fully working.

Add new staged subcommands that each run one step and exit, so a JobSet
can drive them as ordered initContainers/Jobs per node:

  install: host-check -> artifacts -> cri -> label
  cleanup (reverse): unlabel -> revert-cri -> remove-artifacts

`install` becomes a compatibility wrapper composing the install stages
in the canonical order, so the DaemonSet deployment model is unchanged.
The DaemonSet `cleanup` (with its DaemonSet-presence gating) is left
intact; the staged cleanup actions are added alongside it and skip that
gating since the JobSet workflow only schedules them on a real uninstall.

Each stage has an idempotent skip check so reruns are safe:
  - install label / cleanup unlabel: short-circuit via the node label
  - cleanup remove-artifacts: skip when the install dir is already gone
  - cleanup revert-cri: skip the disruptive runtime restart when the CRI
    drop-ins are already absent (new cri_drop_in_present helper)

Introduce a shared KATA_RUNTIME_LABEL constant and add rstest-based
tests covering the subcommand-name -> Action mapping, rejection of
unknown actions, and the visible/hidden help semantics.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
Assisted-by: Cursor <cursoragent@cursor.com>
---
 .../packaging/kata-deploy/binary/src/main.rs  | 365 +++++++++++++++++-
 1 file changed, 348 insertions(+), 17 deletions(-)

diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs
index 6691fed0e9..d015fa05e1 100644
--- a/tools/packaging/kata-deploy/binary/src/main.rs
+++ b/tools/packaging/kata-deploy/binary/src/main.rs
@@ -56,6 +56,39 @@ enum Action {
     Install,
     Cleanup,
     Reset,
+    /// Stage 0 of a staged (JobSet) install: validate host/node prerequisites
+    /// without mutating the host. Fails fast with actionable diagnostics when
+    /// the node cannot support installation.
+    #[clap(name = "install-stage-host-check")]
+    InstallStageHostCheck,
+    /// Stage 1 of a staged (JobSet) install: install kata artifacts/config on
+    /// the host and set up configured snapshotters. Does not touch CRI
+    /// configuration, but is still privileged (host writes + snapshotter setup
+    /// shell into the host via nsenter).
+    #[clap(name = "install-stage-artifacts")]
+    InstallStageArtifacts,
+    /// Stage 2 of a staged (JobSet) install: write CRI drop-ins, restart the
+    /// runtime, and wait for node readiness. Privileged + short-lived.
+    #[clap(name = "install-stage-cri")]
+    InstallStageCri,
+    /// Stage 3 of a staged (JobSet) install: apply the kata-runtime node label.
+    /// Unprivileged, Kubernetes API only.
+    #[clap(name = "install-stage-label")]
+    InstallStageLabel,
+    /// Cleanup stage 1 of a staged (JobSet) uninstall: remove the kata-runtime
+    /// node label first so the scheduler stops placing kata workloads here.
+    /// Unprivileged, Kubernetes API only.
+    #[clap(name = "cleanup-stage-unlabel")]
+    CleanupStageUnlabel,
+    /// Cleanup stage 2 of a staged (JobSet) uninstall: remove CRI drop-ins,
+    /// restart the runtime, and wait for readiness. Privileged + short-lived.
+    #[clap(name = "cleanup-stage-revert-cri")]
+    CleanupStageRevertCri,
+    /// Cleanup stage 3 of a staged (JobSet) uninstall: remove kata
+    /// artifacts/config/symlinks from the host. Privileged (mutates the host
+    /// filesystem under the install dir).
+    #[clap(name = "cleanup-stage-remove-artifacts")]
+    CleanupStageRemoveArtifacts,
     /// Internal: entered via re-exec after install completes. Holds the
     /// DaemonSet pod alive waiting for SIGTERM, then runs cleanup. Hidden
     /// from `--help`; users should never invoke this directly.
@@ -63,6 +96,10 @@ enum Action {
     InternalPostInstallWait,
 }
 
+/// Node label applied to mark a node as kata-capable. Shared across the
+/// install/cleanup label stages so the key stays consistent.
+const KATA_RUNTIME_LABEL: &str = "katacontainers.io/kata-runtime";
+
 // Cap the tokio runtime to a small fixed number of worker threads. The default
 // multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB
 // stack), which on a 200+ vCPU GPU node is the dominant contributor to the
@@ -107,6 +144,13 @@ async fn main() -> Result<()> {
         Action::Install => "install",
         Action::Cleanup => "cleanup",
         Action::Reset => "reset",
+        Action::InstallStageHostCheck => "install-stage-host-check",
+        Action::InstallStageArtifacts => "install-stage-artifacts",
+        Action::InstallStageCri => "install-stage-cri",
+        Action::InstallStageLabel => "install-stage-label",
+        Action::CleanupStageUnlabel => "cleanup-stage-unlabel",
+        Action::CleanupStageRevertCri => "cleanup-stage-revert-cri",
+        Action::CleanupStageRemoveArtifacts => "cleanup-stage-remove-artifacts",
         Action::InternalPostInstallWait => "internal-post-install-wait",
     };
     config.print_info(action_str);
@@ -245,6 +289,42 @@ async fn main() -> Result<()> {
             // Exit after completion so the job can complete
             info!("Reset completed, exiting");
         }
+        // Staged (JobSet) install actions. Each runs one step of the install
+        // pipeline as a short-lived Job/initContainer and exits. The DaemonSet
+        // path does not use these directly; it goes through `install` above,
+        // which composes the same stage functions.
+        Action::InstallStageHostCheck => {
+            install_stage_host_check(&config, &runtime).await?;
+            info!("Install host-check stage completed, exiting");
+        }
+        Action::InstallStageArtifacts => {
+            install_stage_artifacts(&config, &runtime).await?;
+            info!("Install artifacts stage completed, exiting");
+        }
+        Action::InstallStageCri => {
+            install_stage_cri(&config, &runtime).await?;
+            info!("Install CRI stage completed, exiting");
+        }
+        Action::InstallStageLabel => {
+            install_stage_label(&config).await?;
+            info!("Install label stage completed, exiting");
+        }
+        // Staged (JobSet) cleanup actions. These run in reverse order
+        // (unlabel -> revert-cri -> remove-artifacts) and, unlike the DaemonSet
+        // `cleanup` above, do not perform DaemonSet-presence gating: the JobSet
+        // workflow only schedules these when an uninstall is actually intended.
+        Action::CleanupStageUnlabel => {
+            cleanup_stage_unlabel(&config).await?;
+            info!("Cleanup unlabel stage completed, exiting");
+        }
+        Action::CleanupStageRevertCri => {
+            cleanup_stage_revert_cri(&config, &runtime).await?;
+            info!("Cleanup revert-cri stage completed, exiting");
+        }
+        Action::CleanupStageRemoveArtifacts => {
+            cleanup_stage_remove_artifacts(&config).await?;
+            info!("Cleanup remove-artifacts stage completed, exiting");
+        }
     }
 
     Ok(())
@@ -273,20 +353,39 @@ fn reexec_into_post_install_wait(
     ))
 }
 
+/// Full install pipeline. Used by the DaemonSet deployment model. Composes the
+/// same per-stage functions the staged JobSet workflow invokes individually, in
+/// the canonical order: host-check -> artifacts -> cri -> label.
 async fn install(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Installing Kata Containers");
 
-    const SUPPORTED_RUNTIMES: &[&str] = &[
-        "crio",
-        "containerd",
-        "k3s",
-        "k3s-agent",
-        "rke2-agent",
-        "rke2-server",
-        "k0s-worker",
-        "k0s-controller",
-        "microk8s",
-    ];
+    install_stage_host_check(config, runtime).await?;
+    install_stage_artifacts(config, runtime).await?;
+    install_stage_cri(config, runtime).await?;
+    install_stage_label(config).await?;
+
+    info!("Kata Containers installation completed successfully");
+    Ok(())
+}
+
+const SUPPORTED_RUNTIMES: &[&str] = &[
+    "crio",
+    "containerd",
+    "k3s",
+    "k3s-agent",
+    "rke2-agent",
+    "rke2-server",
+    "k0s-worker",
+    "k0s-controller",
+    "microk8s",
+];
+
+/// Install stage 0 (host-check): validate that this node can support a Kata
+/// installation before any host mutation happens. This is read-only and safe
+/// to run repeatedly; it fails fast with actionable diagnostics so a staged
+/// JobSet can abort the per-node pipeline before the privileged stages run.
+async fn install_stage_host_check(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (host-check): validating node prerequisites for runtime {runtime}");
 
     if !SUPPORTED_RUNTIMES.contains(&runtime) {
         return Err(anyhow::anyhow!(
@@ -345,16 +444,44 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
         }
     }
 
-    runtime::containerd::setup_containerd_config_files(runtime, config).await?;
+    info!("install (host-check): node prerequisites satisfied");
+    Ok(())
+}
+
+/// Install stage 1 (artifacts): place kata artifacts/config on the host and set
+/// up any configured snapshotters. This does not touch CRI configuration, but it
+/// still needs privileged host access: writing under the host install dir and
+/// the snapshotter setup (e.g. nydus) shell into the host via nsenter.
+async fn install_stage_artifacts(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (artifacts): installing kata artifacts on host");
 
     artifacts::install_artifacts(config, runtime).await?;
 
+    if runtime != "crio" {
+        if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
+            for snapshotter in snapshotters {
+                artifacts::snapshotters::install_snapshotter(snapshotter, config).await?;
+            }
+        }
+    }
+
+    info!("install (artifacts): artifacts installed");
+    Ok(())
+}
+
+/// Install stage 2 (cri): write CRI drop-ins, configure snapshotters, restart
+/// the runtime, and wait for the node to become ready. This is the privileged,
+/// node-disrupting stage and is kept short-lived.
+async fn install_stage_cri(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("install (cri): configuring CRI runtime");
+
+    runtime::containerd::setup_containerd_config_files(runtime, config).await?;
+
     runtime::configure_cri_runtime(config, runtime).await?;
 
     if runtime != "crio" {
         if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
             for snapshotter in snapshotters {
-                artifacts::snapshotters::install_snapshotter(snapshotter, config).await?;
                 artifacts::snapshotters::configure_snapshotter(snapshotter, runtime, config)
                     .await?;
             }
@@ -365,9 +492,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> {
     runtime::lifecycle::restart_runtime(config, runtime).await?;
     info!("Runtime restart completed successfully");
 
-    label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?;
+    Ok(())
+}
+
+/// Install stage 3 (label): apply the kata-runtime node label. Unprivileged,
+/// Kubernetes API only. Skips re-applying when the label is already correct.
+async fn install_stage_label(config: &config::Config) -> Result<()> {
+    info!("install (label): applying node label");
+
+    match k8s::get_node_label(config, KATA_RUNTIME_LABEL).await {
+        Ok(Some(ref val)) if val == "true" => {
+            info!(
+                "install (label): node already labeled {}=true, skipping",
+                KATA_RUNTIME_LABEL
+            );
+            return Ok(());
+        }
+        // Any other state (absent, different value, or a transient read error)
+        // falls through to label_node_with_retry, which applies and verifies.
+        _ => {}
+    }
+
+    label_node_with_retry(config, KATA_RUNTIME_LABEL, "true").await?;
 
-    info!("Kata Containers installation completed successfully");
     Ok(())
 }
 
@@ -539,7 +686,7 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
     info!("No other kata-deploy DaemonSets found, performing full shared cleanup");
 
     info!("Removing kata-runtime label from node");
-    k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
     info!("Successfully removed kata-runtime label");
 
     // Restart the CRI runtime last. On k3s/rke2 this restarts the entire
@@ -553,10 +700,111 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> {
     Ok(())
 }
 
+/// Cleanup stage 1 (unlabel): remove the kata-runtime node label first so the
+/// scheduler stops placing kata workloads on this node before any host
+/// mutation. Unprivileged, Kubernetes API only. Skips when already absent.
+async fn cleanup_stage_unlabel(config: &config::Config) -> Result<()> {
+    info!("cleanup (unlabel): removing node label");
+
+    // If the label is already absent, there is nothing to do. Any other state
+    // (present, or unknown due to a transient read error) falls through to the
+    // removal below.
+    if let Ok(None) = k8s::get_node_label(config, KATA_RUNTIME_LABEL).await {
+        info!(
+            "cleanup (unlabel): label {} already absent, skipping",
+            KATA_RUNTIME_LABEL
+        );
+        return Ok(());
+    }
+
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
+    info!("cleanup (unlabel): label removed");
+    Ok(())
+}
+
+/// Cleanup stage 2 (revert-cri): remove CRI drop-ins (and any snapshotter
+/// config), then restart the runtime and wait for readiness. This is the
+/// privileged, node-disrupting cleanup stage and is kept short-lived. Skips
+/// entirely when the CRI drop-ins are already absent, avoiding an unnecessary
+/// runtime restart.
+async fn cleanup_stage_revert_cri(config: &config::Config, runtime: &str) -> Result<()> {
+    info!("cleanup (revert-cri): reverting CRI configuration");
+
+    if !cri_drop_in_present(config, runtime).await {
+        info!("cleanup (revert-cri): CRI drop-ins already absent, skipping");
+        return Ok(());
+    }
+
+    if runtime != "crio" {
+        if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() {
+            for snapshotter in snapshotters {
+                info!("cleanup (revert-cri): uninstalling snapshotter {snapshotter}");
+                artifacts::snapshotters::uninstall_snapshotter(snapshotter, config).await?;
+            }
+        }
+    }
+
+    runtime::cleanup_cri_runtime_config(config, runtime).await?;
+
+    info!("cleanup (revert-cri): restarting runtime");
+    runtime::restart_and_wait_for_ready(config, runtime).await?;
+    info!("cleanup (revert-cri): runtime restarted");
+
+    Ok(())
+}
+
+/// Cleanup stage 3 (remove-artifacts): delete kata artifacts/config/symlinks
+/// from the host. Skips when the install directory is already gone.
+async fn cleanup_stage_remove_artifacts(config: &config::Config) -> Result<()> {
+    info!("cleanup (remove-artifacts): removing kata artifacts from host");
+
+    if !std::path::Path::new(&config.host_install_dir).exists() {
+        info!(
+            "cleanup (remove-artifacts): install dir {} already absent, skipping",
+            config.host_install_dir
+        );
+        return Ok(());
+    }
+
+    artifacts::remove_artifacts(config).await?;
+    info!("cleanup (remove-artifacts): artifacts removed");
+    Ok(())
+}
+
+/// Best-effort check for whether kata's CRI drop-in configuration is present on
+/// the host for this runtime. Used by the staged cleanup to skip a disruptive
+/// runtime restart when there is nothing to revert. On any uncertainty (e.g.
+/// the containerd paths cannot be resolved) this returns `true` so the caller
+/// errs on the side of running the revert rather than incorrectly skipping it.
+async fn cri_drop_in_present(config: &config::Config, runtime: &str) -> bool {
+    if runtime == "crio" {
+        return std::path::Path::new(&config.crio_drop_in_conf_file).exists();
+    }
+
+    match config.get_containerd_paths(runtime).await {
+        Ok(paths) => {
+            // /etc/containerd is mounted directly; other paths live under /host.
+            let resolved = if paths.drop_in_file.starts_with("/etc/containerd/") {
+                std::path::PathBuf::from(&paths.drop_in_file)
+            } else {
+                std::path::Path::new("/host").join(paths.drop_in_file.trim_start_matches('/'))
+            };
+            resolved.exists()
+        }
+        Err(e) => {
+            log::warn!(
+                "cleanup (revert-cri): could not resolve containerd paths to check drop-in \
+                 presence ({e}); proceeding with revert"
+            );
+            true
+        }
+    }
+}
+
 async fn reset(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Resetting Kata Containers");
 
-    k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?;
+    k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?;
     runtime::lifecycle::restart_cri_runtime(config, runtime).await?;
     if matches!(runtime, "crio" | "containerd") {
         utils::host_systemctl(&["restart", "kubelet"])?;
@@ -566,3 +814,86 @@ async fn reset(config: &config::Config, runtime: &str) -> Result<()> {
     info!("Kata Containers reset completed successfully");
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    //! Tests for CLI action wiring. The staged install/cleanup actions are the
+    //! entrypoints the JobSet workflow invokes per node, so we lock in their
+    //! exact subcommand names (a rename would silently break the chart) and the
+    //! mapping into the `Action` enum.
+
+    use super::*;
+    use clap::ValueEnum;
+    use rstest::rstest;
+
+    /// Every staged subcommand name parses into the expected `Action` variant.
+    /// Keep this in sync with the `#[clap(name = ...)]` attributes above.
+    #[rstest]
+    #[case("install", Action::Install)]
+    #[case("cleanup", Action::Cleanup)]
+    #[case("reset", Action::Reset)]
+    #[case("install-stage-host-check", Action::InstallStageHostCheck)]
+    #[case("install-stage-artifacts", Action::InstallStageArtifacts)]
+    #[case("install-stage-cri", Action::InstallStageCri)]
+    #[case("install-stage-label", Action::InstallStageLabel)]
+    #[case("cleanup-stage-unlabel", Action::CleanupStageUnlabel)]
+    #[case("cleanup-stage-revert-cri", Action::CleanupStageRevertCri)]
+    #[case("cleanup-stage-remove-artifacts", Action::CleanupStageRemoveArtifacts)]
+    #[case("internal-post-install-wait", Action::InternalPostInstallWait)]
+    fn test_action_parses_from_arg(#[case] arg: &str, #[case] expected: Action) {
+        let args = Args::try_parse_from(["kata-deploy", arg])
+            .unwrap_or_else(|e| panic!("failed to parse action {arg:?}: {e}"));
+        assert_eq!(
+            std::mem::discriminant(&args.action),
+            std::mem::discriminant(&expected),
+            "arg {arg:?} parsed into the wrong Action variant",
+        );
+    }
+
+    /// Unknown actions must be rejected rather than silently accepted.
+    #[rstest]
+    #[case("install-stage")]
+    #[case("cleanup-stage")]
+    #[case("install-stage-foo")]
+    #[case("bogus")]
+    fn test_unknown_action_is_rejected(#[case] arg: &str) {
+        assert!(
+            Args::try_parse_from(["kata-deploy", arg]).is_err(),
+            "expected action {arg:?} to be rejected",
+        );
+    }
+
+    /// The hidden internal waiter must stay hidden from `--help` so users never
+    /// invoke it directly, while still being parseable (asserted above).
+    #[test]
+    fn test_internal_action_is_hidden() {
+        let internal = Action::InternalPostInstallWait
+            .to_possible_value()
+            .expect("internal action should have a possible value");
+        assert!(
+            internal.is_hide_set(),
+            "internal-post-install-wait should be hidden from --help",
+        );
+    }
+
+    /// All non-internal staged actions remain visible in `--help` so operators
+    /// can discover and run individual stages.
+    #[rstest]
+    #[case(Action::InstallStageHostCheck)]
+    #[case(Action::InstallStageArtifacts)]
+    #[case(Action::InstallStageCri)]
+    #[case(Action::InstallStageLabel)]
+    #[case(Action::CleanupStageUnlabel)]
+    #[case(Action::CleanupStageRevertCri)]
+    #[case(Action::CleanupStageRemoveArtifacts)]
+    fn test_staged_actions_are_visible(#[case] action: Action) {
+        let value = action
+            .to_possible_value()
+            .expect("staged action should have a possible value");
+        assert!(
+            !value.is_hide_set(),
+            "staged action {:?} should be visible in --help",
+            value.get_name(),
+        );
+    }
+}