From 87d27e0cc81e9a61adb9d7738a29ba35d6945981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 21:42:02 +0200 Subject: [PATCH 1/9] kata-deploy-job-dispatcher: add generic per-node Job dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a small, deployment-agnostic dispatcher binary that runs exactly one Kubernetes Job per selected node and paces the rollout, so callers get guaranteed per-node coverage without encoding the fan-out in Helm. Motivation: templating one Job per node into a Helm release does not scale (the release Secret hits etcd's 1 MiB limit and hooks run sequentially), and a single Indexed Job cannot guarantee per-node coverage when paced - the scheduler ignores completed pods when evaluating topology spread, so nodes get uneven numbers of pods. A tiny dispatcher that enumerates nodes live and creates node-pinned Jobs itself sidesteps both problems and keeps the Helm release O(1) in fleet size. The dispatcher: - enumerates target nodes live (explicit --nodes list or --node-selector label selector), paginating the API; - stamps out one Job per node from a YAML template, pinning it with nodeName and an owner label for server-side filtering; - keeps at most --parallelism Jobs in flight, refilling as they finish, and sets an OwnerReference to the owner Job so the per-node Jobs are garbage-collected with it; - is a plain API client (kube): it never touches the host, so it can run fully unprivileged. Node membership is resolved live on each run, not frozen at Helm template-render time: re-running the dispatcher (e.g. via `helm upgrade`) picks up nodes added since the last run and skips ones already done, as the per-node stages are idempotent. The dispatcher is one-shot, however - it does not watch the API, so nodes added while it is not running are only covered by the next run. job.rs holds the pure helpers (node-name sanitization, deterministic Job naming, template instantiation, status interpretation) with rstest unit tests; main.rs wires up the CLI and the fan-out loop. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- Cargo.lock | 15 + Cargo.toml | 3 + .../kata-deploy/job-dispatcher/Cargo.toml | 39 ++ .../kata-deploy/job-dispatcher/src/job.rs | 347 ++++++++++++++++ .../kata-deploy/job-dispatcher/src/main.rs | 373 ++++++++++++++++++ 5 files changed, 777 insertions(+) create mode 100644 tools/packaging/kata-deploy/job-dispatcher/Cargo.toml create mode 100644 tools/packaging/kata-deploy/job-dispatcher/src/job.rs create mode 100644 tools/packaging/kata-deploy/job-dispatcher/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 7d8a405b5b..2a6f8c29d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3650,6 +3650,21 @@ dependencies = [ "zstd 0.13.3", ] +[[package]] +name = "kata-deploy-job-dispatcher" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "env_logger", + "k8s-openapi", + "kube", + "log", + "rstest 0.18.2", + "serde_yaml 0.9.34+deprecated", + "tokio", +] + [[package]] name = "kata-sys-util" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 668c51eed2..7803baa1c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,9 @@ members = [ # kata-deploy (Kubernetes installer binary) "tools/packaging/kata-deploy/binary", + # kata-deploy-job-dispatcher (generic per-node Job dispatcher) + "tools/packaging/kata-deploy/job-dispatcher", + # runtime-rs "src/runtime-rs", "src/runtime-rs/crates/agent", diff --git a/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml b/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml new file mode 100644 index 0000000000..dca8c223b5 --- /dev/null +++ b/tools/packaging/kata-deploy/job-dispatcher/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "kata-deploy-job-dispatcher" +version = "0.1.0" +authors.workspace = true +edition = "2021" +license.workspace = true +rust-version.workspace = true + +[[bin]] +name = "kata-deploy-job-dispatcher" +path = "src/main.rs" + +[dependencies] +anyhow.workspace = true +clap.workspace = true +env_logger = "0.10" +k8s-openapi = { version = "0.26", default-features = false, features = [ + "v1_33", +] } +# Only the bare client is needed: this tool drives Jobs by polling with +# Api::list, so kube::runtime (watchers/reflectors) and kube::derive are not +# pulled in. `ring` matches kube's default rustls CryptoProvider and must stay +# enabled, otherwise rustls panics at startup. +kube = { version = "2.0", default-features = false, features = [ + "client", + "rustls-tls", + "ring", +] } +log.workspace = true +serde_yaml = "0.9" +tokio = { workspace = true, features = [ + "rt-multi-thread", + "macros", + "signal", + "time", +] } + +[dev-dependencies] +rstest.workspace = true diff --git a/tools/packaging/kata-deploy/job-dispatcher/src/job.rs b/tools/packaging/kata-deploy/job-dispatcher/src/job.rs new file mode 100644 index 0000000000..2fd6ddacbd --- /dev/null +++ b/tools/packaging/kata-deploy/job-dispatcher/src/job.rs @@ -0,0 +1,347 @@ +// Copyright (c) 2026 NVIDIA Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +use k8s_openapi::api::batch::v1::Job; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference; +use std::collections::hash_map::DefaultHasher; +use std::collections::BTreeMap; +use std::hash::{Hash, Hasher}; + +/// Label applied to every per-node Job, set to the dispatcher's name prefix. +/// Used as a server-side selector so the dispatcher only ever sees the Jobs it +/// created (and not unrelated Jobs in the namespace). +pub const OWNER_LABEL: &str = "kata-deploy-job-dispatcher/owner"; + +/// Label carrying the (sanitized) target node name, for human inspection. +pub const NODE_LABEL: &str = "kata-deploy-job-dispatcher/node"; + +/// Annotation carrying the full, unmodified target node name. Node names can +/// exceed the 63-char label-value limit or contain characters invalid in a +/// label value, so the authoritative value lives in an annotation. +pub const NODE_ANNOTATION: &str = "kata-deploy-job-dispatcher/node-name"; + +/// Maximum length of a DNS-1123 label and of a Kubernetes label value. +pub const MAX_LABEL_LEN: usize = 63; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum JobOutcome { + Running, + Succeeded, + Failed, +} + +/// Lowercase a node name and replace any character that is not a valid +/// DNS-1123 label character (`[a-z0-9-]`) with `-`, then trim leading/trailing +/// dashes. The result is safe to embed in a Job name and label value. +pub fn sanitize_node(node: &str) -> String { + let lowered = node.to_ascii_lowercase(); + let mapped: String = lowered + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect(); + mapped.trim_matches('-').to_string() +} + +/// Short, stable hex digest of an arbitrary string. Used to keep generated +/// Job names unique when the sanitized/truncated form would otherwise collide. +fn short_hash(s: &str) -> String { + let mut hasher = DefaultHasher::new(); + s.hash(&mut hasher); + format!("{:08x}", (hasher.finish() & 0xffff_ffff) as u32) +} + +/// Build a deterministic, RFC1123-label-safe Job name (`<= 63` chars) for a +/// node. When `-` fits it is used verbatim; otherwise +/// it is truncated and a short hash of the *full* node name is appended so two +/// different long node names cannot collide. +pub fn job_name(prefix: &str, node: &str) -> String { + let sanitized = sanitize_node(node); + let base = format!("{prefix}-{sanitized}"); + if base.len() <= MAX_LABEL_LEN { + return base; + } + let hash = short_hash(node); + // Reserve room for "-" + hash. + let keep = MAX_LABEL_LEN.saturating_sub(hash.len() + 1); + let truncated = base.chars().take(keep).collect::(); + format!("{}-{}", truncated.trim_end_matches('-'), hash) +} + +/// Sanitize an arbitrary string into a value safe to use BOTH as the prefix of +/// a DNS-1123 Job name and as a Kubernetes label value: lowercased, every +/// non-`[a-z0-9-]` character replaced with `-`, leading/trailing `-` trimmed, +/// and truncated to [`MAX_LABEL_LEN`] (re-trimming any trailing `-` left by the +/// truncation). The dispatcher records its `--name-prefix` in [`OWNER_LABEL`] +/// and reuses it as the Job-name prefix, so callers can pass a raw value (e.g. +/// a Helm release/suffix) without risking an invalid or over-long label. +pub fn sanitize_label_value(value: &str) -> String { + let sanitized = sanitize_node(value); + if sanitized.len() <= MAX_LABEL_LEN { + return sanitized; + } + sanitized + .chars() + .take(MAX_LABEL_LEN) + .collect::() + .trim_end_matches('-') + .to_string() +} + +/// True if `job` carries [`OWNER_LABEL`] set to exactly `owner_value`. Used to +/// decide whether a pre-existing (409) Job is safe to adopt: the dispatcher +/// only ever LISTs Jobs by that label, so adopting one that lacks it would +/// leave it stuck in-flight forever. +pub fn job_owned_by(job: &Job, owner_value: &str) -> bool { + job.metadata + .labels + .as_ref() + .and_then(|labels| labels.get(OWNER_LABEL)) + .map(|value| value == owner_value) + .unwrap_or(false) +} + +/// Clone the template Job and specialize it for a single node: +/// - set a unique `metadata.name`, +/// - pin the pod to `node` via `spec.template.spec.nodeName`, +/// - add owner/node tracking labels (+ a full-name annotation), +/// - optionally attach an `ownerReference` for garbage collection. +/// +/// `owner_value` is the dispatcher's name prefix, recorded in [`OWNER_LABEL`] so +/// the dispatcher can list back only its own Jobs. +pub fn build_node_job( + template: &Job, + name: &str, + node: &str, + owner_value: &str, + owner: Option<&OwnerReference>, +) -> Job { + let mut job = template.clone(); + + job.metadata.name = Some(name.to_string()); + // A template may carry generateName; an explicit name wins, drop it to + // avoid the apiserver rejecting both being set. + job.metadata.generate_name = None; + + let labels = job.metadata.labels.get_or_insert_with(BTreeMap::new); + labels.insert(OWNER_LABEL.to_string(), owner_value.to_string()); + labels.insert(NODE_LABEL.to_string(), sanitize_node(node)); + + let annotations = job.metadata.annotations.get_or_insert_with(BTreeMap::new); + annotations.insert(NODE_ANNOTATION.to_string(), node.to_string()); + + if let Some(owner_ref) = owner { + job.metadata.owner_references = Some(vec![owner_ref.clone()]); + } + + let spec = job.spec.get_or_insert_with(Default::default); + + // Mirror the owner label onto the pod template so the pods are easy to + // find too. + let tmpl_meta = spec.template.metadata.get_or_insert_with(Default::default); + let tmpl_labels = tmpl_meta.labels.get_or_insert_with(BTreeMap::new); + tmpl_labels.insert(OWNER_LABEL.to_string(), owner_value.to_string()); + + let pod_spec = spec.template.spec.get_or_insert_with(Default::default); + pod_spec.node_name = Some(node.to_string()); + + job +} + +/// Interpret a Job's `.status` into a coarse outcome. Prefers the explicit +/// `Complete`/`Failed` conditions; falls back to the succeeded counter. +pub fn interpret_status(job: &Job) -> JobOutcome { + let Some(status) = job.status.as_ref() else { + return JobOutcome::Running; + }; + + if let Some(conditions) = status.conditions.as_ref() { + for c in conditions { + if c.status != "True" { + continue; + } + match c.type_.as_str() { + "Failed" => return JobOutcome::Failed, + "Complete" => return JobOutcome::Succeeded, + _ => {} + } + } + } + + if status.succeeded.unwrap_or(0) >= 1 { + return JobOutcome::Succeeded; + } + + JobOutcome::Running +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("worker-0", "worker-0")] + #[case("Worker.Example.COM", "worker-example-com")] + #[case("--node--", "node")] + #[case("a_b/c", "a-b-c")] + fn test_sanitize_node(#[case] input: &str, #[case] expected: &str) { + assert_eq!(sanitize_node(input), expected); + } + + #[rstest] + #[case("kata-deploy-install", "kata-deploy-install")] + #[case("Kata_Deploy.Install", "kata-deploy-install")] + #[case("--weird--", "weird")] + fn test_sanitize_label_value_short(#[case] input: &str, #[case] expected: &str) { + assert_eq!(sanitize_label_value(input), expected); + } + + #[test] + fn test_sanitize_label_value_truncates() { + let out = sanitize_label_value(&"a".repeat(100)); + assert_eq!(out.len(), MAX_LABEL_LEN); + assert!( + !out.ends_with('-'), + "truncation must not leave a trailing dash" + ); + } + + #[test] + fn test_job_owned_by() { + let mut job = Job::default(); + assert!(!job_owned_by(&job, "kata-deploy-install")); + job.metadata + .labels + .get_or_insert_with(BTreeMap::new) + .insert(OWNER_LABEL.to_string(), "kata-deploy-install".to_string()); + assert!(job_owned_by(&job, "kata-deploy-install")); + assert!(!job_owned_by(&job, "other-owner")); + } + + #[rstest] + #[case("kata-deploy-install", "worker-0", "kata-deploy-install-worker-0")] + #[case("kata-deploy-cleanup", "Worker.0", "kata-deploy-cleanup-worker-0")] + fn test_job_name_short(#[case] prefix: &str, #[case] node: &str, #[case] expected: &str) { + assert_eq!(job_name(prefix, node), expected); + } + + #[test] + fn test_job_name_truncated_and_unique() { + let prefix = "kata-deploy-install"; + let long_a = "node-with-a-really-really-really-really-really-long-name-aaaaaaa"; + let long_b = "node-with-a-really-really-really-really-really-long-name-bbbbbbb"; + + let name_a = job_name(prefix, long_a); + let name_b = job_name(prefix, long_b); + + assert!( + name_a.len() <= 63, + "name too long: {} ({})", + name_a, + name_a.len() + ); + assert!( + name_b.len() <= 63, + "name too long: {} ({})", + name_b, + name_b.len() + ); + assert_ne!( + name_a, name_b, + "different node names must yield different job names" + ); + } + + #[test] + fn test_build_node_job_pins_node_and_labels() { + let template: Job = serde_yaml::from_str( + r#" +apiVersion: batch/v1 +kind: Job +metadata: + name: ignored +spec: + template: + spec: + restartPolicy: Never + containers: + - name: c + image: busybox +"#, + ) + .unwrap(); + + let owner = OwnerReference { + api_version: "batch/v1".to_string(), + kind: "Job".to_string(), + name: "dispatcher".to_string(), + uid: "abc-123".to_string(), + controller: Some(false), + block_owner_deletion: Some(false), + }; + + let job = build_node_job( + &template, + "kata-deploy-install-node1", + "node1", + "kata-deploy-install", + Some(&owner), + ); + + assert_eq!( + job.metadata.name.as_deref(), + Some("kata-deploy-install-node1") + ); + let labels = job.metadata.labels.unwrap(); + assert_eq!( + labels.get(OWNER_LABEL).map(String::as_str), + Some("kata-deploy-install") + ); + assert_eq!(labels.get(NODE_LABEL).map(String::as_str), Some("node1")); + let annotations = job.metadata.annotations.unwrap(); + assert_eq!( + annotations.get(NODE_ANNOTATION).map(String::as_str), + Some("node1") + ); + assert_eq!(job.metadata.owner_references.unwrap().len(), 1); + let pod_spec = job.spec.unwrap().template.spec.unwrap(); + assert_eq!(pod_spec.node_name.as_deref(), Some("node1")); + } + + fn job_with_status(status_yaml: &str) -> Job { + let yaml = format!( + "apiVersion: batch/v1\nkind: Job\nmetadata:\n name: j\nstatus:\n{status_yaml}" + ); + serde_yaml::from_str(&yaml).unwrap() + } + + #[rstest] + #[case( + " conditions:\n - type: Complete\n status: \"True\"\n", + JobOutcome::Succeeded + )] + #[case( + " conditions:\n - type: Failed\n status: \"True\"\n", + JobOutcome::Failed + )] + #[case( + " conditions:\n - type: Complete\n status: \"False\"\n", + JobOutcome::Running + )] + #[case(" succeeded: 1\n", JobOutcome::Succeeded)] + fn test_interpret_status(#[case] status_yaml: &str, #[case] expected: JobOutcome) { + assert_eq!(interpret_status(&job_with_status(status_yaml)), expected); + } + + #[test] + fn test_interpret_status_running_when_unset() { + assert_eq!(interpret_status(&Job::default()), JobOutcome::Running); + } +} diff --git a/tools/packaging/kata-deploy/job-dispatcher/src/main.rs b/tools/packaging/kata-deploy/job-dispatcher/src/main.rs new file mode 100644 index 0000000000..c5db2e5cf5 --- /dev/null +++ b/tools/packaging/kata-deploy/job-dispatcher/src/main.rs @@ -0,0 +1,373 @@ +// Copyright (c) 2026 NVIDIA Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +//! kata-deploy-job-dispatcher: a small, deployment-agnostic dispatcher that runs exactly +//! one node-pinned Job per selected node. +//! +//! Given a Job template (any `batch/v1` Job manifest) and a node selector, it +//! creates one Job per node — pinned to that node via `spec.nodeName` — keeps +//! at most `--parallelism` Jobs in flight at a time (refilling as they finish), +//! and exits non-zero if any node's Job failed. This gives paced rollouts with +//! *guaranteed per-node coverage*, which an Indexed Job / topology-spread +//! cannot guarantee once `parallelism < completions` (the scheduler ignores +//! completed pods when balancing the spread). +//! +//! It has no host dependencies and only needs RBAC to list nodes and to +//! manage Jobs in its namespace. + +mod job; + +use anyhow::{bail, Context, Result}; +use clap::Parser; +use job::{ + build_node_job, interpret_status, job_name, job_owned_by, sanitize_label_value, JobOutcome, + OWNER_LABEL, +}; +use k8s_openapi::api::batch::v1::Job; +use k8s_openapi::api::core::v1::Node; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference; +use kube::api::{Api, ListParams, PostParams}; +use kube::Client; +use log::{error, info}; +use std::collections::{HashMap, VecDeque}; +use std::time::Duration; + +#[derive(Parser, Debug)] +#[command( + author, + version, + about = "Run one node-pinned Job per selected node, paced and with guaranteed coverage." +)] +struct Args { + /// Path to a YAML file containing the batch/v1 Job to run on each node. + /// The dispatcher clones it per node and sets metadata.name + nodeName. + #[arg(long)] + job_template: String, + + /// Prefix for generated per-node Job names. Also recorded as the + /// "kata-deploy-job-dispatcher/owner" label so the dispatcher tracks only its own Jobs. + #[arg(long)] + name_prefix: String, + + /// Namespace to create the per-node Jobs in. Defaults to $POD_NAMESPACE, + /// then the in-cluster service-account namespace, then "default". + #[arg(long)] + namespace: Option, + + /// Maximum number of per-node Jobs in flight at once. + #[arg(long, default_value_t = 100)] + parallelism: usize, + + /// Server-side label selector used to pick target nodes, e.g. + /// "kubernetes.io/os=linux" or "node-role.kubernetes.io/control-plane". + /// Supports the full label-selector grammar (In/NotIn/Exists/DoesNotExist). + #[arg(long)] + node_selector: Option, + + /// Server-side field selector used to pick target nodes (ANDed with the + /// label selector). + #[arg(long)] + node_field_selector: Option, + + /// Explicit comma-separated node names. When set, the node selectors are + /// ignored and exactly these nodes are targeted. + #[arg(long)] + nodes: Option, + + /// Optional owner Job name (in the dispatcher's namespace). When set, every + /// per-node Job gets an ownerReference to it so they are garbage-collected + /// together with the owner. + #[arg(long)] + owner_job_name: Option, + + /// Seconds between status polls. + #[arg(long, default_value_t = 5)] + poll_interval_secs: u64, + + /// Page size used when listing nodes (server-side pagination). + #[arg(long, default_value_t = 500)] + node_page_size: u32, +} + +// The dispatcher is overwhelmingly I/O-bound (apiserver round-trips); two worker +// threads are plenty and keep the footprint small. +#[tokio::main(flavor = "multi_thread", worker_threads = 2)] +async fn main() -> Result<()> { + env_logger::Builder::from_default_env() + .filter_level(log::LevelFilter::Info) + .init(); + + let args = Args::parse(); + + let client = Client::try_default() + .await + .context("failed to create Kubernetes client")?; + + let namespace = resolve_namespace(args.namespace.clone()); + info!("kata-deploy-job-dispatcher starting (namespace: {namespace})"); + + let nodes = resolve_nodes(&client, &args).await?; + if nodes.is_empty() { + info!("no target nodes matched the selection; nothing to do"); + return Ok(()); + } + + let template_raw = std::fs::read_to_string(&args.job_template) + .with_context(|| format!("failed to read job template {}", args.job_template))?; + let template: Job = serde_yaml::from_str(&template_raw) + .with_context(|| format!("failed to parse job template {}", args.job_template))?; + + let owner = match args.owner_job_name.as_deref() { + Some(name) => Some(owner_ref_for_job(&client, &namespace, name).await?), + None => None, + }; + + let jobs: Api = Api::namespaced(client.clone(), &namespace); + + let parallelism = args.parallelism.clamp(1, nodes.len()); + info!( + "fanning out {} per-node Job(s) with parallelism {}", + nodes.len(), + parallelism + ); + + run_fanout( + &jobs, + &template, + &nodes, + &args, + &namespace, + parallelism, + owner.as_ref(), + ) + .await +} + +/// Resolve the namespace to create Jobs in: explicit flag, then $POD_NAMESPACE, +/// then the in-cluster service-account namespace file, then "default". +fn resolve_namespace(flag: Option) -> String { + if let Some(ns) = flag.filter(|s| !s.trim().is_empty()) { + return ns; + } + if let Ok(ns) = std::env::var("POD_NAMESPACE") { + if !ns.trim().is_empty() { + return ns; + } + } + if let Ok(ns) = + std::fs::read_to_string("/var/run/secrets/kubernetes.io/serviceaccount/namespace") + { + let ns = ns.trim().to_string(); + if !ns.is_empty() { + return ns; + } + } + "default".to_string() +} + +/// Resolve the set of target node names: an explicit `--nodes` list when given, +/// otherwise a paginated, server-side-filtered LIST of nodes. +async fn resolve_nodes(client: &Client, args: &Args) -> Result> { + if let Some(list) = args.nodes.as_deref() { + let mut names: Vec = list + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + names.sort(); + names.dedup(); + return Ok(names); + } + + let api: Api = Api::all(client.clone()); + let mut names = Vec::new(); + let mut continue_token: Option = None; + + loop { + let lp = ListParams { + limit: Some(args.node_page_size.max(1)), + label_selector: args.node_selector.clone(), + field_selector: args.node_field_selector.clone(), + continue_token: continue_token.clone(), + ..Default::default() + }; + + let page = api.list(&lp).await.context("failed to list nodes")?; + for node in &page.items { + if let Some(name) = node.metadata.name.clone() { + names.push(name); + } + } + + match page.metadata.continue_ { + Some(token) if !token.is_empty() => continue_token = Some(token), + _ => break, + } + } + + names.sort(); + names.dedup(); + Ok(names) +} + +/// Fetch the owner Job and build an `ownerReference` to it (non-controller, so +/// it does not interfere with the Job controller's own ownership of pods). +async fn owner_ref_for_job(client: &Client, namespace: &str, name: &str) -> Result { + let jobs: Api = Api::namespaced(client.clone(), namespace); + let job = jobs + .get(name) + .await + .with_context(|| format!("failed to get owner job {name}"))?; + let uid = job + .metadata + .uid + .ok_or_else(|| anyhow::anyhow!("owner job {name} has no uid"))?; + Ok(OwnerReference { + api_version: "batch/v1".to_string(), + kind: "Job".to_string(), + name: name.to_string(), + uid, + controller: Some(false), + block_owner_deletion: Some(false), + }) +} + +/// Create and watch per-node Jobs, keeping at most `parallelism` in flight. +/// Returns an error listing the nodes whose Jobs failed, if any. +async fn run_fanout( + jobs: &Api, + template: &Job, + nodes: &[String], + args: &Args, + namespace: &str, + parallelism: usize, + owner: Option<&OwnerReference>, +) -> Result<()> { + let mut queue: VecDeque<&String> = nodes.iter().collect(); + // job name -> node name + let mut in_flight: HashMap = HashMap::new(); + let mut succeeded = 0usize; + let mut failed: Vec = Vec::new(); + + let post = PostParams::default(); + let poll = Duration::from_secs(args.poll_interval_secs.max(1)); + // The name prefix is recorded in OWNER_LABEL and reused as the Job-name + // prefix; sanitize it once so it is a valid label value / DNS-1123 prefix + // regardless of what the caller passed (e.g. a Helm release suffix). + let owner_value = sanitize_label_value(&args.name_prefix); + let owner_selector = format!("{OWNER_LABEL}={owner_value}"); + + while !queue.is_empty() || !in_flight.is_empty() { + // Refill the in-flight set up to the parallelism cap. + while in_flight.len() < parallelism { + let Some(node) = queue.pop_front() else { + break; + }; + let name = job_name(&owner_value, node); + let node_job = build_node_job(template, &name, node, &owner_value, owner); + match jobs.create(&post, &node_job).await { + Ok(_) => info!("created job {name} (node {node})"), + // A Job with this name already exists (e.g. left over from a + // previous, interrupted run). Only adopt it if it actually + // carries our owner label: status polling LISTs Jobs by that + // label, so adopting one that lacks it (or belongs to someone + // else) would leave it stuck in-flight forever. If it is not + // ours, fail the node instead of hanging. + Err(kube::Error::Api(e)) if e.code == 409 => match jobs.get(&name).await { + Ok(existing) if job_owned_by(&existing, &owner_value) => { + info!("job {name} (node {node}) already exists and is ours, adopting it"); + } + Ok(_) => { + error!( + "job {name} (node {node}) already exists but is not labeled \ + {OWNER_LABEL}={owner_value}; refusing to adopt it" + ); + failed.push(node.clone()); + continue; + } + Err(e) => { + error!("failed to fetch pre-existing job {name} (node {node}): {e}"); + failed.push(node.clone()); + continue; + } + }, + Err(e) => { + error!("failed to create job {name} (node {node}): {e}"); + failed.push(node.clone()); + continue; + } + } + in_flight.insert(name, node.clone()); + } + + if in_flight.is_empty() { + break; + } + + tokio::time::sleep(poll).await; + + // One LIST per poll returns the status of all our Jobs at once. + let lp = ListParams { + label_selector: Some(owner_selector.clone()), + ..Default::default() + }; + let listed = jobs + .list(&lp) + .await + .context("failed to list per-node jobs")?; + let mut status_by_name: HashMap<&str, &Job> = HashMap::new(); + for j in &listed.items { + if let Some(name) = j.metadata.name.as_deref() { + status_by_name.insert(name, j); + } + } + + let mut finished: Vec = Vec::new(); + for (name, node) in &in_flight { + let Some(j) = status_by_name.get(name.as_str()) else { + continue; + }; + match interpret_status(j) { + JobOutcome::Succeeded => { + succeeded += 1; + finished.push(name.clone()); + info!("node {node}: job {name} succeeded"); + } + JobOutcome::Failed => { + failed.push(node.clone()); + finished.push(name.clone()); + error!("node {node}: job {name} failed"); + } + JobOutcome::Running => {} + } + } + for name in finished { + in_flight.remove(&name); + } + + info!( + "progress: {succeeded} succeeded, {} failed, {} in-flight, {} queued", + failed.len(), + in_flight.len(), + queue.len() + ); + } + + if !failed.is_empty() { + failed.sort(); + failed.dedup(); + bail!( + "{} node(s) failed: {}. Inspect the per-node Job logs with: \ + kubectl logs -n {} -l {}={} --all-containers --prefix", + failed.len(), + failed.join(", "), + namespace, + OWNER_LABEL, + owner_value + ); + } + + info!("all {succeeded} node(s) completed successfully"); + Ok(()) +} From d4205c7fccd9ec4cda6b916819bfe3d99a1b7d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 21:42:15 +0200 Subject: [PATCH 2/9] kata-deploy: build and publish the kata-deploy-job-dispatcher image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Package and ship the dispatcher built in the previous commit so the job-mode Helm chart has an image to run. - Dockerfile.components: build kata-deploy and kata-deploy-job-dispatcher from the same rust-builder stage (one compile), and run fmt/clippy/ test for both crates. - job-dispatcher/Dockerfile: a minimal distroless/static image containing only the dispatcher binary and CA certs - it is an API client, so it needs nothing from the host. - local-build: kata-deploy-job-dispatcher becomes its own build component with its own static tarball (kata-deploy-static-kata-deploy-job-dispatcher.tar.zst); the shared rust-builder output is reused so the two components do not recompile the workspace locally. The payload script builds and pushes a separate "-job-dispatcher" image with the same tag scheme, and release.sh publishes its multi-arch manifest symmetrically. - CI: add kata-deploy-job-dispatcher to the build-kata-deploy-components matrices (its tarball is picked up by the existing kata-artifacts-* glob), and gate it in the kata-deploy rust static checks. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../build-kata-static-tarball-amd64.yaml | 1 + .../build-kata-static-tarball-arm64.yaml | 1 + .../build-kata-static-tarball-ppc64le.yaml | 1 + .../build-kata-static-tarball-s390x.yaml | 1 + .github/workflows/static-checks.yaml | 3 ++ .../kata-deploy/Dockerfile.components | 9 +++- .../kata-deploy/job-dispatcher/Dockerfile | 41 +++++++++++++++++++ .../kata-deploy/local-build/Makefile | 5 +++ .../kata-deploy-build-and-upload-payload.sh | 30 +++++++++++++- .../kata-deploy-build-components-tarballs.sh | 38 +++++++++++++++-- tools/packaging/release/release.sh | 24 ++++++++++- 11 files changed, 145 insertions(+), 9 deletions(-) create mode 100644 tools/packaging/kata-deploy/job-dispatcher/Dockerfile diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml index 40bc636de9..7d520f3cf8 100644 --- a/.github/workflows/build-kata-static-tarball-amd64.yaml +++ b/.github/workflows/build-kata-static-tarball-amd64.yaml @@ -162,6 +162,7 @@ jobs: matrix: component: - kata-deploy-binary + - kata-deploy-job-dispatcher - nydus-snapshotter-for-coco-guest-pull concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-amd64-${{ toJSON(matrix) }} diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml index 1431680ece..50087e0631 100644 --- a/.github/workflows/build-kata-static-tarball-arm64.yaml +++ b/.github/workflows/build-kata-static-tarball-arm64.yaml @@ -156,6 +156,7 @@ jobs: matrix: component: - kata-deploy-binary + - kata-deploy-job-dispatcher - nydus-snapshotter-for-coco-guest-pull concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-arm64-${{ toJSON(matrix) }} diff --git a/.github/workflows/build-kata-static-tarball-ppc64le.yaml b/.github/workflows/build-kata-static-tarball-ppc64le.yaml index 8596f757f9..ba98a6a691 100644 --- a/.github/workflows/build-kata-static-tarball-ppc64le.yaml +++ b/.github/workflows/build-kata-static-tarball-ppc64le.yaml @@ -101,6 +101,7 @@ jobs: matrix: component: - kata-deploy-binary + - kata-deploy-job-dispatcher - nydus-snapshotter-for-coco-guest-pull concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-ppc64le-${{ toJSON(matrix) }} diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml index 03bf46fc01..c0b5fe0b62 100644 --- a/.github/workflows/build-kata-static-tarball-s390x.yaml +++ b/.github/workflows/build-kata-static-tarball-s390x.yaml @@ -139,6 +139,7 @@ jobs: matrix: component: - kata-deploy-binary + - kata-deploy-job-dispatcher - nydus-snapshotter-for-coco-guest-pull concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-s390x-${{ toJSON(matrix) }} diff --git a/.github/workflows/static-checks.yaml b/.github/workflows/static-checks.yaml index 2c90dacc0c..f3ef50dbfb 100644 --- a/.github/workflows/static-checks.yaml +++ b/.github/workflows/static-checks.yaml @@ -121,6 +121,9 @@ jobs: cargo fmt -p kata-deploy --check cargo clippy -p kata-deploy --all-targets --all-features -- -D warnings RUSTFLAGS="-D warnings" cargo test -p kata-deploy -- --test-threads=1 + cargo fmt -p kata-deploy-job-dispatcher --check + cargo clippy -p kata-deploy-job-dispatcher --all-targets --all-features -- -D warnings + RUSTFLAGS="-D warnings" cargo test -p kata-deploy-job-dispatcher -- --test-threads=1 static-checks: name: static-checks diff --git a/tools/packaging/kata-deploy/Dockerfile.components b/tools/packaging/kata-deploy/Dockerfile.components index 122141a839..cc5c1e3a4f 100644 --- a/tools/packaging/kata-deploy/Dockerfile.components +++ b/tools/packaging/kata-deploy/Dockerfile.components @@ -57,6 +57,7 @@ WORKDIR /kata COPY Cargo.toml Cargo.lock ./ COPY src ./src COPY tools/packaging/kata-deploy/binary ./tools/packaging/kata-deploy/binary +COPY tools/packaging/kata-deploy/job-dispatcher ./tools/packaging/kata-deploy/job-dispatcher # Install target and run tests based on architecture # - AMD64/arm64: use musl for fully static binaries @@ -98,9 +99,11 @@ RUN \ rust_target="$(cat /tmp/rust_target)" && \ echo "Checking code formatting..." && \ cargo fmt -p kata-deploy --check && \ + cargo fmt -p kata-deploy-job-dispatcher --check && \ echo "Code formatting check passed!" && \ echo "Running cargo clippy with target ${rust_target}..." && \ cargo clippy -p kata-deploy --all-targets --all-features --release --locked --target "${rust_target}" -- -D warnings && \ + cargo clippy -p kata-deploy-job-dispatcher --all-targets --all-features --release --locked --target "${rust_target}" -- -D warnings && \ echo "Cargo clippy passed!" # Run tests using --test-threads=1 to prevent environment variable pollution between tests, @@ -109,14 +112,16 @@ RUN \ rust_target="$(cat /tmp/rust_target)"; \ echo "Running binary tests with target ${rust_target}..." && \ RUSTFLAGS="-D warnings" cargo test -p kata-deploy --target "${rust_target}" -- --test-threads=1 && \ + RUSTFLAGS="-D warnings" cargo test -p kata-deploy-job-dispatcher --target "${rust_target}" -- --test-threads=1 && \ echo "All tests passed!" RUN \ rust_target="$(cat /tmp/rust_target)"; \ - echo "Building kata-deploy binary for ${rust_target}..." && \ - RUSTFLAGS="-D warnings" cargo build --release -p kata-deploy --target "${rust_target}" && \ + echo "Building kata-deploy + kata-deploy-job-dispatcher binaries for ${rust_target}..." && \ + RUSTFLAGS="-D warnings" cargo build --release -p kata-deploy -p kata-deploy-job-dispatcher --target "${rust_target}" && \ mkdir -p /kata-deploy/bin && \ cp "/kata/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \ + cp "/kata/target/${rust_target}/release/kata-deploy-job-dispatcher" /kata-deploy/bin/kata-deploy-job-dispatcher && \ echo "Cleaning up build artifacts to save disk space..." && \ rm -rf /kata/target && \ cargo clean diff --git a/tools/packaging/kata-deploy/job-dispatcher/Dockerfile b/tools/packaging/kata-deploy/job-dispatcher/Dockerfile new file mode 100644 index 0000000000..36fa227e7d --- /dev/null +++ b/tools/packaging/kata-deploy/job-dispatcher/Dockerfile @@ -0,0 +1,41 @@ +# Copyright (c) 2026 Kata Contributors +# +# SPDX-License-Identifier: Apache-2.0 +# +# Minimal image for the job-mode dispatcher (kata-deploy-job-dispatcher). +# +# Unlike the kata-deploy image, this dispatcher never touches the host: it only +# talks to the Kubernetes API (lists nodes, creates/watches per-node Jobs). It +# therefore needs nothing but the statically-linked binary and CA certificates, +# and ships on distroless/static. +# +# The binary is produced by the shared rust-builder stage and packaged into +# kata-deploy-static-kata-deploy-job-dispatcher.tar.zst (see Dockerfile.components and +# local-build/kata-deploy-build-components-tarballs.sh). Build from the repo +# root so the tarball path resolves: +# docker build -f tools/packaging/kata-deploy/job-dispatcher/Dockerfile . + +#### Extract the dispatcher binary from its tarball +FROM alpine:3.22 AS extract-stage + +ARG KATA_ARTIFACTS_DIR=tools/packaging/kata-deploy/kata-artifacts + +SHELL ["/bin/ash", "-eo", "pipefail", "-c"] + +RUN apk add --no-cache zstd + +COPY ${KATA_ARTIFACTS_DIR}/kata-deploy-static-kata-deploy-job-dispatcher.tar.zst /tmp/dispatcher.tar.zst + +RUN \ + mkdir -p /opt/dispatcher && \ + zstd -dc /tmp/dispatcher.tar.zst | tar -xf - -C /opt/dispatcher ./usr/bin/kata-deploy-job-dispatcher + +#### Dispatcher image +# distroless does not publish pinned/versioned tags - only rolling ones +# (latest, nonroot, debug) - so :latest is the intended way to consume it. +# hadolint ignore=DL3007 +FROM gcr.io/distroless/static-debian13:latest + +COPY --from=extract-stage /opt/dispatcher/usr/bin/kata-deploy-job-dispatcher /usr/bin/kata-deploy-job-dispatcher + +ENTRYPOINT ["/usr/bin/kata-deploy-job-dispatcher"] diff --git a/tools/packaging/kata-deploy/local-build/Makefile b/tools/packaging/kata-deploy/local-build/Makefile index e6507f8522..5af4a6e989 100644 --- a/tools/packaging/kata-deploy/local-build/Makefile +++ b/tools/packaging/kata-deploy/local-build/Makefile @@ -70,6 +70,7 @@ endif PUBLISH_COMPONENT_TARBALLS = \ kata-deploy-binary-tarball \ + kata-deploy-job-dispatcher-tarball \ nydus-snapshotter-for-coco-guest-pull-tarball ifeq ($(ARCH), x86_64) @@ -106,6 +107,7 @@ endif # can consume a single nvgpu bundle without rebuilding extra components. NVGPU_FINAL_TARBALL_INPUTS = \ kata-deploy-static-kata-deploy-binary.tar.zst \ + kata-deploy-static-kata-deploy-job-dispatcher.tar.zst \ kata-deploy-static-nydus-snapshotter-for-coco-guest-pull.tar.zst \ kata-static-kernel-nvidia-gpu.tar.zst \ kata-static-ovmf-sev.tar.zst \ @@ -313,6 +315,9 @@ virtiofsd-tarball: kata-deploy-binary-tarball: $(call BUILD_KATA_DEPLOY_COMPONENT,kata-deploy-binary) +kata-deploy-job-dispatcher-tarball: + $(call BUILD_KATA_DEPLOY_COMPONENT,kata-deploy-job-dispatcher) + nydus-snapshotter-for-coco-guest-pull-tarball: $(call BUILD_KATA_DEPLOY_COMPONENT,nydus-snapshotter-for-coco-guest-pull) diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh index 580402c83e..040ce2b5fb 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh @@ -17,6 +17,18 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)" REGISTRY="${1:-"quay.io/kata-containers/kata-deploy"}" TAG="${2:-}" ARTIFACTS_BUILD_DIR="${3:-${REPO_ROOT}/tools/packaging/kata-deploy/local-build/build}" +# Separate, minimal image for the job-mode dispatcher (kata-deploy-job-dispatcher). +# Built from its own staged tarball, with the same tag scheme as the kata-deploy +# image. The repo name mirrors the kata-deploy repo with "-job-dispatcher" inserted +# before any "-ci" suffix, so the "-ci" stays last: +# .../kata-deploy -> .../kata-deploy-job-dispatcher +# .../kata-deploy-ci -> .../kata-deploy-job-dispatcher-ci +if [[ "${REGISTRY}" == *-ci ]]; then + default_job_dispatcher_image_reference="${REGISTRY%-ci}-job-dispatcher-ci" +else + default_job_dispatcher_image_reference="${REGISTRY}-job-dispatcher" +fi +JOB_DISPATCHER_IMAGE_REFERENCE="${4:-${default_job_dispatcher_image_reference}}" KATA_DEPLOY_DIR="${REPO_ROOT}/tools/packaging/kata-deploy" ARTIFACTS_STAGE_DIR="${KATA_DEPLOY_DIR}/kata-artifacts" @@ -40,22 +52,36 @@ arch=$(uname -m) # Disable provenance and SBOM so each tag is a single image manifest. quay.io rejects # pushing multi-arch manifest lists that include attestation manifests ("manifest invalid"). PLATFORM="linux/${arch}" -IMAGE_TAG="${REGISTRY}:kata-containers-$(git -C "${REPO_ROOT}" rev-parse HEAD)-${arch}" +COMMIT_TAG="kata-containers-$(git -C "${REPO_ROOT}" rev-parse HEAD)-${arch}" +IMAGE_TAG="${REGISTRY}:${COMMIT_TAG}" +JOB_DISPATCHER_IMAGE_TAG="${JOB_DISPATCHER_IMAGE_REFERENCE}:${COMMIT_TAG}" DOCKERFILE="${REPO_ROOT}/tools/packaging/kata-deploy/Dockerfile" +JOB_DISPATCHER_DOCKERFILE="${REPO_ROOT}/tools/packaging/kata-deploy/job-dispatcher/Dockerfile" -echo "Building the image" +echo "Building the kata-deploy image" docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \ -f "${DOCKERFILE}" \ --tag "${IMAGE_TAG}" --push . +echo "Building the kata-deploy-job-dispatcher image" +docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \ + -f "${JOB_DISPATCHER_DOCKERFILE}" \ + --tag "${JOB_DISPATCHER_IMAGE_TAG}" --push . + if [[ -n "${TAG}" ]]; then ADDITIONAL_TAG="${REGISTRY}:${TAG}" + JOB_DISPATCHER_ADDITIONAL_TAG="${JOB_DISPATCHER_IMAGE_REFERENCE}:${TAG}" echo "Building the ${ADDITIONAL_TAG} image" docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \ -f "${DOCKERFILE}" \ --tag "${ADDITIONAL_TAG}" --push . + + echo "Building the ${JOB_DISPATCHER_ADDITIONAL_TAG} image" + docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \ + -f "${JOB_DISPATCHER_DOCKERFILE}" \ + --tag "${JOB_DISPATCHER_ADDITIONAL_TAG}" --push . fi popd diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh index f90abbdbf8..c156639a1d 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-components-tarballs.sh @@ -37,21 +37,49 @@ if [[ -z "${rust_toolchain}" ]]; then exit 1 fi -build_kata_deploy_binary() { +rust_builder_out="${build_dir}/kata-deploy-binary-out" + +# kata-deploy and kata-deploy-job-dispatcher are produced by the same rust-builder +# stage. Build it once *per process* and let each component package its own +# binary, so running both components in a single invocation does not compile the +# workspace twice. The guard is process-local (not a directory check) on purpose: +# a fresh invocation must always rebuild, otherwise a stale output dir from an +# earlier run/commit would be silently reused. +rust_binaries_built="false" +build_rust_binaries() { + if [[ "${rust_binaries_built}" == "true" ]]; then + return + fi + rm -rf "${rust_builder_out}" docker buildx build \ --target rust-builder \ --build-arg "RUST_TOOLCHAIN=${rust_toolchain}" \ - --output "type=local,dest=${build_dir}/kata-deploy-binary-out" \ + --output "type=local,dest=${rust_builder_out}" \ -f "${repo_root_dir}/tools/packaging/kata-deploy/Dockerfile.components" \ "${repo_root_dir}" + rust_binaries_built="true" +} + +build_kata_deploy_binary() { + build_rust_binaries mkdir -p "${build_dir}/kata-deploy-binary/usr/bin" - cp "${build_dir}/kata-deploy-binary-out/kata-deploy/bin/kata-deploy" \ + cp "${rust_builder_out}/kata-deploy/bin/kata-deploy" \ "${build_dir}/kata-deploy-binary/usr/bin/kata-deploy" tar --zstd -cf "${build_dir}/kata-deploy-static-kata-deploy-binary.tar.zst" \ -C "${build_dir}/kata-deploy-binary" . } +build_kata_deploy_job_dispatcher() { + build_rust_binaries + + mkdir -p "${build_dir}/kata-deploy-job-dispatcher/usr/bin" + cp "${rust_builder_out}/kata-deploy/bin/kata-deploy-job-dispatcher" \ + "${build_dir}/kata-deploy-job-dispatcher/usr/bin/kata-deploy-job-dispatcher" + tar --zstd -cf "${build_dir}/kata-deploy-static-kata-deploy-job-dispatcher.tar.zst" \ + -C "${build_dir}/kata-deploy-job-dispatcher" . +} + build_nydus_snapshotter_for_coco_guest_pull() { docker buildx build \ --target nydus-binary-downloader \ @@ -70,13 +98,15 @@ build_nydus_snapshotter_for_coco_guest_pull() { case "${component}" in kata-deploy-binary) build_kata_deploy_binary ;; + kata-deploy-job-dispatcher) build_kata_deploy_job_dispatcher ;; nydus-snapshotter-for-coco-guest-pull) build_nydus_snapshotter_for_coco_guest_pull ;; all) build_kata_deploy_binary + build_kata_deploy_job_dispatcher build_nydus_snapshotter_for_coco_guest_pull ;; *) - echo "Unknown component '${component}'. Expected: kata-deploy-binary, nydus-snapshotter-for-coco-guest-pull, all" >&2 + echo "Unknown component '${component}'. Expected: kata-deploy-binary, kata-deploy-job-dispatcher, nydus-snapshotter-for-coco-guest-pull, all" >&2 exit 1 ;; esac diff --git a/tools/packaging/release/release.sh b/tools/packaging/release/release.sh index a66efed75f..d717977747 100755 --- a/tools/packaging/release/release.sh +++ b/tools/packaging/release/release.sh @@ -19,6 +19,11 @@ KATA_DEPLOY_IMAGE_TAGS="${KATA_DEPLOY_IMAGE_TAGS:-}" IFS=' ' read -r -a IMAGE_TAGS <<< "${KATA_DEPLOY_IMAGE_TAGS}" KATA_DEPLOY_REGISTRIES="${KATA_DEPLOY_REGISTRIES:-}" IFS=' ' read -r -a REGISTRIES <<< "${KATA_DEPLOY_REGISTRIES}" +# Registries for the separate job-mode dispatcher image. When unset, derived +# from KATA_DEPLOY_REGISTRIES by inserting "-job-dispatcher" before any "-ci" +# suffix on each entry (so the "-ci" stays last). +KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES="${KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES:-}" +IFS=' ' read -r -a JOB_DISPATCHER_REGISTRIES <<< "${KATA_DEPLOY_JOB_DISPATCHER_REGISTRIES}" GH_TOKEN="${GH_TOKEN:-}" ARCHITECTURE="${ARCHITECTURE:-}" KATA_STATIC_TARBALL="${KATA_STATIC_TARBALL:-}" @@ -146,11 +151,28 @@ function _publish_multiarch_manifest() _check_required_env_var "KATA_DEPLOY_IMAGE_TAGS" _check_required_env_var "KATA_DEPLOY_REGISTRIES" + # The dispatcher is shipped as a separate, minimal image alongside kata-deploy + # with the same tags. When no dedicated registries are given, derive them from + # each kata-deploy registry by inserting "-job-dispatcher" before any "-ci" + # suffix, so the "-ci" stays last: + # .../kata-deploy -> .../kata-deploy-job-dispatcher + # .../kata-deploy-ci -> .../kata-deploy-job-dispatcher-ci + if [[ ${#JOB_DISPATCHER_REGISTRIES[@]} -eq 0 ]]; then + JOB_DISPATCHER_REGISTRIES=() + for registry in "${REGISTRIES[@]}"; do + if [[ "${registry}" == *-ci ]]; then + JOB_DISPATCHER_REGISTRIES+=("${registry%-ci}-job-dispatcher-ci") + else + JOB_DISPATCHER_REGISTRIES+=("${registry}-job-dispatcher") + fi + done + fi + # Per-arch images are built without provenance/SBOM so each tag is a single image manifest; # quay.io rejects pushing multi-arch manifest lists that include attestation manifests # ("manifest invalid"), so we do not enable them for this workflow. # imagetools create pushes to --tag by default. - for registry in "${REGISTRIES[@]}"; do + for registry in "${REGISTRIES[@]}" "${JOB_DISPATCHER_REGISTRIES[@]}"; do for tag in "${IMAGE_TAGS[@]}"; do docker buildx imagetools create --tag "${registry}:${tag}" \ "${registry}:${tag}-amd64" \ From 225ff2209ed49353311d35da4b608936b1b96974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:06:16 +0200 Subject: [PATCH 3/9] kata-deploy: split install/cleanup into staged actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of migrating kata-deploy from a DaemonSet to a staged JobSet workflow: refactor the binary's install/cleanup flows into discrete, independently invocable stages while keeping the existing DaemonSet path fully working. Add new staged subcommands that each run one step and exit, so a JobSet can drive them as ordered initContainers/Jobs per node: install: host-check -> artifacts -> cri -> label cleanup (reverse): unlabel -> revert-cri -> remove-artifacts `install` becomes a compatibility wrapper composing the install stages in the canonical order, so the DaemonSet deployment model is unchanged. The DaemonSet `cleanup` (with its DaemonSet-presence gating) is left intact; the staged cleanup actions are added alongside it and skip that gating since the JobSet workflow only schedules them on a real uninstall. Each stage has an idempotent skip check so reruns are safe: - install label / cleanup unlabel: short-circuit via the node label - cleanup remove-artifacts: skip when the install dir is already gone - cleanup revert-cri: skip the disruptive runtime restart when the CRI drop-ins are already absent (new cri_drop_in_present helper) Introduce a shared KATA_RUNTIME_LABEL constant and add rstest-based tests covering the subcommand-name -> Action mapping, rejection of unknown actions, and the visible/hidden help semantics. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../packaging/kata-deploy/binary/src/main.rs | 365 +++++++++++++++++- 1 file changed, 348 insertions(+), 17 deletions(-) diff --git a/tools/packaging/kata-deploy/binary/src/main.rs b/tools/packaging/kata-deploy/binary/src/main.rs index 6691fed0e9..d015fa05e1 100644 --- a/tools/packaging/kata-deploy/binary/src/main.rs +++ b/tools/packaging/kata-deploy/binary/src/main.rs @@ -56,6 +56,39 @@ enum Action { Install, Cleanup, Reset, + /// Stage 0 of a staged (JobSet) install: validate host/node prerequisites + /// without mutating the host. Fails fast with actionable diagnostics when + /// the node cannot support installation. + #[clap(name = "install-stage-host-check")] + InstallStageHostCheck, + /// Stage 1 of a staged (JobSet) install: install kata artifacts/config on + /// the host and set up configured snapshotters. Does not touch CRI + /// configuration, but is still privileged (host writes + snapshotter setup + /// shell into the host via nsenter). + #[clap(name = "install-stage-artifacts")] + InstallStageArtifacts, + /// Stage 2 of a staged (JobSet) install: write CRI drop-ins, restart the + /// runtime, and wait for node readiness. Privileged + short-lived. + #[clap(name = "install-stage-cri")] + InstallStageCri, + /// Stage 3 of a staged (JobSet) install: apply the kata-runtime node label. + /// Unprivileged, Kubernetes API only. + #[clap(name = "install-stage-label")] + InstallStageLabel, + /// Cleanup stage 1 of a staged (JobSet) uninstall: remove the kata-runtime + /// node label first so the scheduler stops placing kata workloads here. + /// Unprivileged, Kubernetes API only. + #[clap(name = "cleanup-stage-unlabel")] + CleanupStageUnlabel, + /// Cleanup stage 2 of a staged (JobSet) uninstall: remove CRI drop-ins, + /// restart the runtime, and wait for readiness. Privileged + short-lived. + #[clap(name = "cleanup-stage-revert-cri")] + CleanupStageRevertCri, + /// Cleanup stage 3 of a staged (JobSet) uninstall: remove kata + /// artifacts/config/symlinks from the host. Privileged (mutates the host + /// filesystem under the install dir). + #[clap(name = "cleanup-stage-remove-artifacts")] + CleanupStageRemoveArtifacts, /// Internal: entered via re-exec after install completes. Holds the /// DaemonSet pod alive waiting for SIGTERM, then runs cleanup. Hidden /// from `--help`; users should never invoke this directly. @@ -63,6 +96,10 @@ enum Action { InternalPostInstallWait, } +/// Node label applied to mark a node as kata-capable. Shared across the +/// install/cleanup label stages so the key stays consistent. +const KATA_RUNTIME_LABEL: &str = "katacontainers.io/kata-runtime"; + // Cap the tokio runtime to a small fixed number of worker threads. The default // multi-thread runtime allocates `num_cpus()` workers (each with a ~2 MiB // stack), which on a 200+ vCPU GPU node is the dominant contributor to the @@ -107,6 +144,13 @@ async fn main() -> Result<()> { Action::Install => "install", Action::Cleanup => "cleanup", Action::Reset => "reset", + Action::InstallStageHostCheck => "install-stage-host-check", + Action::InstallStageArtifacts => "install-stage-artifacts", + Action::InstallStageCri => "install-stage-cri", + Action::InstallStageLabel => "install-stage-label", + Action::CleanupStageUnlabel => "cleanup-stage-unlabel", + Action::CleanupStageRevertCri => "cleanup-stage-revert-cri", + Action::CleanupStageRemoveArtifacts => "cleanup-stage-remove-artifacts", Action::InternalPostInstallWait => "internal-post-install-wait", }; config.print_info(action_str); @@ -245,6 +289,42 @@ async fn main() -> Result<()> { // Exit after completion so the job can complete info!("Reset completed, exiting"); } + // Staged (JobSet) install actions. Each runs one step of the install + // pipeline as a short-lived Job/initContainer and exits. The DaemonSet + // path does not use these directly; it goes through `install` above, + // which composes the same stage functions. + Action::InstallStageHostCheck => { + install_stage_host_check(&config, &runtime).await?; + info!("Install host-check stage completed, exiting"); + } + Action::InstallStageArtifacts => { + install_stage_artifacts(&config, &runtime).await?; + info!("Install artifacts stage completed, exiting"); + } + Action::InstallStageCri => { + install_stage_cri(&config, &runtime).await?; + info!("Install CRI stage completed, exiting"); + } + Action::InstallStageLabel => { + install_stage_label(&config).await?; + info!("Install label stage completed, exiting"); + } + // Staged (JobSet) cleanup actions. These run in reverse order + // (unlabel -> revert-cri -> remove-artifacts) and, unlike the DaemonSet + // `cleanup` above, do not perform DaemonSet-presence gating: the JobSet + // workflow only schedules these when an uninstall is actually intended. + Action::CleanupStageUnlabel => { + cleanup_stage_unlabel(&config).await?; + info!("Cleanup unlabel stage completed, exiting"); + } + Action::CleanupStageRevertCri => { + cleanup_stage_revert_cri(&config, &runtime).await?; + info!("Cleanup revert-cri stage completed, exiting"); + } + Action::CleanupStageRemoveArtifacts => { + cleanup_stage_remove_artifacts(&config).await?; + info!("Cleanup remove-artifacts stage completed, exiting"); + } } Ok(()) @@ -273,20 +353,39 @@ fn reexec_into_post_install_wait( )) } +/// Full install pipeline. Used by the DaemonSet deployment model. Composes the +/// same per-stage functions the staged JobSet workflow invokes individually, in +/// the canonical order: host-check -> artifacts -> cri -> label. async fn install(config: &config::Config, runtime: &str) -> Result<()> { info!("Installing Kata Containers"); - const SUPPORTED_RUNTIMES: &[&str] = &[ - "crio", - "containerd", - "k3s", - "k3s-agent", - "rke2-agent", - "rke2-server", - "k0s-worker", - "k0s-controller", - "microk8s", - ]; + install_stage_host_check(config, runtime).await?; + install_stage_artifacts(config, runtime).await?; + install_stage_cri(config, runtime).await?; + install_stage_label(config).await?; + + info!("Kata Containers installation completed successfully"); + Ok(()) +} + +const SUPPORTED_RUNTIMES: &[&str] = &[ + "crio", + "containerd", + "k3s", + "k3s-agent", + "rke2-agent", + "rke2-server", + "k0s-worker", + "k0s-controller", + "microk8s", +]; + +/// Install stage 0 (host-check): validate that this node can support a Kata +/// installation before any host mutation happens. This is read-only and safe +/// to run repeatedly; it fails fast with actionable diagnostics so a staged +/// JobSet can abort the per-node pipeline before the privileged stages run. +async fn install_stage_host_check(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (host-check): validating node prerequisites for runtime {runtime}"); if !SUPPORTED_RUNTIMES.contains(&runtime) { return Err(anyhow::anyhow!( @@ -345,16 +444,44 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { } } - runtime::containerd::setup_containerd_config_files(runtime, config).await?; + info!("install (host-check): node prerequisites satisfied"); + Ok(()) +} + +/// Install stage 1 (artifacts): place kata artifacts/config on the host and set +/// up any configured snapshotters. This does not touch CRI configuration, but it +/// still needs privileged host access: writing under the host install dir and +/// the snapshotter setup (e.g. nydus) shell into the host via nsenter. +async fn install_stage_artifacts(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (artifacts): installing kata artifacts on host"); artifacts::install_artifacts(config, runtime).await?; + if runtime != "crio" { + if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { + for snapshotter in snapshotters { + artifacts::snapshotters::install_snapshotter(snapshotter, config).await?; + } + } + } + + info!("install (artifacts): artifacts installed"); + Ok(()) +} + +/// Install stage 2 (cri): write CRI drop-ins, configure snapshotters, restart +/// the runtime, and wait for the node to become ready. This is the privileged, +/// node-disrupting stage and is kept short-lived. +async fn install_stage_cri(config: &config::Config, runtime: &str) -> Result<()> { + info!("install (cri): configuring CRI runtime"); + + runtime::containerd::setup_containerd_config_files(runtime, config).await?; + runtime::configure_cri_runtime(config, runtime).await?; if runtime != "crio" { if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { for snapshotter in snapshotters { - artifacts::snapshotters::install_snapshotter(snapshotter, config).await?; artifacts::snapshotters::configure_snapshotter(snapshotter, runtime, config) .await?; } @@ -365,9 +492,29 @@ async fn install(config: &config::Config, runtime: &str) -> Result<()> { runtime::lifecycle::restart_runtime(config, runtime).await?; info!("Runtime restart completed successfully"); - label_node_with_retry(config, "katacontainers.io/kata-runtime", "true").await?; + Ok(()) +} + +/// Install stage 3 (label): apply the kata-runtime node label. Unprivileged, +/// Kubernetes API only. Skips re-applying when the label is already correct. +async fn install_stage_label(config: &config::Config) -> Result<()> { + info!("install (label): applying node label"); + + match k8s::get_node_label(config, KATA_RUNTIME_LABEL).await { + Ok(Some(ref val)) if val == "true" => { + info!( + "install (label): node already labeled {}=true, skipping", + KATA_RUNTIME_LABEL + ); + return Ok(()); + } + // Any other state (absent, different value, or a transient read error) + // falls through to label_node_with_retry, which applies and verifies. + _ => {} + } + + label_node_with_retry(config, KATA_RUNTIME_LABEL, "true").await?; - info!("Kata Containers installation completed successfully"); Ok(()) } @@ -539,7 +686,7 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { info!("No other kata-deploy DaemonSets found, performing full shared cleanup"); info!("Removing kata-runtime label from node"); - k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; info!("Successfully removed kata-runtime label"); // Restart the CRI runtime last. On k3s/rke2 this restarts the entire @@ -553,10 +700,111 @@ async fn cleanup(config: &config::Config, runtime: &str) -> Result<()> { Ok(()) } +/// Cleanup stage 1 (unlabel): remove the kata-runtime node label first so the +/// scheduler stops placing kata workloads on this node before any host +/// mutation. Unprivileged, Kubernetes API only. Skips when already absent. +async fn cleanup_stage_unlabel(config: &config::Config) -> Result<()> { + info!("cleanup (unlabel): removing node label"); + + // If the label is already absent, there is nothing to do. Any other state + // (present, or unknown due to a transient read error) falls through to the + // removal below. + if let Ok(None) = k8s::get_node_label(config, KATA_RUNTIME_LABEL).await { + info!( + "cleanup (unlabel): label {} already absent, skipping", + KATA_RUNTIME_LABEL + ); + return Ok(()); + } + + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; + info!("cleanup (unlabel): label removed"); + Ok(()) +} + +/// Cleanup stage 2 (revert-cri): remove CRI drop-ins (and any snapshotter +/// config), then restart the runtime and wait for readiness. This is the +/// privileged, node-disrupting cleanup stage and is kept short-lived. Skips +/// entirely when the CRI drop-ins are already absent, avoiding an unnecessary +/// runtime restart. +async fn cleanup_stage_revert_cri(config: &config::Config, runtime: &str) -> Result<()> { + info!("cleanup (revert-cri): reverting CRI configuration"); + + if !cri_drop_in_present(config, runtime).await { + info!("cleanup (revert-cri): CRI drop-ins already absent, skipping"); + return Ok(()); + } + + if runtime != "crio" { + if let Some(snapshotters) = config.experimental_setup_snapshotter.as_ref() { + for snapshotter in snapshotters { + info!("cleanup (revert-cri): uninstalling snapshotter {snapshotter}"); + artifacts::snapshotters::uninstall_snapshotter(snapshotter, config).await?; + } + } + } + + runtime::cleanup_cri_runtime_config(config, runtime).await?; + + info!("cleanup (revert-cri): restarting runtime"); + runtime::restart_and_wait_for_ready(config, runtime).await?; + info!("cleanup (revert-cri): runtime restarted"); + + Ok(()) +} + +/// Cleanup stage 3 (remove-artifacts): delete kata artifacts/config/symlinks +/// from the host. Skips when the install directory is already gone. +async fn cleanup_stage_remove_artifacts(config: &config::Config) -> Result<()> { + info!("cleanup (remove-artifacts): removing kata artifacts from host"); + + if !std::path::Path::new(&config.host_install_dir).exists() { + info!( + "cleanup (remove-artifacts): install dir {} already absent, skipping", + config.host_install_dir + ); + return Ok(()); + } + + artifacts::remove_artifacts(config).await?; + info!("cleanup (remove-artifacts): artifacts removed"); + Ok(()) +} + +/// Best-effort check for whether kata's CRI drop-in configuration is present on +/// the host for this runtime. Used by the staged cleanup to skip a disruptive +/// runtime restart when there is nothing to revert. On any uncertainty (e.g. +/// the containerd paths cannot be resolved) this returns `true` so the caller +/// errs on the side of running the revert rather than incorrectly skipping it. +async fn cri_drop_in_present(config: &config::Config, runtime: &str) -> bool { + if runtime == "crio" { + return std::path::Path::new(&config.crio_drop_in_conf_file).exists(); + } + + match config.get_containerd_paths(runtime).await { + Ok(paths) => { + // /etc/containerd is mounted directly; other paths live under /host. + let resolved = if paths.drop_in_file.starts_with("/etc/containerd/") { + std::path::PathBuf::from(&paths.drop_in_file) + } else { + std::path::Path::new("/host").join(paths.drop_in_file.trim_start_matches('/')) + }; + resolved.exists() + } + Err(e) => { + log::warn!( + "cleanup (revert-cri): could not resolve containerd paths to check drop-in \ + presence ({e}); proceeding with revert" + ); + true + } + } +} + async fn reset(config: &config::Config, runtime: &str) -> Result<()> { info!("Resetting Kata Containers"); - k8s::label_node(config, "katacontainers.io/kata-runtime", None, false).await?; + k8s::label_node(config, KATA_RUNTIME_LABEL, None, false).await?; runtime::lifecycle::restart_cri_runtime(config, runtime).await?; if matches!(runtime, "crio" | "containerd") { utils::host_systemctl(&["restart", "kubelet"])?; @@ -566,3 +814,86 @@ async fn reset(config: &config::Config, runtime: &str) -> Result<()> { info!("Kata Containers reset completed successfully"); Ok(()) } + +#[cfg(test)] +mod tests { + //! Tests for CLI action wiring. The staged install/cleanup actions are the + //! entrypoints the JobSet workflow invokes per node, so we lock in their + //! exact subcommand names (a rename would silently break the chart) and the + //! mapping into the `Action` enum. + + use super::*; + use clap::ValueEnum; + use rstest::rstest; + + /// Every staged subcommand name parses into the expected `Action` variant. + /// Keep this in sync with the `#[clap(name = ...)]` attributes above. + #[rstest] + #[case("install", Action::Install)] + #[case("cleanup", Action::Cleanup)] + #[case("reset", Action::Reset)] + #[case("install-stage-host-check", Action::InstallStageHostCheck)] + #[case("install-stage-artifacts", Action::InstallStageArtifacts)] + #[case("install-stage-cri", Action::InstallStageCri)] + #[case("install-stage-label", Action::InstallStageLabel)] + #[case("cleanup-stage-unlabel", Action::CleanupStageUnlabel)] + #[case("cleanup-stage-revert-cri", Action::CleanupStageRevertCri)] + #[case("cleanup-stage-remove-artifacts", Action::CleanupStageRemoveArtifacts)] + #[case("internal-post-install-wait", Action::InternalPostInstallWait)] + fn test_action_parses_from_arg(#[case] arg: &str, #[case] expected: Action) { + let args = Args::try_parse_from(["kata-deploy", arg]) + .unwrap_or_else(|e| panic!("failed to parse action {arg:?}: {e}")); + assert_eq!( + std::mem::discriminant(&args.action), + std::mem::discriminant(&expected), + "arg {arg:?} parsed into the wrong Action variant", + ); + } + + /// Unknown actions must be rejected rather than silently accepted. + #[rstest] + #[case("install-stage")] + #[case("cleanup-stage")] + #[case("install-stage-foo")] + #[case("bogus")] + fn test_unknown_action_is_rejected(#[case] arg: &str) { + assert!( + Args::try_parse_from(["kata-deploy", arg]).is_err(), + "expected action {arg:?} to be rejected", + ); + } + + /// The hidden internal waiter must stay hidden from `--help` so users never + /// invoke it directly, while still being parseable (asserted above). + #[test] + fn test_internal_action_is_hidden() { + let internal = Action::InternalPostInstallWait + .to_possible_value() + .expect("internal action should have a possible value"); + assert!( + internal.is_hide_set(), + "internal-post-install-wait should be hidden from --help", + ); + } + + /// All non-internal staged actions remain visible in `--help` so operators + /// can discover and run individual stages. + #[rstest] + #[case(Action::InstallStageHostCheck)] + #[case(Action::InstallStageArtifacts)] + #[case(Action::InstallStageCri)] + #[case(Action::InstallStageLabel)] + #[case(Action::CleanupStageUnlabel)] + #[case(Action::CleanupStageRevertCri)] + #[case(Action::CleanupStageRemoveArtifacts)] + fn test_staged_actions_are_visible(#[case] action: Action) { + let value = action + .to_possible_value() + .expect("staged action should have a possible value"); + assert!( + !value.is_hide_set(), + "staged action {:?} should be visible in --help", + value.get_name(), + ); + } +} From 28fce44b702d04a334937126eb58fcd89085ff19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:06:26 +0200 Subject: [PATCH 4/9] kata-deploy: extract shared pod env/volumes into helm helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull the kata-deploy container's environment block and host volume/volumeMount definitions out of the DaemonSet template into reusable named templates in _helpers.tpl: - kata-deploy.commonEnv - kata-deploy.commonVolumeMounts - kata-deploy.commonVolumes These are derived purely from chart values and are independent of the deployment model, so they can be shared verbatim by upcoming per-node install/cleanup Jobs without duplicating the (large) env wiring. Pure refactor: the rendered DaemonSet is byte-for-byte identical to before (verified via normalized `helm template` diff across default and multiInstallSuffix/userDropIn/customRuntimes permutations). Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../kata-deploy/templates/_helpers.tpl | 240 ++++++++++++++++++ .../kata-deploy/templates/kata-deploy.yaml | 215 +--------------- 2 files changed, 243 insertions(+), 212 deletions(-) diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl index cd885522e0..dae7c0ca32 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl @@ -409,6 +409,246 @@ Get debug value from structured config {{- end -}} {{- end -}} +{{/* +Common environment variables for any pod that runs the kata-deploy binary +(DaemonSet, staged JobSet install/cleanup Jobs, reconcile-created Jobs). + +These are all derived from chart values and are independent of the deployment +model, so they are shared verbatim. HEALTH_PORT and the health probes are NOT +included here: they only matter for the long-running install pod (DaemonSet), +not the short-lived staged Jobs. + +Emitted at column 0; callers must indent with `nindent` to the right depth, +e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`. +*/}} +{{- define "kata-deploy.commonEnv" -}} +- name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName +{{- if .Values.env.multiInstallSuffix }} +- name: DAEMONSET_NAME + value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }} +{{- else }} +- name: DAEMONSET_NAME + value: {{ .Chart.Name | quote }} +{{- end }} +- name: DEBUG + value: {{ include "kata-deploy.getDebug" . | quote }} +{{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $shimsAmd64 }} +- name: SHIMS_X86_64 + value: {{ $shimsAmd64 | quote }} +{{- end }} +{{- $shimsArm64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $shimsArm64 }} +- name: SHIMS_AARCH64 + value: {{ $shimsArm64 | quote }} +{{- end }} +{{- $shimsS390x := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $shimsS390x }} +- name: SHIMS_S390X + value: {{ $shimsS390x | quote }} +{{- end }} +{{- $shimsPpc64le := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $shimsPpc64le }} +- name: SHIMS_PPC64LE + value: {{ $shimsPpc64le | quote }} +{{- end }} +{{- $defaultShimAmd64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $defaultShimAmd64 }} +- name: DEFAULT_SHIM_X86_64 + value: {{ $defaultShimAmd64 | quote }} +{{- end }} +{{- $defaultShimArm64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $defaultShimArm64 }} +- name: DEFAULT_SHIM_AARCH64 + value: {{ $defaultShimArm64 | quote }} +{{- end }} +{{- $defaultShimS390x := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $defaultShimS390x }} +- name: DEFAULT_SHIM_S390X + value: {{ $defaultShimS390x | quote }} +{{- end }} +{{- $defaultShimPpc64le := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $defaultShimPpc64le }} +- name: DEFAULT_SHIM_PPC64LE + value: {{ $defaultShimPpc64le | quote }} +{{- end }} +{{- $allowedHypervisorAnnotationsAmd64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $allowedHypervisorAnnotationsAmd64 }} +- name: ALLOWED_HYPERVISOR_ANNOTATIONS_X86_64 + value: {{ $allowedHypervisorAnnotationsAmd64 | quote }} +{{- end }} +{{- $allowedHypervisorAnnotationsArm64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $allowedHypervisorAnnotationsArm64 }} +- name: ALLOWED_HYPERVISOR_ANNOTATIONS_AARCH64 + value: {{ $allowedHypervisorAnnotationsArm64 | quote }} +{{- end }} +{{- $allowedHypervisorAnnotationsS390x := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $allowedHypervisorAnnotationsS390x }} +- name: ALLOWED_HYPERVISOR_ANNOTATIONS_S390X + value: {{ $allowedHypervisorAnnotationsS390x | quote }} +{{- end }} +{{- $allowedHypervisorAnnotationsPpc64le := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $allowedHypervisorAnnotationsPpc64le }} +- name: ALLOWED_HYPERVISOR_ANNOTATIONS_PPC64LE + value: {{ $allowedHypervisorAnnotationsPpc64le | quote }} +{{- end }} +{{- $snapshotterHandlerMappingAmd64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $snapshotterHandlerMappingAmd64 }} +- name: SNAPSHOTTER_HANDLER_MAPPING_X86_64 + value: {{ $snapshotterHandlerMappingAmd64 | quote }} +{{- end }} +{{- $snapshotterHandlerMappingArm64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $snapshotterHandlerMappingArm64 }} +- name: SNAPSHOTTER_HANDLER_MAPPING_AARCH64 + value: {{ $snapshotterHandlerMappingArm64 | quote }} +{{- end }} +{{- $snapshotterHandlerMappingS390x := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $snapshotterHandlerMappingS390x }} +- name: SNAPSHOTTER_HANDLER_MAPPING_S390X + value: {{ $snapshotterHandlerMappingS390x | quote }} +{{- end }} +{{- $snapshotterHandlerMappingPpc64le := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $snapshotterHandlerMappingPpc64le }} +- name: SNAPSHOTTER_HANDLER_MAPPING_PPC64LE + value: {{ $snapshotterHandlerMappingPpc64le | quote }} +{{- end }} +{{- $agentHttpsProxy := include "kata-deploy.getAgentHttpsProxy" . | trim -}} +{{- if $agentHttpsProxy }} +- name: AGENT_HTTPS_PROXY + value: {{ $agentHttpsProxy | quote }} +{{- end }} +{{- $agentNoProxy := include "kata-deploy.getAgentNoProxy" . | trim -}} +{{- if $agentNoProxy }} +- name: AGENT_NO_PROXY + value: {{ $agentNoProxy | quote }} +{{- end }} +{{- $pullTypeMappingAmd64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $pullTypeMappingAmd64 }} +- name: PULL_TYPE_MAPPING_X86_64 + value: {{ $pullTypeMappingAmd64 | quote }} +{{- end }} +{{- $pullTypeMappingArm64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $pullTypeMappingArm64 }} +- name: PULL_TYPE_MAPPING_AARCH64 + value: {{ $pullTypeMappingArm64 | quote }} +{{- end }} +{{- $pullTypeMappingS390x := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $pullTypeMappingS390x }} +- name: PULL_TYPE_MAPPING_S390X + value: {{ $pullTypeMappingS390x | quote }} +{{- end }} +{{- $pullTypeMappingPpc64le := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $pullTypeMappingPpc64le }} +- name: PULL_TYPE_MAPPING_PPC64LE + value: {{ $pullTypeMappingPpc64le | quote }} +{{- end }} +- name: INSTALLATION_PREFIX + value: {{ .Values.env.installationPrefix | quote }} +- name: MULTI_INSTALL_SUFFIX + value: {{ .Values.env.multiInstallSuffix | quote }} +{{- $snapshotterSetup := include "kata-deploy.getSnapshotterSetup" . | trim -}} +{{- if $snapshotterSetup }} +- name: EXPERIMENTAL_SETUP_SNAPSHOTTER + value: {{ $snapshotterSetup | quote }} +{{- end }} +{{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}} +{{- if $forceGuestPullAmd64 }} +- name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64 + value: {{ $forceGuestPullAmd64 | quote }} +{{- end }} +{{- $forceGuestPullArm64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "arm64") | trim -}} +{{- if $forceGuestPullArm64 }} +- name: EXPERIMENTAL_FORCE_GUEST_PULL_AARCH64 + value: {{ $forceGuestPullArm64 | quote }} +{{- end }} +{{- $forceGuestPullS390x := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "s390x") | trim -}} +{{- if $forceGuestPullS390x }} +- name: EXPERIMENTAL_FORCE_GUEST_PULL_S390X + value: {{ $forceGuestPullS390x | quote }} +{{- end }} +{{- $forceGuestPullPpc64le := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "ppc64le") | trim -}} +{{- if $forceGuestPullPpc64le }} +- name: EXPERIMENTAL_FORCE_GUEST_PULL_PPC64LE + value: {{ $forceGuestPullPpc64le | quote }} +{{- end }} +{{- if .Values.containerd.configFileName | trim }} +- name: CONTAINERD_CONFIG_FILE_NAME + value: {{ .Values.containerd.configFileName | trim | quote }} +{{- end }} +{{- if .Values.containerd.userDropIn | trim }} +- name: CONTAINERD_USER_DROP_IN_SOURCE_FILE + value: "/custom-containerd-config/containerd-user-dropin.toml" +{{- end }} +{{- with .Values.env.hostOS }} +- name: HOST_OS + value: {{ . | quote }} +{{- end }} +{{- if and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes }} +- name: CUSTOM_RUNTIMES_ENABLED + value: "true" +{{- end }} +{{- end -}} + +{{/* +Common volumeMounts for any pod that runs the kata-deploy binary against the +host. Emitted at column 0; indent with `nindent` at the call site. +*/}} +{{- define "kata-deploy.commonVolumeMounts" -}} +- name: crio-conf + mountPath: /etc/crio/ +- name: containerd-conf + mountPath: /etc/containerd/ +- name: host + mountPath: /host/ +{{- if .Values.containerd.userDropIn | trim }} +- name: custom-containerd-config + mountPath: /custom-containerd-config/ + readOnly: true +{{- end }} +{{- if or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") }} +- name: custom-configs + mountPath: /custom-configs/ + readOnly: true +{{- end }} +{{- end -}} + +{{/* +Common host/configMap volumes backing the mounts above. Emitted at column 0; +indent with `nindent` at the call site. +*/}} +{{- define "kata-deploy.commonVolumes" -}} +- name: crio-conf + hostPath: + path: /etc/crio/ +- name: containerd-conf + hostPath: + path: '{{- template "containerdConfPath" .Values }}' +- name: host + hostPath: + path: / +{{- if .Values.containerd.userDropIn | trim }} +- name: custom-containerd-config + configMap: +{{- if .Values.env.multiInstallSuffix }} + name: {{ .Chart.Name }}-containerd-user-dropin-{{ .Values.env.multiInstallSuffix }} +{{- else }} + name: {{ .Chart.Name }}-containerd-user-dropin +{{- end }} +{{- end }} +{{- if or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") }} +- name: custom-configs + configMap: +{{- if .Values.env.multiInstallSuffix }} + name: {{ .Chart.Name }}-custom-configs-{{ .Values.env.multiInstallSuffix }} +{{- else }} + name: {{ .Chart.Name }}-custom-configs +{{- end }} +{{- end }} +{{- end -}} + {{/* Get EXPERIMENTAL_FORCE_GUEST_PULL for a specific architecture from structured config Returns comma-separated list of shim names with forceGuestPull enabled diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml index e1a2614a64..ff02c34de6 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml @@ -18,7 +18,6 @@ {{- end -}} {{- end -}} {{- end -}} -{{- $hasCustomConfigs := or (and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes) (eq (include "kata-deploy.hasDefaultRuntimeDropIns" . | trim) "true") -}} apiVersion: apps/v1 kind: DaemonSet metadata: @@ -153,174 +152,7 @@ spec: {{- end }} - install env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName -{{- if .Values.env.multiInstallSuffix }} - - name: DAEMONSET_NAME - value: {{ printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix | quote }} -{{- else }} - - name: DAEMONSET_NAME - value: {{ .Chart.Name | quote }} -{{- end }} - - name: DEBUG - value: {{ include "kata-deploy.getDebug" . | quote }} - {{- $shimsAmd64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $shimsAmd64 }} - - name: SHIMS_X86_64 - value: {{ $shimsAmd64 | quote }} - {{- end }} - {{- $shimsArm64 := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $shimsArm64 }} - - name: SHIMS_AARCH64 - value: {{ $shimsArm64 | quote }} - {{- end }} - {{- $shimsS390x := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $shimsS390x }} - - name: SHIMS_S390X - value: {{ $shimsS390x | quote }} - {{- end }} - {{- $shimsPpc64le := include "kata-deploy.getEnabledShimsForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $shimsPpc64le }} - - name: SHIMS_PPC64LE - value: {{ $shimsPpc64le | quote }} - {{- end }} - {{- $defaultShimAmd64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $defaultShimAmd64 }} - - name: DEFAULT_SHIM_X86_64 - value: {{ $defaultShimAmd64 | quote }} - {{- end }} - {{- $defaultShimArm64 := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $defaultShimArm64 }} - - name: DEFAULT_SHIM_AARCH64 - value: {{ $defaultShimArm64 | quote }} - {{- end }} - {{- $defaultShimS390x := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $defaultShimS390x }} - - name: DEFAULT_SHIM_S390X - value: {{ $defaultShimS390x | quote }} - {{- end }} - {{- $defaultShimPpc64le := include "kata-deploy.getDefaultShimForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $defaultShimPpc64le }} - - name: DEFAULT_SHIM_PPC64LE - value: {{ $defaultShimPpc64le | quote }} - {{- end }} - {{- $allowedHypervisorAnnotationsAmd64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $allowedHypervisorAnnotationsAmd64 }} - - name: ALLOWED_HYPERVISOR_ANNOTATIONS_X86_64 - value: {{ $allowedHypervisorAnnotationsAmd64 | quote }} - {{- end }} - {{- $allowedHypervisorAnnotationsArm64 := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $allowedHypervisorAnnotationsArm64 }} - - name: ALLOWED_HYPERVISOR_ANNOTATIONS_AARCH64 - value: {{ $allowedHypervisorAnnotationsArm64 | quote }} - {{- end }} - {{- $allowedHypervisorAnnotationsS390x := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $allowedHypervisorAnnotationsS390x }} - - name: ALLOWED_HYPERVISOR_ANNOTATIONS_S390X - value: {{ $allowedHypervisorAnnotationsS390x | quote }} - {{- end }} - {{- $allowedHypervisorAnnotationsPpc64le := include "kata-deploy.getAllowedHypervisorAnnotationsForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $allowedHypervisorAnnotationsPpc64le }} - - name: ALLOWED_HYPERVISOR_ANNOTATIONS_PPC64LE - value: {{ $allowedHypervisorAnnotationsPpc64le | quote }} - {{- end }} - {{- $snapshotterHandlerMappingAmd64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $snapshotterHandlerMappingAmd64 }} - - name: SNAPSHOTTER_HANDLER_MAPPING_X86_64 - value: {{ $snapshotterHandlerMappingAmd64 | quote }} - {{- end }} - {{- $snapshotterHandlerMappingArm64 := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $snapshotterHandlerMappingArm64 }} - - name: SNAPSHOTTER_HANDLER_MAPPING_AARCH64 - value: {{ $snapshotterHandlerMappingArm64 | quote }} - {{- end }} - {{- $snapshotterHandlerMappingS390x := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $snapshotterHandlerMappingS390x }} - - name: SNAPSHOTTER_HANDLER_MAPPING_S390X - value: {{ $snapshotterHandlerMappingS390x | quote }} - {{- end }} - {{- $snapshotterHandlerMappingPpc64le := include "kata-deploy.getSnapshotterHandlerMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $snapshotterHandlerMappingPpc64le }} - - name: SNAPSHOTTER_HANDLER_MAPPING_PPC64LE - value: {{ $snapshotterHandlerMappingPpc64le | quote }} - {{- end }} - {{- $agentHttpsProxy := include "kata-deploy.getAgentHttpsProxy" . | trim -}} - {{- if $agentHttpsProxy }} - - name: AGENT_HTTPS_PROXY - value: {{ $agentHttpsProxy | quote }} - {{- end }} - {{- $agentNoProxy := include "kata-deploy.getAgentNoProxy" . | trim -}} - {{- if $agentNoProxy }} - - name: AGENT_NO_PROXY - value: {{ $agentNoProxy | quote }} - {{- end }} - {{- $pullTypeMappingAmd64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $pullTypeMappingAmd64 }} - - name: PULL_TYPE_MAPPING_X86_64 - value: {{ $pullTypeMappingAmd64 | quote }} - {{- end }} - {{- $pullTypeMappingArm64 := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $pullTypeMappingArm64 }} - - name: PULL_TYPE_MAPPING_AARCH64 - value: {{ $pullTypeMappingArm64 | quote }} - {{- end }} - {{- $pullTypeMappingS390x := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $pullTypeMappingS390x }} - - name: PULL_TYPE_MAPPING_S390X - value: {{ $pullTypeMappingS390x | quote }} - {{- end }} - {{- $pullTypeMappingPpc64le := include "kata-deploy.getPullTypeMappingForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $pullTypeMappingPpc64le }} - - name: PULL_TYPE_MAPPING_PPC64LE - value: {{ $pullTypeMappingPpc64le | quote }} - {{- end }} - - name: INSTALLATION_PREFIX - value: {{ .Values.env.installationPrefix | quote }} - - name: MULTI_INSTALL_SUFFIX - value: {{ .Values.env.multiInstallSuffix | quote }} - {{- $snapshotterSetup := include "kata-deploy.getSnapshotterSetup" . | trim -}} - {{- if $snapshotterSetup }} - - name: EXPERIMENTAL_SETUP_SNAPSHOTTER - value: {{ $snapshotterSetup | quote }} - {{- end }} - {{- $forceGuestPullAmd64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "amd64") | trim -}} - {{- if $forceGuestPullAmd64 }} - - name: EXPERIMENTAL_FORCE_GUEST_PULL_X86_64 - value: {{ $forceGuestPullAmd64 | quote }} - {{- end }} - {{- $forceGuestPullArm64 := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "arm64") | trim -}} - {{- if $forceGuestPullArm64 }} - - name: EXPERIMENTAL_FORCE_GUEST_PULL_AARCH64 - value: {{ $forceGuestPullArm64 | quote }} - {{- end }} - {{- $forceGuestPullS390x := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "s390x") | trim -}} - {{- if $forceGuestPullS390x }} - - name: EXPERIMENTAL_FORCE_GUEST_PULL_S390X - value: {{ $forceGuestPullS390x | quote }} - {{- end }} - {{- $forceGuestPullPpc64le := include "kata-deploy.getForceGuestPullForArch" (dict "root" . "arch" "ppc64le") | trim -}} - {{- if $forceGuestPullPpc64le }} - - name: EXPERIMENTAL_FORCE_GUEST_PULL_PPC64LE - value: {{ $forceGuestPullPpc64le | quote }} - {{- end }} -{{- if .Values.containerd.configFileName | trim }} - - name: CONTAINERD_CONFIG_FILE_NAME - value: {{ .Values.containerd.configFileName | trim | quote }} -{{- end }} -{{- if .Values.containerd.userDropIn | trim }} - - name: CONTAINERD_USER_DROP_IN_SOURCE_FILE - value: "/custom-containerd-config/containerd-user-dropin.toml" -{{- end }} -{{- with .Values.env.hostOS }} - - name: HOST_OS - value: {{ . | quote }} -{{- end }} -{{- if and .Values.customRuntimes.enabled .Values.customRuntimes.runtimes }} - - name: CUSTOM_RUNTIMES_ENABLED - value: "true" -{{- end }} +{{- include "kata-deploy.commonEnv" . | nindent 8 }} {{- $healthDefaults := dict "port" 8090 "startupProbe" (dict "enabled" true "initialDelaySeconds" 1 "periodSeconds" 10 "failureThreshold" 60 "timeoutSeconds" 3) @@ -365,50 +197,9 @@ spec: {{- toYaml . | nindent 10 }} {{- end }} volumeMounts: - - name: crio-conf - mountPath: /etc/crio/ - - name: containerd-conf - mountPath: /etc/containerd/ - - name: host - mountPath: /host/ -{{- if .Values.containerd.userDropIn | trim }} - - name: custom-containerd-config - mountPath: /custom-containerd-config/ - readOnly: true -{{- end }} -{{- if $hasCustomConfigs }} - - name: custom-configs - mountPath: /custom-configs/ - readOnly: true -{{- end }} +{{- include "kata-deploy.commonVolumeMounts" . | nindent 8 }} volumes: - - name: crio-conf - hostPath: - path: /etc/crio/ - - name: containerd-conf - hostPath: - path: '{{- template "containerdConfPath" .Values }}' - - name: host - hostPath: - path: / -{{- if .Values.containerd.userDropIn | trim }} - - name: custom-containerd-config - configMap: -{{- if .Values.env.multiInstallSuffix }} - name: {{ .Chart.Name }}-containerd-user-dropin-{{ .Values.env.multiInstallSuffix }} -{{- else }} - name: {{ .Chart.Name }}-containerd-user-dropin -{{- end }} -{{- end }} -{{- if $hasCustomConfigs }} - - name: custom-configs - configMap: -{{- if .Values.env.multiInstallSuffix }} - name: {{ .Chart.Name }}-custom-configs-{{ .Values.env.multiInstallSuffix }} -{{- else }} - name: {{ .Chart.Name }}-custom-configs -{{- end }} -{{- end }} +{{- include "kata-deploy.commonVolumes" . | nindent 6 }} {{- with .Values.updateStrategy }} updateStrategy: {{- toYaml . | nindent 4 }} From 54878fa373d845d616c17497086b93a8ea215411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:06:55 +0200 Subject: [PATCH 5/9] kata-deploy: add job deployment mode driven by the job-dispatcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the DaemonSet -> staged-Job migration: add an opt-in `deploymentMode: job` that installs Kata via short-lived, per-node install Jobs instead of the long-running DaemonSet. The DaemonSet remains the default and is now gated behind `deploymentMode == daemonset`. Rather than render one Job per node into the Helm release (which grows the release secret O(nodes) and offers no rollout pacing), job mode ships a single tiny post-install/post-upgrade hook Job that runs the kata-deploy-job-dispatcher. The dispatcher enumerates the selected nodes LIVE from the API server and stamps out one node-pinned install Job per node from a constant-size ConfigMap of Job templates, keeping at most `job.parallelism` in flight and refilling as they finish. This guarantees per-node coverage with a paced rollout while the Helm release stays O(1) regardless of fleet size. New nodes are picked up by re-running `helm upgrade`; there is no always-on component. Each per-node Job runs the staged install pipeline as ordered initContainers and exits: host-check -> artifacts -> cri (initContainers, run sequentially) label (main container) The privilege split is explicit: the dispatcher pod is a pure control-plane client (lists nodes, manages Jobs in its own namespace) and runs fully unprivileged under a dedicated, least-privilege ServiceAccount (kata-rbac.yaml); only the per-node Jobs it creates carry the privileged kata-deploy host-mutation rights. Node selection (templates/_helpers.tpl: nodeLabelSelector / perNodeJob): - job.nodes: explicit node-name list passed to the dispatcher, and - job.nodeSelector (equality map) ANDed with - job.nodeSelectorExpressions (k8s label-selector requirements: In / NotIn / Exists / DoesNotExist), compiled into a single label-selector string the dispatcher resolves live. The default expressions target worker (non-control-plane) nodes, so no custom node labeling is required; set the expressions to [] to target all discovered nodes. Reuses the commonEnv/commonVolume* helpers and adds the stageContainer, serviceAccountName, dispatcherServiceAccountName, dispatcherImage and perNodeJob helpers shared by the dispatcher and the staged Jobs. The default (daemonset) render is unchanged. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../kata-deploy/lib/helm-deploy.bash | 13 ++ tests/gha-run-k8s-common.sh | 13 ++ .../kata-deploy/templates/_helpers.tpl | 175 ++++++++++++++++++ .../templates/kata-deploy-install-job.yaml | 113 +++++++++++ .../templates/kata-deploy-job-templates.yaml | 33 ++++ .../kata-deploy/templates/kata-deploy.yaml | 2 + .../kata-deploy/templates/kata-rbac.yaml | 62 +++++++ .../helm-chart/kata-deploy/values.yaml | 103 +++++++++++ ...kata-deploy-build-and-upload-helm-chart.sh | 13 ++ 9 files changed, 527 insertions(+) create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml diff --git a/tests/functional/kata-deploy/lib/helm-deploy.bash b/tests/functional/kata-deploy/lib/helm-deploy.bash index 463bd1648b..73d15dae94 100644 --- a/tests/functional/kata-deploy/lib/helm-deploy.bash +++ b/tests/functional/kata-deploy/lib/helm-deploy.bash @@ -31,11 +31,24 @@ generate_base_values() { local output_file="$1" local extra_values_file="${2:-}" + local kata_deploy_image="${DOCKER_REGISTRY}/${DOCKER_REPO}" + local dispatcher_image + if [[ "${kata_deploy_image}" == *-ci ]]; then + dispatcher_image="${kata_deploy_image%-ci}-job-dispatcher-ci" + else + dispatcher_image="${kata_deploy_image}-job-dispatcher" + fi + cat > "${output_file}" <-job-dispatcher", with the "-ci" suffix (if + # any) kept at the very end (e.g. kata-deploy-ci -> kata-deploy-job-dispatcher-ci). + local dispatcher_reference + if [[ "${HELM_IMAGE_REFERENCE}" == *-ci ]]; then + dispatcher_reference="${HELM_IMAGE_REFERENCE%-ci}-job-dispatcher-ci" + else + dispatcher_reference="${HELM_IMAGE_REFERENCE}-job-dispatcher" + fi + yq -i ".job.dispatcherImage.reference = \"${dispatcher_reference}\"" "${values_yaml}" + yq -i ".job.dispatcherImage.tag = \"${HELM_IMAGE_TAG}\"" "${values_yaml}" + [[ -n "${HELM_K8S_DISTRIBUTION}" ]] && yq -i ".k8sDistribution = \"${HELM_K8S_DISTRIBUTION}\"" "${values_yaml}" if [[ "${HELM_DEFAULT_INSTALLATION}" = "false" ]]; then diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl index dae7c0ca32..457cb00ab6 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/_helpers.tpl @@ -391,6 +391,21 @@ reference:tag (tag defaults to Chart.AppVersion). {{- end -}} {{- end -}} +{{/* +Dispatcher image reference for the job-mode dispatcher (kata-deploy-job-dispatcher). +Supports tag (reference:tag) and digest (reference@sha256:...) formats; tag +defaults to Chart.AppVersion. +*/}} +{{- define "kata-deploy.dispatcherImage" -}} +{{- $ref := .Values.job.dispatcherImage.reference -}} +{{- $tag := default .Chart.AppVersion .Values.job.dispatcherImage.tag | toString -}} +{{- if contains "@" $ref -}} +{{- $ref -}} +{{- else -}} +{{- printf "%s:%s" $ref $tag -}} +{{- end -}} +{{- end -}} + {{/* Get snapshotter setup list from structured config */}} @@ -592,6 +607,166 @@ e.g. `{{- include "kata-deploy.commonEnv" . | nindent 8 }}`. {{- end }} {{- end -}} +{{/* +Build a Kubernetes label-selector STRING (the form accepted by the apiserver +and `kubectl --selector`) from an equality map plus a list of match-expression +requirements. This is handed to `kata-deploy-job-dispatcher --node-selector`, which +resolves the actual target nodes LIVE at run time (so node membership is never +frozen into the Helm release). + +Arguments (dict): + eq - equality label map -> "k=v" + exprs - list of {key, operator, values}: + Exists -> "key" + DoesNotExist -> "!key" + In -> "key in (v1,v2)" + NotIn -> "key notin (v1,v2)" + +Returns the comma-joined selector string (possibly empty, meaning "all nodes"). +*/}} +{{- define "kata-deploy.nodeLabelSelector" -}} +{{- $parts := list -}} +{{- range $k, $v := (.eq | default dict) -}} +{{- $parts = append $parts (printf "%s=%s" $k $v) -}} +{{- end -}} +{{- range $expr := (.exprs | default list) -}} +{{- $op := $expr.operator -}} +{{- if eq $op "Exists" -}} +{{- $parts = append $parts $expr.key -}} +{{- else if eq $op "DoesNotExist" -}} +{{- $parts = append $parts (printf "!%s" $expr.key) -}} +{{- else if eq $op "In" -}} +{{- $parts = append $parts (printf "%s in (%s)" $expr.key (join "," ($expr.values | default list))) -}} +{{- else if eq $op "NotIn" -}} +{{- $parts = append $parts (printf "%s notin (%s)" $expr.key (join "," ($expr.values | default list))) -}} +{{- else -}} +{{- fail (printf "nodeSelectorExpressions: unsupported operator %q for key %q (use In, NotIn, Exists, DoesNotExist)" $op $expr.key) -}} +{{- end -}} +{{- end -}} +{{- join "," $parts -}} +{{- end -}} + +{{/* +Per-node staged Job manifest (deploymentMode: job), embedded verbatim into the +job-templates ConfigMap. The dispatcher (kata-deploy-job-dispatcher) clones this once per +target node, injecting metadata.name + spec.template.spec.nodeName, so the +template itself carries NO node identity and NO Helm hook annotations. + +Arguments (dict): + root - top-level context (.) + stage - "install" | "cleanup" + +install pipeline: host-check -> artifacts -> cri (initContainers) ; label (main) +cleanup pipeline: unlabel -> revert-cri (initContainers) ; remove-artifacts (main) + +Emitted at column 0 (a standalone Job document); embed with `indent` at the call +site under a ConfigMap data key. +*/}} +{{- define "kata-deploy.perNodeJob" -}} +{{- $root := .root -}} +{{- $stage := .stage -}} +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/stage: {{ $stage }} +spec: + backoffLimit: {{ $root.Values.job.backoffLimit }} + ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/stage: {{ $stage }} + spec: +{{- with $root.Values.imagePullSecrets }} + imagePullSecrets: +{{- toYaml . | nindent 8 }} +{{- end }} + serviceAccountName: {{ include "kata-deploy.serviceAccountName" $root }} + restartPolicy: Never + hostPID: true +{{- with $root.Values.tolerations }} + tolerations: +{{- toYaml . | nindent 8 }} +{{- end }} +{{- with $root.Values.priorityClassName }} + priorityClassName: {{ . | quote }} +{{- end }} +{{- if eq $stage "install" }} + initContainers: +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "host-check" "action" "install-stage-host-check" "privileged" true "mountHost" true) | nindent 8 }} +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "artifacts" "action" "install-stage-artifacts" "privileged" true "mountHost" true) | nindent 8 }} +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "cri" "action" "install-stage-cri" "privileged" true "mountHost" true) | nindent 8 }} + containers: +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "label" "action" "install-stage-label" "privileged" false "mountHost" false) | nindent 8 }} +{{- else }} + initContainers: +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "unlabel" "action" "cleanup-stage-unlabel" "privileged" false "mountHost" false) | nindent 8 }} +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "revert-cri" "action" "cleanup-stage-revert-cri" "privileged" true "mountHost" true) | nindent 8 }} + containers: +{{- include "kata-deploy.stageContainer" (dict "root" $root "name" "remove-artifacts" "action" "cleanup-stage-remove-artifacts" "privileged" true "mountHost" true) | nindent 8 }} +{{- end }} + volumes: +{{- include "kata-deploy.commonVolumes" $root | nindent 8 }} +{{- end -}} + +{{/* +Service account name (honoring multiInstallSuffix), shared by all kata-deploy +workloads (DaemonSet and staged Jobs). +*/}} +{{- define "kata-deploy.serviceAccountName" -}} +{{- if .Values.env.multiInstallSuffix -}} +{{ .Chart.Name }}-sa-{{ .Values.env.multiInstallSuffix }} +{{- else -}} +{{ .Chart.Name }}-sa +{{- end -}} +{{- end -}} + +{{/* +ServiceAccount name for the job-mode dispatcher (kata-deploy-job-dispatcher). Separate from +kata-deploy.serviceAccountName: the dispatcher is a pure API client (list nodes, +manage Jobs) and must NOT carry the privileged kata-deploy host-mutation rights. +*/}} +{{- define "kata-deploy.dispatcherServiceAccountName" -}} +{{- if .Values.env.multiInstallSuffix -}} +{{ .Chart.Name }}-dispatcher-sa-{{ .Values.env.multiInstallSuffix }} +{{- else -}} +{{ .Chart.Name }}-dispatcher-sa +{{- end -}} +{{- end -}} + +{{/* +Render a single staged-pipeline container that runs one kata-deploy stage action. +Used by the per-node staged install/cleanup Jobs (deploymentMode: job). + +Arguments (dict): + root - the top-level context (.) + name - container name + action - kata-deploy subcommand (e.g. install-stage-cri) + privileged - bool, whether the container runs privileged (host nsenter/restart) + mountHost - bool, whether to mount the host paths (crio/containerd/host) + +Emitted at column 0; indent with `nindent` at the call site. +*/}} +{{- define "kata-deploy.stageContainer" -}} +- name: {{ .name }} + image: {{ include "kata-deploy.image" .root }} + imagePullPolicy: {{ .root.Values.imagePullPolicy }} + command: ["/usr/bin/kata-deploy", "{{ .action }}"] + env: +{{- include "kata-deploy.commonEnv" .root | nindent 4 }} + securityContext: + privileged: {{ .privileged }} +{{- if .mountHost }} + volumeMounts: +{{- include "kata-deploy.commonVolumeMounts" .root | nindent 4 }} +{{- end }} +{{- end -}} + {{/* Common volumeMounts for any pod that runs the kata-deploy binary against the host. Emitted at column 0; indent with `nindent` at the call site. diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml new file mode 100644 index 0000000000..ff8e97f3fb --- /dev/null +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-install-job.yaml @@ -0,0 +1,113 @@ +{{- /* +Install dispatcher (deploymentMode: job). + +A single, tiny post-install/post-upgrade hook Job that runs the dispatcher +(kata-deploy-job-dispatcher). The dispatcher enumerates the selected nodes LIVE, then +creates one node-pinned install Job per node from the job-templates ConfigMap, +keeping at most job.parallelism in flight and refilling as they finish. This +guarantees one install per node (coverage) with a paced rollout, while the Helm +release stays O(1) regardless of fleet size. + +Each per-node Job runs the staged pipeline as ordered initContainers and exits: + + host-check -> artifacts -> cri (initContainers, run sequentially) + label (main container) + +Helm waits only on THIS dispatcher Job (the verification hook runs at a higher +weight, after it). before-hook-creation lets `helm upgrade` re-run the dispatcher, +which re-enumerates nodes (idempotent stages skip already-installed nodes and +pick up newly added ones). +*/ -}} +{{- if eq (.Values.deploymentMode | default "daemonset") "job" }} +{{- $root := . }} +{{- $base := .Chart.Name }} +{{- if .Values.env.multiInstallSuffix }} +{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }} +{{- end }} +{{- $sa := include "kata-deploy.dispatcherServiceAccountName" . }} +{{- $dispatcherName := printf "%s-install-dispatcher" $base | trunc 63 | trimSuffix "-" }} +{{- $nodes := .Values.job.nodes | default list }} +{{- $selector := include "kata-deploy.nodeLabelSelector" (dict "eq" (.Values.job.nodeSelector | default dict) "exprs" (.Values.job.nodeSelectorExpressions | default list)) }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ $dispatcherName }} + namespace: {{ $root.Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/dispatcher: install + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation +spec: + # The dispatcher does per-node retries (job.backoffLimit) itself; a dispatcher + # failure means "some node failed" and should surface, not be retried blindly. + backoffLimit: 0 + ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/dispatcher: install + spec: +{{- with $root.Values.imagePullSecrets }} + imagePullSecrets: +{{- toYaml . | nindent 8 }} +{{- end }} + serviceAccountName: {{ $sa }} + restartPolicy: Never + # The dispatcher never touches the host; it is a plain API client. Lock the + # pod down so a compromise cannot escalate beyond its (minimal) API rights. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + seccompProfile: + type: RuntimeDefault +{{- with $root.Values.tolerations }} + tolerations: +{{- toYaml . | nindent 8 }} +{{- end }} +{{- with $root.Values.priorityClassName }} + priorityClassName: {{ . | quote }} +{{- end }} + containers: + - name: dispatcher + image: {{ include "kata-deploy.dispatcherImage" $root }} + imagePullPolicy: {{ $root.Values.imagePullPolicy }} + securityContext: + privileged: false + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + command: + - /usr/bin/kata-deploy-job-dispatcher + - "--job-template=/etc/kata-job/install-job.yaml" + - "--name-prefix={{ $base }}-install" + - "--owner-job-name={{ $dispatcherName }}" + - "--parallelism={{ $root.Values.job.parallelism }}" +{{- if $nodes }} + - "--nodes={{ join "," $nodes }}" +{{- else if $selector }} + - "--node-selector={{ $selector }}" +{{- end }} + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: job-templates + mountPath: /etc/kata-job + readOnly: true + volumes: + - name: job-templates + configMap: + name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }} +{{- end }} diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml new file mode 100644 index 0000000000..4d455f0763 --- /dev/null +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-job-templates.yaml @@ -0,0 +1,33 @@ +{{- /* +Per-node Job templates for deploymentMode: job. + +This ConfigMap holds the install and cleanup per-node Job manifests, rendered +ONCE (constant size, independent of the number of nodes). The job-mode dispatcher +(kata-deploy-job-dispatcher) mounts it, and for every selected node clones the relevant +template, injects metadata.name + spec.template.spec.nodeName, and creates the +Job. Keeping the rich pod spec (env/volumes/shim config) here means the Helm +chart stays the single source of truth; the dispatcher only does fan-out. + +It is a normal (non-hook) resource: Helm creates it before the post-install +dispatcher hook runs, and it still exists during the pre-delete cleanup hook +(release resources are torn down only after pre-delete hooks complete). +*/ -}} +{{- if eq (.Values.deploymentMode | default "daemonset") "job" }} +{{- $base := .Chart.Name }} +{{- if .Values.env.multiInstallSuffix }} +{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }} +{{- end }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} +data: + install-job.yaml: | +{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "install") | indent 4 }} + cleanup-job.yaml: | +{{ include "kata-deploy.perNodeJob" (dict "root" . "stage" "cleanup") | indent 4 }} +{{- end }} diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml index ff02c34de6..17ff5bd183 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy.yaml @@ -1,3 +1,4 @@ +{{- if eq (.Values.deploymentMode | default "daemonset") "daemonset" -}} {{- if index .Values "node-feature-discovery" "enabled" -}} {{- $existingNFDNamespace := include "kata-deploy.detectExistingNFD" . | trim -}} {{- if $existingNFDNamespace -}} @@ -204,3 +205,4 @@ spec: updateStrategy: {{- toYaml . | nindent 4 }} {{- end}} +{{- end -}} diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml index 863b037c51..0f66e45a4a 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-rbac.yaml @@ -65,6 +65,68 @@ subjects: name: {{ .Chart.Name }}-sa {{- end }} namespace: {{ .Release.Namespace }} +{{- if eq (.Values.deploymentMode | default "daemonset") "job" }} +--- +# Dedicated, least-privilege identity for the job-mode dispatcher +# (kata-deploy-job-dispatcher). It is a pure control-plane client: it lists nodes +# (cluster-scoped) and manages per-node Jobs in the release namespace +# (namespace-scoped). It deliberately does NOT get the privileged kata-deploy +# host-mutation rights (node patch, runtimeclasses, NFD, etc.); those stay on +# kata-deploy-sa, which only the per-node Jobs use. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "kata-deploy.dispatcherServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} +rules: +# Enumerating nodes is inherently cluster-scoped. +- apiGroups: [""] + resources: ["nodes"] + verbs: ["list"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ .Chart.Name }}-dispatcher-noderb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ .Chart.Name }}-dispatcher-noderole{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} +subjects: +- kind: ServiceAccount + name: {{ include "kata-deploy.dispatcherServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} + namespace: {{ .Release.Namespace }} +rules: +# The dispatcher only ever creates/watches/GCs per-node Jobs in its own namespace. +- apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "delete"] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ .Chart.Name }}-dispatcher-rb{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ .Chart.Name }}-dispatcher-role{{ with .Values.env.multiInstallSuffix }}-{{ . }}{{ end }} +subjects: +- kind: ServiceAccount + name: {{ include "kata-deploy.dispatcherServiceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} --- # ServiceAccount and RBAC for the post-delete Job that removes the kept RBAC above. # Created as post-delete hooks with lower weight than the Job so they exist when the Job runs. diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml index 7b0e9fa74a..120bfd5027 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml @@ -1,3 +1,106 @@ +# Deployment model for installing/cleaning up Kata on nodes. +# daemonset: (default) the long-running kata-deploy DaemonSet installs Kata on +# every matching node and reverts it on pod termination (uninstall). +# job: no always-on component. A tiny dispatcher Job (the dispatcher, +# kata-deploy-job-dispatcher) runs as a post-install/upgrade hook, enumerates +# the selected nodes LIVE, and creates one node-pinned install Job +# per node - paced to job.parallelism and guaranteeing one install +# per node. Each per-node Job runs the staged pipeline as ordered +# initContainers and exits. Uninstall works the same way via a +# pre-delete dispatcher (reverse pipeline). +# +# Why a dispatcher instead of rendering per-node Jobs in the chart: Helm stores +# the whole rendered release in one ~1 MiB Secret and runs hook resources +# sequentially, and neither an Indexed Job nor a JobSet can guarantee one pod +# per node once parallelism < node-count (the scheduler ignores completed pods +# when balancing spread). The dispatcher keeps the release O(1), enumerates nodes +# at run time, and paces a guaranteed-coverage rollout with built-in Jobs only. +# +# NOTE on "job" mode and new nodes: +# The dispatcher only runs on `helm install` / `helm upgrade` / `helm uninstall`. +# When you add nodes later, re-run `helm upgrade` so the dispatcher enumerates +# and installs the new nodes (the staged actions are idempotent, so already- +# installed nodes are skipped). This is intentional: it avoids an always-on +# privileged component on every node. +deploymentMode: daemonset # daemonset | job + +# Settings specific to deploymentMode: job +job: + # Dispatcher image: the dispatcher that fans out per-node Jobs. It only talks to + # the Kubernetes API (lists nodes, creates/watches Jobs); it never touches the + # host. Supports reference:tag or reference@sha256:digest; tag defaults to the + # chart appVersion. + dispatcherImage: + reference: quay.io/kata-containers/kata-deploy-job-dispatcher + tag: "" + # Maximum number of nodes processed concurrently (the dispatcher keeps at most + # this many per-node Jobs in flight, refilling as they finish). Lower it to + # pace the rollout (e.g. limit how many CRI runtimes restart at once on a big + # fleet); raise it to install faster. Effectively capped at the node count. + parallelism: 100 + # How to choose which nodes get a per-node INSTALL Job. Precedence: + # 1. job.nodes (explicit list of node names) - if non-empty, used verbatim + # (passed to the dispatcher as --nodes). + # 2. otherwise a label selector built from job.nodeSelector (equality) ANDed + # with job.nodeSelectorExpressions (In/NotIn/Exists/DoesNotExist) is + # passed to the dispatcher, which resolves matching nodes LIVE at run time. + # 3. if both are empty, ALL nodes are targeted. + # + # DEFAULT: target worker (non-control-plane) nodes, so no custom labeling is + # required. Override these freely: + # - Target nodes with a specific label: + # job: + # nodeSelector: { kata-containers: "enabled" } + # - Target every node (including control-plane), e.g. single-node clusters/CI: + # job: + # nodeSelectorExpressions: [] + # - Richer expressions: + # job: + # nodeSelectorExpressions: + # - { key: kubernetes.io/os, operator: In, values: ["linux"] } + # - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist } + # - Pin to explicit nodes: + # job: + # nodes: ["worker-1", "worker-2"] + nodes: [] + # Equality label selector (ANDed with nodeSelectorExpressions). Ignored when + # job.nodes is set. Empty by default. + nodeSelector: {} + # Kubernetes-style label selector requirements (ANDed with nodeSelector). + # Each entry: { key, operator, values }. operator is one of: + # In | NotIn (values required) | Exists | DoesNotExist (values must be empty). + # Default selects nodes that are NOT control-plane/master (i.e. worker nodes). + # Set to [] to disable role filtering and target all discovered nodes. + nodeSelectorExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + - key: node-role.kubernetes.io/master + operator: DoesNotExist + # Node selection for the UNINSTALL (pre-delete hook) dispatcher. Same precedence + # and semantics as install (cleanup.nodes, else cleanup.nodeSelector ANDed with + # cleanup.nodeSelectorExpressions, else all nodes). + # + # The cleanup dispatcher resolves nodes LIVE when it runs at `helm uninstall` + # (the dispatcher does the lookup), so - unlike a frozen Helm-rendered hook - + # the DEFAULT below can safely be "nodes carrying katacontainers.io/kata-runtime", + # i.e. exactly the nodes install actually labeled. Override to clean a + # different set, e.g.: + # job: + # cleanup: + # nodes: ["worker-1"] + cleanup: + nodes: [] + nodeSelector: {} + nodeSelectorExpressions: + - key: katacontainers.io/kata-runtime + operator: Exists + # How long finished per-node Jobs are retained before automatic garbage + # collection (seconds). Applies to both install and cleanup per-node Jobs. + ttlSecondsAfterFinished: 600 + # Per-node retry budget: retries for a single node's Job before it is marked + # failed. One node failing never aborts the others. + backoffLimit: 3 + imagePullPolicy: Always imagePullSecrets: [] diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh index 337a29291b..2897bcdd46 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-helm-chart.sh @@ -23,8 +23,21 @@ tmp="$(mktemp -d)" trap '[[ -n "${KEEP_TMPDIR}" ]] && echo "kept: ${tmp}" || rm -rf "${tmp}"' EXIT cp -r "${CHART_SRC}" "${tmp}/" + +# Job-mode dispatcher image. Its repo mirrors the kata-deploy repo with +# "-job-dispatcher" inserted before any "-ci" suffix (so the "-ci" stays last): +# .../kata-deploy -> .../kata-deploy-job-dispatcher +# .../kata-deploy-ci -> .../kata-deploy-job-dispatcher-ci +# It is built and pushed with the same tag by kata-deploy-build-and-upload-payload.sh. +if [[ "${REGISTRY}" == *-ci ]]; then + JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY%-ci}-job-dispatcher-ci"}" +else + JOB_DISPATCHER_IMAGE_REFERENCE="${JOB_DISPATCHER_IMAGE_REFERENCE:-"${REGISTRY}-job-dispatcher"}" +fi + yq eval ".version = \"${CHART_VERSION}\" | .appVersion = \"${CHART_VERSION}\"" -i "${tmp}/kata-deploy/Chart.yaml" yq eval ".image.reference = \"${REGISTRY}\" | .image.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml" +yq eval ".job.dispatcherImage.reference = \"${JOB_DISPATCHER_IMAGE_REFERENCE}\" | .job.dispatcherImage.tag = \"${TAG}\"" -i "${tmp}/kata-deploy/values.yaml" helm dependencies update "${tmp}/kata-deploy" helm package "${tmp}/kata-deploy" -d "${tmp}" helm push "${tmp}/kata-deploy-${CHART_VERSION}.tgz" "oci://${CHART_REGISTRY}" From 3d732986d2f8eb783950292767576a88762ba3cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:07:15 +0200 Subject: [PATCH 6/9] kata-deploy: add per-node staged cleanup for job mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the uninstall counterpart to the install dispatcher for deploymentMode: job. On `helm uninstall`, a single pre-delete hook Job runs the kata-deploy-job-dispatcher, which enumerates the targeted nodes live and fans out one node-pinned cleanup Job per node that runs the install pipeline in reverse and exits: unlabel -> revert-cri (initContainers, run sequentially) remove-artifacts (main container) Running as a pre-delete hook means the dispatcher ServiceAccount/RBAC and the kata-deploy host-mutation RBAC still exist while the Jobs run, so the unlabel stage retains node get/patch access. revert-cri and remove-artifacts are host-only operations (privileged nsenter / host mount) and need no extra cluster RBAC. Ordering mirrors install in reverse: unlabel first so the scheduler stops placing kata workloads here, then revert the CRI config + restart the runtime, then remove the on-host artifacts. Each stage is idempotent and skips when already undone, so partially-installed nodes and re-runs are safe. Uninstall node selection is deliberately SEPARATE from install (a dedicated job.cleanup.* block) and defaults to every node carrying the katacontainers.io/kata-runtime label (set by the install label stage) rather than re-evaluating the install selector. Because the cleanup dispatcher resolves nodes live when it runs, this stays robust to install-time selector drift (relabeled nodes, etc.) while remaining fully overridable via job.cleanup.nodes / job.cleanup.nodeSelector / job.cleanup.nodeSelectorExpressions. The default (daemonset) mode is unaffected. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../templates/kata-deploy-cleanup-job.yaml | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml new file mode 100644 index 0000000000..31b3887cc0 --- /dev/null +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/kata-deploy-cleanup-job.yaml @@ -0,0 +1,112 @@ +{{- /* +Cleanup dispatcher (deploymentMode: job, pre-delete hook). + +The mirror image of the install dispatcher: a single tiny pre-delete hook Job that +runs the dispatcher (kata-deploy-job-dispatcher) to fan out one node-pinned cleanup Job +per selected node, paced to job.parallelism. Each per-node Job runs the install +pipeline in reverse and exits: + + unlabel -> revert-cri (initContainers, run sequentially) + remove-artifacts (main container) + +Unlike the old per-node hook model, node selection here is resolved LIVE when the +hook runs at `helm uninstall` (the dispatcher does the lookup), not frozen at +install/upgrade time. That is why the default cleanup selector can be +"nodes carrying the katacontainers.io/kata-runtime label" (i.e. exactly the +nodes install actually labeled) - see values.yaml job.cleanup. + +This runs while the release's kept ServiceAccount/RBAC and the job-templates +ConfigMap still exist; they are torn down only after pre-delete hooks complete. +*/ -}} +{{- if eq (.Values.deploymentMode | default "daemonset") "job" }} +{{- $root := . }} +{{- $base := .Chart.Name }} +{{- if .Values.env.multiInstallSuffix }} +{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }} +{{- end }} +{{- $sa := include "kata-deploy.dispatcherServiceAccountName" . }} +{{- $dispatcherName := printf "%s-cleanup-dispatcher" $base | trunc 63 | trimSuffix "-" }} +{{- $cleanup := .Values.job.cleanup | default dict }} +{{- $cNodes := $cleanup.nodes | default list }} +{{- $cSelector := include "kata-deploy.nodeLabelSelector" (dict "eq" ($cleanup.nodeSelector | default dict) "exprs" ($cleanup.nodeSelectorExpressions | default list)) }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ $dispatcherName }} + namespace: {{ $root.Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/dispatcher: cleanup + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-weight": "5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: {{ $root.Values.job.ttlSecondsAfterFinished }} + template: + metadata: + labels: + app.kubernetes.io/name: {{ include "kata-deploy.name" $root }} + app.kubernetes.io/instance: {{ $root.Release.Name }} + kata-deploy/dispatcher: cleanup + spec: +{{- with $root.Values.imagePullSecrets }} + imagePullSecrets: +{{- toYaml . | nindent 8 }} +{{- end }} + serviceAccountName: {{ $sa }} + restartPolicy: Never + # The dispatcher never touches the host; it is a plain API client. Lock the + # pod down so a compromise cannot escalate beyond its (minimal) API rights. + securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + seccompProfile: + type: RuntimeDefault +{{- with $root.Values.tolerations }} + tolerations: +{{- toYaml . | nindent 8 }} +{{- end }} +{{- with $root.Values.priorityClassName }} + priorityClassName: {{ . | quote }} +{{- end }} + containers: + - name: dispatcher + image: {{ include "kata-deploy.dispatcherImage" $root }} + imagePullPolicy: {{ $root.Values.imagePullPolicy }} + securityContext: + privileged: false + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + command: + - /usr/bin/kata-deploy-job-dispatcher + - "--job-template=/etc/kata-job/cleanup-job.yaml" + - "--name-prefix={{ $base }}-cleanup" + - "--owner-job-name={{ $dispatcherName }}" + - "--parallelism={{ $root.Values.job.parallelism }}" +{{- if $cNodes }} + - "--nodes={{ join "," $cNodes }}" +{{- else if $cSelector }} + - "--node-selector={{ $cSelector }}" +{{- end }} + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: job-templates + mountPath: /etc/kata-job + readOnly: true + volumes: + - name: job-templates + configMap: + name: {{ printf "%s-job-templates" $base | trunc 63 | trimSuffix "-" }} +{{- end }} From c23fe1152995e0e0d8a1371bb21ec0acc776dad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:07:31 +0200 Subject: [PATCH 7/9] kata-deploy: make verification Job aware of job deployment mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verification Job assumed the DaemonSet model: it waited for the DaemonSet to exist, for its pods, and for `rollout status daemonset/...`, then required every node in the cluster to be labeled. None of that holds for deploymentMode: job, where install happens via the dispatcher and the per-node Jobs it fans out, and only the targeted (worker) nodes get labeled. Make the hook mode-aware: - Hook weight: in job mode the install dispatcher runs as a post-install hook at weight 5, so verification now runs at weight 10 (after it); daemonset mode keeps weight 0 (the DaemonSet is a normal resource). - Readiness wait: in job mode, wait for the install dispatcher Job to complete and then for the per-node install Jobs (kata-deploy/stage=install) to finish (with the same CRI-restart retry logic) instead of a DaemonSet rollout. - Label check: in job mode, verify exactly the nodes the dispatcher targeted are labeled, rather than comparing the labeled count against all nodes in the cluster. - Grant the verification ClusterRole read access to batch/jobs (used by the job-mode waits; harmless in daemonset mode). The daemonset code path is unchanged and the default render (no verification.pod) is byte-for-byte identical. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- .../templates/verification-job.yaml | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml index 21ed7b6155..4871169cfc 100644 --- a/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml +++ b/tools/packaging/kata-deploy/helm-chart/kata-deploy/templates/verification-job.yaml @@ -6,6 +6,10 @@ Verification Job - runs after kata-deploy installation to validate Kata is worki Only created when verification.pod is provided. */ -}} {{- if .Values.verification.pod }} +{{- $isJob := eq (.Values.deploymentMode | default "daemonset") "job" }} +{{- $base := .Chart.Name }} +{{- if .Values.env.multiInstallSuffix }}{{- $base = printf "%s-%s" .Chart.Name .Values.env.multiInstallSuffix }}{{- end }} +{{- $installDispatcher := printf "%s-install-dispatcher" $base | trunc 63 | trimSuffix "-" }} apiVersion: v1 kind: ConfigMap metadata: @@ -27,7 +31,10 @@ metadata: app.kubernetes.io/component: verification annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-weight": "0" + # In job mode the per-node install Jobs are post-install hooks at weight 5; + # verification must run after them, so use a higher weight. In daemonset + # mode the DaemonSet is a normal resource (created before any hook), so 0 is fine. + "helm.sh/hook-weight": {{ if $isJob }}"10"{{ else }}"0"{{ end }} "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: backoffLimit: 3 @@ -57,6 +64,33 @@ spec: echo "Timeout: ${TIMEOUT}s" echo "" + {{- if $isJob }} + # job mode: there is no DaemonSet. Helm has already waited for the + # install dispatcher hook (this verification hook runs at a higher + # weight); re-check it defensively here. The dispatcher only reports + # success once every per-node install Job has succeeded. + DISPATCHER="{{ $installDispatcher }}" + INSTALL_TIMEOUT="{{ .Values.verification.daemonsetTimeout }}" + echo "Waiting for install dispatcher Job ${DISPATCHER} to complete (timeout ${INSTALL_TIMEOUT}s)..." + # kata-deploy restarts the CRI runtime during the cri stage, which can + # cause transient API server unavailability. Retry the wait to handle this. + wait_retries=5 + wait_retry_delay=15 + for wait_attempt in $(seq 1 ${wait_retries}); do + if kubectl wait --for=condition=complete "job/${DISPATCHER}" -n {{ .Release.Namespace }} --timeout="${INSTALL_TIMEOUT}s" 2>&1; then + break + fi + if [[ ${wait_attempt} -eq ${wait_retries} ]]; then + echo "ERROR: install dispatcher ${DISPATCHER} did not complete after ${wait_retries} attempts" + kubectl get job "${DISPATCHER}" -n {{ .Release.Namespace }} || true + kubectl logs -n {{ .Release.Namespace }} "job/${DISPATCHER}" --tail=50 || true + kubectl get jobs -n {{ .Release.Namespace }} -l kata-deploy/stage=install || true + exit 1 + fi + echo "API server may be restarting (attempt ${wait_attempt}/${wait_retries}), retrying in ${wait_retry_delay}s..." + sleep ${wait_retry_delay} + done + {{- else }} # First, wait for kata-deploy DaemonSet to exist (it's created by Helm, not a hook) echo "Waiting for kata-deploy DaemonSet to be created..." {{- if .Values.env.multiInstallSuffix }} @@ -128,6 +162,7 @@ spec: echo "API server may be restarting (attempt ${rollout_attempt}/${rollout_retries}), retrying in ${rollout_retry_delay}s..." sleep ${rollout_retry_delay} done + {{- end }} # Wait for nodes to be labeled with katacontainers.io/kata-runtime=true # This label is set by kata-deploy when installation is complete @@ -137,6 +172,35 @@ spec: max_wait=60 echo "Node label timeout: ${max_wait}s" elapsed=0 + {{- if $isJob }} + # job mode: only the targeted nodes get labeled. The dispatcher + # created one per-node install Job per targeted node (label + # kata-deploy/stage=install); use that count as the expected + # coverage rather than comparing against all nodes. + expected_count=$(kubectl get jobs -n {{ .Release.Namespace }} -l kata-deploy/stage=install --no-headers 2>/dev/null | wc -l) + echo "Expected ${expected_count} node(s) to be labeled (one per per-node install Job)" + while true; do + labeled_nodes=$(kubectl get nodes -l katacontainers.io/kata-runtime=true --no-headers 2>/dev/null | wc -l) + + if [[ ${expected_count} -gt 0 ]] && [[ ${labeled_nodes} -ge ${expected_count} ]]; then + echo "All ${expected_count} targeted node(s) labeled with kata-runtime=true" + kubectl get nodes -L katacontainers.io/kata-runtime || true + break + fi + + if [[ ${elapsed} -ge ${max_wait} ]]; then + echo "ERROR: Timeout waiting for nodes to be labeled after ${max_wait}s" + echo "Labeled nodes: ${labeled_nodes}/${expected_count}" + echo "Node labels:" + kubectl get nodes -L katacontainers.io/kata-runtime || true + exit 1 + fi + + echo "Labeled nodes: ${labeled_nodes}/${expected_count} (${elapsed}s/${max_wait}s)" + sleep 5 + elapsed=$((elapsed + 5)) + done + {{- else }} while true; do labeled_nodes=$(kubectl get nodes -l katacontainers.io/kata-runtime=true --no-headers 2>/dev/null | wc -l) total_nodes=$(kubectl get nodes --no-headers 2>/dev/null | wc -l) @@ -159,6 +223,7 @@ spec: sleep 5 elapsed=$((elapsed + 5)) done + {{- end }} # Give kubelet time to pick up the new runtime configuration after containerd restart echo "" @@ -315,6 +380,9 @@ rules: - apiGroups: ["apps"] resources: ["daemonsets"] verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding From aebadb1ab2336d142716d8bae069b78c3a960196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 3 Jun 2026 22:07:57 +0200 Subject: [PATCH 8/9] docs: document kata-deploy job deployment mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the new opt-in deploymentMode: job alongside the default DaemonSet model in the maintained docs (not just the chart README): - helm-configuration.md: add a "Deployment Modes (DaemonSet vs Job)" section covering the dispatcher-driven staged install/cleanup pipelines, why a dispatcher is used instead of Helm-rendered per-node Jobs (O(1) release, guaranteed coverage, paced rollout, explicit privilege split), the "re-run helm upgrade to cover newly added nodes" model (no always-on reconcile component), and the node-selection precedence (job.nodes > job.nodeSelector + job.nodeSelectorExpressions) that defaults to worker nodes. - installation.md: note that the DaemonSet is the default but no longer the only model, linking to the section above. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- docs/helm-configuration.md | 173 +++++++++++++++++++++++++++++++++++++ docs/installation.md | 6 ++ 2 files changed, 179 insertions(+) diff --git a/docs/helm-configuration.md b/docs/helm-configuration.md index febeb7f433..aa83ecda2f 100644 --- a/docs/helm-configuration.md +++ b/docs/helm-configuration.md @@ -93,6 +93,179 @@ customRuntimes: Again, view the default [`values.yaml`](#parameters) file for more details. +## Deployment Modes (DaemonSet vs Job) + +The chart can install Kata on nodes in one of two ways, selected with the +top-level `deploymentMode` value: + +- **`daemonset`** (default): the long-running `kata-deploy` DaemonSet installs + Kata on every matching node and reverts it when the pod is terminated (i.e. on + uninstall). This is the historical behavior and is unchanged. +- **`job`**: there is **no always-on component**. A tiny *dispatcher* Job (the + dispatcher, `kata-deploy-job-dispatcher`) runs as a `post-install`/`post-upgrade` hook, + enumerates the selected nodes **live** via the Kubernetes API, and creates one + node-pinned install `Job` per node. Each per-node Job runs the staged install + pipeline as ordered `initContainers` and then exits: + + ``` + host-check -> artifacts -> cri (initContainers) -> label (main) + ``` + + On `helm uninstall`, a `pre-delete` dispatcher fans out per-node Jobs that run + the pipeline in reverse (`unlabel -> revert-cri -> remove-artifacts`). Unlike + the DaemonSet, **nothing keeps running on the node after installation + completes**, and the dispatcher itself only ever talks to the API server — it + never touches the host (so it ships as a separate, minimal image, + `job.dispatcherImage`). + + The privilege split is explicit: the dispatcher pod runs **fully unprivileged** + (`runAsNonRoot`, all capabilities dropped, no privilege escalation, read-only + root filesystem, `RuntimeDefault` seccomp) under a **dedicated minimal + ServiceAccount** whose only rights are `nodes: list` (cluster-scoped) and + managing Jobs in the release namespace. All privileged, host-mutating work + stays in the per-node Jobs, which continue to use the `kata-deploy` + ServiceAccount. + +```yaml title="values.yaml" +deploymentMode: job +``` + +#### Why a dispatcher instead of Helm-rendered per-node Jobs + +Rendering one Job per node directly in the chart does not scale: Helm stores the +whole rendered release in a single (~1 MiB) Secret and runs hook resources +sequentially, so large fleets blow the size limit and/or take far too long. A +single `Indexed Job` or a `JobSet` removes those limits but **cannot guarantee +one pod per node** once `parallelism < node-count`: Kubernetes' topology-spread +and affinity scheduling ignore *completed* pods, so as paced pods finish, later +pods pile onto a subset of nodes and leave others uncovered. + +The dispatcher sidesteps both problems: the Helm release stays O(1) (just the +dispatcher + a constant-size ConfigMap holding the per-node Job templates), node +membership is resolved at run time, and the dispatcher itself paces the rollout +(at most `job.parallelism` per-node Jobs in flight) while **guaranteeing one Job +per node**. Per-node Jobs are garbage-collected via an `ownerReference` to the +dispatcher and `job.ttlSecondsAfterFinished`. + +### Adding nodes in `job` mode + +The dispatcher only runs on `helm install` / `helm upgrade` / `helm uninstall`. +There is **no dispatcher watching for new nodes**, so when you add nodes later, +re-run `helm upgrade`; the dispatcher re-enumerates the cluster and installs the +new nodes: + +```sh +helm upgrade kata-deploy "${CHART}" --version "${VERSION}" --reuse-values +``` + +Each per-node stage is idempotent (it skips when already applied), so the +upgrade only does real work on the newly added nodes. + +### Recovering from a failed or deleted dispatcher + +The dispatcher runs as a **blocking** `post-install`/`post-upgrade` hook Job with +`restartPolicy: Never` and `backoffLimit: 0`, so if its pod is evicted, drained, +or deleted mid-rollout the Job is marked *failed* and is **not** restarted +automatically — `helm install`/`helm upgrade` surfaces the failure rather than +leaving you silently half-installed. + +What survives the dispatcher dying: + +- **Per-node Jobs already created keep running.** They are independent, + `nodeName`-pinned Jobs, not children of the dispatcher pod, so installs that + were already dispatched run to completion and those nodes get labeled. Only + nodes still queued (never dispatched) are skipped, so at worst you get + *partial coverage* — never a half-mutated host, because each stage is + idempotent. +- Those per-node Jobs carry a (non-controller) `ownerReference` to the dispatcher + Job, so they survive *pod* deletion but are garbage-collected once the + dispatcher **Job** itself is removed or its `job.ttlSecondsAfterFinished` + elapses. Keep that TTL comfortably larger than a single node's install so + in-flight Jobs are not reaped early. + +Recovery is the same one-liner as adding nodes — re-run `helm upgrade`: + +```sh +helm upgrade kata-deploy "${CHART}" --version "${VERSION}" --reuse-values +``` + +The `before-hook-creation` delete policy first removes the stale dispatcher Job +(cascading away any leftover per-node Jobs); the fresh dispatcher then +re-enumerates nodes live, recreates the per-node Jobs (adopting any that still +exist rather than duplicating them), and because every stage is idempotent the +already-installed nodes are fast no-ops. Coverage converges on the re-run. + +### Choosing which nodes get a Job + +In `job` mode, node selection is configured under the `job` key, with the +following precedence (highest first): + +1. `job.nodes`: an explicit list of node names, passed to the dispatcher verbatim. +2. `job.nodeSelector` (an equality map) **ANDed with** + `job.nodeSelectorExpressions` (Kubernetes label-selector requirements using + the operators `In`, `NotIn`, `Exists`, `DoesNotExist`). These are compiled + into a single label-selector string that the dispatcher resolves live. +3. If both are empty, **all** nodes are targeted. + +By **default the expressions target worker (non-control-plane) nodes**, so no +custom node labeling is required (this differs from the DaemonSet `nodeSelector` +examples above, which rely on you labeling nodes). Override as needed: + +```yaml title="values.yaml" +# Target nodes carrying a specific label: +job: + nodeSelector: + kata-containers: "enabled" + +# Target every node, including control-plane (e.g. single-node clusters / CI): +job: + nodeSelectorExpressions: [] + +# Richer expressions: +job: + nodeSelectorExpressions: + - { key: kubernetes.io/os, operator: In, values: ["linux"] } + - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist } + +# Pin to explicit nodes: +job: + nodes: ["worker-1", "worker-2"] +``` + +Use `job.parallelism` to pace the rollout — it caps how many per-node Jobs run +concurrently (e.g. to limit how many CRI runtimes restart at once on a big +fleet). It is effectively capped at the number of targeted nodes. + +### Choosing which nodes are cleaned up on uninstall + +Because the cleanup dispatcher resolves nodes **live when it runs** at +`helm uninstall` (the dispatcher does the lookup, not Helm at render time), the +node set is *not* frozen into the stored release. This means the **default +cleanup selector can simply be "nodes carrying the +`katacontainers.io/kata-runtime` label"** — i.e. exactly the nodes the install +actually labeled, regardless of how the install selector has drifted since. + +Override it under `job.cleanup`, with the same precedence/semantics as install +(`cleanup.nodes`, then `cleanup.nodeSelector` ANDed with +`cleanup.nodeSelectorExpressions`, else all nodes): + +```yaml title="values.yaml" +# Only uninstall from specific nodes: +job: + cleanup: + nodes: ["worker-1"] + +# Use an explicit selector instead of the kata-runtime label default: +job: + cleanup: + nodeSelectorExpressions: + - { key: node-role.kubernetes.io/control-plane, operator: DoesNotExist } +``` + +See the default [`values.yaml`](#parameters) for the remaining `job.*` options +(e.g. `dispatcherImage`, `parallelism`, `ttlSecondsAfterFinished`, +`backoffLimit`). + ## Examples We provide a few examples that you can pass to helm via the `-f`/`--values` flag. diff --git a/docs/installation.md b/docs/installation.md index f3d21a3118..f54f752cc7 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -40,6 +40,12 @@ $ helm show values "${CHART}" --version "${VERSION}" This installs the `kata-deploy` DaemonSet and the default Kata `RuntimeClass` resources on your cluster. +> **Note:** the DaemonSet is the default install model, but it is no longer the +> only one. You can instead install Kata via short-lived, staged per-node Jobs +> (no always-on component on the node) by setting `deploymentMode: job`. See +> [Deployment Modes (DaemonSet vs Job)](helm-configuration.md#deployment-modes-daemonset-vs-job) +> for details and node-selection options. + To see what versions of the chart are available: ```sh From aa274908019b20dffd880df62b526d1f2c2759fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 10 Jun 2026 15:00:18 +0200 Subject: [PATCH 9/9] kata-deploy: track distroless static base by tag, not digest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kata-deploy main image pinned its gcr.io/distroless/static-debian13 base by sha256 digest. distroless does not publish versioned tags, so a pinned digest just goes stale with no clear upgrade path. Track the rolling tag instead (guarded with a hadolint DL3007 ignore plus a comment explaining why), matching the kata-deploy-job-dispatcher image base. Signed-off-by: Fabiano Fidêncio Assisted-by: Cursor --- tools/packaging/kata-deploy/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kata-deploy/Dockerfile b/tools/packaging/kata-deploy/Dockerfile index b5b80ff640..152c0f583d 100644 --- a/tools/packaging/kata-deploy/Dockerfile +++ b/tools/packaging/kata-deploy/Dockerfile @@ -111,7 +111,10 @@ RUN \ esac #### kata-deploy main image -FROM gcr.io/distroless/static-debian13@sha256:972618ca78034aaddc55864342014a96b85108c607372f7cbd0dbd1361f1d841 +# distroless does not publish pinned/versioned tags - only rolling ones +# (latest, nonroot, debug) - so :latest is the intended way to consume it. +# hadolint ignore=DL3007 +FROM gcr.io/distroless/static-debian13:latest ARG DESTINATION=/opt/kata-artifacts