mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-02-27 01:02:12 +00:00
Compare commits
4 Commits
remove-ins
...
topic/kata
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
622b912369 | ||
|
|
62fef5a5e4 | ||
|
|
2e9ed9aa4c | ||
|
|
e8a896aaa2 |
75
.github/workflows/build-helm-image.yaml
vendored
Normal file
75
.github/workflows/build-helm-image.yaml
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
name: Build helm multi-arch image
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run every Sunday at 12:00 UTC (12 hours after kubectl image build)
|
||||
- cron: '0 12 * * 0'
|
||||
workflow_dispatch:
|
||||
# Allow manual triggering
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'tools/packaging/helm/Dockerfile'
|
||||
- '.github/workflows/build-helm-image.yaml'
|
||||
|
||||
permissions: {}
|
||||
|
||||
env:
|
||||
REGISTRY: quay.io
|
||||
IMAGE_NAME: kata-containers/helm
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
name: Build and push multi-arch image
|
||||
runs-on: ubuntu-24.04
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
|
||||
|
||||
- name: Login to Quay.io
|
||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ vars.QUAY_DEPLOYER_USERNAME }}
|
||||
password: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||
|
||||
- name: Get helm version
|
||||
id: helm-version
|
||||
run: |
|
||||
HELM_VERSION=$(curl -s https://api.github.com/repos/helm/helm/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/')
|
||||
echo "version=${HELM_VERSION}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Generate image metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
||||
tags: |
|
||||
type=raw,value=latest
|
||||
type=raw,value={{date 'YYYYMMDD'}}
|
||||
type=raw,value=${{ steps.helm-version.outputs.version }}
|
||||
type=sha,prefix=
|
||||
|
||||
- name: Build and push multi-arch image
|
||||
uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
|
||||
with:
|
||||
context: tools/packaging/helm/
|
||||
file: tools/packaging/helm/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64,linux/s390x,linux/ppc64le
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
6
.github/workflows/release.yaml
vendored
6
.github/workflows/release.yaml
vendored
@@ -284,11 +284,15 @@ jobs:
|
||||
echo "${QUAY_DEPLOYER_PASSWORD}" | helm registry login quay.io --username "${QUAY_DEPLOYER_USERNAME}" --password-stdin
|
||||
echo "${GITHUB_TOKEN}" | helm registry login ghcr.io --username "${GITHUB_ACTOR}" --password-stdin
|
||||
|
||||
- name: Push helm chart to the OCI registries
|
||||
- name: Push helm charts to the OCI registries
|
||||
run: |
|
||||
release_version=$(./tools/packaging/release/release.sh release-version)
|
||||
# Push kata-deploy chart
|
||||
helm push "kata-deploy-${release_version}.tgz" oci://quay.io/kata-containers/kata-deploy-charts
|
||||
helm push "kata-deploy-${release_version}.tgz" oci://ghcr.io/kata-containers/kata-deploy-charts
|
||||
# Push kata-lifecycle-manager chart
|
||||
helm push "kata-lifecycle-manager-${release_version}.tgz" oci://quay.io/kata-containers/kata-deploy-charts
|
||||
helm push "kata-lifecycle-manager-${release_version}.tgz" oci://ghcr.io/kata-containers/kata-deploy-charts
|
||||
|
||||
publish-release:
|
||||
name: publish-release
|
||||
|
||||
118
docs/Kata-Containers-Lifecycle-Management.md
Normal file
118
docs/Kata-Containers-Lifecycle-Management.md
Normal file
@@ -0,0 +1,118 @@
|
||||
# Kata Containers Lifecycle Management
|
||||
|
||||
## Overview
|
||||
|
||||
Kata Containers lifecycle management in Kubernetes consists of two operations:
|
||||
|
||||
1. **Installation** - Deploy Kata Containers to cluster nodes
|
||||
2. **Upgrades** - Update Kata Containers to newer versions without disrupting workloads
|
||||
|
||||
The Kata Containers project provides two Helm charts to address these needs:
|
||||
|
||||
| Chart | Purpose |
|
||||
|-------|---------|
|
||||
| `kata-deploy` | Initial installation and configuration |
|
||||
| `kata-lifecycle-manager` | Orchestrated rolling upgrades with verification |
|
||||
|
||||
---
|
||||
|
||||
## Installation with kata-deploy
|
||||
|
||||
The `kata-deploy` Helm chart installs Kata Containers across all (or selected) nodes using a Kubernetes DaemonSet. When deployed, it:
|
||||
|
||||
- Installs Kata runtime binaries on each node
|
||||
- Configures the container runtime (containerd) to use Kata
|
||||
- Registers RuntimeClasses (`kata-qemu-nvidia-gpu-snp`, `kata-qemu-nvidia-gpu-tdx`, `kata-qemu-nvidia-gpu`, etc.)
|
||||
|
||||
After installation, workloads can use Kata isolation by specifying `runtimeClassName: kata-qemu-nvidia-gpu-snp` (or another Kata RuntimeClass) in their pod spec.
|
||||
|
||||
---
|
||||
|
||||
## Upgrades with kata-lifecycle-manager
|
||||
|
||||
### The Problem
|
||||
|
||||
Standard `helm upgrade kata-deploy` updates all nodes simultaneously via the DaemonSet. This approach:
|
||||
|
||||
- Provides no per-node verification
|
||||
- Offers no controlled rollback mechanism
|
||||
- Can leave the cluster in an inconsistent state if something fails
|
||||
|
||||
### The Solution
|
||||
|
||||
The `kata-lifecycle-manager` Helm chart uses Argo Workflows to orchestrate upgrades with the following guarantees:
|
||||
|
||||
| Guarantee | Description |
|
||||
|-----------|-------------|
|
||||
| **Sequential Processing** | Nodes are upgraded one at a time |
|
||||
| **Per-Node Verification** | A user-provided pod validates Kata functionality after each node upgrade |
|
||||
| **Fail-Fast** | If verification fails, the workflow stops immediately |
|
||||
| **Automatic Rollback** | On failure, Helm rollback is executed and the node is restored |
|
||||
|
||||
### Upgrade Flow
|
||||
|
||||
For each node in the cluster:
|
||||
|
||||
1. **Cordon** - Mark node as unschedulable
|
||||
2. **Drain** (optional) - Evict existing workloads
|
||||
3. **Upgrade** - Run `helm upgrade kata-deploy` targeting this node
|
||||
4. **Wait** - Ensure kata-deploy DaemonSet pod is ready
|
||||
5. **Verify** - Run verification pod to confirm Kata works
|
||||
6. **Uncordon** - Mark node as schedulable again
|
||||
|
||||
If verification fails on any node, the workflow:
|
||||
- Rolls back the Helm release
|
||||
- Uncordons the node
|
||||
- Stops processing (remaining nodes are not upgraded)
|
||||
|
||||
### Verification Pod
|
||||
|
||||
Users must provide a verification pod that tests Kata functionality. This pod:
|
||||
|
||||
- Uses a Kata RuntimeClass
|
||||
- Is scheduled on the specific node being verified
|
||||
- Runs whatever validation logic the user requires (smoke tests, attestation checks, etc.)
|
||||
|
||||
**Basic GPU Verification Example:**
|
||||
|
||||
For clusters with NVIDIA GPUs, the CUDA VectorAdd sample provides a more comprehensive verification:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: ${TEST_POD}
|
||||
spec:
|
||||
runtimeClassName: kata-qemu-nvidia-gpu-snp # or kata-qemu-nvidia-gpu-tdx
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: ${NODE}
|
||||
containers:
|
||||
- name: cuda-vectoradd
|
||||
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/pgpu: "1"
|
||||
memory: 16Gi
|
||||
```
|
||||
|
||||
This verifies that GPU passthrough works correctly with the upgraded Kata runtime.
|
||||
|
||||
The placeholders `${NODE}` and `${TEST_POD}` are substituted at runtime.
|
||||
|
||||
---
|
||||
|
||||
## Demo Recordings
|
||||
|
||||
| Demo | Description | Link |
|
||||
|------|-------------|------|
|
||||
| Sunny Path | Successful upgrade from 3.24.0 to 3.25.0 | [TODO] |
|
||||
| Rainy Path | Failed verification triggers rollback | [TODO] |
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [kata-deploy Helm Chart](tools/packaging/kata-deploy/helm-chart/README.md)
|
||||
- [kata-lifecycle-manager Helm Chart](tools/packaging/kata-deploy/helm-chart/kata-lifecycle-manager/README.md)
|
||||
- [kata-lifecycle-manager Design Document](docs/design/kata-lifecycle-manager-design.md)
|
||||
@@ -28,13 +28,15 @@ Bug fixes are released as part of `MINOR` or `MAJOR` releases only. `PATCH` is a
|
||||
|
||||
## Release Process
|
||||
|
||||
### Bump the `VERSION` and `Chart.yaml` file
|
||||
### Bump the `VERSION` and `Chart.yaml` files
|
||||
|
||||
When the `kata-containers/kata-containers` repository is ready for a new release,
|
||||
first create a PR to set the release in the [`VERSION`](./../VERSION) file and update the
|
||||
`version` and `appVersion` in the
|
||||
[`Chart.yaml`](./../tools/packaging/kata-deploy/helm-chart/kata-deploy/Chart.yaml) file and
|
||||
have it merged.
|
||||
`version` and `appVersion` in the following `Chart.yaml` files:
|
||||
- [`kata-deploy/Chart.yaml`](./../tools/packaging/kata-deploy/helm-chart/kata-deploy/Chart.yaml)
|
||||
- [`kata-lifecycle-manager/Chart.yaml`](./../tools/packaging/kata-deploy/helm-chart/kata-lifecycle-manager/Chart.yaml)
|
||||
|
||||
Have the PR merged before proceeding.
|
||||
|
||||
### Lock the `main` branch
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ Kata Containers design documents:
|
||||
- [Design for direct-assigned volume](direct-blk-device-assignment.md)
|
||||
- [Design for core-scheduling](core-scheduling.md)
|
||||
- [Virtualization Reference Architecture](kata-vra.md)
|
||||
- [Design for kata-lifecycle-manager Helm chart](kata-lifecycle-manager-design.md)
|
||||
---
|
||||
|
||||
- [Design proposals](proposals)
|
||||
|
||||
502
docs/design/kata-lifecycle-manager-design.md
Normal file
502
docs/design/kata-lifecycle-manager-design.md
Normal file
@@ -0,0 +1,502 @@
|
||||
# Kata Containers Lifecycle Manager Design
|
||||
|
||||
## Summary
|
||||
|
||||
This document proposes a Helm chart-based orchestration solution for Kata Containers that
|
||||
enables controlled, node-by-node upgrades with verification and rollback capabilities
|
||||
using Argo Workflows.
|
||||
|
||||
## Motivation
|
||||
|
||||
### Problem Statement
|
||||
|
||||
Upgrading Kata Containers in a production Kubernetes cluster presents several challenges:
|
||||
|
||||
1. **Workload Scheduling Control**: New Kata workloads should not be scheduled on a node
|
||||
during upgrade until the new runtime is verified.
|
||||
|
||||
2. **Verification Gap**: There is no standardized way to verify that Kata is working correctly
|
||||
after an upgrade before allowing workloads to return to the node. This solution addresses
|
||||
the gap by running a user-provided verification pod on each upgraded node.
|
||||
|
||||
3. **Rollback Complexity**: If an upgrade fails, administrators must manually coordinate
|
||||
rollback across multiple nodes.
|
||||
|
||||
4. **Controlled Rollout**: Operators need the ability to upgrade nodes incrementally
|
||||
(canary approach) with fail-fast behavior if any node fails verification.
|
||||
|
||||
5. **Multi-Architecture Support**: The upgrade tooling must work across all architectures
|
||||
supported by Kata Containers (amd64, arm64, s390x, ppc64le).
|
||||
|
||||
### Current State
|
||||
|
||||
The `kata-deploy` Helm chart provides installation and configuration of Kata Containers,
|
||||
including a post-install verification job. However, there is no built-in mechanism for
|
||||
orchestrating upgrades across nodes in a controlled manner.
|
||||
|
||||
## Goals
|
||||
|
||||
1. Provide a standardized, automated way to upgrade Kata Containers node-by-node
|
||||
2. Ensure each node is verified before returning to service
|
||||
3. Support user-defined verification logic
|
||||
4. Automatically rollback if verification fails
|
||||
5. Work with the existing `kata-deploy` Helm chart
|
||||
6. Support all Kata-supported architectures
|
||||
|
||||
## Non-Goals
|
||||
|
||||
1. Initial Kata Containers installation (use kata-deploy Helm chart for that)
|
||||
2. Managing Kubernetes cluster upgrades
|
||||
3. Providing Kata-specific verification logic (this is user responsibility)
|
||||
4. Managing Argo Workflows installation
|
||||
|
||||
## Argo Workflows Dependency
|
||||
|
||||
### What Works Without Argo
|
||||
|
||||
The following components work independently of Argo Workflows:
|
||||
|
||||
| Component | Description |
|
||||
|-----------|-------------|
|
||||
| **kata-deploy Helm chart** | Full installation, configuration, `RuntimeClasses` |
|
||||
| **Post-install verification** | Helm hook runs verification pod after install |
|
||||
| **Label-gated deployment** | Progressive rollout via node labels |
|
||||
| **Manual upgrades** | User can script: cordon, helm upgrade, verify, `uncordon` |
|
||||
|
||||
Users who do not want Argo can still:
|
||||
- Install and configure Kata via kata-deploy
|
||||
- Perform upgrades manually or with custom scripts
|
||||
- Use the verification pod pattern in their own automation
|
||||
|
||||
### What Requires Argo
|
||||
|
||||
The kata-lifecycle-manager Helm chart provides orchestration via Argo Workflows:
|
||||
|
||||
| Feature | Description |
|
||||
|---------|-------------|
|
||||
| **Automated node-by-node upgrades** | Sequential processing with fail-fast |
|
||||
| **Taint-based node selection** | Select nodes by taint key/value |
|
||||
| **`WorkflowTemplate`** | Reusable upgrade workflow |
|
||||
| **Rollback entrypoint** | `argo submit --entrypoint rollback-node` |
|
||||
| **Status tracking** | Node annotations updated at each phase |
|
||||
|
||||
### For Users Already Using Argo
|
||||
|
||||
If your cluster already has Argo Workflows installed:
|
||||
|
||||
```bash
|
||||
# Install kata-lifecycle-manager - integrates with your existing Argo installation
|
||||
helm install kata-lifecycle-manager oci://ghcr.io/kata-containers/kata-deploy-charts/kata-lifecycle-manager \
|
||||
--set argoNamespace=argo \
|
||||
--set-file defaults.verificationPod=./verification-pod.yaml
|
||||
|
||||
# Trigger upgrades via argo CLI or integrate with existing workflows
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager -p target-version=3.25.0
|
||||
```
|
||||
|
||||
kata-lifecycle-manager can also be triggered by other Argo workflows, CI/CD pipelines, or `GitOps`
|
||||
tools that support Argo.
|
||||
|
||||
### For Users Not Wanting Argo
|
||||
|
||||
If you prefer not to use Argo Workflows:
|
||||
|
||||
1. **Use kata-deploy directly** - handles installation and basic verification
|
||||
2. **Script your own orchestration** - example approach:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Manual upgrade script (no Argo required)
|
||||
set -euo pipefail
|
||||
|
||||
VERSION="3.25.0"
|
||||
|
||||
# Upgrade each node with Kata runtime
|
||||
kubectl get nodes -l katacontainers.io/kata-runtime=true -o name | while read -r node_path; do
|
||||
NODE="${node_path#node/}"
|
||||
echo "Upgrading $NODE..."
|
||||
kubectl cordon "$NODE"
|
||||
|
||||
helm upgrade kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--namespace kube-system \
|
||||
--version "$VERSION" \
|
||||
--reuse-values \
|
||||
--wait
|
||||
|
||||
# Wait for DaemonSet pod on this node
|
||||
kubectl rollout status daemonset/kata-deploy -n kube-system
|
||||
|
||||
# Run verification (apply your pod, wait, check exit code)
|
||||
kubectl apply -f verification-pod.yaml
|
||||
kubectl wait pod/kata-verify --for=jsonpath='{.status.phase}'=Succeeded --timeout=180s
|
||||
kubectl delete pod/kata-verify
|
||||
|
||||
kubectl uncordon "$NODE"
|
||||
echo "$NODE upgraded successfully"
|
||||
done
|
||||
```
|
||||
|
||||
This approach requires more manual effort but avoids the Argo dependency.
|
||||
|
||||
## Proposed Design
|
||||
|
||||
### Architecture Overview
|
||||
|
||||
```text
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Argo Workflows Controller │
|
||||
│ (pre-installed) │
|
||||
└────────────────────────────┬────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────────────────────────────────────────────────────┐
|
||||
│ kata-lifecycle-manager Helm Chart │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ WorkflowTemplate │ │
|
||||
│ │ - upgrade-all-nodes (entrypoint) │ │
|
||||
│ │ - upgrade-single-node (per-node steps) │ │
|
||||
│ │ - rollback-node (manual recovery) │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ RBAC Resources │ │
|
||||
│ │ - ServiceAccount │ │
|
||||
│ │ - ClusterRole (node, pod, helm operations) │ │
|
||||
│ │ - ClusterRoleBinding │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ kata-deploy Helm Chart │
|
||||
│ (existing installation) │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Upgrade Flow
|
||||
|
||||
For each node selected by the upgrade label:
|
||||
|
||||
```text
|
||||
┌────────────┐ ┌──────────────┐ ┌────────────┐ ┌────────────┐
|
||||
│ Prepare │───▶│ Cordon │───▶│ Upgrade │───▶│Wait Ready │
|
||||
│ (annotate) │ │ (mark │ │ (helm │ │(kata-deploy│
|
||||
│ │ │unschedulable)│ │ upgrade) │ │ DaemonSet) │
|
||||
└────────────┘ └──────────────┘ └────────────┘ └────────────┘
|
||||
│
|
||||
▼
|
||||
┌────────────┐ ┌──────────────┐ ┌────────────┐
|
||||
│ Complete │◀───│ Uncordon │◀───│ Verify │
|
||||
│ (annotate │ │ (mark │ │ (user pod)│
|
||||
│ version) │ │schedulable) │ │ │
|
||||
└────────────┘ └──────────────┘ └────────────┘
|
||||
```
|
||||
|
||||
**Note:** Drain is not required for Kata upgrades. Running Kata VMs continue using
|
||||
the in-memory binaries. Only new workloads use the upgraded binaries. Cordon ensures
|
||||
the verification pod runs before any new workloads are scheduled with the new runtime.
|
||||
|
||||
**Optional Drain:** For users who prefer to evict workloads before any maintenance
|
||||
operation, an optional drain step can be enabled via `drain-enabled=true`. When
|
||||
enabled, an additional drain step runs after cordon and before upgrade.
|
||||
|
||||
### Node Selection Model
|
||||
|
||||
Nodes can be selected for upgrade using **labels**, **taints**, or **both**.
|
||||
|
||||
**Label-based selection:**
|
||||
|
||||
```bash
|
||||
# Select nodes by label
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-selector="katacontainers.io/kata-lifecycle-manager-window=true"
|
||||
```
|
||||
|
||||
**Taint-based selection:**
|
||||
|
||||
Some organizations use taints to mark nodes for maintenance. The workflow supports
|
||||
selecting nodes by taint key and optionally taint value:
|
||||
|
||||
```bash
|
||||
# Select nodes with a specific taint
|
||||
kubectl taint nodes worker-1 kata-lifecycle-manager=pending:NoSchedule
|
||||
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-taint-key=kata-lifecycle-manager \
|
||||
-p node-taint-value=pending
|
||||
```
|
||||
|
||||
**Combined selection:**
|
||||
|
||||
Labels and taints can be used together for precise targeting:
|
||||
|
||||
```bash
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-selector="node-pool=kata-pool" \
|
||||
-p node-taint-key=maintenance
|
||||
```
|
||||
|
||||
This allows operators to:
|
||||
1. Upgrade a single canary node first
|
||||
2. Gradually add nodes to the upgrade window
|
||||
3. Control upgrade timing via `GitOps` or automation
|
||||
4. Integrate with existing taint-based maintenance workflows
|
||||
|
||||
### Node Pool Support
|
||||
|
||||
The node selector and taint selector parameters enable basic node pool targeting:
|
||||
|
||||
```bash
|
||||
# Upgrade only nodes matching a specific node pool label
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-selector="node-pool=kata-pool"
|
||||
```
|
||||
|
||||
**Current Capabilities:**
|
||||
|
||||
| Feature | Status | Chart | Notes |
|
||||
|---------|--------|-------|-------|
|
||||
| Label-based selection | Supported | kata-lifecycle-manager | Works with any label combination |
|
||||
| Taint-based selection | Supported | kata-lifecycle-manager | Select by taint key/value |
|
||||
| Sequential upgrades | Supported | kata-lifecycle-manager | One node at a time with fail-fast |
|
||||
| Pool-specific verification pods | Not supported | kata-lifecycle-manager | Same verification for all nodes |
|
||||
| Pool-ordered upgrades | Not supported | kata-lifecycle-manager | Upgrade pool A before pool B |
|
||||
|
||||
See the [Potential Enhancements](#potential-enhancements) section for future work.
|
||||
|
||||
### Verification Model
|
||||
|
||||
**Verification runs on each node that is upgraded.** The node is only `uncordoned` after
|
||||
its verification pod succeeds. If verification fails, automatic rollback is triggered
|
||||
to restore the previous version before `uncordoning` the node.
|
||||
|
||||
**Common failure modes detected by verification:**
|
||||
- Pod stuck in Pending/`ContainerCreating` (runtime can't start VM)
|
||||
- Pod crashes immediately (containerd/CRI-O configuration issues)
|
||||
- Pod times out (resource issues, image pull failures)
|
||||
- Pod exits with non-zero code (verification logic failed)
|
||||
|
||||
All of these trigger automatic rollback. The workflow logs include pod status, events,
|
||||
and logs to help diagnose the issue.
|
||||
|
||||
The user provides a complete Pod YAML that:
|
||||
- Uses the Kata runtime class they want to verify
|
||||
- Contains their verification logic (e.g., attestation checks)
|
||||
- Exits 0 on success, non-zero on failure
|
||||
- Includes tolerations for cordoned nodes (verification runs while node is cordoned)
|
||||
- Includes a `nodeSelector` to ensure it runs on the specific node being upgraded
|
||||
|
||||
When upgrading multiple nodes (via label selector), nodes are processed sequentially.
|
||||
For each node, the following placeholders are substituted with that node's specific values,
|
||||
ensuring the verification pod runs on the exact node that was just upgraded:
|
||||
|
||||
- `${NODE}` - The hostname of the node being upgraded/verified
|
||||
- `${TEST_POD}` - A generated unique pod name
|
||||
|
||||
Example verification pod:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: ${TEST_POD}
|
||||
spec:
|
||||
runtimeClassName: kata-qemu
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: ${NODE}
|
||||
tolerations:
|
||||
- operator: Exists # Required: node is cordoned during verification
|
||||
containers:
|
||||
- name: verify
|
||||
image: quay.io/kata-containers/alpine-bash-curl:latest
|
||||
command: ["uname", "-a"]
|
||||
```
|
||||
|
||||
This design keeps verification logic entirely in the user's domain, supporting:
|
||||
- Different runtime classes (`kata-qemu`, `kata-qemu-snp`, `kata-qemu-tdx`, etc.)
|
||||
- TEE-specific attestation verification
|
||||
- GPU/accelerator validation
|
||||
- Custom application smoke tests
|
||||
|
||||
### Sequential Execution with Fail-Fast
|
||||
|
||||
Nodes are upgraded strictly sequentially using recursive Argo templates. This design
|
||||
ensures that if any node fails verification, the workflow stops immediately before
|
||||
touching remaining nodes, preventing a mixed-version fleet.
|
||||
|
||||
Alternative approaches considered:
|
||||
- **`withParam` + semaphore**: Provides cleaner UI but semaphore only controls concurrency,
|
||||
not failure propagation. Other nodes would still proceed after one fails.
|
||||
- **`withParam` + `failFast`**: Would be ideal, but Argo only supports `failFast` for DAG
|
||||
tasks, not for steps with `withParam`.
|
||||
|
||||
The recursive template approach (`upgrade-node-chain`) naturally provides fail-fast
|
||||
behavior because if any step in the chain fails, the recursion stops.
|
||||
|
||||
### Status Tracking
|
||||
|
||||
Node upgrade status is tracked via Kubernetes annotations:
|
||||
|
||||
| Annotation | Values |
|
||||
|------------|--------|
|
||||
| `katacontainers.io/kata-lifecycle-manager-status` | preparing, cordoned, draining, upgrading, verifying, completed, rolling-back, rolled-back |
|
||||
| `katacontainers.io/kata-current-version` | Version string (e.g., "3.25.0") |
|
||||
|
||||
This enables:
|
||||
- Monitoring upgrade progress via `kubectl get nodes`
|
||||
- Integration with external monitoring systems
|
||||
- Recovery from interrupted upgrades
|
||||
|
||||
### Rollback Support
|
||||
|
||||
**Automatic rollback on verification failure:** If the verification pod fails (non-zero exit),
|
||||
kata-lifecycle-manager automatically:
|
||||
1. Runs `helm rollback` to revert to the previous Helm release
|
||||
2. Waits for kata-deploy DaemonSet to be ready with the previous version
|
||||
3. `Uncordons` the node
|
||||
4. Annotates the node with `rolled-back` status
|
||||
|
||||
This ensures nodes are never left in a broken state.
|
||||
|
||||
**Manual rollback:** For cases where you need to rollback a successfully upgraded node:
|
||||
|
||||
```bash
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
--entrypoint rollback-node \
|
||||
-p node-name=worker-1
|
||||
```
|
||||
|
||||
## Components
|
||||
|
||||
### Container Images
|
||||
|
||||
Two multi-architecture container images are built and published:
|
||||
|
||||
| Image | Purpose | Architectures |
|
||||
|-------|---------|---------------|
|
||||
| `quay.io/kata-containers/kubectl:latest` | Kubernetes operations | amd64, arm64, s390x, ppc64le |
|
||||
| `quay.io/kata-containers/helm:latest` | Helm operations | amd64, arm64, s390x, ppc64le |
|
||||
|
||||
Images are rebuilt weekly to pick up security updates and tool version upgrades.
|
||||
|
||||
### Helm Chart Structure
|
||||
|
||||
```text
|
||||
kata-lifecycle-manager/
|
||||
├── Chart.yaml # Chart metadata
|
||||
├── values.yaml # Configurable defaults
|
||||
├── README.md # Usage documentation
|
||||
└── templates/
|
||||
├── _helpers.tpl # Template helpers
|
||||
├── rbac.yaml # ServiceAccount, ClusterRole, ClusterRoleBinding
|
||||
└── workflow-template.yaml # Argo `WorkflowTemplate`
|
||||
```
|
||||
|
||||
### RBAC Requirements
|
||||
|
||||
The workflow requires the following permissions:
|
||||
|
||||
| Resource | Verbs | Purpose |
|
||||
|----------|-------|---------|
|
||||
| nodes | get, list, watch, patch | `cordon`/`uncordon`, annotations |
|
||||
| pods | get, list, watch, create, delete | Verification pods |
|
||||
| pods/log | get | Verification output |
|
||||
| `daemonsets` | get, list, watch | Wait for `kata-deploy` |
|
||||
|
||||
## User Experience
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
# Install kata-lifecycle-manager with verification config
|
||||
helm install kata-lifecycle-manager oci://ghcr.io/kata-containers/kata-deploy-charts/kata-lifecycle-manager \
|
||||
--set-file defaults.verificationPod=/path/to/verification-pod.yaml
|
||||
```
|
||||
|
||||
### Triggering an Upgrade
|
||||
|
||||
```bash
|
||||
# Label nodes for upgrade
|
||||
kubectl label node worker-1 katacontainers.io/kata-lifecycle-manager-window=true
|
||||
|
||||
# Submit upgrade workflow
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0
|
||||
|
||||
# Watch progress
|
||||
argo watch @latest
|
||||
```
|
||||
|
||||
### Monitoring
|
||||
|
||||
```bash
|
||||
kubectl get nodes \
|
||||
-L katacontainers.io/kata-runtime \
|
||||
-L katacontainers.io/kata-lifecycle-manager-status \
|
||||
-L katacontainers.io/kata-current-version
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
1. **Namespace-Scoped Templates**: The chart creates a `WorkflowTemplate` (namespace-scoped)
|
||||
rather than `ClusterWorkflowTemplate` by default, reducing blast radius.
|
||||
|
||||
2. **Required Verification**: The chart fails to install if `defaults.verificationPod` is
|
||||
not provided, ensuring upgrades are always verified.
|
||||
|
||||
3. **Minimal RBAC**: The `ServiceAccount` has only the permissions required for upgrade
|
||||
operations.
|
||||
|
||||
4. **User-Controlled Verification**: Verification logic is entirely user-defined, avoiding
|
||||
any hardcoded assumptions about what "working" means.
|
||||
|
||||
## Integration with Release Process
|
||||
|
||||
The `kata-lifecycle-manager` chart is:
|
||||
- Packaged alongside `kata-deploy` during releases
|
||||
- Published to the same OCI registries (`quay.io`, `ghcr.io`)
|
||||
- Versioned to match `kata-deploy`
|
||||
|
||||
## Potential Enhancements
|
||||
|
||||
The following enhancements could be considered if needed:
|
||||
|
||||
### kata-lifecycle-manager
|
||||
|
||||
1. **Pool-Specific Verification**: Different verification pods for different node pools
|
||||
(e.g., GPU nodes vs. CPU-only nodes).
|
||||
|
||||
2. **Ordered Pool Upgrades**: Upgrade node pool A completely before starting pool B.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### 1. DaemonSet-Based Upgrades
|
||||
|
||||
Using a DaemonSet to coordinate upgrades on each node.
|
||||
|
||||
**Rejected because**: DaemonSets don't provide the node-by-node sequencing and
|
||||
verification workflow needed for controlled upgrades.
|
||||
|
||||
### 2. Operator Pattern
|
||||
|
||||
Building a Kubernetes Operator to manage upgrades.
|
||||
|
||||
**Rejected because**: Adds significant complexity and maintenance burden. Argo Workflows
|
||||
is already widely adopted and provides the orchestration primitives needed.
|
||||
|
||||
### 3. Shell Script Orchestration
|
||||
|
||||
Providing a shell script that loops through nodes.
|
||||
|
||||
**Rejected because**: Less reliable, harder to monitor, no built-in retry/recovery,
|
||||
and doesn't integrate with Kubernetes-native tooling.
|
||||
|
||||
## References
|
||||
|
||||
- [kata-deploy Helm Chart](https://github.com/kata-containers/kata-containers/tree/main/tools/packaging/kata-deploy/helm-chart/kata-deploy)
|
||||
- [Argo Workflows](https://argoproj.github.io/argo-workflows/)
|
||||
- [Helm Documentation](https://helm.sh/docs/)
|
||||
34
tools/packaging/helm/Dockerfile
Normal file
34
tools/packaging/helm/Dockerfile
Normal file
@@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2026 Kata Contributors
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Helm image based on kata-containers/kubectl for multi-arch support
|
||||
# Used for kata-lifecycle-manager workflows and other helm operations
|
||||
|
||||
# hadolint ignore=DL3007
|
||||
FROM quay.io/kata-containers/kubectl:latest
|
||||
|
||||
# Use bash with pipefail for safer pipe handling
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
|
||||
# Download and install helm
|
||||
# hadolint ignore=DL3018
|
||||
RUN ARCH=$(uname -m) && \
|
||||
case "${ARCH}" in \
|
||||
x86_64) HELM_ARCH=amd64 ;; \
|
||||
aarch64) HELM_ARCH=arm64 ;; \
|
||||
ppc64le) HELM_ARCH=ppc64le ;; \
|
||||
s390x) HELM_ARCH=s390x ;; \
|
||||
*) echo "Unsupported architecture: ${ARCH}" && exit 1 ;; \
|
||||
esac && \
|
||||
HELM_VERSION=$(curl -s https://api.github.com/repos/helm/helm/releases/latest | grep '"tag_name"' | sed -E 's/.*"([^"]+)".*/\1/') && \
|
||||
curl -fL --progress-bar -o /tmp/helm.tar.gz \
|
||||
"https://get.helm.sh/helm-${HELM_VERSION}-linux-${HELM_ARCH}.tar.gz" && \
|
||||
tar -xzf /tmp/helm.tar.gz -C /tmp && \
|
||||
mv "/tmp/linux-${HELM_ARCH}/helm" /usr/local/bin/helm && \
|
||||
rm -rf /tmp/helm.tar.gz "/tmp/linux-${HELM_ARCH}" && \
|
||||
chmod +x /usr/local/bin/helm && \
|
||||
helm version --short
|
||||
|
||||
# Default to bash shell
|
||||
CMD ["/bin/bash"]
|
||||
@@ -448,3 +448,7 @@ kata-qemu-snp-cicd kata-qemu-snp-cicd 77s
|
||||
kata-qemu-tdx-cicd kata-qemu-tdx-cicd 77s
|
||||
kata-stratovirt-cicd kata-stratovirt-cicd 77s
|
||||
```
|
||||
|
||||
## Related Charts
|
||||
|
||||
- [kata-lifecycle-manager](kata-lifecycle-manager/README.md) - Argo Workflows-based lifecycle management for controlled, node-by-node upgrades with verification and automatic rollback
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
# Copyright (c) 2026 The Kata Containers Authors
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
apiVersion: v2
|
||||
name: kata-lifecycle-manager
|
||||
description: Argo Workflows-based lifecycle management for Kata Containers
|
||||
|
||||
type: application
|
||||
|
||||
# Chart version - follows kata-containers versioning
|
||||
version: "3.26.0"
|
||||
|
||||
# App version - matches kata-containers
|
||||
appVersion: "3.26.0"
|
||||
|
||||
keywords:
|
||||
- kata-containers
|
||||
- lifecycle-management
|
||||
- upgrade
|
||||
- argo-workflows
|
||||
- gitops
|
||||
|
||||
home: https://katacontainers.io
|
||||
|
||||
sources:
|
||||
- https://github.com/kata-containers/kata-containers
|
||||
|
||||
maintainers:
|
||||
- name: Kata Containers Community
|
||||
|
||||
annotations:
|
||||
kata-containers.io/companion-chart: kata-deploy
|
||||
@@ -0,0 +1,333 @@
|
||||
# Kata Lifecycle Manager Helm Chart
|
||||
|
||||
Argo Workflows-based lifecycle management for Kata Containers.
|
||||
|
||||
This chart installs a namespace-scoped `WorkflowTemplate` that performs controlled,
|
||||
node-by-node upgrades of kata-deploy with verification and automatic rollback on failure.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Kubernetes cluster with kata-deploy installed via Helm
|
||||
- [Argo Workflows](https://argoproj.github.io/argo-workflows/) v3.4+ installed
|
||||
- `helm` and `argo` CLI tools
|
||||
- **Verification pod spec** (see [Verification Pod](#verification-pod-required))
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# From OCI registry (when published)
|
||||
helm install kata-lifecycle-manager oci://ghcr.io/kata-containers/kata-deploy-charts/kata-lifecycle-manager
|
||||
|
||||
# From local source
|
||||
helm install kata-lifecycle-manager ./kata-lifecycle-manager
|
||||
```
|
||||
|
||||
## Verification Pod (Required)
|
||||
|
||||
A verification pod is **required** to validate each node after upgrade. The chart
|
||||
will fail to install without one.
|
||||
|
||||
### Option A: Bake into kata-lifecycle-manager (recommended)
|
||||
|
||||
Provide the verification pod when installing the chart:
|
||||
|
||||
```bash
|
||||
helm install kata-lifecycle-manager ./kata-lifecycle-manager \
|
||||
--set-file defaults.verificationPod=./my-verification-pod.yaml
|
||||
```
|
||||
|
||||
This verification pod is baked into the `WorkflowTemplate` and used for all upgrades.
|
||||
|
||||
### Option B: Override at workflow submission
|
||||
|
||||
One-off override for a specific upgrade run. The pod spec must be base64-encoded
|
||||
because Argo workflow parameters don't handle multi-line YAML reliably:
|
||||
|
||||
```bash
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p verification-pod="$(base64 -w0 < ./my-verification-pod.yaml)"
|
||||
```
|
||||
|
||||
**Note:** During helm upgrade, kata-`kata-deploy`'s own verification is disabled
|
||||
(`--set verification.pod=""`). This is because kata-`kata-deploy`'s verification is
|
||||
cluster-wide (designed for initial install), while kata-lifecycle-manager performs
|
||||
per-node verification with proper placeholder substitution.
|
||||
|
||||
### Verification Pod Spec
|
||||
|
||||
Create a pod spec that validates your Kata deployment. The pod should exit 0 on success,
|
||||
non-zero on failure.
|
||||
|
||||
**Example (`my-verification-pod.yaml`):**
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: ${TEST_POD}
|
||||
spec:
|
||||
runtimeClassName: kata-qemu
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: ${NODE}
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
containers:
|
||||
- name: verify
|
||||
image: quay.io/kata-containers/alpine-bash-curl:latest
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
echo "=== Kata Verification ==="
|
||||
echo "Node: ${NODE}"
|
||||
echo "Kernel: $(uname -r)"
|
||||
echo "SUCCESS: Pod running with Kata runtime"
|
||||
```
|
||||
|
||||
### Placeholders
|
||||
|
||||
| Placeholder | Description |
|
||||
|-------------|-------------|
|
||||
| `${NODE}` | Node hostname being upgraded/verified |
|
||||
| `${TEST_POD}` | Generated unique pod name |
|
||||
|
||||
**You are responsible for:**
|
||||
- Setting the `runtimeClassName` in your pod spec
|
||||
- Defining the verification logic in your container
|
||||
- Using the exit code to indicate success (0) or failure (non-zero)
|
||||
|
||||
**Failure modes detected:**
|
||||
- Pod stuck in Pending/`ContainerCreating` (runtime can't start VM)
|
||||
- Pod crashes immediately (containerd/CRI-O configuration issues)
|
||||
- Pod times out (resource issues, image pull failures)
|
||||
- Pod exits with non-zero code (verification logic failed)
|
||||
|
||||
All of these trigger automatic rollback.
|
||||
|
||||
## Usage
|
||||
|
||||
### 1. Select Nodes for Upgrade
|
||||
|
||||
Nodes can be selected using **labels**, **taints**, or **both**.
|
||||
|
||||
**Option A: Label-based selection (default)**
|
||||
|
||||
```bash
|
||||
# Label nodes for upgrade
|
||||
kubectl label node worker-1 katacontainers.io/kata-lifecycle-manager-window=true
|
||||
|
||||
# Trigger upgrade
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-selector="katacontainers.io/kata-lifecycle-manager-window=true"
|
||||
```
|
||||
|
||||
**Option B: Taint-based selection**
|
||||
|
||||
```bash
|
||||
# Taint nodes for upgrade
|
||||
kubectl taint nodes worker-1 kata-lifecycle-manager=pending:NoSchedule
|
||||
|
||||
# Trigger upgrade using taint selector
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-taint-key=kata-lifecycle-manager \
|
||||
-p node-taint-value=pending
|
||||
```
|
||||
|
||||
**Option C: Combined selection**
|
||||
|
||||
```bash
|
||||
# Use both labels and taints for precise targeting
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p node-selector="node-pool=kata-pool" \
|
||||
-p node-taint-key=kata-lifecycle-manager
|
||||
```
|
||||
|
||||
### 2. Trigger Upgrade
|
||||
|
||||
```bash
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0
|
||||
|
||||
# Watch progress
|
||||
argo watch @latest
|
||||
```
|
||||
|
||||
### 3. Sequential Upgrade Behavior
|
||||
|
||||
Nodes are upgraded **sequentially** (one at a time) to ensure fleet consistency.
|
||||
If any node fails verification, the workflow stops immediately and that node is
|
||||
rolled back. This prevents ending up with a mixed fleet where some nodes have
|
||||
the new version and others have the old version.
|
||||
|
||||
## Configuration
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `argoNamespace` | Namespace for Argo resources | `argo` |
|
||||
| `defaults.helmRelease` | kata-deploy Helm release name | `kata-deploy` |
|
||||
| `defaults.helmNamespace` | kata-deploy namespace | `kube-system` |
|
||||
| `defaults.nodeSelector` | Node label selector (optional if using taints) | `""` |
|
||||
| `defaults.nodeTaintKey` | Taint key for node selection | `""` |
|
||||
| `defaults.nodeTaintValue` | Taint value filter (optional) | `""` |
|
||||
| `defaults.verificationNamespace` | Namespace for verification pods | `default` |
|
||||
| `defaults.verificationPod` | Pod YAML for verification **(required)** | `""` |
|
||||
| `defaults.drainEnabled` | Enable node drain before upgrade | `false` |
|
||||
| `defaults.drainTimeout` | Timeout for drain operation | `300s` |
|
||||
| `images.helm` | Helm container image (multi-arch) | `quay.io/kata-containers/helm:latest` |
|
||||
| `images.kubectl` | `kubectl` container image (multi-arch) | `quay.io/kata-containers/kubectl:latest` |
|
||||
|
||||
## Workflow Parameters
|
||||
|
||||
When submitting a workflow, you can override:
|
||||
|
||||
| Parameter | Description |
|
||||
|-----------|-------------|
|
||||
| `target-version` | **Required** - Target Kata version |
|
||||
| `helm-release` | Helm release name |
|
||||
| `helm-namespace` | Namespace of kata-deploy |
|
||||
| `node-selector` | Label selector for nodes |
|
||||
| `node-taint-key` | Taint key for node selection |
|
||||
| `node-taint-value` | Taint value filter |
|
||||
| `verification-namespace` | Namespace for verification pods |
|
||||
| `verification-pod` | Pod YAML with placeholders |
|
||||
| `drain-enabled` | Whether to drain nodes before upgrade |
|
||||
| `drain-timeout` | Timeout for drain operation |
|
||||
|
||||
## Upgrade Flow
|
||||
|
||||
For each node selected by the node-selector label:
|
||||
|
||||
1. **Prepare**: Annotate node with upgrade status
|
||||
2. **Cordon**: Mark node as `unschedulable`
|
||||
3. **Drain** (optional): Evict pods if `drain-enabled=true`
|
||||
4. **Upgrade**: Run `helm upgrade` for kata-deploy
|
||||
5. **Wait**: Wait for kata-deploy DaemonSet pod to be ready
|
||||
6. **Verify**: Run verification pod and check exit code
|
||||
7. **On Success**: `Uncordon` node, proceed to next node
|
||||
8. **On Failure**: Automatic rollback, `uncordon`, workflow stops
|
||||
|
||||
Nodes are upgraded **sequentially** (one at a time). If verification fails on any node,
|
||||
the workflow stops immediately, preventing a mixed-version fleet.
|
||||
|
||||
### When to Use Drain
|
||||
|
||||
**Default (drain disabled):** Drain is not required for Kata upgrades. Running Kata
|
||||
VMs continue using the in-memory binaries. Only new workloads use the upgraded
|
||||
binaries.
|
||||
|
||||
**Optional drain:** Enable drain if you prefer to evict all workloads before any
|
||||
maintenance operation, or if your organization's operational policies require it:
|
||||
|
||||
```bash
|
||||
# Enable drain when installing the chart
|
||||
helm install kata-lifecycle-manager ./kata-lifecycle-manager \
|
||||
--set defaults.drainEnabled=true \
|
||||
--set defaults.drainTimeout=600s \
|
||||
--set-file defaults.verificationPod=./my-verification-pod.yaml
|
||||
|
||||
# Or override at workflow submission time
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0 \
|
||||
-p drain-enabled=true \
|
||||
-p drain-timeout=600s
|
||||
```
|
||||
|
||||
## Rollback
|
||||
|
||||
**Automatic rollback on verification failure:** If the verification pod fails (non-zero exit),
|
||||
kata-lifecycle-manager automatically:
|
||||
1. Runs `helm rollback` to revert to the previous Helm release
|
||||
2. Waits for kata-deploy DaemonSet to be ready with the previous version
|
||||
3. `Uncordons` the node
|
||||
4. Annotates the node with `rolled-back` status
|
||||
|
||||
This ensures nodes are never left in a broken state.
|
||||
|
||||
**Manual rollback:** For cases where you need to rollback a successfully upgraded node:
|
||||
|
||||
```bash
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
--entrypoint rollback-node \
|
||||
-p node-name=worker-1
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
Check node annotations to monitor upgrade progress:
|
||||
|
||||
```bash
|
||||
kubectl get nodes \
|
||||
-L katacontainers.io/kata-lifecycle-manager-status \
|
||||
-L katacontainers.io/kata-current-version
|
||||
```
|
||||
|
||||
| Annotation | Description |
|
||||
|------------|-------------|
|
||||
| `katacontainers.io/kata-lifecycle-manager-status` | Current upgrade phase |
|
||||
| `katacontainers.io/kata-current-version` | Version after successful upgrade |
|
||||
|
||||
Status values:
|
||||
- `preparing` - Upgrade starting
|
||||
- `cordoned` - Node marked `unschedulable`
|
||||
- `draining` - Draining pods (only if drain-enabled=true)
|
||||
- `upgrading` - Helm upgrade in progress
|
||||
- `verifying` - Verification pod running
|
||||
- `completed` - Upgrade successful
|
||||
- `rolling-back` - Rollback in progress (automatic on verification failure)
|
||||
- `rolled-back` - Rollback completed
|
||||
|
||||
## Known Limitations
|
||||
|
||||
### Cluster-Wide DaemonSet Updates
|
||||
|
||||
The kata-deploy Helm chart uses a DaemonSet, which means `helm upgrade` updates
|
||||
all nodes simultaneously at the Kubernetes level, even though this workflow
|
||||
processes nodes sequentially for verification.
|
||||
|
||||
**Current behavior:**
|
||||
|
||||
1. Node A is cordoned and upgraded
|
||||
2. Node A verification passes, Node A is `uncordoned`
|
||||
3. New workloads can now start on Node A using the **new** Kata version
|
||||
4. Node B verification fails
|
||||
5. Automatic rollback reverts kata-deploy cluster-wide to the **old** version
|
||||
6. Workloads that started on Node A between steps 2-5 continue running with
|
||||
the new version's in-memory binaries, while new workloads use the old version
|
||||
|
||||
This is generally acceptable because:
|
||||
- Running Kata VMs continue functioning (they use in-memory binaries)
|
||||
- New workloads use the rolled-back version
|
||||
- The cluster reaches a consistent state for new workloads
|
||||
|
||||
**Future improvement:** A two-phase approach could cordon all target nodes upfront,
|
||||
perform the upgrade, verify all nodes, and only `uncordon` after all verifications
|
||||
pass. This would prevent any new workloads from using the new version until the
|
||||
entire upgrade is validated, at the cost of longer node unavailability.
|
||||
|
||||
## For Projects Using kata-deploy
|
||||
|
||||
Any project that uses the kata-deploy Helm chart can install this companion chart
|
||||
to get upgrade orchestration:
|
||||
|
||||
```bash
|
||||
# Install kata-deploy
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--namespace kube-system
|
||||
|
||||
# Install upgrade tooling with your verification config
|
||||
helm install kata-lifecycle-manager oci://ghcr.io/kata-containers/kata-deploy-charts/kata-lifecycle-manager \
|
||||
--set-file defaults.verificationPod=./my-verification-pod.yaml
|
||||
|
||||
# Trigger upgrade
|
||||
argo submit -n argo --from workflowtemplate/kata-lifecycle-manager \
|
||||
-p target-version=3.25.0
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Apache License 2.0
|
||||
@@ -0,0 +1,46 @@
|
||||
{{/*
|
||||
Copyright (c) 2026 The Kata Containers Authors
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
*/}}
|
||||
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "kata-lifecycle-manager.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
*/}}
|
||||
{{- define "kata-lifecycle-manager.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "kata-lifecycle-manager.labels" -}}
|
||||
helm.sh/chart: {{ include "kata-lifecycle-manager.name" . }}-{{ .Chart.Version }}
|
||||
app.kubernetes.io/name: {{ include "kata-lifecycle-manager.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
app.kubernetes.io/part-of: kata-containers
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
ServiceAccount name
|
||||
*/}}
|
||||
{{- define "kata-lifecycle-manager.serviceAccountName" -}}
|
||||
{{- include "kata-lifecycle-manager.fullname" . }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,89 @@
|
||||
{{/*
|
||||
Copyright (c) 2026 The Kata Containers Authors
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
*/}}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "kata-lifecycle-manager.serviceAccountName" . }}
|
||||
namespace: {{ .Values.argoNamespace }}
|
||||
labels:
|
||||
{{- include "kata-lifecycle-manager.labels" . | nindent 4 }}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ include "kata-lifecycle-manager.fullname" . }}
|
||||
labels:
|
||||
{{- include "kata-lifecycle-manager.labels" . | nindent 4 }}
|
||||
rules:
|
||||
# Node operations (cordon, drain, label)
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get", "list", "watch", "patch", "update"]
|
||||
# Pod operations (eviction for drain, verification pods, Argo output parameters)
|
||||
- apiGroups: [""]
|
||||
resources: ["pods"]
|
||||
verbs: ["get", "list", "watch", "create", "delete", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/log"]
|
||||
verbs: ["get", "list", "watch", "create", "delete"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/eviction"]
|
||||
verbs: ["create"]
|
||||
# DaemonSet operations (kata-deploy is a DaemonSet)
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["daemonsets"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# RuntimeClass operations (kata-deploy creates RuntimeClasses)
|
||||
- apiGroups: ["node.k8s.io"]
|
||||
resources: ["runtimeclasses"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# Helm needs these for release management
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets", "configmaps"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# kata-deploy creates a ServiceAccount
|
||||
- apiGroups: [""]
|
||||
resources: ["serviceaccounts"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# kata-deploy creates RBAC resources
|
||||
- apiGroups: ["rbac.authorization.k8s.io"]
|
||||
resources: ["clusterroles", "clusterrolebindings", "roles", "rolebindings"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# Jobs (kata-deploy may have post-install/post-delete hooks)
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# Events for debugging
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["get", "list", "watch", "create"]
|
||||
# Argo Workflows task results (for output parameters)
|
||||
- apiGroups: ["argoproj.io"]
|
||||
resources: ["workflowtaskresults"]
|
||||
verbs: ["create", "patch"]
|
||||
# CRDs (kata-deploy may reference NFD CRDs)
|
||||
- apiGroups: ["apiextensions.k8s.io"]
|
||||
resources: ["customresourcedefinitions"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
# Node Feature Discovery resources (kata-deploy NFD integration)
|
||||
- apiGroups: ["nfd.k8s-sigs.io"]
|
||||
resources: ["nodefeaturerules"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "kata-lifecycle-manager.fullname" . }}
|
||||
labels:
|
||||
{{- include "kata-lifecycle-manager.labels" . | nindent 4 }}
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "kata-lifecycle-manager.serviceAccountName" . }}
|
||||
namespace: {{ .Values.argoNamespace }}
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: {{ include "kata-lifecycle-manager.fullname" . }}
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
@@ -0,0 +1,775 @@
|
||||
{{/*
|
||||
Copyright (c) 2026 The Kata Containers Authors
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Argo WorkflowTemplate for orchestrating kata-deploy upgrades.
|
||||
Uses native Argo resource templates where possible, and standard
|
||||
helm/kubectl images for operations requiring those tools.
|
||||
*/}}
|
||||
{{- if not .Values.defaults.verificationPod }}
|
||||
{{- fail "defaults.verificationPod is required. Provide a pod spec that validates your Kata deployment using --set-file defaults.verificationPod=./your-pod.yaml" }}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: WorkflowTemplate
|
||||
metadata:
|
||||
name: kata-lifecycle-manager
|
||||
namespace: {{ .Values.argoNamespace }}
|
||||
labels:
|
||||
{{- include "kata-lifecycle-manager.labels" . | nindent 4 }}
|
||||
spec:
|
||||
entrypoint: upgrade-all-nodes
|
||||
serviceAccountName: {{ include "kata-lifecycle-manager.serviceAccountName" . }}
|
||||
|
||||
podGC:
|
||||
strategy: OnWorkflowSuccess
|
||||
|
||||
arguments:
|
||||
parameters:
|
||||
- name: target-version
|
||||
description: "Target Kata Containers version (e.g., 3.25.0)"
|
||||
- name: helm-release
|
||||
value: {{ .Values.defaults.helmRelease | quote }}
|
||||
description: "Helm release name"
|
||||
- name: helm-namespace
|
||||
value: {{ .Values.defaults.helmNamespace | quote }}
|
||||
description: "Namespace where kata-deploy is installed"
|
||||
- name: node-selector
|
||||
value: {{ .Values.defaults.nodeSelector | quote }}
|
||||
description: "Label selector for nodes to upgrade (optional if using taint selection)"
|
||||
- name: node-taint-key
|
||||
value: {{ .Values.defaults.nodeTaintKey | quote }}
|
||||
description: "Taint key for node selection (optional, alternative to label selector)"
|
||||
- name: node-taint-value
|
||||
value: {{ .Values.defaults.nodeTaintValue | quote }}
|
||||
description: "Taint value filter (optional, only used with node-taint-key)"
|
||||
- name: helm-image
|
||||
value: {{ .Values.images.helm | quote }}
|
||||
description: "Helm container image"
|
||||
- name: kubectl-image
|
||||
value: {{ .Values.images.kubectl | quote }}
|
||||
description: "Kubectl container image"
|
||||
- name: verification-namespace
|
||||
value: {{ .Values.defaults.verificationNamespace | quote }}
|
||||
description: "Namespace for verification pods"
|
||||
- name: verification-pod
|
||||
value: {{ .Values.defaults.verificationPod | b64enc | quote }}
|
||||
description: "Base64-encoded pod YAML for verification (uses placeholders NODE, TEST_POD)"
|
||||
- name: drain-enabled
|
||||
value: {{ .Values.defaults.drainEnabled | quote }}
|
||||
description: "Whether to drain nodes before upgrade"
|
||||
- name: drain-timeout
|
||||
value: {{ .Values.defaults.drainTimeout | quote }}
|
||||
description: "Timeout for node drain"
|
||||
|
||||
templates:
|
||||
# =========================================================================
|
||||
# MAIN ENTRYPOINT
|
||||
# =========================================================================
|
||||
- name: upgrade-all-nodes
|
||||
steps:
|
||||
- - name: validate-prerequisites
|
||||
template: check-prerequisites
|
||||
- - name: get-nodes
|
||||
template: get-target-nodes
|
||||
- - name: show-upgrade-plan
|
||||
template: print-upgrade-plan
|
||||
arguments:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
value: "{{`{{steps.get-nodes.outputs.parameters.nodes}}`}}"
|
||||
- name: node-count
|
||||
value: "{{`{{steps.get-nodes.outputs.parameters.node-count}}`}}"
|
||||
- - name: upgrade-nodes-sequentially
|
||||
template: upgrade-node-chain
|
||||
arguments:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
value: "{{`{{steps.get-nodes.outputs.parameters.nodes}}`}}"
|
||||
- name: current-index
|
||||
value: "0"
|
||||
- - name: summary
|
||||
template: print-summary
|
||||
|
||||
# =========================================================================
|
||||
# CHECK PREREQUISITES (fail fast if verification pod not configured)
|
||||
# =========================================================================
|
||||
- name: check-prerequisites
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.helm-image}}`}}"
|
||||
command: [sh]
|
||||
source: |
|
||||
set -e
|
||||
|
||||
RELEASE="{{`{{workflow.parameters.helm-release}}`}}"
|
||||
NS="{{`{{workflow.parameters.helm-namespace}}`}}"
|
||||
VERIFICATION_POD="{{`{{workflow.parameters.verification-pod}}`}}"
|
||||
|
||||
echo "=============================================="
|
||||
echo " VALIDATING PREREQUISITES"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
|
||||
# Check verification pod availability
|
||||
echo "Checking verification pod configuration..."
|
||||
|
||||
if [ -n "$VERIFICATION_POD" ]; then
|
||||
echo "✓ Verification pod configured"
|
||||
else
|
||||
echo ""
|
||||
echo "ERROR: No verification pod configured!"
|
||||
echo ""
|
||||
echo "The upgrade cannot proceed without a verification pod."
|
||||
echo ""
|
||||
echo "This should not happen if kata-lifecycle-manager was installed correctly."
|
||||
echo "Reinstall with: helm upgrade kata-lifecycle-manager ... --set-file defaults.verificationPod=<path>"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check kata-deploy release exists
|
||||
echo "Checking kata-deploy Helm release..."
|
||||
if helm status "$RELEASE" -n "$NS" &>/dev/null; then
|
||||
CURRENT_VERSION=$(helm get metadata "$RELEASE" -n "$NS" -o json 2>/dev/null | grep -o '"version":"[^"]*"' | cut -d'"' -f4 || echo "unknown")
|
||||
echo "✓ Found Helm release: $RELEASE (chart version: $CURRENT_VERSION)"
|
||||
else
|
||||
echo ""
|
||||
echo "ERROR: Helm release '$RELEASE' not found in namespace '$NS'"
|
||||
echo ""
|
||||
echo "Make sure kata-deploy is installed via Helm before running upgrade."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=============================================="
|
||||
echo " ALL PREREQUISITES PASSED"
|
||||
echo "=============================================="
|
||||
|
||||
# =========================================================================
|
||||
# PRINT UPGRADE PLAN (shows all nodes before starting)
|
||||
# =========================================================================
|
||||
- name: print-upgrade-plan
|
||||
inputs:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
- name: node-count
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [sh]
|
||||
source: |
|
||||
apk add --no-cache -q jq 2>/dev/null || true
|
||||
|
||||
NODES_JSON='{{`{{inputs.parameters.nodes-json}}`}}'
|
||||
NODE_COUNT={{`{{inputs.parameters.node-count}}`}}
|
||||
VERSION="{{`{{workflow.parameters.target-version}}`}}"
|
||||
|
||||
echo "=============================================="
|
||||
echo " KATA CONTAINERS UPGRADE PLAN"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
echo "Target Version: $VERSION"
|
||||
echo "Total Nodes: $NODE_COUNT"
|
||||
echo "Mode: Sequential (one at a time)"
|
||||
echo ""
|
||||
echo "Nodes to upgrade (in order):"
|
||||
echo "----------------------------------------------"
|
||||
|
||||
INDEX=1
|
||||
echo "$NODES_JSON" | jq -r '.[]' | while read NODE; do
|
||||
echo " $INDEX. $NODE"
|
||||
INDEX=$((INDEX + 1))
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "----------------------------------------------"
|
||||
echo "Upgrade will stop immediately on first failure"
|
||||
echo "=============================================="
|
||||
|
||||
# =========================================================================
|
||||
# SEQUENTIAL NODE CHAIN (recursive: upgrades one node, then next)
|
||||
# Stops immediately on first failure - no mixed fleet possible
|
||||
# =========================================================================
|
||||
- name: upgrade-node-chain
|
||||
inputs:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
- name: current-index
|
||||
steps:
|
||||
# Extract current node info
|
||||
- - name: node-info
|
||||
template: get-node-at-index
|
||||
arguments:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
value: "{{`{{inputs.parameters.nodes-json}}`}}"
|
||||
- name: index
|
||||
value: "{{`{{inputs.parameters.current-index}}`}}"
|
||||
# Upgrade current node (shows all sub-steps: prepare, cordon, upgrade, verify...)
|
||||
- - name: upgrade
|
||||
template: upgrade-single-node
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{steps.node-info.outputs.parameters.node-name}}`}}"
|
||||
# Continue to next node only if this one succeeded and more nodes exist
|
||||
- - name: next
|
||||
template: upgrade-node-chain
|
||||
when: "{{`{{steps.node-info.outputs.parameters.has-more}}`}} == true"
|
||||
arguments:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
value: "{{`{{inputs.parameters.nodes-json}}`}}"
|
||||
- name: current-index
|
||||
value: "{{`{{steps.node-info.outputs.parameters.next-index}}`}}"
|
||||
|
||||
# =========================================================================
|
||||
# GET NODE AT INDEX (helper for sequential chain)
|
||||
# =========================================================================
|
||||
- name: get-node-at-index
|
||||
inputs:
|
||||
parameters:
|
||||
- name: nodes-json
|
||||
- name: index
|
||||
outputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
valueFrom:
|
||||
path: /tmp/node-name.txt
|
||||
- name: has-more
|
||||
valueFrom:
|
||||
path: /tmp/has-more.txt
|
||||
- name: next-index
|
||||
valueFrom:
|
||||
path: /tmp/next-index.txt
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [sh]
|
||||
source: |
|
||||
set -e
|
||||
apk add --no-cache -q jq 2>/dev/null || true
|
||||
NODES_JSON='{{`{{inputs.parameters.nodes-json}}`}}'
|
||||
INDEX={{`{{inputs.parameters.index}}`}}
|
||||
|
||||
NODE=$(echo "$NODES_JSON" | jq -r ".[$INDEX]")
|
||||
TOTAL=$(echo "$NODES_JSON" | jq 'length')
|
||||
NEXT=$((INDEX + 1))
|
||||
|
||||
echo "$NODE" > /tmp/node-name.txt
|
||||
echo "$NEXT" > /tmp/next-index.txt
|
||||
|
||||
if [ "$NEXT" -lt "$TOTAL" ]; then
|
||||
echo "true" > /tmp/has-more.txt
|
||||
else
|
||||
echo "false" > /tmp/has-more.txt
|
||||
fi
|
||||
|
||||
echo "=== Node $((INDEX + 1)) of $TOTAL: $NODE ==="
|
||||
|
||||
# =========================================================================
|
||||
# GET TARGET NODES (supports label selector, taint selector, or both)
|
||||
# =========================================================================
|
||||
- name: get-target-nodes
|
||||
# Tolerate system taints so workflow pods can schedule during upgrades
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
outputs:
|
||||
parameters:
|
||||
- name: nodes
|
||||
valueFrom:
|
||||
path: /tmp/nodes.json
|
||||
- name: node-count
|
||||
valueFrom:
|
||||
path: /tmp/count.txt
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
set -e
|
||||
LABEL_SELECTOR="{{`{{workflow.parameters.node-selector}}`}}"
|
||||
TAINT_KEY="{{`{{workflow.parameters.node-taint-key}}`}}"
|
||||
TAINT_VALUE="{{`{{workflow.parameters.node-taint-value}}`}}"
|
||||
|
||||
# Get nodes based on label selector (or all nodes if no selector)
|
||||
if [[ -n "$LABEL_SELECTOR" ]]; then
|
||||
NODE_NAMES=$(kubectl get nodes -l "$LABEL_SELECTOR" -o jsonpath='{.items[*].metadata.name}')
|
||||
else
|
||||
NODE_NAMES=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')
|
||||
fi
|
||||
|
||||
if [[ -z "$NODE_NAMES" ]]; then
|
||||
echo "[]" > /tmp/nodes.json
|
||||
echo "0" > /tmp/count.txt
|
||||
echo "No nodes found matching label selector"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# If taint key specified, filter nodes by taint
|
||||
if [[ -n "$TAINT_KEY" ]]; then
|
||||
TAINT_FILTERED=""
|
||||
for node in $NODE_NAMES; do
|
||||
# Get taints for this node
|
||||
if [[ -n "$TAINT_VALUE" ]]; then
|
||||
# Check for specific taint key=value
|
||||
HAS_TAINT=$(kubectl get node "$node" -o jsonpath="{.spec.taints[?(@.key=='$TAINT_KEY' && @.value=='$TAINT_VALUE')].key}" 2>/dev/null || echo "")
|
||||
else
|
||||
# Check for taint key only
|
||||
HAS_TAINT=$(kubectl get node "$node" -o jsonpath="{.spec.taints[?(@.key=='$TAINT_KEY')].key}" 2>/dev/null || echo "")
|
||||
fi
|
||||
if [[ -n "$HAS_TAINT" ]]; then
|
||||
TAINT_FILTERED="$TAINT_FILTERED $node"
|
||||
fi
|
||||
done
|
||||
NODE_NAMES=$(echo "$TAINT_FILTERED" | xargs)
|
||||
fi
|
||||
|
||||
if [[ -z "$NODE_NAMES" ]]; then
|
||||
echo "[]" > /tmp/nodes.json
|
||||
echo "0" > /tmp/count.txt
|
||||
echo "No nodes found matching selection criteria"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Convert space-separated names to sorted newline-separated
|
||||
NODE_LIST=$(echo "$NODE_NAMES" | tr ' ' '\n' | sort)
|
||||
NODE_COUNT=$(echo "$NODE_LIST" | wc -l)
|
||||
|
||||
# Output as JSON array (manually build it without jq)
|
||||
JSON_ARRAY="["
|
||||
FIRST=true
|
||||
for node in $NODE_LIST; do
|
||||
if [[ "$FIRST" == "true" ]]; then
|
||||
JSON_ARRAY="${JSON_ARRAY}\"${node}\""
|
||||
FIRST=false
|
||||
else
|
||||
JSON_ARRAY="${JSON_ARRAY},\"${node}\""
|
||||
fi
|
||||
done
|
||||
JSON_ARRAY="${JSON_ARRAY}]"
|
||||
|
||||
echo "$JSON_ARRAY" > /tmp/nodes.json
|
||||
echo "$NODE_COUNT" > /tmp/count.txt
|
||||
|
||||
echo "Found $NODE_COUNT nodes for upgrade:"
|
||||
echo "$NODE_LIST"
|
||||
|
||||
# =========================================================================
|
||||
# UPGRADE SINGLE NODE
|
||||
# =========================================================================
|
||||
- name: upgrade-single-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
steps:
|
||||
- - name: prepare
|
||||
template: prepare-node
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
- - name: cordon
|
||||
template: cordon-node
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
- - name: drain
|
||||
template: drain-node
|
||||
when: "{{`{{workflow.parameters.drain-enabled}}`}} == true"
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
- - name: upgrade
|
||||
template: helm-upgrade
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
- - name: wait-ready
|
||||
template: wait-kata-ready
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
- - name: verify-and-complete
|
||||
template: verify-and-complete-node
|
||||
arguments:
|
||||
parameters:
|
||||
- name: node-name
|
||||
value: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
|
||||
# =========================================================================
|
||||
# PREPARE NODE
|
||||
# =========================================================================
|
||||
- name: prepare-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
resource:
|
||||
action: patch
|
||||
mergeStrategy: merge
|
||||
manifest: |
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
annotations:
|
||||
katacontainers.io/kata-lifecycle-manager-status: "preparing"
|
||||
|
||||
# =========================================================================
|
||||
# CORDON NODE
|
||||
# =========================================================================
|
||||
- name: cordon-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
resource:
|
||||
action: patch
|
||||
mergeStrategy: merge
|
||||
manifest: |
|
||||
apiVersion: v1
|
||||
kind: Node
|
||||
metadata:
|
||||
name: "{{`{{inputs.parameters.node-name}}`}}"
|
||||
annotations:
|
||||
katacontainers.io/kata-lifecycle-manager-status: "cordoned"
|
||||
spec:
|
||||
unschedulable: true
|
||||
|
||||
# =========================================================================
|
||||
# DRAIN NODE (optional, only runs if drain-enabled is true)
|
||||
# =========================================================================
|
||||
- name: drain-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
set -e
|
||||
NODE="{{`{{inputs.parameters.node-name}}`}}"
|
||||
TIMEOUT="{{`{{workflow.parameters.drain-timeout}}`}}"
|
||||
|
||||
# On failure: mark node as failed and uncordon
|
||||
cleanup_on_failure() {
|
||||
echo "ERROR: Drain failed, cleaning up node $NODE"
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="failed" || true
|
||||
kubectl uncordon "$NODE" || true
|
||||
}
|
||||
trap cleanup_on_failure EXIT
|
||||
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="draining"
|
||||
kubectl drain "$NODE" \
|
||||
--ignore-daemonsets \
|
||||
--delete-emptydir-data \
|
||||
--force \
|
||||
--timeout="$TIMEOUT"
|
||||
|
||||
# Success - remove the trap
|
||||
trap - EXIT
|
||||
|
||||
# =========================================================================
|
||||
# HELM UPGRADE
|
||||
# =========================================================================
|
||||
- name: helm-upgrade
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.helm-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
set -e
|
||||
NODE="{{`{{inputs.parameters.node-name}}`}}"
|
||||
VERSION="{{`{{workflow.parameters.target-version}}`}}"
|
||||
RELEASE="{{`{{workflow.parameters.helm-release}}`}}"
|
||||
NS="{{`{{workflow.parameters.helm-namespace}}`}}"
|
||||
CHART="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy"
|
||||
|
||||
apk add --no-cache -q kubectl
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="upgrading"
|
||||
|
||||
# On failure: mark node as failed and uncordon
|
||||
cleanup_on_failure() {
|
||||
echo "ERROR: Helm upgrade failed, cleaning up node $NODE"
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="failed" || true
|
||||
kubectl uncordon "$NODE" || true
|
||||
}
|
||||
trap cleanup_on_failure EXIT
|
||||
|
||||
# Disable kata-deploy's verification (--set verification.pod="") because:
|
||||
# - kata-deploy verification is cluster-wide (runs once after helm upgrade)
|
||||
# - kata-lifecycle-manager does per-node verification in verify-and-complete-node
|
||||
# The per-node verification is more appropriate for rolling upgrades.
|
||||
helm upgrade "$RELEASE" "$CHART" \
|
||||
--namespace "$NS" \
|
||||
--version "$VERSION" \
|
||||
--reuse-values \
|
||||
--set verification.pod="" \
|
||||
--rollback-on-failure \
|
||||
--timeout 10m \
|
||||
--wait
|
||||
|
||||
# Success - remove the trap so we don't run cleanup
|
||||
trap - EXIT
|
||||
|
||||
# =========================================================================
|
||||
# WAIT FOR KATA-DEPLOY READY
|
||||
# =========================================================================
|
||||
- name: wait-kata-ready
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
set -e
|
||||
NODE="{{`{{inputs.parameters.node-name}}`}}"
|
||||
NS="{{`{{workflow.parameters.helm-namespace}}`}}"
|
||||
|
||||
# On failure: mark node as failed and uncordon
|
||||
cleanup_on_failure() {
|
||||
echo "ERROR: Timed out waiting for kata-deploy, cleaning up node $NODE"
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="failed" || true
|
||||
kubectl uncordon "$NODE" || true
|
||||
}
|
||||
trap cleanup_on_failure EXIT
|
||||
|
||||
for i in $(seq 1 60); do
|
||||
POD=$(kubectl get pods -n "$NS" -l name=kata-deploy \
|
||||
--field-selector spec.nodeName="$NODE" \
|
||||
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
||||
if [[ -n "$POD" ]]; then
|
||||
if kubectl wait pod "$POD" -n "$NS" --for=condition=Ready --timeout=10s 2>/dev/null; then
|
||||
echo "kata-deploy pod $POD is ready"
|
||||
trap - EXIT
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
echo "Waiting... ($i/60)"
|
||||
sleep 5
|
||||
done
|
||||
exit 1
|
||||
|
||||
# =========================================================================
|
||||
# VERIFY AND COMPLETE NODE (with automatic rollback on failure)
|
||||
# =========================================================================
|
||||
- name: verify-and-complete-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.helm-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
# Don't use set -e - we need to reach rollback logic even on errors
|
||||
NODE="{{`{{inputs.parameters.node-name}}`}}"
|
||||
VERIFY_NS="{{`{{workflow.parameters.verification-namespace}}`}}"
|
||||
RELEASE="{{`{{workflow.parameters.helm-release}}`}}"
|
||||
NS="{{`{{workflow.parameters.helm-namespace}}`}}"
|
||||
VERSION="{{`{{workflow.parameters.target-version}}`}}"
|
||||
VERIFICATION_POD="{{`{{workflow.parameters.verification-pod}}`}}"
|
||||
TEST_POD="kata-verify-${NODE}-$(date +%s)"
|
||||
|
||||
# Install kubectl (helm image is based on kubectl image but just in case)
|
||||
apk add --no-cache -q kubectl 2>/dev/null || true
|
||||
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="verifying"
|
||||
|
||||
# Decode verification pod spec (base64-encoded)
|
||||
echo "Using verification pod from workflow parameters"
|
||||
echo "$VERIFICATION_POD" | base64 -d > /tmp/verify-pod.yaml
|
||||
|
||||
# Apply verification pod with placeholder substitution
|
||||
sed -i "s|\${NODE}|$NODE|g" /tmp/verify-pod.yaml
|
||||
sed -i "s|\${TEST_POD}|$TEST_POD|g" /tmp/verify-pod.yaml
|
||||
|
||||
if ! kubectl apply -n "$VERIFY_NS" -f /tmp/verify-pod.yaml; then
|
||||
echo "ERROR: Failed to create verification pod"
|
||||
VERIFY_SUCCESS=false
|
||||
fi
|
||||
|
||||
# Cleanup function for verification pod
|
||||
cleanup_pod() {
|
||||
kubectl delete pod "$TEST_POD" -n "$VERIFY_NS" --ignore-not-found --wait=false
|
||||
}
|
||||
trap cleanup_pod EXIT
|
||||
|
||||
# Wait for verification pod to complete (only if pod was created)
|
||||
# This catches all failure modes:
|
||||
# - Pod stuck in Pending/ContainerCreating (runtime can't start VM)
|
||||
# - Pod crashes immediately (containerd/CRI-O config issues)
|
||||
# - Pod times out (resource issues, image pull failures)
|
||||
# - Pod exits with non-zero code (verification logic failed)
|
||||
if [ "${VERIFY_SUCCESS:-}" != "false" ]; then
|
||||
VERIFY_SUCCESS=false
|
||||
if kubectl wait pod "$TEST_POD" -n "$VERIFY_NS" --for=jsonpath='{.status.phase}'=Succeeded --timeout=180s; then
|
||||
echo "=== Verification Succeeded ==="
|
||||
kubectl logs "$TEST_POD" -n "$VERIFY_NS" || true
|
||||
VERIFY_SUCCESS=true
|
||||
else
|
||||
echo "=== Verification Failed ==="
|
||||
echo ""
|
||||
echo "Pod status:"
|
||||
kubectl get pod "$TEST_POD" -n "$VERIFY_NS" -o wide || true
|
||||
echo ""
|
||||
echo "Pod events and details:"
|
||||
kubectl describe pod "$TEST_POD" -n "$VERIFY_NS" || true
|
||||
echo ""
|
||||
echo "Pod logs (if available):"
|
||||
kubectl logs "$TEST_POD" -n "$VERIFY_NS" || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Clean up verification pod
|
||||
cleanup_pod
|
||||
trap - EXIT
|
||||
|
||||
if [ "$VERIFY_SUCCESS" = "true" ]; then
|
||||
# Success path: uncordon and mark complete
|
||||
echo "Uncordoning node $NODE..."
|
||||
kubectl uncordon "$NODE"
|
||||
kubectl annotate node "$NODE" --overwrite \
|
||||
katacontainers.io/kata-lifecycle-manager-status="completed" \
|
||||
katacontainers.io/kata-current-version="$VERSION"
|
||||
echo "Node $NODE upgrade completed successfully"
|
||||
exit 0
|
||||
else
|
||||
# Failure path: automatic rollback
|
||||
echo "Initiating automatic rollback for node $NODE..."
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="rolling-back"
|
||||
|
||||
helm rollback "$RELEASE" -n "$NS" --wait --timeout 10m
|
||||
|
||||
# Wait for kata-deploy to be ready after rollback
|
||||
echo "Waiting for kata-deploy to be ready after rollback..."
|
||||
for i in $(seq 1 60); do
|
||||
POD=$(kubectl get pods -n "$NS" -l name=kata-deploy \
|
||||
--field-selector spec.nodeName="$NODE" \
|
||||
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
||||
if [[ -n "$POD" ]]; then
|
||||
if kubectl wait pod "$POD" -n "$NS" --for=condition=Ready --timeout=10s 2>/dev/null; then
|
||||
echo "kata-deploy pod $POD is ready after rollback"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
echo "Waiting for rollback to complete... ($i/60)"
|
||||
sleep 5
|
||||
done
|
||||
|
||||
# Uncordon and mark as rolled back
|
||||
kubectl uncordon "$NODE"
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="rolled-back"
|
||||
echo "Node $NODE rolled back to previous version"
|
||||
|
||||
# Exit with error so workflow shows the failure
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# =========================================================================
|
||||
# PRINT SUMMARY
|
||||
# =========================================================================
|
||||
- name: print-summary
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.kubectl-image}}`}}"
|
||||
command: [bash]
|
||||
source: |
|
||||
echo "=== KATA UPGRADE SUMMARY ==="
|
||||
kubectl get nodes \
|
||||
-L katacontainers.io/kata-runtime \
|
||||
-L katacontainers.io/kata-lifecycle-manager-status \
|
||||
-L katacontainers.io/kata-current-version
|
||||
|
||||
# =========================================================================
|
||||
# ROLLBACK (can be called manually)
|
||||
# =========================================================================
|
||||
- name: rollback-node
|
||||
inputs:
|
||||
parameters:
|
||||
- name: node-name
|
||||
tolerations:
|
||||
- key: node.kubernetes.io/unschedulable
|
||||
operator: Exists
|
||||
- key: node.kubernetes.io/disk-pressure
|
||||
operator: Exists
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
script:
|
||||
image: "{{`{{workflow.parameters.helm-image}}`}}"
|
||||
command: [sh]
|
||||
source: |
|
||||
set -e
|
||||
NODE="{{`{{inputs.parameters.node-name}}`}}"
|
||||
RELEASE="{{`{{workflow.parameters.helm-release}}`}}"
|
||||
NS="{{`{{workflow.parameters.helm-namespace}}`}}"
|
||||
|
||||
apk add --no-cache -q kubectl
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="rolling-back"
|
||||
|
||||
helm rollback "$RELEASE" -n "$NS" --wait --timeout 10m
|
||||
|
||||
kubectl annotate node "$NODE" --overwrite katacontainers.io/kata-lifecycle-manager-status="rolled-back"
|
||||
kubectl uncordon "$NODE"
|
||||
@@ -0,0 +1,63 @@
|
||||
# Copyright (c) 2026 The Kata Containers Authors
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Argo Workflows namespace where the WorkflowTemplate will be created
|
||||
argoNamespace: argo
|
||||
|
||||
# Default workflow parameters (can be overridden when submitting workflows)
|
||||
defaults:
|
||||
# Helm release name for kata-deploy
|
||||
helmRelease: kata-deploy
|
||||
|
||||
# Namespace where kata-deploy is installed
|
||||
helmNamespace: kube-system
|
||||
|
||||
# Label selector for nodes to upgrade (optional if using taint selection)
|
||||
nodeSelector: ""
|
||||
|
||||
# Taint-based node selection (optional, alternative to label selector)
|
||||
# Select nodes that have a taint with this key
|
||||
nodeTaintKey: ""
|
||||
# Optional: filter by taint value (only used if nodeTaintKey is set)
|
||||
nodeTaintValue: ""
|
||||
|
||||
# Namespace for verification pods
|
||||
verificationNamespace: default
|
||||
|
||||
# Verification pod spec (REQUIRED)
|
||||
#
|
||||
# A verification pod is required to validate each node after upgrade.
|
||||
# The chart will fail to install without one.
|
||||
#
|
||||
# Provide via:
|
||||
# helm install kata-lifecycle-manager ... \
|
||||
# --set-file defaults.verificationPod=/path/to/your-verification-pod.yaml
|
||||
#
|
||||
# Or override at workflow submission (base64-encoded):
|
||||
# argo submit ... -p verification-pod="$(base64 -w0 < pod.yaml)"
|
||||
#
|
||||
# Note: kata-deploy's own verification is disabled during upgrade because
|
||||
# it is cluster-wide (designed for initial install), while kata-lifecycle-manager
|
||||
# performs per-node verification with proper placeholder substitution.
|
||||
#
|
||||
# Placeholders substituted at runtime:
|
||||
# ${NODE} - the node being upgraded/verified
|
||||
# ${TEST_POD} - generated unique pod name
|
||||
#
|
||||
verificationPod: ""
|
||||
|
||||
# Optional: Drain nodes before upgrade (default: false)
|
||||
# Not required for Kata upgrades since running VMs use in-memory binaries.
|
||||
# Enable if you prefer to evict workloads before upgrading.
|
||||
drainEnabled: false
|
||||
|
||||
# Timeout for node drain (only used if drainEnabled is true)
|
||||
drainTimeout: "300s"
|
||||
|
||||
# Container images used by workflow steps
|
||||
images:
|
||||
# Helm image for helm upgrade operations (multi-arch)
|
||||
helm: quay.io/kata-containers/helm:latest
|
||||
|
||||
# Kubectl image for kubernetes operations (multi-arch)
|
||||
kubectl: quay.io/kata-containers/kubectl:latest
|
||||
@@ -231,9 +231,14 @@ function _upload_helm_chart_tarball()
|
||||
|
||||
RELEASE_VERSION="$(_release_version)"
|
||||
|
||||
# Package and upload kata-deploy chart
|
||||
helm dependencies update ${repo_root_dir}/tools/packaging/kata-deploy/helm-chart/kata-deploy
|
||||
helm package ${repo_root_dir}/tools/packaging/kata-deploy/helm-chart/kata-deploy
|
||||
gh release upload "${RELEASE_VERSION}" "kata-deploy-${RELEASE_VERSION}.tgz"
|
||||
|
||||
# Package and upload kata-lifecycle-manager chart
|
||||
helm package ${repo_root_dir}/tools/packaging/kata-deploy/helm-chart/kata-lifecycle-manager
|
||||
gh release upload "${RELEASE_VERSION}" "kata-lifecycle-manager-${RELEASE_VERSION}.tgz"
|
||||
}
|
||||
|
||||
function main()
|
||||
|
||||
Reference in New Issue
Block a user