Compare commits

..

1 Commits

Author SHA1 Message Date
Fabiano Fidêncio
ca1e365a27 tests: arm64: Increase k8s tests timeout to 60 minutes
This allows adding different runners in case the powerful one goes down
for one reason or another.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-01-17 12:49:45 +01:00
110 changed files with 9645 additions and 2851 deletions

View File

@@ -12,6 +12,7 @@ updates:
- "/src/tools/agent-ctl"
- "/src/tools/genpolicy"
- "/src/tools/kata-ctl"
- "/src/tools/runk"
- "/src/tools/trace-forwarder"
schedule:
interval: "daily"

View File

@@ -163,6 +163,42 @@ jobs:
timeout-minutes: 10
run: bash tests/integration/nydus/gha-run.sh run
run-runk:
name: run-runk
# Skip runk tests as we have no maintainers. TODO: Decide when to remove altogether
if: false
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: lts
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Install dependencies
run: bash tests/integration/runk/gha-run.sh install-dependencies
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata
run: bash tests/integration/runk/gha-run.sh install-kata kata-artifacts
- name: Run runk tests
timeout-minutes: 10
run: bash tests/integration/runk/gha-run.sh run
run-tracing:
name: run-tracing
strategy:

36
.github/workflows/ci-nightly-rust.yaml vendored Normal file
View File

@@ -0,0 +1,36 @@
name: Kata Containers Nightly CI (Rust)
on:
schedule:
- cron: '0 1 * * *' # Run at 1 AM UTC (1 hour after script-based nightly)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
permissions: {}
jobs:
kata-containers-ci-on-push-rust:
permissions:
contents: read
packages: write
id-token: write
attestations: write
uses: ./.github/workflows/ci.yaml
with:
commit-hash: ${{ github.sha }}
pr-number: "nightly-rust"
tag: ${{ github.sha }}-nightly-rust
target-branch: ${{ github.ref_name }}
build-type: "rust" # Use Rust-based build
secrets:
AUTHENTICATED_IMAGE_PASSWORD: ${{ secrets.AUTHENTICATED_IMAGE_PASSWORD }}
AZ_APPID: ${{ secrets.AZ_APPID }}
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
ITA_KEY: ${{ secrets.ITA_KEY }}
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

View File

@@ -19,6 +19,11 @@ on:
required: false
type: string
default: no
build-type:
description: The build type for kata-deploy. Use 'rust' for Rust-based build, empty or omit for script-based (default).
required: false
type: string
default: ""
secrets:
AUTHENTICATED_IMAGE_PASSWORD:
required: true
@@ -72,6 +77,7 @@ jobs:
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-22.04
arch: amd64
build-type: ${{ inputs.build-type }}
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -104,6 +110,7 @@ jobs:
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-24.04-arm
arch: arm64
build-type: ${{ inputs.build-type }}
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -149,6 +156,7 @@ jobs:
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-24.04-s390x
arch: s390x
build-type: ${{ inputs.build-type }}
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -167,6 +175,7 @@ jobs:
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-24.04-ppc64le
arch: ppc64le
build-type: ${{ inputs.build-type }}
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -288,7 +297,7 @@ jobs:
tarball-suffix: -${{ inputs.tag }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
tag: ${{ inputs.tag }}-amd64${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -304,7 +313,7 @@ jobs:
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-arm64
tag: ${{ inputs.tag }}-arm64${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -317,7 +326,7 @@ jobs:
tarball-suffix: -${{ inputs.tag }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
tag: ${{ inputs.tag }}-amd64${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -339,7 +348,7 @@ jobs:
tarball-suffix: -${{ inputs.tag }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
tag: ${{ inputs.tag }}-amd64${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -357,7 +366,7 @@ jobs:
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-s390x
tag: ${{ inputs.tag }}-s390x${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -371,7 +380,7 @@ jobs:
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-ppc64le
tag: ${{ inputs.tag }}-ppc64le${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
@@ -383,7 +392,7 @@ jobs:
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
tag: ${{ inputs.tag }}-amd64${{ inputs.build-type == 'rust' && '-rust' || '' }}
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}

View File

@@ -82,6 +82,7 @@ jobs:
target-branch: ${{ github.ref_name }}
runner: ubuntu-22.04
arch: amd64
build-type: "" # Use script-based build (default)
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -99,6 +100,7 @@ jobs:
target-branch: ${{ github.ref_name }}
runner: ubuntu-24.04-arm
arch: arm64
build-type: "" # Use script-based build (default)
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -116,6 +118,7 @@ jobs:
target-branch: ${{ github.ref_name }}
runner: s390x
arch: s390x
build-type: "" # Use script-based build (default)
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -133,6 +136,7 @@ jobs:
target-branch: ${{ github.ref_name }}
runner: ubuntu-24.04-ppc64le
arch: ppc64le
build-type: "" # Use script-based build (default)
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}

View File

@@ -30,6 +30,11 @@ on:
description: The arch of the tarball.
required: true
type: string
build-type:
description: The build type for kata-deploy. Use 'rust' for Rust-based build, empty or omit for script-based (default).
required: false
type: string
default: ""
secrets:
QUAY_DEPLOYER_PASSWORD:
required: true
@@ -101,8 +106,10 @@ jobs:
REGISTRY: ${{ inputs.registry }}
REPO: ${{ inputs.repo }}
TAG: ${{ inputs.tag }}
BUILD_TYPE: ${{ inputs.build-type }}
run: |
./tools/packaging/kata-deploy/local-build/kata-deploy-build-and-upload-payload.sh \
"$(pwd)/kata-static.tar.zst" \
"${REGISTRY}/${REPO}" \
"${TAG}"
"${TAG}" \
"${BUILD_TYPE}"

View File

@@ -65,7 +65,7 @@ jobs:
run: bash tests/integration/kubernetes/gha-run.sh install-bats
- name: Run tests
timeout-minutes: 30
timeout-minutes: 60
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests

View File

@@ -126,6 +126,5 @@ jobs:
- name: Delete CoCo KBS
if: always() && matrix.environment.name != 'nvidia-gpu'
timeout-minutes: 10
run: |
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs

View File

@@ -137,12 +137,10 @@ jobs:
- name: Delete kata-deploy
if: always()
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh cleanup-zvsi
- name: Delete CoCo KBS
if: always()
timeout-minutes: 10
run: |
if [ "${KBS}" == "true" ]; then
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs

View File

@@ -120,12 +120,10 @@ jobs:
- name: Delete kata-deploy
if: always()
timeout-minutes: 15
run: bash tests/integration/kubernetes/gha-run.sh cleanup
- name: Delete CoCo KBS
if: always()
timeout-minutes: 10
run: |
[[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] && echo "ITA_KEY=${GH_ITA_KEY}" >> "${GITHUB_ENV}"
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs

View File

@@ -87,4 +87,4 @@ jobs:
- name: Report tests
if: always()
run: bash tests/functional/kata-deploy/gha-run.sh report-tests
run: bash tests/integration/kubernetes/gha-run.sh report-tests

54
.github/workflows/run-runk-tests.yaml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: CI | Run runk tests
on:
workflow_call:
inputs:
tarball-suffix:
required: false
type: string
commit-hash:
required: false
type: string
target-branch:
required: false
type: string
default: ""
permissions: {}
jobs:
run-runk:
name: run-runk
# Skip runk tests as we have no maintainers. TODO: Decide when to remove altogether
if: false
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: lts
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Install dependencies
run: bash tests/integration/runk/gha-run.sh install-dependencies
env:
GH_TOKEN: ${{ github.token }}
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata
run: bash tests/integration/runk/gha-run.sh install-kata kata-artifacts
- name: Run runk tests
run: bash tests/integration/runk/gha-run.sh run

View File

@@ -6,21 +6,14 @@ on:
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
stale:
name: stale
runs-on: ubuntu-22.04
permissions:
actions: write # Needed to manage caches for state persistence across runs
pull-requests: write # Needed to add/remove labels, post comments, or close PRs
steps:
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
stale-pr-message: 'This PR has been opened without activity for 180 days. Please comment on the issue or it will be closed in 7 days.'
stale-pr-message: 'This PR has been opened without with no activity for 180 days. Comment on the issue otherwise it will be closed in 7 days'
days-before-pr-stale: 180
days-before-pr-close: 7
days-before-issue-stale: -1

View File

@@ -1,41 +0,0 @@
name: 'Stale issues with activity before a fixed date'
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
inputs:
date:
description: "Date of stale cut-off. All issues not updated since this date will be marked as stale. Format: YYYY-MM-DD e.g. 2022-10-09"
default: "2022-10-09"
required: false
type: string
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
stale:
name: stale
runs-on: ubuntu-24.04
permissions:
actions: write # Needed to manage caches for state persistence across runs
issues: write # Needed to add/remove labels, post comments, or close issues
steps:
- name: Calculate the age to stale
run: |
echo AGE=$(( ( $(date +%s) - $(date -d "${DATE:-2022-10-09}" +%s) ) / 86400 )) >> "$GITHUB_ENV"
env:
DATE: ${{ inputs.date }}
- name: Run the stale action
uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
stale-pr-message: 'This issue has had no activity for at least ${AGE} days. Please comment on the issue, or it will be closed in 30 days'
days-before-pr-stale: -1
days-before-pr-close: -1
days-before-issue-stale: ${AGE}
days-before-issue-close: 30

View File

@@ -18,6 +18,7 @@ TOOLS =
TOOLS += agent-ctl
TOOLS += kata-ctl
TOOLS += log-parser
TOOLS += runk
TOOLS += trace-forwarder
STANDARD_TARGETS = build check clean install static-checks-build test vendor

View File

@@ -139,6 +139,7 @@ The table below lists the remaining parts of the project:
| [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
| [`kata-ctl`](src/tools/kata-ctl) | utility | Tool that provides advanced commands and debug facilities. |
| [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. |
| [`runk`](src/tools/runk) | utility | Standard OCI container runtime based on the agent. |
| [`ci`](.github/workflows) | CI | Continuous Integration configuration files and scripts. |
| [`ocp-ci`](ci/openshift-ci/README.md) | CI | Continuous Integration configuration for the OpenShift pipelines. |
| [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. |

View File

@@ -1 +1 @@
3.25.0
3.24.0

View File

@@ -103,8 +103,48 @@ $ minikube ssh "grep -c -E 'vmx|svm' /proc/cpuinfo"
## Installing Kata Containers
You can now install the Kata Containers runtime components
[following the official instructions](../../tools/packaging/kata-deploy/helm-chart).
You can now install the Kata Containers runtime components. You will need a local copy of some Kata
Containers components to help with this, and then use `kubectl` on the host (that Minikube has already
configured for you) to deploy them:
```sh
$ git clone https://github.com/kata-containers/kata-containers.git
$ cd kata-containers/tools/packaging/kata-deploy
$ kubectl apply -f kata-rbac/base/kata-rbac.yaml
$ kubectl apply -f kata-deploy/base/kata-deploy.yaml
```
This installs the Kata Containers components into `/opt/kata` inside the Minikube node. It can take
a few minutes for the operation to complete. You can check the installation has worked by checking
the status of the `kata-deploy` pod, which will be executing
[this script](../../tools/packaging/kata-deploy/scripts/kata-deploy.sh),
and will be executing a `sleep infinity` once it has successfully completed its work.
You can accomplish this by running the following:
```sh
$ podname=$(kubectl -n kube-system get pods -o=name | grep -F kata-deploy | sed 's?pod/??')
$ kubectl -n kube-system exec ${podname} -- ps -ef | grep -F infinity
```
> *NOTE:* This check only works for single node clusters, which is the default for Minikube.
> For multi-node clusters, the check would need to be adapted to check `kata-deploy` had
> completed on all nodes.
## Enabling Kata Containers
Now you have installed the Kata Containers components in the Minikube node. Next, you need to configure
Kubernetes `RuntimeClass` to know when to use Kata Containers to run a pod.
### Register the runtime
Now register the `kata qemu` runtime with that class. This should result in no errors:
```sh
$ cd kata-containers/tools/packaging/kata-deploy/runtimeclasses
$ kubectl apply -f kata-runtimeClasses.yaml
```
The Kata Containers installation process should be complete and enabled in the Minikube cluster.
## Testing Kata Containers

1
src/agent/Cargo.lock generated
View File

@@ -4305,7 +4305,6 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
name = "test-utils"
version = "0.1.0"
dependencies = [
"libc",
"nix 0.26.4",
]

View File

@@ -1588,11 +1588,9 @@ async fn join_namespaces(
cm.apply(p.pid)?;
}
if p.init {
if let Some(resource) = res {
info!(logger, "set properties to cgroups!");
cm.set(resource, false)?;
}
if p.init && res.is_some() {
info!(logger, "set properties to cgroups!");
cm.set(res.unwrap(), false)?;
}
info!(logger, "notify child to continue");

View File

@@ -10,7 +10,7 @@ use std::fs::File;
use std::sync::{Arc, Mutex};
use crossbeam_channel::{Receiver, Sender, TryRecvError};
use log::{debug, info, warn};
use log::{debug, error, info, warn};
use std::sync::mpsc;
use tracing::instrument;

View File

@@ -24,6 +24,7 @@ use dbs_legacy_devices::ConsoleHandler;
use dbs_pci::CAPABILITY_BAR_SIZE;
use dbs_utils::epoll_manager::EpollManager;
use kvm_ioctls::VmFd;
use log::error;
use virtio_queue::QueueSync;
#[cfg(feature = "dbs-virtio-devices")]

View File

@@ -770,11 +770,10 @@ impl MachineInfo {
}
/// Huge page type for VM RAM backend
#[derive(Clone, Debug, Deserialize_enum_str, Serialize_enum_str, PartialEq, Eq, Default)]
#[derive(Clone, Debug, Deserialize_enum_str, Serialize_enum_str, PartialEq, Eq)]
pub enum HugePageType {
/// Memory allocated using hugetlbfs backend
#[serde(rename = "hugetlbfs")]
#[default]
Hugetlbfs,
/// Memory allocated using transparent huge pages
@@ -782,6 +781,12 @@ pub enum HugePageType {
THP,
}
impl Default for HugePageType {
fn default() -> Self {
Self::Hugetlbfs
}
}
/// Virtual machine memory configuration information.
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct MemoryInfo {

View File

@@ -366,8 +366,8 @@ key = "value"
let result = add_hypervisor_initdata_overrides(&encoded);
// This might fail depending on whether algorithm is required
if let Err(error) = result {
assert!(error.to_string().contains("parse initdata"));
if result.is_err() {
assert!(result.unwrap_err().to_string().contains("parse initdata"));
}
}
@@ -386,8 +386,8 @@ key = "value"
let result = add_hypervisor_initdata_overrides(&encoded);
// This might fail depending on whether version is required
if let Err(error) = result {
assert!(error.to_string().contains("parse initdata"));
if result.is_err() {
assert!(result.unwrap_err().to_string().contains("parse initdata"));
}
}
@@ -488,7 +488,7 @@ key = "value"
let valid_toml = r#"
version = "0.1.0"
algorithm = "sha384"
[data]
valid_key = "valid_value"
"#;
@@ -497,7 +497,7 @@ key = "value"
// Invalid TOML (missing version)
let invalid_toml = r#"
algorithm = "sha256"
[data]
key = "value"
"#;

View File

@@ -136,6 +136,8 @@ macro_rules! skip_loop_by_user {
#[cfg(test)]
mod tests {
use super::{skip_if_kvm_unaccessable, skip_if_not_root, skip_if_root};
#[test]
fn test_skip_if_not_root() {
skip_if_not_root!();

View File

@@ -133,17 +133,6 @@ PKGLIBEXECDIR := $(LIBEXECDIR)/$(PROJECT_DIR)
FIRMWAREPATH :=
FIRMWAREVOLUMEPATH :=
ROOTMEASURECONFIG ?= ""
KERNELTDXPARAMS += $(ROOTMEASURECONFIG)
# TDX
DEFSHAREDFS_QEMU_TDX_VIRTIOFS := none
FIRMWARETDXPATH := $(PREFIXDEPS)/share/ovmf/OVMF.inteltdx.fd
# SEV-SNP
FIRMWARE_SNP_PATH := $(PREFIXDEPS)/share/ovmf/AMDSEV.fd
FIRMWARE_VOLUME_SNP_PATH :=
##VAR DEFVCPUS=<number> Default number of vCPUs
DEFVCPUS := 1
##VAR DEFMAXVCPUS=<number> Default maximum number of vCPUs
@@ -187,7 +176,6 @@ DEFVIRTIOFSQUEUESIZE ?= 1024
# Make sure you quote args.
DEFVIRTIOFSEXTRAARGS ?= [\"--thread-pool-size=1\", \"-o\", \"announce_submounts\"]
DEFENABLEIOTHREADS := false
DEFINDEPIOTHREADS := 0
DEFENABLEVHOSTUSERSTORE := false
DEFVHOSTUSERSTOREPATH := $(PKGRUNDIR)/vhost-user
DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"]
@@ -204,8 +192,6 @@ QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT := 4050
DEFCREATECONTAINERTIMEOUT ?= 30
DEFCREATECONTAINERTIMEOUT_COCO ?= 60
DEFSTATICRESOURCEMGMT_COCO = true
DEFDISABLEIMAGENVDIMM ?= false
DEFPODRESOURCEAPISOCK := ""
SED = sed
CLI_DIR = cmd
@@ -306,30 +292,6 @@ ifneq (,$(QEMUCMD))
CONFIGS += $(CONFIG_QEMU)
CONFIG_FILE_QEMU_TDX = configuration-qemu-tdx-runtime-rs.toml
CONFIG_QEMU_TDX = config/$(CONFIG_FILE_QEMU_TDX)
CONFIG_QEMU_TDX_IN = $(CONFIG_QEMU_TDX).in
CONFIG_PATH_QEMU_TDX = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_TDX))
CONFIG_PATHS += $(CONFIG_PATH_QEMU_TDX)
SYSCONFIG_QEMU_TDX = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_TDX))
SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_TDX)
CONFIGS += $(CONFIG_QEMU_TDX)
CONFIG_FILE_QEMU_SNP = configuration-qemu-snp-runtime-rs.toml
CONFIG_QEMU_SNP = config/$(CONFIG_FILE_QEMU_SNP)
CONFIG_QEMU_SNP_IN = $(CONFIG_QEMU_SNP).in
CONFIG_PATH_QEMU_SNP = $(abspath $(CONFDIR)/$(CONFIG_FILE_QEMU_SNP))
CONFIG_PATHS += $(CONFIG_PATH_QEMU_SNP)
SYSCONFIG_QEMU_SNP = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_QEMU_SNP))
SYSCONFIG_PATHS += $(SYSCONFIG_QEMU_SNP)
CONFIGS += $(CONFIG_QEMU_SNP)
CONFIG_FILE_QEMU_SE = configuration-qemu-se-runtime-rs.toml
CONFIG_QEMU_SE = config/$(CONFIG_FILE_QEMU_SE)
CONFIG_QEMU_SE_IN = $(CONFIG_QEMU_SE).in
@@ -560,7 +522,6 @@ USER_VARS += DEFVIRTIOFSEXTRAARGS
USER_VARS += DEFENABLEANNOTATIONS
USER_VARS += DEFENABLEANNOTATIONS_COCO
USER_VARS += DEFENABLEIOTHREADS
USER_VARS += DEFINDEPIOTHREADS
USER_VARS += DEFSECCOMPSANDBOXPARAM
USER_VARS += DEFGUESTSELINUXLABEL
USER_VARS += DEFENABLEVHOSTUSERSTORE
@@ -581,7 +542,6 @@ USER_VARS += DEFSTATICRESOURCEMGMT_FC
USER_VARS += DEFSTATICRESOURCEMGMT_CLH
USER_VARS += DEFSTATICRESOURCEMGMT_QEMU
USER_VARS += DEFSTATICRESOURCEMGMT_COCO
USER_VARS += DEFDISABLEIMAGENVDIMM
USER_VARS += DEFBINDMOUNTS
USER_VARS += DEFVFIOMODE
USER_VARS += DEFVFIOMODE_SE
@@ -602,13 +562,6 @@ USER_VARS += DEFFORCEGUESTPULL
USER_VARS += QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT
USER_VARS += DEFCREATECONTAINERTIMEOUT
USER_VARS += DEFCREATECONTAINERTIMEOUT_COCO
USER_VARS += QEMUTDXEXPERIMENTALCMD
USER_VARS += FIRMWARE_SNP_PATH
USER_VARS += FIRMWARE_VOLUME_SNP_PATH
USER_VARS += KERNELTDXPARAMS
USER_VARS += DEFSHAREDFS_QEMU_TDX_VIRTIOFS
USER_VARS += FIRMWARETDXPATH
USER_VARS += DEFPODRESOURCEAPISOCK
SOURCES := \
$(shell find . 2>&1 | grep -E '.*\.rs$$') \
@@ -646,8 +599,6 @@ GENERATED_VARS = \
VERSION \
CONFIG_DB_IN \
CONFIG_FC_IN \
CONFIG_QEMU_TDX_IN \
CONFIG_QEMU_SNP_IN \
$(USER_VARS)

View File

@@ -1,770 +0,0 @@
# Copyright (c) 2017-2019 Intel Corporation
# Copyright (c) 2021 Adobe Inc.
# Copyright (c) 2024 IBM Corp.
# Copyright (c) 2025-2026 Ant Group
#
# SPDX-License-Identifier: Apache-2.0
#
# XXX: WARNING: this file is auto-generated.
# XXX:
# XXX: Source file: "@CONFIG_QEMU_IN@"
# XXX: Project:
# XXX: Name: @PROJECT_NAME@
# XXX: Type: @PROJECT_TYPE@
[hypervisor.qemu]
path = "@QEMUPATH@"
kernel = "@KERNELPATH_COCO@"
initrd = "@INITRDCONFIDENTIALPATH@"
# image = "@IMAGECONFIDENTIALPATH@"
machine_type = "@MACHINETYPE@"
# Enable confidential guest support.
# Toggling that setting may trigger different hardware features, ranging
# from memory encryption to both memory and CPU-state encryption and integrity.
# The Kata Containers runtime dynamically detects the available feature set and
# aims at enabling the largest possible one, returning an error if none is
# available, or none is supported by the hypervisor.
#
# Known limitations:
# * Does not work by design:
# - CPU Hotplug
# - Memory Hotplug
# - NVDIMM devices
#
# Default false
confidential_guest = true
# Enable AMD SEV-SNP confidential guests
# In case of using confidential guests on AMD hardware that supports SEV-SNP,
# the following enables SEV-SNP guests. Default true
sev_snp_guest = true
# SNP 'ID Block' and 'ID Authentication Information Structure'.
# If one of snp_id_block or snp_id_auth is specified, the other must be specified, too.
# Notice that the default SNP policy of QEMU (0x30000) is used by Kata, if not explicitly
# set via 'snp_guest_policy' option. The IDBlock contains the guest policy as field, and
# it must match the value from 'snp_guest_policy' or, if unset, the QEMU default policy.
#
# 96-byte, base64-encoded blob to provide the ID Block structure for the
# SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero)
snp_id_block = ""
# 4096-byte, base64-encoded blob to provide the ID Authentication Information Structure
# for the SNP_LAUNCH_FINISH command defined in the SEV-SNP firmware ABI (QEMU default: all-zero)
snp_id_auth = ""
# SNP Guest Policy, the POLICY parameter to the SNP_LAUNCH_START command.
# If unset, the QEMU default policy (0x30000) will be used.
# Notice that the guest policy is enforced at VM launch, and your pod VMs
# won't start at all if the policy denys it. This will be indicated by a
# 'SNP_LAUNCH_START' error.
snp_guest_policy = 196608
# rootfs filesystem type:
# - ext4 (default)
# - xfs
# - erofs
rootfs_type = @DEFROOTFSTYPE@
# Block storage driver to be used for the VM rootfs is backed
# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm
vm_rootfs_driver = "virtio-blk-pci"
# Enable running QEMU VMM as a non-root user.
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
# a non-root random user. See documentation for the limitations of this mode.
rootless = false
# List of valid annotation names for the hypervisor
# Each member of the list is a regular expression, which is the base name
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
enable_annotations = @DEFENABLEANNOTATIONS_COCO@
# List of valid annotations values for the hypervisor
# Each member of the list is a path pattern as described by glob(3).
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@
valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@
# Optional space-separated list of options to pass to the guest kernel.
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
# trouble running pre-2.15 glibc.
#
# WARNING: - any parameter specified here will take priority over the default
# parameter value of the same name used to start the virtual machine.
# Do not set values here unless you understand the impact of doing so as you
# may stop the virtual machine from booting.
# To see the list of default parameters, enable hypervisor debug, create a
# container and look for 'default-kernel-parameters' log entries.
kernel_params = "@KERNELPARAMS@"
# Path to the firmware.
# If you want that qemu uses the default firmware leave this option empty
firmware = "@FIRMWARE_SNP_PATH@"
# Path to the firmware volume.
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
# can be customized per each user while UEFI code is kept same.
firmware_volume = "@FIRMWARE_VOLUME_SNP_PATH@"
# Machine accelerators
# comma-separated list of machine accelerators to pass to the hypervisor.
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
machine_accelerators = "@MACHINEACCELERATORS@"
# Qemu seccomp sandbox feature
# comma-separated list of seccomp sandbox features to control the syscall access.
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
# Another note: enabling this feature may reduce performance, you may enable
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
# CPU features
# comma-separated list of cpu features to pass to the cpu
# For example, `cpu_features = "pmu=off,vmx=off"
cpu_features = "@CPUFEATURES@"
# Default number of vCPUs per SB/VM:
# unspecified or 0 --> will be set to @DEFVCPUS@
# < 0 --> will be set to the actual number of physical cores
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores
default_vcpus = @DEFVCPUS_QEMU@
# Default maximum number of vCPUs per SB/VM:
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
# the actual number of physical cores is greater than it.
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
# unless you know what are you doing.
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
default_maxvcpus = @DEFMAXVCPUS_QEMU@
# Bridges can be used to hot plug devices.
# Limitations:
# * Currently only pci bridges are supported
# * Until 30 devices per bridge can be hot plugged.
# * Until 5 PCI bridges can be cold plugged per VM.
# This limitation could be a bug in qemu or in the kernel
# Default number of bridges per SB/VM:
# unspecified or 0 --> will be set to @DEFBRIDGES@
# > 1 <= 5 --> will be set to the specified number
# > 5 --> will be set to 5
default_bridges = @DEFBRIDGES@
# Default memory size in MiB for SB/VM.
# If unspecified then it will be set @DEFMEMSZ@ MiB.
default_memory = @DEFMEMSZ@
#
# Default memory slots per SB/VM.
# If unspecified then it will be set @DEFMEMSLOTS@.
# This is will determine the times that memory will be hotadded to sandbox/VM.
memory_slots = @DEFMEMSLOTS@
# Default maximum memory in MiB per SB / VM
# unspecified or == 0 --> will be set to the actual amount of physical RAM
# > 0 <= amount of physical RAM --> will be set to the specified number
# > amount of physical RAM --> will be set to the actual amount of physical RAM
default_maxmemory = @DEFMAXMEMSZ@
# The size in MiB will be plused to max memory of hypervisor.
# It is the memory address space for the NVDIMM device.
# If set block storage driver (block_device_driver) to "nvdimm",
# should set memory_offset to the size of block device.
# Default 0
memory_offset = 0
# Specifies virtio-mem will be enabled or not.
# Please note that this option should be used with the command
# "echo 1 > /proc/sys/vm/overcommit_memory".
# Default false
enable_virtio_mem = false
# Disable block device from being used for a container's rootfs.
# In case of a storage driver like devicemapper where a container's
# root file system is backed by a block device, the block device is passed
# directly to the hypervisor for performance reasons.
# This flag prevents the block device from being passed to the hypervisor,
# virtio-fs is used instead to pass the rootfs.
disable_block_device_use = @DEFDISABLEBLOCK@
# Shared file system type:
# - virtio-fs (default)
# - virtio-fs-nydus
# - none
shared_fs = "none"
# Path to vhost-user-fs daemon.
virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
# List of valid annotations values for the virtiofs daemon
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@
valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
# Default size of DAX cache in MiB
virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
# Default size of virtqueues
virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@
# Extra args for virtiofsd daemon
#
# Format example:
# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"]
# Examples:
# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"]
#
# see `virtiofsd -h` for possible options.
virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
# Cache mode:
#
# - never
# Metadata, data, and pathname lookup are not cached in guest. They are
# always fetched from host and any changes are immediately pushed to host.
#
# - metadata
# Metadata and pathname lookup are cached in guest and never expire.
# Data is never cached in guest.
#
# - auto
# Metadata and pathname lookup cache expires after a configured amount of
# time (default is 1 second). Data is cached while the file is open (close
# to open consistency).
#
# - always
# Metadata, data, and pathname lookup are cached in guest and never expire.
virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
# Block storage driver to be used for the hypervisor in case the container
# rootfs is backed by a block device. This is virtio-scsi, virtio-blk
# or nvdimm.
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@"
# aio is the I/O mechanism used by qemu
# Options:
#
# - threads
# Pthread based disk I/O.
#
# - native
# Native Linux I/O.
#
# - io_uring
# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and
# qemu >=5.0.
block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@"
# Specifies cache-related options will be set to block devices or not.
# Default false
block_device_cache_set = false
# Specifies cache-related options for block devices.
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
# Default false
block_device_cache_direct = false
# Specifies cache-related options for block devices.
# Denotes whether flush requests for the device are ignored.
# Default false
block_device_cache_noflush = false
# Enable iothreads (data-plane) to be used. This causes IO to be
# handled in a separate IO thread. This is currently only implemented
# for SCSI.
#
enable_iothreads = @DEFENABLEIOTHREADS@
# Independent IOThreads enables IO to be processed in a separate thread, it is
# for QEMU hotplug device attach to iothread, like virtio-blk.
indep_iothreads = @DEFINDEPIOTHREADS@
# Enable pre allocation of VM RAM, default false
# Enabling this will result in lower container density
# as all of the memory will be allocated and locked
# This is useful when you want to reserve all the memory
# upfront or in the cases where you want memory latencies
# to be very predictable
# Default false
enable_mem_prealloc = false
# Reclaim guest freed memory.
# Enabling this will result in the VM balloon device having f_reporting=on set.
# Then the hypervisor will use it to reclaim guest freed memory.
# This is useful for reducing the amount of memory used by a VM.
# Enabling this feature may sometimes reduce the speed of memory access in
# the VM.
#
# Default false
reclaim_guest_freed_memory = false
# Enable huge pages for VM RAM, default false
# Enabling this will result in the VM memory
# being allocated using huge pages.
# This is useful when you want to use vhost-user network
# stacks within the container. This will automatically
# result in memory pre allocation
enable_hugepages = false
# Enable vhost-user storage device, default false
# Enabling this will result in some Linux reserved block type
# major range 240-254 being chosen to represent vhost-user devices.
enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@
# The base directory specifically used for vhost-user devices.
# Its sub-path "block" is used for block devices; "block/sockets" is
# where we expect vhost-user sockets to live; "block/devices" is where
# simulated block device nodes for vhost-user devices to live.
vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@"
# Enable vIOMMU, default false
# Enabling this will result in the VM having a vIOMMU device
# This will also add the following options to the kernel's
# command line: intel_iommu=on,iommu=pt
enable_iommu = false
# Enable IOMMU_PLATFORM, default false
# Enabling this will result in the VM device having iommu_platform=on set
enable_iommu_platform = false
# List of valid annotations values for the vhost user store path
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
# qemu will delay this many seconds and then attempt to reconnect.
# Zero disables reconnecting, and the default is zero.
vhost_user_reconnect_timeout_sec = 0
# Enable file based guest memory support. The default is an empty string which
# will disable this feature. In the case of virtio-fs, this is enabled
# automatically and '/dev/shm' is used as the backing folder.
# This option will be ignored if VM templating is enabled.
file_mem_backend = ""
# List of valid annotations values for the file_mem_backend annotation
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@
valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@
# -pflash can add image file to VM. The arguments of it should be in format
# of ["/path/to/flash0.img", "/path/to/flash1.img"]
pflashes = []
# This option changes the default hypervisor and kernel parameters
# to enable debug output where available. And Debug also enable the hmp socket.
#
# Default false
enable_debug = false
# Disable the customizations done in the runtime when it detects
# that it is running on top a VMM. This will result in the runtime
# behaving as it would when running on bare metal.
#
disable_nesting_checks = true
# If false and nvdimm is supported, use nvdimm device to plug guest image.
# Otherwise virtio-block device is used.
#
# nvdimm is not supported when `confidential_guest = true`.
disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM@
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port
# Default 0
pcie_root_port = 0
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
# security (vhost-net runs ring0) for network I/O performance.
disable_vhost_net = false
# This option allows to add an extra HMP or QMP socket when `enable_debug = true`
#
# WARNING: Anyone with access to the extra socket can take full control of
# Qemu. This is for debugging purpose only and must *NEVER* be used in
# production.
#
# Valid values are :
# - "hmp"
# - "qmp"
# - "qmp-pretty" (same as "qmp" with pretty json formatting)
#
# If set to the empty string "", no extra monitor socket is added. This is
# the default.
#extra_monitor_socket = "hmp"
#
# Default entropy source.
# The path to a host source of entropy (including a real hardware RNG)
# /dev/urandom and /dev/random are two main options.
# Be aware that /dev/random is a blocking source of entropy. If the host
# runs out of entropy, the VMs boot time will increase leading to get startup
# timeouts.
# The source of entropy /dev/urandom is non-blocking and provides a
# generally acceptable source of entropy. It should work well for pretty much
# all practical purposes.
entropy_source = "@DEFENTROPYSOURCE@"
# List of valid annotations values for entropy_source
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
# Path to OCI hook binaries in the *guest rootfs*.
# This does not affect host-side hooks which must instead be added to
# the OCI spec passed to the runtime.
#
# You can create a rootfs with hooks by customizing the osbuilder scripts:
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
#
# Hooks must be stored in a subdirectory of guest_hook_path according to their
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
# The agent will scan these directories for executable files and add them, in
# lexicographical order, to the lifecycle of the guest container.
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
# Warnings will be logged if any error is encountered while scanning for hooks,
# but it will not abort container execution.
# Recommended value when enabling: "/usr/share/oci/hooks"
guest_hook_path = ""
#
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic.
# Default 0-sized value means unlimited rate.
rx_rate_limiter_max_rate = 0
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block)
# to discipline traffic.
# Default 0-sized value means unlimited rate.
tx_rate_limiter_max_rate = 0
# Set where to save the guest memory dump file.
# If set, when GUEST_PANICKED event occurred,
# guest memeory will be dumped to host filesystem under guest_memory_dump_path,
# This directory will be created automatically if it does not exist.
#
# The dumped file(also called vmcore) can be processed with crash or gdb.
#
# WARNING:
# Dump guest's memory can take very long depending on the amount of guest memory
# and use much disk space.
# Recommended value when enabling: "/var/crash/kata"
guest_memory_dump_path = ""
# If enable paging.
# Basically, if you want to use "gdb" rather than "crash",
# or need the guest-virtual addresses in the ELF vmcore,
# then you should enable paging.
#
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
guest_memory_dump_paging = false
# Enable swap in the guest. Default false.
# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device
# if the swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness")
# is bigger than 0.
# The size of the swap device should be
# swap_in_bytes (set by annotation "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes.
# If swap_in_bytes is not set, the size should be memory_limit_in_bytes.
# If swap_in_bytes and memory_limit_in_bytes is not set, the size should
# be default_memory.
enable_guest_swap = false
# use legacy serial for guest console if available and implemented for architecture. Default false
use_legacy_serial = false
# disable applying SELinux on the VMM process (default false)
disable_selinux = @DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
[factory]
# VM templating support. Once enabled, new VMs are created from template
# using vm cloning. They will share the same initial kernel, initramfs and
# agent memory by mapping it readonly. It helps speeding up new container
# creation and saves a lot of memory if there are many kata containers running
# on the same host.
#
# When disabled, new VMs are created from scratch.
#
# Note: Requires "initrd=" to be set ("image=" is not supported).
#
# Default false
enable_template = false
# Specifies the path of template.
#
# Default "/run/vc/vm/template"
template_path = "/run/vc/vm/template"
# The number of caches of VMCache:
# unspecified or == 0 --> VMCache is disabled
# > 0 --> will be set to the specified number
#
# VMCache is a function that creates VMs as caches before using it.
# It helps speed up new container creation.
# The function consists of a server and some clients communicating
# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto.
# The VMCache server will create some VMs and cache them by factory cache.
# It will convert the VM to gRPC format and transport it when gets
# requestion from clients.
# Factory grpccache is the VMCache client. It will request gRPC format
# VM and convert it back to a VM. If VMCache function is enabled,
# kata-runtime will request VM from factory grpccache when it creates
# a new sandbox.
#
# Default 0
vm_cache_number = 0
# Specify the address of the Unix socket that is used by VMCache.
#
# Default /var/run/kata-containers/cache.sock
vm_cache_endpoint = "/var/run/kata-containers/cache.sock"
[agent.@PROJECT_TYPE@]
# If enabled, make the agent display debug-level messages.
# (default: disabled)
enable_debug = false
# Enable agent tracing.
#
# If enabled, the agent will generate OpenTelemetry trace spans.
#
# Notes:
#
# - If the runtime also has tracing enabled, the agent spans will be
# associated with the appropriate runtime parent span.
# - If enabled, the runtime will wait for the container to shutdown,
# increasing the container shutdown time slightly.
#
# (default: disabled)
enable_tracing = false
# Comma separated list of kernel modules and their parameters.
# These modules will be loaded in the guest kernel using modprobe(8).
# The following example can be used to load two kernel modules with parameters
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
# The first word is considered as the module name and the rest as its parameters.
# Container will not be started when:
# * A kernel module is specified and the modprobe command is not installed in the guest
# or it fails loading the module.
# * The module is not available in the guest or it doesn't met the guest kernel
# requirements, like architecture and version.
#
kernel_modules = []
# Enable debug console.
# If enabled, user can connect guest OS running inside hypervisor
# through "kata-runtime exec <sandbox-id>" command
debug_console_enabled = false
# Agent dial timeout in millisecond.
# (default: 10)
dial_timeout_ms = 10
# Agent reconnect timeout in millisecond.
# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300)
# If you find pod cannot connect to the agent when starting, please
# consider increasing this value to increase the retry times.
# You'd better not change the value of dial_timeout_ms, unless you have an
# idea of what you are doing.
# (default: 3000)
reconnect_timeout_ms = 3000
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,
# have sufficient time to complete.
#
# Effective Timeout Determination:
# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values:
# - create_container_timeout: The timeout value configured for creating containers (default: 30,000 milliseconds).
# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below:
# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout)
# Defaults to @DEFCREATECONTAINERTIMEOUT_COCO@ second(s)
create_container_timeout = @DEFCREATECONTAINERTIMEOUT_COCO@
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log
# (default: disabled)
enable_debug = false
#
# Internetworking model
# Determines how the VM should be connected to the
# the container network interface
# Options:
#
# - macvtap
# Used when the Container network interface can be bridged using
# macvtap.
#
# - none
# Used when customize network. Only creates a tap device. No veth pair.
#
# - tcfilter
# Uses tc filter rules to redirect traffic from the network interface
# provided by plugin to a tap interface connected to the VM.
#
internetworking_model="@DEFNETWORKMODEL_QEMU@"
name="@RUNTIMENAME@"
hypervisor_name="@HYPERVISOR_QEMU@"
agent_name="@PROJECT_TYPE@"
# disable guest seccomp
# Determines whether container seccomp profiles are passed to the virtual
# machine and applied by the kata agent. If set to true, seccomp is not applied
# within the guest
# (default: true)
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
# vCPUs pinning settings
# if enabled, each vCPU thread will be scheduled to a fixed CPU
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
enable_vcpus_pinning = false
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled)
enable_tracing = false
# Set the full url to the Jaeger HTTP Thrift collector.
# The default if not set will be "http://localhost:14268/api/traces"
jaeger_endpoint = ""
# Sets the username to be used if basic auth is required for Jaeger.
jaeger_user = ""
# Sets the password to be used if basic auth is required for Jaeger.
jaeger_password = ""
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
# (like OVS) directly.
# (default: false)
disable_new_netns = false
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# The sandbox cgroup is constrained if there is no container type annotation.
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_QEMU@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_COCO@
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
# These will not be exposed to the container workloads, and are only provided for potential guest services.
sandbox_bind_mounts = @DEFBINDMOUNTS@
# VFIO Mode
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - vfio
# Matches behaviour of OCI runtimes (e.g. runc) as much as
# possible. VFIO devices will appear in the container as VFIO
# character devices under /dev/vfio. The exact names may differ
# from the host (they need to match the VM's IOMMU group numbers
# rather than the host's)
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel
# claims it. This means it will appear as one or more device nodes
# or network interfaces depending on the nature of the device.
# Using this mode requires specially built workloads that know how
# to locate the relevant device interfaces within the VM.
#
vfio_mode = "@DEFVFIOMODE@"
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.
# Supported experimental features:
# for example:
# experimental=["force_guest_pull"]
# which is for enable force_guest_pull mode in CoCo scenarios.
# (default: [])
experimental = @DEFAULTEXPFEATURES@
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
# (default: false)
enable_pprof = false
# Base directory of directly attachable network config.
# Network devices for VM-based containers are allowed to be placed in the
# host netns to eliminate as many hops as possible, which is what we
# called a "Directly Attachable Network". The config, set by special CNI
# plugins, is used to tell the Kata containers what devices are attached
# to the hypervisor.
# (default: /run/kata-containers/dans)
dan_conf = "@DEFDANCONF@"
# pod_resource_api_sock specifies the unix socket for the Kubelet's
# PodResource API endpoint. If empty, kubernetes based cold plug
# will not be attempted. In order for this feature to work, the
# KubeletPodResourcesGet featureGate must be enabled in Kubelet,
# if using Kubelet older than 1.34.
#
# The pod resource API's socket is relative to the Kubelet's root-dir,
# which is defined by the cluster admin, and its location is:
# ${KubeletRootDir}/pod-resources/kubelet.sock
#
# cold_plug_vfio(see hypervisor config) acts as a feature gate:
# cold_plug_vfio = no_port (default) => no cold plug
# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need
# explicit CDI annotation for cold plug (applies mainly
# to non-k8s cases)
# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet
# based cold plug.
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK@"

View File

@@ -1,746 +0,0 @@
# Copyright (c) 2017-2019 Intel Corporation
# Copyright (c) 2021 Adobe Inc.
# Copyright (c) 2025-2026 Ant Group
#
# SPDX-License-Identifier: Apache-2.0
#
# XXX: WARNING: this file is auto-generated.
# XXX:
# XXX: Source file: "@CONFIG_QEMU_IN@"
# XXX: Project:
# XXX: Name: @PROJECT_NAME@
# XXX: Type: @PROJECT_TYPE@
[hypervisor.qemu]
path = "@QEMUPATH@"
kernel = "@KERNELPATH_COCO@"
image = "@IMAGECONFIDENTIALPATH@"
# initrd = "@INITRDPATH@"
machine_type = "@MACHINETYPE@"
tdx_quote_generation_service_socket_port = @QEMUTDXQUOTEGENERATIONSERVICESOCKETPORT@
# rootfs filesystem type:
# - ext4 (default)
# - xfs
# - erofs
rootfs_type = @DEFROOTFSTYPE@
# Block storage driver to be used for the VM rootfs is backed
# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm
vm_rootfs_driver = "virtio-blk-pci"
# Enable confidential guest support.
# Toggling that setting may trigger different hardware features, ranging
# from memory encryption to both memory and CPU-state encryption and integrity.
# The Kata Containers runtime dynamically detects the available feature set and
# aims at enabling the largest possible one, returning an error if none is
# available, or none is supported by the hypervisor.
#
# Known limitations:
# * Does not work by design:
# - CPU Hotplug
# - Memory Hotplug
# - NVDIMM devices
#
# Default false
confidential_guest = true
# Enable running QEMU VMM as a non-root user.
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
# a non-root random user. See documentation for the limitations of this mode.
rootless = false
# List of valid annotation names for the hypervisor
# Each member of the list is a regular expression, which is the base name
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
enable_annotations = @DEFENABLEANNOTATIONS_COCO@
# List of valid annotations values for the hypervisor
# Each member of the list is a path pattern as described by glob(3).
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @QEMUVALIDHYPERVISORPATHS@
valid_hypervisor_paths = @QEMUVALIDHYPERVISORPATHS@
# Optional space-separated list of options to pass to the guest kernel.
# For example, use `kernel_params = "vsyscall=emulate"` if you are having
# trouble running pre-2.15 glibc.
#
# WARNING: - any parameter specified here will take priority over the default
# parameter value of the same name used to start the virtual machine.
# Do not set values here unless you understand the impact of doing so as you
# may stop the virtual machine from booting.
# To see the list of default parameters, enable hypervisor debug, create a
# container and look for 'default-kernel-parameters' log entries.
kernel_params = "@KERNELTDXPARAMS@"
# Path to the firmware.
# If you want that qemu uses the default firmware leave this option empty
firmware = "@FIRMWARETDXPATH@"
# Path to the firmware volume.
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
# can be customized per each user while UEFI code is kept same.
firmware_volume = "@FIRMWAREVOLUMEPATH@"
# Machine accelerators
# comma-separated list of machine accelerators to pass to the hypervisor.
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
machine_accelerators = "@MACHINEACCELERATORS@"
# Qemu seccomp sandbox feature
# comma-separated list of seccomp sandbox features to control the syscall access.
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
# Another note: enabling this feature may reduce performance, you may enable
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
# CPU features
# comma-separated list of cpu features to pass to the cpu
# For example, `cpu_features = "pmu=off,vmx=off"
cpu_features = "@CPUFEATURES@"
# Default number of vCPUs per SB/VM:
# unspecified or 0 --> will be set to @DEFVCPUS@
# < 0 --> will be set to the actual number of physical cores
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores
default_vcpus = 1
# Default maximum number of vCPUs per SB/VM:
# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# > 0 <= number of physical cores --> will be set to the specified number
# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number
# of vCPUs supported by KVM if that number is exceeded
# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when
# the actual number of physical cores is greater than it.
# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU
# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs
# can be added to a SB/VM, but the memory footprint will be big. Another example, with
# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of
# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable,
# unless you know what are you doing.
# NOTICE: on arm platform with gicv2 interrupt controller, set it to 8.
default_maxvcpus = @DEFMAXVCPUS@
# Bridges can be used to hot plug devices.
# Limitations:
# * Currently only pci bridges are supported
# * Until 30 devices per bridge can be hot plugged.
# * Until 5 PCI bridges can be cold plugged per VM.
# This limitation could be a bug in qemu or in the kernel
# Default number of bridges per SB/VM:
# unspecified or 0 --> will be set to @DEFBRIDGES@
# > 1 <= 5 --> will be set to the specified number
# > 5 --> will be set to 5
default_bridges = @DEFBRIDGES@
# Default memory size in MiB for SB/VM.
# If unspecified then it will be set @DEFMEMSZ@ MiB.
default_memory = @DEFMEMSZ@
#
# Default memory slots per SB/VM.
# If unspecified then it will be set @DEFMEMSLOTS@.
# This is will determine the times that memory will be hotadded to sandbox/VM.
memory_slots = @DEFMEMSLOTS@
# Default maximum memory in MiB per SB / VM
# unspecified or == 0 --> will be set to the actual amount of physical RAM
# > 0 <= amount of physical RAM --> will be set to the specified number
# > amount of physical RAM --> will be set to the actual amount of physical RAM
default_maxmemory = @DEFMAXMEMSZ@
# The size in MiB will be plused to max memory of hypervisor.
# It is the memory address space for the NVDIMM device.
# If set block storage driver (block_device_driver) to "nvdimm",
# should set memory_offset to the size of block device.
# Default 0
memory_offset = 0
# Specifies virtio-mem will be enabled or not.
# Please note that this option should be used with the command
# "echo 1 > /proc/sys/vm/overcommit_memory".
# Default false
enable_virtio_mem = false
# Disable block device from being used for a container's rootfs.
# In case of a storage driver like devicemapper where a container's
# root file system is backed by a block device, the block device is passed
# directly to the hypervisor for performance reasons.
# This flag prevents the block device from being passed to the hypervisor,
# virtio-fs is used instead to pass the rootfs.
disable_block_device_use = @DEFDISABLEBLOCK@
# Shared file system type:
# - virtio-fs (default)
# - virtio-fs-nydus
# - none
shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"
# Path to vhost-user-fs daemon.
virtio_fs_daemon = "@DEFVIRTIOFSDAEMON@"
# List of valid annotations values for the virtiofs daemon
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDVIRTIOFSDAEMONPATHS@
valid_virtio_fs_daemon_paths = @DEFVALIDVIRTIOFSDAEMONPATHS@
# Default size of DAX cache in MiB
virtio_fs_cache_size = @DEFVIRTIOFSCACHESIZE@
# Default size of virtqueues
virtio_fs_queue_size = @DEFVIRTIOFSQUEUESIZE@
# Extra args for virtiofsd daemon
#
# Format example:
# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"]
# Examples:
# Set virtiofsd log level to debug : ["-o", "log_level=debug"] or ["-d"]
#
# see `virtiofsd -h` for possible options.
virtio_fs_extra_args = @DEFVIRTIOFSEXTRAARGS@
# Cache mode:
#
# - never
# Metadata, data, and pathname lookup are not cached in guest. They are
# always fetched from host and any changes are immediately pushed to host.
#
# - metadata
# Metadata and pathname lookup are cached in guest and never expire.
# Data is never cached in guest.
#
# - auto
# Metadata and pathname lookup cache expires after a configured amount of
# time (default is 1 second). Data is cached while the file is open (close
# to open consistency).
#
# - always
# Metadata, data, and pathname lookup are cached in guest and never expire.
virtio_fs_cache = "@DEFVIRTIOFSCACHE@"
# Block storage driver to be used for the hypervisor in case the container
# rootfs is backed by a block device. This is virtio-scsi, virtio-blk
# or nvdimm.
block_device_driver = "@DEFBLOCKSTORAGEDRIVER_QEMU@"
# aio is the I/O mechanism used by qemu
# Options:
#
# - threads
# Pthread based disk I/O.
#
# - native
# Native Linux I/O.
#
# - io_uring
# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and
# qemu >=5.0.
block_device_aio = "@DEFBLOCKDEVICEAIO_QEMU@"
# Specifies cache-related options will be set to block devices or not.
# Default false
block_device_cache_set = false
# Specifies cache-related options for block devices.
# Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
# Default false
block_device_cache_direct = false
# Specifies cache-related options for block devices.
# Denotes whether flush requests for the device are ignored.
# Default false
block_device_cache_noflush = false
# Enable iothreads (data-plane) to be used. This causes IO to be
# handled in a separate IO thread. This is currently implemented
# for virtio-scsi and virtio-blk.
#
enable_iothreads = @DEFENABLEIOTHREADS@
# Independent IOThreads enables IO to be processed in a separate thread, it is
# for QEMU hotplug device attach to iothread, like virtio-blk.
indep_iothreads = @DEFINDEPIOTHREADS@
# Enable pre allocation of VM RAM, default false
# Enabling this will result in lower container density
# as all of the memory will be allocated and locked
# This is useful when you want to reserve all the memory
# upfront or in the cases where you want memory latencies
# to be very predictable
# Default false
enable_mem_prealloc = false
# Reclaim guest freed memory.
# Enabling this will result in the VM balloon device having f_reporting=on set.
# Then the hypervisor will use it to reclaim guest freed memory.
# This is useful for reducing the amount of memory used by a VM.
# Enabling this feature may sometimes reduce the speed of memory access in
# the VM.
#
# Default false
reclaim_guest_freed_memory = false
# Enable huge pages for VM RAM, default false
# Enabling this will result in the VM memory
# being allocated using huge pages.
# This is useful when you want to use vhost-user network
# stacks within the container. This will automatically
# result in memory pre allocation
enable_hugepages = false
# Enable vhost-user storage device, default false
# Enabling this will result in some Linux reserved block type
# major range 240-254 being chosen to represent vhost-user devices.
enable_vhost_user_store = @DEFENABLEVHOSTUSERSTORE@
# The base directory specifically used for vhost-user devices.
# Its sub-path "block" is used for block devices; "block/sockets" is
# where we expect vhost-user sockets to live; "block/devices" is where
# simulated block device nodes for vhost-user devices to live.
vhost_user_store_path = "@DEFVHOSTUSERSTOREPATH@"
# Enable vIOMMU, default false
# Enabling this will result in the VM having a vIOMMU device
# This will also add the following options to the kernel's
# command line: intel_iommu=on,iommu=pt
enable_iommu = false
# Enable IOMMU_PLATFORM, default false
# Enabling this will result in the VM device having iommu_platform=on set
enable_iommu_platform = false
# List of valid annotations values for the vhost user store path
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
# qemu will delay this many seconds and then attempt to reconnect.
# Zero disables reconnecting, and the default is zero.
vhost_user_reconnect_timeout_sec = 0
# Enable file based guest memory support. The default is an empty string which
# will disable this feature. In the case of virtio-fs, this is enabled
# automatically and '/dev/shm' is used as the backing folder.
# This option will be ignored if VM templating is enabled.
file_mem_backend = "@DEFFILEMEMBACKEND@"
# List of valid annotations values for the file_mem_backend annotation
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDFILEMEMBACKENDS@
valid_file_mem_backends = @DEFVALIDFILEMEMBACKENDS@
# -pflash can add image file to VM. The arguments of it should be in format
# of ["/path/to/flash0.img", "/path/to/flash1.img"]
pflashes = []
# This option changes the default hypervisor and kernel parameters
# to enable debug output where available. And Debug also enable the hmp socket.
#
# Default false
enable_debug = false
# This option allows to add an extra HMP or QMP socket when `enable_debug = true`
#
# WARNING: Anyone with access to the extra socket can take full control of
# Qemu. This is for debugging purpose only and must *NEVER* be used in
# production.
#
# Valid values are :
# - "hmp"
# - "qmp"
# - "qmp-pretty" (same as "qmp" with pretty json formatting)
#
# If set to the empty string "", no extra monitor socket is added. This is
# the default.
extra_monitor_socket = ""
# Disable the customizations done in the runtime when it detects
# that it is running on top a VMM. This will result in the runtime
# behaving as it would when running on bare metal.
#
disable_nesting_checks = false
# If false and nvdimm is supported, use nvdimm device to plug guest image.
# Otherwise virtio-block device is used.
#
# nvdimm is not supported when `confidential_guest = true`.
disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM@
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port
# Default 0
pcie_root_port = 0
# If vhost-net backend for virtio-net is not desired, set to true. Default is false, which trades off
# security (vhost-net runs ring0) for network I/O performance.
disable_vhost_net = false
#
# Default entropy source.
# The path to a host source of entropy (including a real hardware RNG)
# /dev/urandom and /dev/random are two main options.
# Be aware that /dev/random is a blocking source of entropy. If the host
# runs out of entropy, the VMs boot time will increase leading to get startup
# timeouts.
# The source of entropy /dev/urandom is non-blocking and provides a
# generally acceptable source of entropy. It should work well for pretty much
# all practical purposes.
entropy_source = "@DEFENTROPYSOURCE@"
# List of valid annotations values for entropy_source
# The default if not set is empty (all annotations rejected.)
# Your distribution recommends: @DEFVALIDENTROPYSOURCES@
valid_entropy_sources = @DEFVALIDENTROPYSOURCES@
# Path to OCI hook binaries in the *guest rootfs*.
# This does not affect host-side hooks which must instead be added to
# the OCI spec passed to the runtime.
#
# You can create a rootfs with hooks by customizing the osbuilder scripts:
# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder
#
# Hooks must be stored in a subdirectory of guest_hook_path according to their
# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}".
# The agent will scan these directories for executable files and add them, in
# lexicographical order, to the lifecycle of the guest container.
# Hooks are executed in the runtime namespace of the guest. See the official documentation:
# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks
# Warnings will be logged if any error is encountered while scanning for hooks,
# but it will not abort container execution.
# Recommended value when enabling: "/usr/share/oci/hooks"
guest_hook_path = ""
#
# Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM).
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic.
# Default 0-sized value means unlimited rate.
rx_rate_limiter_max_rate = 0
# Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM).
# In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional Block)
# to discipline traffic.
# Default 0-sized value means unlimited rate.
tx_rate_limiter_max_rate = 0
# Set where to save the guest memory dump file.
# If set, when GUEST_PANICKED event occurred,
# guest memeory will be dumped to host filesystem under guest_memory_dump_path,
# This directory will be created automatically if it does not exist.
#
# The dumped file(also called vmcore) can be processed with crash or gdb.
#
# WARNING:
# Dump guest's memory can take very long depending on the amount of guest memory
# and use much disk space.
# Recommended value when enabling: "/var/crash/kata"
guest_memory_dump_path = ""
# If enable paging.
# Basically, if you want to use "gdb" rather than "crash",
# or need the guest-virtual addresses in the ELF vmcore,
# then you should enable paging.
#
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
guest_memory_dump_paging = false
# Enable swap in the guest. Default false.
# When enable_guest_swap is enabled, insert a raw file to the guest as the swap device
# if the swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness")
# is bigger than 0.
# The size of the swap device should be
# swap_in_bytes (set by annotation "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes.
# If swap_in_bytes is not set, the size should be memory_limit_in_bytes.
# If swap_in_bytes and memory_limit_in_bytes is not set, the size should
# be default_memory.
enable_guest_swap = false
# use legacy serial for guest console if available and implemented for architecture. Default false
use_legacy_serial = false
# disable applying SELinux on the VMM process (default false)
disable_selinux = @DEFDISABLESELINUX@
# disable applying SELinux on the container process
# If set to false, the type `container_t` is applied to the container process by default.
# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built
# with `SELINUX=yes`.
# (default: true)
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
[factory]
# VM templating support. Once enabled, new VMs are created from template
# using vm cloning. They will share the same initial kernel, initramfs and
# agent memory by mapping it readonly. It helps speeding up new container
# creation and saves a lot of memory if there are many kata containers running
# on the same host.
#
# When disabled, new VMs are created from scratch.
#
# Note: Requires "initrd=" to be set ("image=" is not supported).
#
# Default false
enable_template = false
# Specifies the path of template.
#
# Default "/run/vc/vm/template"
template_path = "/run/vc/vm/template"
# The number of caches of VMCache:
# unspecified or == 0 --> VMCache is disabled
# > 0 --> will be set to the specified number
#
# VMCache is a function that creates VMs as caches before using it.
# It helps speed up new container creation.
# The function consists of a server and some clients communicating
# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto.
# The VMCache server will create some VMs and cache them by factory cache.
# It will convert the VM to gRPC format and transport it when gets
# requestion from clients.
# Factory grpccache is the VMCache client. It will request gRPC format
# VM and convert it back to a VM. If VMCache function is enabled,
# kata-runtime will request VM from factory grpccache when it creates
# a new sandbox.
#
# Default 0
vm_cache_number = 0
# Specify the address of the Unix socket that is used by VMCache.
#
# Default /var/run/kata-containers/cache.sock
vm_cache_endpoint = "/var/run/kata-containers/cache.sock"
[agent.@PROJECT_TYPE@]
# If enabled, make the agent display debug-level messages.
# (default: disabled)
enable_debug = false
# Enable agent tracing.
#
# If enabled, the agent will generate OpenTelemetry trace spans.
#
# Notes:
#
# - If the runtime also has tracing enabled, the agent spans will be
# associated with the appropriate runtime parent span.
# - If enabled, the runtime will wait for the container to shutdown,
# increasing the container shutdown time slightly.
#
# (default: disabled)
enable_tracing = false
# Comma separated list of kernel modules and their parameters.
# These modules will be loaded in the guest kernel using modprobe(8).
# The following example can be used to load two kernel modules with parameters
# - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"]
# The first word is considered as the module name and the rest as its parameters.
# Container will not be started when:
# * A kernel module is specified and the modprobe command is not installed in the guest
# or it fails loading the module.
# * The module is not available in the guest or it doesn't met the guest kernel
# requirements, like architecture and version.
#
kernel_modules = []
# Enable debug console.
# If enabled, user can connect guest OS running inside hypervisor
# through "kata-runtime exec <sandbox-id>" command
debug_console_enabled = false
# Agent dial timeout in millisecond.
# (default: 10)
dial_timeout_ms = 10
# Agent reconnect timeout in millisecond.
# Retry times = reconnect_timeout_ms / dial_timeout_ms (default: 300)
# If you find pod cannot connect to the agent when starting, please
# consider increasing this value to increase the retry times.
# You'd better not change the value of dial_timeout_ms, unless you have an
# idea of what you are doing.
# (default: 3000)
reconnect_timeout_ms = 3000
# Create Container Request Timeout
# This timeout value is used to set the maximum duration for the agent to process a CreateContainerRequest.
# It's also used to ensure that workloads, especially those involving large image pulls within the guest,
# have sufficient time to complete.
#
# Effective Timeout Determination:
# The effective timeout for a CreateContainerRequest is determined by taking the minimum of the following two values:
# - create_container_timeout: The timeout value configured for creating containers (default: 30,000 milliseconds).
# - runtime-request-timeout: The timeout value specified in the Kubelet configuration described as the link below:
# (https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet/#:~:text=runtime%2Drequest%2Dtimeout)
# Defaults to @DEFCREATECONTAINERTIMEOUT_COCO@ second(s)
create_container_timeout = @DEFCREATECONTAINERTIMEOUT_COCO@
[runtime]
# If enabled, the runtime will log additional debug messages to the
# system log
# (default: disabled)
enable_debug = false
#
# Internetworking model
# Determines how the VM should be connected to the
# the container network interface
# Options:
#
# - macvtap
# Used when the Container network interface can be bridged using
# macvtap.
#
# - none
# Used when customize network. Only creates a tap device. No veth pair.
#
# - tcfilter
# Uses tc filter rules to redirect traffic from the network interface
# provided by plugin to a tap interface connected to the VM.
#
internetworking_model = "@DEFNETWORKMODEL_QEMU@"
name="@RUNTIMENAME@"
hypervisor_name="@HYPERVISOR_QEMU@"
agent_name="@PROJECT_TYPE@"
# disable guest seccomp
# Determines whether container seccomp profiles are passed to the virtual
# machine and applied by the kata agent. If set to true, seccomp is not applied
# within the guest
# (default: true)
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
# vCPUs pinning settings
# if enabled, each vCPU thread will be scheduled to a fixed CPU
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
enable_vcpus_pinning = false
# Apply a custom SELinux security policy to the container process inside the VM.
# This is used when you want to apply a type other than the default `container_t`,
# so general users should not uncomment and apply it.
# (format: "user:role:type")
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
# categories are determined automatically by high-level container runtimes such as containerd.
# Example value when enabling: "system_u:system_r:container_t"
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
# If enabled, the runtime will create opentracing.io traces and spans.
# (See https://www.jaegertracing.io/docs/getting-started).
# (default: disabled)
enable_tracing = false
# Set the full url to the Jaeger HTTP Thrift collector.
# The default if not set will be "http://localhost:14268/api/traces"
jaeger_endpoint = ""
# Sets the username to be used if basic auth is required for Jaeger.
jaeger_user = ""
# Sets the password to be used if basic auth is required for Jaeger.
jaeger_password = ""
# If enabled, the runtime will not create a network namespace for shim and hypervisor processes.
# This option may have some potential impacts to your host. It should only be used when you know what you're doing.
# `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only
# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge
# (like OVS) directly.
# (default: false)
disable_new_netns = false
# if enabled, the runtime will add all the kata processes inside one dedicated cgroup.
# The container cgroups in the host are not created, just one single cgroup per sandbox.
# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox.
# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation.
# The sandbox cgroup is constrained if there is no container type annotation.
# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType
sandbox_cgroup_only = @DEFSANDBOXCGROUPONLY_QEMU@
# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In
# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful
# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug.
# Compatibility for determining appropriate sandbox (VM) size:
# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O
# does not yet support sandbox sizing annotations.
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt = @DEFSTATICRESOURCEMGMT_COCO@
# If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path.
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
# These will not be exposed to the container workloads, and are only provided for potential guest services.
sandbox_bind_mounts = @DEFBINDMOUNTS@
# VFIO Mode
# Determines how VFIO devices should be be presented to the container.
# Options:
#
# - vfio
# Matches behaviour of OCI runtimes (e.g. runc) as much as
# possible. VFIO devices will appear in the container as VFIO
# character devices under /dev/vfio. The exact names may differ
# from the host (they need to match the VM's IOMMU group numbers
# rather than the host's)
#
# - guest-kernel
# This is a Kata-specific behaviour that's useful in certain cases.
# The VFIO device is managed by whatever driver in the VM kernel
# claims it. This means it will appear as one or more device nodes
# or network interfaces depending on the nature of the device.
# Using this mode requires specially built workloads that know how
# to locate the relevant device interfaces within the VM.
#
vfio_mode = "@DEFVFIOMODE@"
# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will
# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest.
disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
# Enabled experimental feature list, format: ["a", "b"].
# Experimental features are features not stable enough for production,
# they may break compatibility, and are prepared for a big version bump.
# Supported experimental features:
# for example:
# experimental=["force_guest_pull"]
# which is for enable force_guest_pull mode in CoCo scenarios.
# (default: [])
experimental = @DEFAULTEXPFEATURES@
# If enabled, user can run pprof tools with shim v2 process through kata-monitor.
# (default: false)
enable_pprof = false
# Base directory of directly attachable network config.
# Network devices for VM-based containers are allowed to be placed in the
# host netns to eliminate as many hops as possible, which is what we
# called a "Directly Attachable Network". The config, set by special CNI
# plugins, is used to tell the Kata containers what devices are attached
# to the hypervisor.
# (default: /run/kata-containers/dans)
dan_conf = "@DEFDANCONF@"
# pod_resource_api_sock specifies the unix socket for the Kubelet's
# PodResource API endpoint. If empty, kubernetes based cold plug
# will not be attempted. In order for this feature to work, the
# KubeletPodResourcesGet featureGate must be enabled in Kubelet,
# if using Kubelet older than 1.34.
#
# The pod resource API's socket is relative to the Kubelet's root-dir,
# which is defined by the cluster admin, and its location is:
# ${KubeletRootDir}/pod-resources/kubelet.sock
#
# cold_plug_vfio(see hypervisor config) acts as a feature gate:
# cold_plug_vfio = no_port (default) => no cold plug
# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need
# explicit CDI annotation for cold plug (applies mainly
# to non-k8s cases)
# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet
# based cold plug.
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK@"

View File

@@ -74,21 +74,43 @@ impl KernelParams {
pub(crate) fn new_rootfs_kernel_params(rootfs_driver: &str, rootfs_type: &str) -> Result<Self> {
let mut params = vec![];
// DAX is disabled on aarch64 due to kernel panic in dax_disassociate_entry
// with virtio-pmem on kernel 6.18.x
#[cfg(target_arch = "aarch64")]
let use_dax = false;
#[cfg(not(target_arch = "aarch64"))]
let use_dax = true;
match rootfs_driver {
VM_ROOTFS_DRIVER_PMEM => {
params.push(Param::new("root", VM_ROOTFS_ROOT_PMEM));
match rootfs_type {
VM_ROOTFS_FILESYSTEM_EXT4 => {
params.push(Param::new(
"rootflags",
"dax,data=ordered,errors=remount-ro ro",
));
if use_dax {
params.push(Param::new(
"rootflags",
"dax,data=ordered,errors=remount-ro ro",
));
} else {
params.push(Param::new(
"rootflags",
"data=ordered,errors=remount-ro ro",
));
}
}
VM_ROOTFS_FILESYSTEM_XFS => {
params.push(Param::new("rootflags", "dax ro"));
if use_dax {
params.push(Param::new("rootflags", "dax ro"));
} else {
params.push(Param::new("rootflags", "ro"));
}
}
VM_ROOTFS_FILESYSTEM_EROFS => {
params.push(Param::new("rootflags", "dax ro"));
if use_dax {
params.push(Param::new("rootflags", "dax ro"));
} else {
params.push(Param::new("rootflags", "ro"));
}
}
_ => {
return Err(anyhow!("Unsupported rootfs type {}", rootfs_type));
@@ -233,6 +255,22 @@ mod tests {
#[test]
fn test_rootfs_kernel_params() {
// DAX is disabled on aarch64
#[cfg(target_arch = "aarch64")]
let ext4_pmem_rootflags = "data=ordered,errors=remount-ro ro";
#[cfg(not(target_arch = "aarch64"))]
let ext4_pmem_rootflags = "dax,data=ordered,errors=remount-ro ro";
#[cfg(target_arch = "aarch64")]
let xfs_pmem_rootflags = "ro";
#[cfg(not(target_arch = "aarch64"))]
let xfs_pmem_rootflags = "dax ro";
#[cfg(target_arch = "aarch64")]
let erofs_pmem_rootflags = "ro";
#[cfg(not(target_arch = "aarch64"))]
let erofs_pmem_rootflags = "dax ro";
let tests = &[
// EXT4
TestData {
@@ -241,7 +279,7 @@ mod tests {
expect_params: KernelParams {
params: [
Param::new("root", VM_ROOTFS_ROOT_PMEM),
Param::new("rootflags", "dax,data=ordered,errors=remount-ro ro"),
Param::new("rootflags", ext4_pmem_rootflags),
Param::new("rootfstype", VM_ROOTFS_FILESYSTEM_EXT4),
]
.to_vec(),
@@ -268,7 +306,7 @@ mod tests {
expect_params: KernelParams {
params: [
Param::new("root", VM_ROOTFS_ROOT_PMEM),
Param::new("rootflags", "dax ro"),
Param::new("rootflags", xfs_pmem_rootflags),
Param::new("rootfstype", VM_ROOTFS_FILESYSTEM_XFS),
]
.to_vec(),
@@ -295,7 +333,7 @@ mod tests {
expect_params: KernelParams {
params: [
Param::new("root", VM_ROOTFS_ROOT_PMEM),
Param::new("rootflags", "dax ro"),
Param::new("rootflags", erofs_pmem_rootflags),
Param::new("rootfstype", VM_ROOTFS_FILESYSTEM_EROFS),
]
.to_vec(),

View File

@@ -49,7 +49,7 @@ require (
github.com/safchain/ethtool v0.6.2
github.com/sirupsen/logrus v1.9.3
github.com/stretchr/testify v1.11.1
github.com/urfave/cli v1.22.17
github.com/urfave/cli v1.22.15
github.com/vishvananda/netlink v1.3.1
github.com/vishvananda/netns v0.0.5
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20220601114329-47893b162965
@@ -85,7 +85,7 @@ require (
github.com/containerd/log v0.1.0 // indirect
github.com/containerd/platforms v0.2.1 // indirect
github.com/containernetworking/cni v1.3.0 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
github.com/cyphar/filepath-securejoin v0.6.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/distribution/reference v0.6.0 // indirect

View File

@@ -8,6 +8,7 @@ github.com/AdaLogics/go-fuzz-headers v0.0.0-20230811130428-ced1acdcaa24/go.mod h
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0 h1:59MxjQVfjXsBpLy+dbd2/ELV5ofnUkUZBvWSC85sheA=
github.com/AdamKorcz/go-118-fuzz-build v0.0.0-20230306123547-8075edf89bb0/go.mod h1:OahwfttHWG6eJ0clwcfBAHoDI6X/LV/15hx/wlMZSrU=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/BurntSushi/toml v1.5.0 h1:W5quZX/G/csjUnuI8SUYlsHs9M38FC7znL0lIO+DvMg=
github.com/BurntSushi/toml v1.5.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
@@ -69,8 +70,9 @@ github.com/containernetworking/plugins v1.9.0 h1:Mg3SXBdRGkdXyFC4lcwr6u2ZB2SDeL6
github.com/containernetworking/plugins v1.9.0/go.mod h1:JG3BxoJifxxHBhG3hFyxyhid7JgRVBu/wtooGEvWf1c=
github.com/coreos/go-systemd/v22 v22.6.0 h1:aGVa/v8B7hpb0TKl0MWoAavPDmHvobFe5R5zn0bCJWo=
github.com/coreos/go-systemd/v22 v22.6.0/go.mod h1:iG+pp635Fo7ZmV/j14KUcmEyWF+0X7Lua8rrTWzYgWU=
github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo=
github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/cri-o/cri-o v1.34.0 h1:ux2URwAyENy5e5hD9Z95tshdfy98eqatZk0fxx3rhuk=
github.com/cri-o/cri-o v1.34.0/go.mod h1:kP40HG+1EW5CDNHjqQBFhb6dehT5dCBKcmtO5RZAm6k=
github.com/cyphar/filepath-securejoin v0.6.0 h1:BtGB77njd6SVO6VztOHfPxKitJvd/VPT+OFBFMOi1Is=
@@ -287,13 +289,13 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/urfave/cli v1.22.17 h1:SYzXoiPfQjHBbkYxbew5prZHS1TOLT3ierW8SYLqtVQ=
github.com/urfave/cli v1.22.17/go.mod h1:b0ht0aqgH/6pBYzzxURyrM4xXNgsoT/n2ZzwQiEhNVo=
github.com/urfave/cli v1.22.15 h1:nuqt+pdC/KqswQKhETJjo7pvn/k4xMUxgW6liI7XpnM=
github.com/urfave/cli v1.22.15/go.mod h1:wSan1hmo5zeyLGBjRJbzRTNk8gwoYa2B9n4q9dmRIc0=
github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0=
github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4=
github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY=

View File

@@ -1,4 +1,3 @@
// Package md2man aims in converting markdown into roff (man pages).
package md2man
import (

View File

@@ -47,13 +47,13 @@ const (
tableStart = "\n.TS\nallbox;\n"
tableEnd = ".TE\n"
tableCellStart = "T{\n"
tableCellEnd = "\nT}"
tableCellEnd = "\nT}\n"
tablePreprocessor = `'\" t`
)
// NewRoffRenderer creates a new blackfriday Renderer for generating roff documents
// from markdown
func NewRoffRenderer() *roffRenderer {
func NewRoffRenderer() *roffRenderer { // nolint: golint
return &roffRenderer{}
}
@@ -316,8 +316,9 @@ func (r *roffRenderer) handleTableCell(w io.Writer, node *blackfriday.Node, ente
} else if nodeLiteralSize(node) > 30 {
end = tableCellEnd
}
if node.Next == nil {
// Last cell: need to carriage return if we are at the end of the header row.
if node.Next == nil && end != tableCellEnd {
// Last cell: need to carriage return if we are at the end of the
// header row and content isn't wrapped in a "tablecell"
end += crTag
}
out(w, end)
@@ -355,7 +356,7 @@ func countColumns(node *blackfriday.Node) int {
}
func out(w io.Writer, output string) {
io.WriteString(w, output) //nolint:errcheck
io.WriteString(w, output) // nolint: errcheck
}
func escapeSpecialChars(w io.Writer, text []byte) {
@@ -394,7 +395,7 @@ func escapeSpecialCharsLine(w io.Writer, text []byte) {
i++
}
if i > org {
w.Write(text[org:i]) //nolint:errcheck
w.Write(text[org:i]) // nolint: errcheck
}
// escape a character
@@ -402,7 +403,7 @@ func escapeSpecialCharsLine(w io.Writer, text []byte) {
break
}
w.Write([]byte{'\\', text[i]}) //nolint:errcheck
w.Write([]byte{'\\', text[i]}) // nolint: errcheck
}
}

View File

@@ -257,7 +257,7 @@ github.com/containernetworking/plugins/pkg/testutils
# github.com/coreos/go-systemd/v22 v22.6.0
## explicit; go 1.23
github.com/coreos/go-systemd/v22/dbus
# github.com/cpuguy83/go-md2man/v2 v2.0.7
# github.com/cpuguy83/go-md2man/v2 v2.0.6
## explicit; go 1.12
github.com/cpuguy83/go-md2man/v2/md2man
# github.com/cri-o/cri-o v1.34.0
@@ -526,7 +526,7 @@ github.com/stretchr/testify/assert/yaml
# github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
## explicit
github.com/syndtr/gocapability/capability
# github.com/urfave/cli v1.22.17
# github.com/urfave/cli v1.22.15
## explicit; go 1.11
github.com/urfave/cli
# github.com/vishvananda/netlink v1.3.1

View File

@@ -585,7 +585,9 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
clh.vmconfig.Cpus = chclient.NewCpusConfig(int32(clh.config.NumVCPUs()), int32(clh.config.DefaultMaxVCPUs))
disableNvdimm := (clh.config.DisableImageNvdimm || clh.config.ConfidentialGuest)
enableDax := !disableNvdimm
// DAX is disabled on aarch64 due to kernel panic in dax_disassociate_entry
// with virtio-pmem on kernel 6.18.x
enableDax := !disableNvdimm && runtime.GOARCH != "arm64"
params, err := getNonUserDefinedKernelParams(hypervisorConfig.RootfsType, disableNvdimm, enableDax, clh.config.Debug, clh.config.ConfidentialGuest, clh.config.IOMMU)
if err != nil {

View File

@@ -69,9 +69,11 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
kernelParamsDebug: kernelParamsDebug,
kernelParams: kernelParams,
disableNvdimm: config.DisableImageNvdimm,
dax: true,
protection: noneProtection,
legacySerial: config.LegacySerial,
// DAX is disabled on aarch64 due to kernel panic in dax_disassociate_entry
// with virtio-pmem on kernel 6.18.x
dax: false,
protection: noneProtection,
legacySerial: config.LegacySerial,
},
measurementAlgo: config.MeasurementAlgo,
}

View File

@@ -1415,13 +1415,6 @@ func (s *Sandbox) startVM(ctx context.Context, prestartHookFunc func(context.Con
if err != nil {
return err
}
// If we want the network, scan the netns again to update the network
// configuration after the prestart hooks have run.
if !s.config.NetworkConfig.DisableNewNetwork {
if _, err := s.network.AddEndpoints(ctx, s, nil, false); err != nil {
return err
}
}
}
if err := s.network.Run(ctx, func() error {

View File

@@ -3024,9 +3024,9 @@ dependencies = [
[[package]]
name = "qapi"
version = "0.15.0"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b047adab56acc4948d4b9b58693c1f33fd13efef2d6bb5f0f66a47436ceada8"
checksum = "c6412bdd014ebee03ddbbe79ac03a0b622cce4d80ba45254f6357c847f06fa38"
dependencies = [
"bytes",
"futures",
@@ -3061,9 +3061,9 @@ dependencies = [
[[package]]
name = "qapi-qmp"
version = "0.15.0"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45303cac879d89361cad0287ae15f9ae1e7799b904b474152414aeece39b9875"
checksum = "e8b944db7e544d2fa97595e9a000a6ba5c62c426fa185e7e00aabe4b5640b538"
dependencies = [
"qapi-codegen",
"qapi-spec",

View File

@@ -81,7 +81,6 @@ pub enum Commands {
#[error("Argument is not valid")]
pub struct CheckArgument {
#[clap(subcommand)]
#[allow(unused_assignments)]
pub command: CheckSubCommand,
}

View File

@@ -486,11 +486,11 @@ mod tests {
let releases = get_kata_all_releases_by_url(KATA_GITHUB_RELEASE_URL);
// sometime in GitHub action accessing to github.com API may fail
// we can skip this test to prevent the whole test fail.
if let Err(error) = releases {
if releases.is_err() {
warn!(
sl!(),
"get kata version failed({:?}), this maybe a temporary error, just skip the test.",
error
releases.unwrap_err()
);
return;
}

1
src/tools/runk/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/vendor/

3943
src/tools/runk/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

38
src/tools/runk/Cargo.toml Normal file
View File

@@ -0,0 +1,38 @@
[package]
name = "runk"
version = "0.0.1"
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
description = "runk: Kata OCI container runtime based on Kata agent"
license = "Apache-2.0"
edition = "2018"
[dependencies]
libcontainer = { path = "./libcontainer" }
rustjail = { path = "../../agent/rustjail", features = [
"standard-oci-runtime",
] }
runtime-spec = { path = "../../libs/runtime-spec" }
oci-spec = { version = "0.8.1", features = ["runtime"] }
logging = { path = "../../libs/logging" }
liboci-cli = "0.5.3"
clap = { version = "4.5.40", features = ["derive", "cargo"] }
libc = "0.2.108"
nix = "0.23.0"
anyhow = "1.0.52"
slog = "2.7.0"
chrono = { version = "0.4.19", features = ["serde"] }
slog-async = "2.7.0"
tokio = { version = "1.44.2", features = ["full"] }
serde = { version = "1.0.133", features = ["derive"] }
serde_json = "1.0.74"
uzers = "0.12.1"
tabwriter = "1.2.1"
[features]
seccomp = ["rustjail/seccomp"]
[dev-dependencies]
tempfile = "3.19.1"
[workspace]
members = ["libcontainer"]

67
src/tools/runk/Makefile Normal file
View File

@@ -0,0 +1,67 @@
# Copyright 2021-2022 Sony Group Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# LIBC=musl|gnu (default: gnu)
LIBC ?= gnu
include ../../../utils.mk
TARGET = runk
TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET)
AGENT_SOURCE_PATH = ../../agent
EXTRA_RUSTFEATURES :=
# Define if runk enables seccomp support (default: yes)
SECCOMP := yes
# BINDIR is a directory for installing executable programs
BINDIR := /usr/local/bin
ifeq ($(SECCOMP),yes)
override EXTRA_RUSTFEATURES += seccomp
endif
ifneq ($(EXTRA_RUSTFEATURES),)
override EXTRA_RUSTFEATURES := --features "$(EXTRA_RUSTFEATURES)"
endif
.DEFAULT_GOAL := default
default: build
build:
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
static-checks-build:
@echo "INFO: static-checks-build do nothing.."
install:
install -D $(TARGET_PATH) $(BINDIR)/$(TARGET)
clean:
cargo clean
vendor:
cargo vendor
test: test-runk test-agent
test-runk:
cargo test --all --target $(TRIPLE) $(EXTRA_RUSTFEATURES) -- --nocapture
test-agent:
make test -C $(AGENT_SOURCE_PATH) STANDARD_OCI_RUNTIME=yes
check: standard_rust_check
.PHONY: \
build \
install \
clean \
clippy \
format \
vendor \
test \
check \

352
src/tools/runk/README.md Normal file
View File

@@ -0,0 +1,352 @@
# runk
## Overview
> **Warnings:**
> `runk` is currently an experimental tool.
> Only continue if you are using a non-critical system.
`runk` is a standard OCI container runtime written in Rust based on a modified version of
the [Kata Container agent](https://github.com/kata-containers/kata-containers/tree/main/src/agent), `kata-agent`.
`runk` conforms to the [OCI Container Runtime specifications](https://github.com/opencontainers/runtime-spec).
Unlike the [Kata Container runtime](https://github.com/kata-containers/kata-containers/tree/main/src/agent#features),
`kata-runtime`, `runk` spawns and runs containers on the host machine directly.
The user can run `runk` in the same way as the existing container runtimes such as `runc`,
the most used implementation of the OCI runtime specs.
## Why does `runk` exist?
The `kata-agent` is a process running inside a virtual machine (VM) as a supervisor for managing containers
and processes running within those containers.
In other words, the `kata-agent` is a kind of "low-level" container runtime inside VM because the agent
spawns and runs containers according to the OCI runtime specs.
However, the `kata-agent` does not have the OCI Command-Line Interface (CLI) that is defined in the
[runtime spec](https://github.com/opencontainers/runtime-spec/blob/main/runtime.md).
The `kata-runtime` provides the CLI part of the Kata Containers runtime component,
but the `kata-runtime` is a container runtime for creating hardware-virtualized containers running on the host.
`runk` is a Rust-based standard OCI container runtime that manages normal containers,
not hardware-virtualized containers.
`runk` aims to become one of the alternatives to existing OCI compliant container runtimes.
The `kata-agent` has most of the [features](https://github.com/kata-containers/kata-containers/tree/main/src/agent#features)
needed for the container runtime and delivers high performance with a low memory footprint owing to the
implementation by Rust language.
Therefore, `runk` leverages the mechanism of the `kata-agent` to avoid reinventing the wheel.
## Performance
`runk` is faster than `runc` and has a lower memory footprint.
This table shows the average of the elapsed time and the memory footprint (maximum resident set size)
for running sequentially 100 containers, the containers run `/bin/true` using `run` command with
[detached mode](https://github.com/opencontainers/runc/blob/main/docs/terminals.md#detached)
on 12 CPU cores (`3.8 GHz AMD Ryzen 9 3900X`) and 32 GiB of RAM.
`runk` always runs containers with detached mode currently.
Evaluation Results:
| | `runk` (v0.0.1) | `runc` (v1.0.3) | `crun` (v1.4.2) |
|-----------------------|---------------|---------------|---------------|
| time [ms] | 39.83 | 50.39 | 38.41 |
| memory footprint [MB] | 4.013 | 10.78 | 1.738 |
## Status of `runk`
We drafted the initial code here, and any contributions to `runk` and [`kata-agent`](https://github.com/kata-containers/kata-containers/tree/main/src/agent)
are welcome.
Regarding features compared to `runc`, see the `Status of runk` section in the [issue](https://github.com/kata-containers/kata-containers/issues/2784).
## Building
In order to enable seccomp support, you need to install the `libseccomp` library on
your platform.
> e.g. `libseccomp-dev` for Ubuntu, or `libseccomp-devel` for CentOS
You can build `runk`:
```bash
$ cd runk
$ make
```
If you want to build a statically linked binary of `runk`, set the environment
variables for the [`libseccomp` crate](https://github.com/libseccomp-rs/libseccomp-rs) and
set the `LIBC` to `musl`:
```bash
$ export LIBSECCOMP_LINK_TYPE=static
$ export LIBSECCOMP_LIB_PATH="the path of the directory containing libseccomp.a"
$ export LIBC=musl
$ make
```
> **Note**:
>
> - If the compilation fails when `runk` tries to link the `libseccomp` library statically
> against `musl`, you will need to build the `libseccomp` manually with `-U_FORTIFY_SOURCE`.
> For the details, see [our script](https://github.com/kata-containers/kata-containers/blob/main/ci/install_libseccomp.sh)
> to install the `libseccomp` for the agent.
> - On `ppc64le` and `s390x`, `glibc` should be used even if `LIBC=musl` is specified.
> - If you do not want to enable seccomp support, run `make SECCOMP=no`.
To install `runk` into default directory for executable program (`/usr/local/bin`):
```bash
$ sudo -E make install
```
## Using `runk` directly
Please note that `runk` is a low level tool not developed with an end user in mind.
It is mostly employed by other higher-level container software like `containerd`.
If you still want to use `runk` directly, here's how.
### Prerequisites
It is necessary to create an OCI bundle to use the tool. The simplest method is:
``` bash
$ bundle_dir="bundle"
$ rootfs_dir="$bundle_dir/rootfs"
$ image="busybox"
$ mkdir -p "$rootfs_dir" && (cd "$bundle_dir" && runk spec)
$ sudo docker export $(sudo docker create "$image") | tar -C "$rootfs_dir" -xf -
```
> **Note:**
> If you use the unmodified `runk spec` template, this should give a `sh` session inside the container.
> However, if you use `runk` directly and run a container with the unmodified template,
> `runk` cannot launch the `sh` session because `runk` does not support terminal handling yet.
> You need to edit the process field in the `config.json` should look like this below
> with `"terminal": false` and `"args": ["sleep", "10"]`.
```json
"process": {
"terminal": false,
"user": {
"uid": 0,
"gid": 0
},
"args": [
"sleep",
"10"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
[...]
}
```
If you want to launch the `sh` session inside the container, you need to run `runk` from `containerd`.
Please refer to the [Using `runk` from containerd](#using-runk-from-containerd) section
### Running a container
Now you can go through the [lifecycle operations](https://github.com/opencontainers/runtime-spec/blob/main/runtime.md)
in your shell.
You need to run `runk` as `root` because `runk` does not have the rootless feature which is the ability
to run containers without root privileges.
```bash
$ cd $bundle_dir
# Create a container
$ sudo runk create test
# View the container is created and in the "created" state
$ sudo runk state test
# Start the process inside the container
$ sudo runk start test
# After 10 seconds view that the container has exited and is now in the "stopped" state
$ sudo runk state test
# Now delete the container
$ sudo runk delete test
```
## Using `runk` from `Docker`
`runk` can run containers using [`Docker`](https://github.com/docker).
First, install `Docker` from package by following the
[`Docker` installation instructions](https://docs.docker.com/engine/install/).
### Running a container with `Docker` command line
Start the docker daemon:
```bash
$ sudo dockerd --experimental --add-runtime="runk=/usr/local/bin/runk"
```
> **Note:**
> Before starting the `dockerd`, you need to stop the normal docker daemon
> running on your environment (i.e., `systemctl stop docker`).
Launch a container in a different terminal:
```bash
$ sudo docker run -it --rm --runtime runk busybox sh
/ #
```
## Using `runk` from `Podman`
`runk` can run containers using [`Podman`](https://github.com/containers/podman).
First, install `Podman` from source code or package by following the
[`Podman` installation instructions](https://podman.io/getting-started/installation).
### Running a container with `Podman` command line
```bash
$ sudo podman --runtime /usr/local/bin/runk run -it --rm busybox sh
/ #
```
> **Note:**
> `runk` does not support some commands except
> [OCI standard operations](https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#operations)
> yet, so those commands do not work in `Docker/Podman`. Regarding commands currently
> implemented in `runk`, see the [Status of `runk`](#status-of-runk) section.
## Using `runk` from `containerd`
`runk` can run containers with the containerd runtime handler support on `containerd`.
### Prerequisites for `runk` with containerd
* `containerd` v1.2.4 or above
* `cri-tools`
> **Note:**
> [`cri-tools`](https://github.com/kubernetes-sigs/cri-tools) is a set of tools for CRI
> used for development and testing.
Install `cri-tools` from source code:
```bash
$ go get github.com/kubernetes-sigs/cri-tools
$ pushd $GOPATH/src/github.com/kubernetes-sigs/cri-tools
$ make
$ sudo -E make install
$ popd
```
Write the `crictl` configuration file:
``` bash
$ cat <<EOF | sudo tee /etc/crictl.yaml
runtime-endpoint: unix:///run/containerd/containerd.sock
EOF
```
### Configure `containerd` to use `runk`
Update `/etc/containerd/config.toml`:
```bash
$ cat <<EOF | sudo tee /etc/containerd/config.toml
version = 2
[plugins."io.containerd.runtime.v1.linux"]
shim_debug = true
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runk]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runk.options]
BinaryName = "/usr/local/bin/runk"
EOF
```
Restart `containerd`:
```bash
$ sudo systemctl restart containerd
```
### Running a container with `crictl` command line
You can run containers in `runk` via containerd's CRI.
Pull the `busybox` image:
``` bash
$ sudo crictl pull busybox
```
Create the sandbox configuration:
``` bash
$ cat <<EOF | tee sandbox.json
{
"metadata": {
"name": "busybox-sandbox",
"namespace": "default",
"attempt": 1,
"uid": "hdishd83djaidwnduwk28bcsb"
},
"log_directory": "/tmp",
"linux": {
}
}
EOF
```
Create the container configuration:
``` bash
$ cat <<EOF | tee container.json
{
"metadata": {
"name": "busybox"
},
"image": {
"image": "docker.io/busybox"
},
"command": [
"sh"
],
"envs": [
{
"key": "PATH",
"value": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
},
{
"key": "TERM",
"value": "xterm"
}
],
"log_path": "busybox.0.log",
"stdin": true,
"stdin_once": true,
"tty": true
}
EOF
```
With the `crictl` command line of `cri-tools`, you can specify runtime class with `-r` or `--runtime` flag.
Launch a sandbox and container using the `crictl`:
```bash
# Run a container inside a sandbox
$ sudo crictl run -r runk container.json sandbox.json
f492eee753887ba3dfbba9022028975380739aba1269df431d097b73b23c3871
# Attach to the running container
$ sudo crictl attach --stdin --tty f492eee753887ba3dfbba9022028975380739aba1269df431d097b73b23c3871
/ #
```

View File

@@ -0,0 +1,32 @@
[package]
name = "libcontainer"
version = "0.0.1"
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
description = "Library for runk container"
license = "Apache-2.0"
edition = "2018"
[dependencies]
rustjail = { path = "../../../agent/rustjail", features = [
"standard-oci-runtime",
] }
runtime-spec = { path = "../../../libs/runtime-spec" }
oci-spec = { version = "0.8.1", features = ["runtime"] }
kata-sys-util = { path = "../../../libs/kata-sys-util" }
logging = { path = "../../../libs/logging" }
derive_builder = "0.10.2"
libc = "0.2.108"
nix = "0.23.0"
anyhow = "1.0.52"
slog = "2.7.0"
chrono = { version = "0.4.19", features = ["serde"] }
serde = { version = "1.0.133", features = ["derive"] }
serde_json = "1.0.74"
scopeguard = "1.1.0"
cgroups = { package = "cgroups-rs", git = "https://github.com/kata-containers/cgroups-rs", rev = "v0.3.5" }
procfs = "0.14.0"
[dev-dependencies]
tempfile = "3.19.1"
test-utils = { path = "../../../libs/test-utils" }
protocols = { path = "../../../libs/protocols" }

View File

@@ -0,0 +1,336 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use crate::container::{load_linux_container, Container, ContainerLauncher};
use crate::status::Status;
use crate::utils::validate_spec;
use anyhow::{anyhow, Result};
use derive_builder::Builder;
use oci::{Process as OCIProcess, Spec};
use oci_spec::runtime as oci;
use runtime_spec::ContainerState;
use rustjail::container::update_namespaces;
use slog::{debug, Logger};
use std::fs::File;
use std::path::{Path, PathBuf};
/// Used for exec command. It will prepare the options for joining an existing container.
#[derive(Default, Builder, Debug, Clone)]
#[builder(build_fn(validate = "Self::validate"))]
pub struct ActivatedContainer {
pub id: String,
pub root: PathBuf,
pub console_socket: Option<PathBuf>,
pub pid_file: Option<PathBuf>,
pub tty: bool,
pub cwd: Option<PathBuf>,
pub env: Vec<(String, String)>,
pub no_new_privs: bool,
pub args: Vec<String>,
pub process: Option<PathBuf>,
}
impl ActivatedContainerBuilder {
/// pre-validate before building ActivatedContainer
fn validate(&self) -> Result<(), String> {
// ensure container exists
let id = self.id.as_ref().unwrap();
let root = self.root.as_ref().unwrap();
let status_path = Status::get_dir_path(root, id);
if !status_path.exists() {
return Err(format!(
"container {} does not exist at path {:?}",
id, root
));
}
// ensure argv will not be empty in process exec phase later
let process = self.process.as_ref().unwrap();
let args = self.args.as_ref().unwrap();
if process.is_none() && args.is_empty() {
return Err("process and args cannot be all empty".to_string());
}
Ok(())
}
}
impl ActivatedContainer {
/// Create ContainerLauncher that can be used to spawn a process in an existing container.
/// This reads the spec from status file of an existing container and adapts it with given process file
/// or other options like args, env, etc. It also changes the namespace in spec to join the container.
pub fn create_launcher(self, logger: &Logger) -> Result<ContainerLauncher> {
debug!(
logger,
"enter ActivatedContainer::create_launcher {:?}", self
);
let mut container = Container::load(&self.root, &self.id)?;
// If state is Created or Running, we can execute the process.
if container.state != ContainerState::Created && container.state != ContainerState::Running
{
return Err(anyhow!(
"cannot exec in a stopped or paused container, state: {:?}",
container.state
));
}
let spec = container
.status
.config
.spec
.as_mut()
.ok_or_else(|| anyhow!("spec config was not present"))?;
self.adapt_exec_spec(spec, container.status.pid, logger)?;
debug!(logger, "adapted spec: {:?}", spec);
validate_spec(spec, &self.console_socket)?;
debug!(
logger,
"load LinuxContainer with config: {:?}", &container.status.config
);
let runner = load_linux_container(&container.status, self.console_socket, logger)?;
Ok(ContainerLauncher::new(
&self.id,
&container.status.bundle,
&self.root,
false,
runner,
self.pid_file,
))
}
/// Adapt spec to execute a new process which will join the container.
fn adapt_exec_spec(&self, spec: &mut Spec, pid: i32, logger: &Logger) -> Result<()> {
// If with --process, load process from file.
// Otherwise, update process with args and other options.
if let Some(process_path) = self.process.as_ref() {
spec.set_process(Some(Self::get_process(process_path)?));
} else if let Some(process) = spec.process_mut().as_mut() {
self.update_process(process)?;
} else {
return Err(anyhow!("process is empty in spec"));
};
// Exec process will join the container's namespaces
update_namespaces(logger, spec, pid)?;
Ok(())
}
/// Update process with args and other options.
fn update_process(&self, process: &mut OCIProcess) -> Result<()> {
process.set_args(Some(self.args.clone()));
process.set_no_new_privileges(Some(self.no_new_privs));
process.set_terminal(Some(self.tty));
if let Some(cwd) = self.cwd.as_ref() {
process.set_cwd(cwd.as_path().to_path_buf());
}
if let Some(process_env) = process.env_mut() {
process_env.extend(self.env.iter().map(|kv| format!("{}={}", kv.0, kv.1)));
}
Ok(())
}
/// Read and parse OCI Process from path
fn get_process(process_path: &Path) -> Result<OCIProcess> {
let f = File::open(process_path)?;
Ok(serde_json::from_reader(f)?)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::status::Status;
use crate::utils::test_utils::*;
use nix::unistd::getpid;
use oci_spec::runtime::{LinuxBuilder, LinuxNamespaceBuilder, ProcessBuilder, User};
use rustjail::container::TYPETONAME;
use scopeguard::defer;
use slog::o;
use std::{
fs::{create_dir_all, File},
path::PathBuf,
};
use tempfile::tempdir;
use test_utils::skip_if_not_root;
fn create_activated_dirs(root: &Path, id: &str, bundle: &Path) {
Status::create_dir(root, id).unwrap();
create_dir_all(bundle.join(TEST_ROOTFS_PATH)).unwrap();
}
#[test]
fn test_activated_container_validate() {
let root = tempdir().unwrap();
let id = TEST_CONTAINER_ID.to_string();
Status::create_dir(root.path(), &id).unwrap();
let result = ActivatedContainerBuilder::default()
.id(id)
.root(root.into_path())
.console_socket(None)
.pid_file(None)
.tty(false)
.cwd(None)
.env(Vec::new())
.no_new_privs(false)
.process(None)
.args(vec!["sleep".to_string(), "10".to_string()])
.build();
assert!(result.is_ok());
}
#[test]
fn test_activated_container_create() {
// create cgroup directory needs root permission
skip_if_not_root!();
let logger = slog::Logger::root(slog::Discard, o!());
let bundle_dir = tempdir().unwrap();
let root = tempdir().unwrap();
// Since tests are executed concurrently, container_id must be unique in tests with cgroup.
// Or the cgroup directory may be removed by other tests in advance.
let id = "test_activated_container_create".to_string();
create_activated_dirs(root.path(), &id, bundle_dir.path());
let pid = getpid().as_raw();
let mut spec = create_dummy_spec();
spec.root_mut()
.as_mut()
.unwrap()
.set_path(bundle_dir.path().join(TEST_ROOTFS_PATH));
let status = create_custom_dummy_status(&id, pid, root.path(), &spec);
status.save().unwrap();
// create empty cgroup directory to avoid is_pause failing
let cgroup = create_dummy_cgroup(Path::new(id.as_str()));
defer!(cgroup.delete().unwrap());
let result = ActivatedContainerBuilder::default()
.id(id)
.root(root.into_path())
.console_socket(Some(PathBuf::from(TEST_CONSOLE_SOCKET_PATH)))
.pid_file(Some(PathBuf::from(TEST_PID_FILE_PATH)))
.tty(true)
.cwd(Some(PathBuf::from(TEST_BUNDLE_PATH)))
.env(vec![
("K1".to_string(), "V1".to_string()),
("K2".to_string(), "V2".to_string()),
])
.no_new_privs(true)
.process(None)
.args(vec!["sleep".to_string(), "10".to_string()])
.build()
.unwrap();
let linux = LinuxBuilder::default()
.namespaces(
TYPETONAME
.iter()
.filter(|&(_, &name)| name != "user")
.map(|ns| {
LinuxNamespaceBuilder::default()
.typ(ns.0.clone())
.path(PathBuf::from(&format!("/proc/{}/ns/{}", pid, ns.1)))
.build()
.unwrap()
})
.collect::<Vec<_>>(),
)
.build()
.unwrap();
spec.set_linux(Some(linux));
let process = ProcessBuilder::default()
.terminal(result.tty)
.user(User::default())
.args(result.args.clone())
.cwd(result.cwd.clone().unwrap().to_string_lossy().to_string())
.env(vec![
"PATH=/bin:/usr/bin".to_string(),
"K1=V1".to_string(),
"K2=V2".to_string(),
])
.no_new_privileges(result.no_new_privs)
.build()
.unwrap();
spec.set_process(Some(process));
let launcher = result.clone().create_launcher(&logger).unwrap();
assert!(!launcher.init);
assert_eq!(launcher.runner.config.spec.unwrap(), spec);
assert_eq!(
launcher.runner.console_socket,
result.console_socket.unwrap()
);
assert_eq!(launcher.pid_file, result.pid_file);
}
#[test]
fn test_activated_container_create_with_process() {
// create cgroup directory needs root permission
skip_if_not_root!();
let bundle_dir = tempdir().unwrap();
let process_file = bundle_dir.path().join(TEST_PROCESS_FILE_NAME);
let mut process_template = OCIProcess::default();
process_template.set_args(Some(vec!["sleep".to_string(), "10".to_string()]));
process_template.set_cwd(PathBuf::from("/"));
let file = File::create(process_file.clone()).unwrap();
serde_json::to_writer(&file, &process_template).unwrap();
let logger = slog::Logger::root(slog::Discard, o!());
let root = tempdir().unwrap();
// Since tests are executed concurrently, container_id must be unique in tests with cgroup.
// Or the cgroup directory may be removed by other tests in advance.
let id = "test_activated_container_create_with_process".to_string();
let pid = getpid().as_raw();
let mut spec = create_dummy_spec();
spec.root_mut()
.as_mut()
.unwrap()
.set_path(bundle_dir.path().join(TEST_ROOTFS_PATH));
create_activated_dirs(root.path(), &id, bundle_dir.path());
let status = create_custom_dummy_status(&id, pid, root.path(), &spec);
status.save().unwrap();
// create empty cgroup directory to avoid is_pause failing
let cgroup = create_dummy_cgroup(Path::new(id.as_str()));
defer!(cgroup.delete().unwrap());
let launcher = ActivatedContainerBuilder::default()
.id(id)
.root(root.into_path())
.console_socket(Some(PathBuf::from(TEST_CONSOLE_SOCKET_PATH)))
.pid_file(None)
.tty(true)
.cwd(Some(PathBuf::from(TEST_BUNDLE_PATH)))
.env(vec![
("K1".to_string(), "V1".to_string()),
("K2".to_string(), "V2".to_string()),
])
.no_new_privs(true)
.process(Some(process_file))
.args(vec!["sleep".to_string(), "10".to_string()])
.build()
.unwrap()
.create_launcher(&logger)
.unwrap();
assert!(!launcher.init);
assert_eq!(
launcher
.runner
.config
.spec
.unwrap()
.process()
.clone()
.unwrap(),
process_template
);
}
}

View File

@@ -0,0 +1,77 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::anyhow;
use anyhow::Result;
use cgroups;
use cgroups::freezer::{FreezerController, FreezerState};
use std::{thread, time};
// Try to remove the provided cgroups path five times with increasing delay between tries.
// If after all there are not removed cgroups, an appropriate error will be returned.
pub fn remove_cgroup_dir(cgroup: &cgroups::Cgroup) -> Result<()> {
let mut retries = 5;
let mut delay = time::Duration::from_millis(10);
while retries != 0 {
if retries != 5 {
delay *= 2;
thread::sleep(delay);
}
if cgroup.delete().is_ok() {
return Ok(());
}
retries -= 1;
}
Err(anyhow!("failed to remove cgroups paths"))
}
// Make sure we get a stable freezer state, so retry if the cgroup is still undergoing freezing.
pub fn get_freezer_state(freezer: &FreezerController) -> Result<FreezerState> {
let mut retries = 10;
while retries != 0 {
let state = freezer.state()?;
match state {
FreezerState::Thawed => return Ok(FreezerState::Thawed),
FreezerState::Frozen => return Ok(FreezerState::Frozen),
FreezerState::Freezing => {
// sleep for 10 ms, wait for the cgroup to finish freezing
thread::sleep(time::Duration::from_millis(10));
retries -= 1;
}
}
}
Ok(FreezerState::Freezing)
}
// check whether freezer state is frozen
pub fn is_paused(cgroup: &cgroups::Cgroup) -> Result<bool> {
let freezer_controller: &FreezerController = cgroup
.controller_of()
.ok_or_else(|| anyhow!("failed to get freezer controller"))?;
let freezer_state = get_freezer_state(freezer_controller)?;
match freezer_state {
FreezerState::Frozen => Ok(true),
_ => Ok(false),
}
}
pub fn freeze(cgroup: &cgroups::Cgroup, state: FreezerState) -> Result<()> {
let freezer_controller: &FreezerController = cgroup
.controller_of()
.ok_or_else(|| anyhow!("failed to get freezer controller"))?;
match state {
FreezerState::Frozen => {
freezer_controller.freeze()?;
}
FreezerState::Thawed => {
freezer_controller.thaw()?;
}
_ => return Err(anyhow!("invalid freezer state")),
}
Ok(())
}

View File

@@ -0,0 +1,437 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use crate::cgroup::{freeze, remove_cgroup_dir};
use crate::status::{self, get_current_container_state, Status};
use anyhow::{anyhow, Result};
use cgroups;
use cgroups::freezer::FreezerState;
use cgroups::hierarchies::is_cgroup2_unified_mode;
use nix::sys::signal::kill;
use nix::{
sys::signal::Signal,
sys::signal::SIGKILL,
unistd::{chdir, unlink, Pid},
};
use procfs;
use runtime_spec::{ContainerState, State as OCIState};
use rustjail::cgroups::fs::Manager as CgroupManager;
use rustjail::{
container::{BaseContainer, LinuxContainer, EXEC_FIFO_FILENAME},
process::{Process, ProcessOperations},
specconv::CreateOpts,
};
use scopeguard::defer;
use slog::{debug, info, Logger};
use std::{
env::current_dir,
fs,
path::{Path, PathBuf},
};
use kata_sys_util::hooks::HookStates;
pub const CONFIG_FILE_NAME: &str = "config.json";
#[derive(Debug, Copy, Clone, PartialEq)]
pub enum ContainerAction {
Create,
Start,
Run,
}
#[derive(Debug)]
pub struct Container {
pub status: Status,
pub state: ContainerState,
pub cgroup: cgroups::Cgroup,
}
// Container represents a container that is created by the container runtime.
impl Container {
pub fn load(state_root: &Path, id: &str) -> Result<Self> {
let status = Status::load(state_root, id)?;
let spec = status
.config
.spec
.as_ref()
.ok_or_else(|| anyhow!("spec config was not present"))?;
let linux = spec
.linux()
.as_ref()
.ok_or_else(|| anyhow!("linux config was not present"))?;
let cpath = if linux.cgroups_path().is_none() {
id.to_string()
} else {
linux
.cgroups_path()
.clone()
.unwrap_or_default()
.display()
.to_string()
.trim_start_matches('/')
.to_string()
};
let cgroup = cgroups::Cgroup::load(cgroups::hierarchies::auto(), cpath);
let state = get_current_container_state(&status, &cgroup)?;
Ok(Self {
status,
state,
cgroup,
})
}
pub fn processes(&self) -> Result<Vec<Pid>> {
let pids = self.cgroup.tasks();
let result = pids.iter().map(|x| Pid::from_raw(x.pid as i32)).collect();
Ok(result)
}
pub fn kill(&self, signal: Signal, all: bool) -> Result<()> {
if all {
let pids = self.processes()?;
for pid in pids {
if !status::is_process_running(pid)? {
continue;
}
kill(pid, signal)?;
}
} else {
// If --all option is not specified and the container is stopped,
// kill operation generates an error in accordance with the OCI runtime spec.
if self.state == ContainerState::Stopped {
return Err(anyhow!(
"container {} can't be killed because it is {:?}",
self.status.id,
self.state
)
// This error message mustn't be chagned because the containerd integration tests
// expect that OCI container runtimes return the message.
// Ref. https://github.com/containerd/containerd/blob/release/1.7/pkg/process/utils.go#L135
.context("container not running"));
}
let pid = Pid::from_raw(self.status.pid);
if status::is_process_running(pid)? {
kill(pid, signal)?;
}
}
// For cgroup v1, killing a process in a frozen cgroup does nothing until it's thawed.
// Only thaw the cgroup for SIGKILL.
// Ref: https://github.com/opencontainers/runc/pull/3217
if !is_cgroup2_unified_mode() && self.state == ContainerState::Paused && signal == SIGKILL {
freeze(&self.cgroup, FreezerState::Thawed)?;
}
Ok(())
}
pub async fn delete(&self, force: bool, logger: &Logger) -> Result<()> {
let status = &self.status;
let spec = status
.config
.spec
.as_ref()
.ok_or_else(|| anyhow!("spec config was not present in the status"))?;
let oci_state = OCIState {
version: status.oci_version.clone(),
id: status.id.clone(),
status: self.state,
pid: status.pid,
bundle: status
.bundle
.to_str()
.ok_or_else(|| anyhow!("invalid bundle path"))?
.to_string(),
annotations: spec.annotations().clone().unwrap_or_default(),
};
if let Some(hooks) = spec.hooks().as_ref() {
info!(&logger, "Poststop Hooks");
let mut poststop_hookstates = HookStates::new();
poststop_hookstates.execute_hooks(
&hooks.poststop().clone().unwrap_or_default(),
Some(oci_state.clone()),
)?;
}
match oci_state.status {
ContainerState::Stopped => {
self.destroy()?;
}
ContainerState::Created => {
// Kill an init process
self.kill(SIGKILL, false)?;
self.destroy()?;
}
_ => {
if force {
self.kill(SIGKILL, true)?;
self.destroy()?;
} else {
return Err(anyhow!(
"cannot delete container {} that is not stopped",
&status.id
));
}
}
}
Ok(())
}
pub fn pause(&self) -> Result<()> {
if self.state != ContainerState::Running && self.state != ContainerState::Created {
return Err(anyhow!(
"failed to pause container: current status is: {:?}",
self.state
));
}
freeze(&self.cgroup, FreezerState::Frozen)?;
Ok(())
}
pub fn resume(&self) -> Result<()> {
if self.state != ContainerState::Paused {
return Err(anyhow!(
"failed to resume container: current status is: {:?}",
self.state
));
}
freeze(&self.cgroup, FreezerState::Thawed)?;
Ok(())
}
pub fn destroy(&self) -> Result<()> {
remove_cgroup_dir(&self.cgroup)?;
self.status.remove_dir()
}
}
/// Used to run a process. If init is set, it will create a container and run the process in it.
/// If init is not set, it will run the process in an existing container.
#[derive(Debug)]
pub struct ContainerLauncher {
pub id: String,
pub bundle: PathBuf,
pub state_root: PathBuf,
pub init: bool,
pub runner: LinuxContainer,
pub pid_file: Option<PathBuf>,
}
impl ContainerLauncher {
pub fn new(
id: &str,
bundle: &Path,
state_root: &Path,
init: bool,
runner: LinuxContainer,
pid_file: Option<PathBuf>,
) -> Self {
ContainerLauncher {
id: id.to_string(),
bundle: bundle.to_path_buf(),
state_root: state_root.to_path_buf(),
init,
runner,
pid_file,
}
}
/// Launch a process. For init containers, we will create a container. For non-init, it will join an existing container.
pub async fn launch(&mut self, action: ContainerAction, logger: &Logger) -> Result<()> {
if self.init {
self.spawn_container(action, logger).await?;
} else {
if action == ContainerAction::Create {
return Err(anyhow!(
"ContainerAction::Create is used for init-container only"
));
}
self.spawn_process(action, logger).await?;
}
if let Some(pid_file) = self.pid_file.as_ref() {
fs::write(
pid_file,
format!("{}", self.runner.get_process(self.id.as_str())?.pid()),
)?;
}
Ok(())
}
/// Create the container by invoking runner to spawn the first process and save status.
async fn spawn_container(&mut self, action: ContainerAction, logger: &Logger) -> Result<()> {
// State root path root/id has been created in LinuxContainer::new(),
// so we don't have to create it again.
// Spawn a new process in the container by using the agent's codes.
self.spawn_process(action, logger).await?;
let status = self.get_status()?;
status.save()?;
debug!(logger, "saved status is {:?}", status);
// Clean up the fifo file created by LinuxContainer, which is used for block the created process.
if action == ContainerAction::Run || action == ContainerAction::Start {
let fifo_path = get_fifo_path(&status);
if fifo_path.exists() {
unlink(&fifo_path)?;
}
}
Ok(())
}
/// Generate rustjail::Process from OCI::Process
fn get_process(&self, logger: &Logger) -> Result<Process> {
let spec = self.runner.config.spec.as_ref().unwrap();
if spec.process().is_some() {
Ok(Process::new(
logger,
spec.process().as_ref().unwrap(),
// rustjail::LinuxContainer use the exec_id to identify processes in a container,
// so we can get the spawned process by ctr.get_process(exec_id) later.
// Since LinuxContainer is temporarily created to spawn one process in each runk invocation,
// we can use arbitrary string as the exec_id. Here we choose the container id.
&self.id,
self.init,
0,
None,
)?)
} else {
Err(anyhow!("no process configuration"))
}
}
/// Spawn a new process in the container by invoking runner.
async fn spawn_process(&mut self, action: ContainerAction, logger: &Logger) -> Result<()> {
// Agent will chdir to bundle_path before creating LinuxContainer. Just do the same as agent.
let current_dir = current_dir()?;
chdir(&self.bundle)?;
defer! {
chdir(&current_dir).unwrap();
}
let process = self.get_process(logger)?;
match action {
ContainerAction::Create => {
self.runner.start(process).await?;
}
ContainerAction::Start => {
self.runner.exec().await?;
}
ContainerAction::Run => {
self.runner.run(process).await?;
}
}
Ok(())
}
/// Generate runk specified Status
fn get_status(&self) -> Result<Status> {
let oci_state = self.runner.oci_state()?;
// read start time from /proc/<pid>/stat
let proc = procfs::process::Process::new(self.runner.init_process_pid)?;
let process_start_time = proc.stat()?.starttime;
Status::new(
&self.state_root,
&self.bundle,
oci_state,
process_start_time,
self.runner.created,
self.runner
.cgroup_manager
.as_ref()
.as_any()?
.downcast_ref::<CgroupManager>()
.unwrap()
.clone(),
self.runner.config.clone(),
)
}
}
pub fn create_linux_container(
id: &str,
root: &Path,
config: CreateOpts,
console_socket: Option<PathBuf>,
logger: &Logger,
) -> Result<LinuxContainer> {
let mut container = LinuxContainer::new(
id,
root.to_str()
.map(|s| s.to_string())
.ok_or_else(|| anyhow!("failed to convert bundle path"))?
.as_str(),
None,
config,
logger,
)?;
if let Some(socket_path) = console_socket.as_ref() {
container.set_console_socket(socket_path)?;
}
Ok(container)
}
// Load rustjail's Linux container.
// "uid_map_path" and "gid_map_path" are always empty, so they are not set.
pub fn load_linux_container(
status: &Status,
console_socket: Option<PathBuf>,
logger: &Logger,
) -> Result<LinuxContainer> {
let mut container = LinuxContainer::new(
&status.id,
&status
.root
.to_str()
.map(|s| s.to_string())
.ok_or_else(|| anyhow!("failed to convert a root path"))?,
None,
status.config.clone(),
logger,
)?;
if let Some(socket_path) = console_socket.as_ref() {
container.set_console_socket(socket_path)?;
}
container.init_process_pid = status.pid;
container.init_process_start_time = status.process_start_time;
container.created = status.created.into();
Ok(container)
}
pub fn get_config_path<P: AsRef<Path>>(bundle: P) -> PathBuf {
bundle.as_ref().join(CONFIG_FILE_NAME)
}
pub fn get_fifo_path(status: &Status) -> PathBuf {
status.root.join(&status.id).join(EXEC_FIFO_FILENAME)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::test_utils::*;
use rustjail::container::EXEC_FIFO_FILENAME;
use std::path::PathBuf;
#[test]
fn test_get_config_path() {
let test_data = PathBuf::from(TEST_BUNDLE_PATH).join(CONFIG_FILE_NAME);
assert_eq!(get_config_path(TEST_BUNDLE_PATH), test_data);
}
#[test]
fn test_get_fifo_path() {
let test_data = PathBuf::from(TEST_STATE_ROOT_PATH)
.join(TEST_CONTAINER_ID)
.join(EXEC_FIFO_FILENAME);
let status = create_dummy_status();
assert_eq!(get_fifo_path(&status), test_data);
}
}

View File

@@ -0,0 +1,140 @@
// Copyright 2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use crate::container::{load_linux_container, Container, ContainerLauncher};
use anyhow::{anyhow, Result};
use derive_builder::Builder;
use runtime_spec::ContainerState;
use slog::{debug, Logger};
use std::path::PathBuf;
/// Used for start command. It will prepare the options used for starting a new container.
#[derive(Default, Builder, Debug, Clone)]
#[builder(build_fn(validate = "Self::validate"))]
pub struct CreatedContainer {
id: String,
root: PathBuf,
}
impl CreatedContainerBuilder {
/// pre-validate before building CreatedContainer
fn validate(&self) -> Result<(), String> {
// ensure container exists
let id = self.id.as_ref().unwrap();
let root = self.root.as_ref().unwrap();
let path = root.join(id);
if !path.as_path().exists() {
return Err(format!("container {} does not exist", id));
}
Ok(())
}
}
impl CreatedContainer {
/// Create ContainerLauncher that can be used to start a process from an existing init container.
/// It reads the spec from status file of the init container.
pub fn create_launcher(self, logger: &Logger) -> Result<ContainerLauncher> {
debug!(logger, "enter CreatedContainer::create_launcher {:?}", self);
let container = Container::load(&self.root, &self.id)?;
if container.state != ContainerState::Created {
return Err(anyhow!(
"cannot start a container in the {:?} state",
container.state
));
}
let config = container.status.config.clone();
debug!(
logger,
"Prepare LinuxContainer for starting with config: {:?}", config
);
let runner = load_linux_container(&container.status, None, logger)?;
Ok(ContainerLauncher::new(
&self.id,
&container.status.bundle,
&self.root,
true,
runner,
None,
))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::status::Status;
use crate::utils::test_utils::*;
use nix::sys::stat::Mode;
use nix::unistd::{self, getpid};
use rustjail::container::EXEC_FIFO_FILENAME;
use scopeguard::defer;
use slog::o;
use std::fs::create_dir_all;
use std::path::Path;
use tempfile::tempdir;
use test_utils::skip_if_not_root;
fn create_created_container_dirs(root: &Path, id: &str, bundle: &Path) {
Status::create_dir(root, id).unwrap();
let fifo = root.join(id).join(EXEC_FIFO_FILENAME);
unistd::mkfifo(&fifo, Mode::from_bits(0o644).unwrap()).unwrap();
create_dir_all(bundle.join(TEST_ROOTFS_PATH)).unwrap();
}
#[test]
fn test_created_container_validate() {
let root = tempdir().unwrap();
let id = TEST_CONTAINER_ID.to_string();
let result = CreatedContainerBuilder::default()
.id(id)
.root(root.path().to_path_buf())
.build();
assert!(result.is_err());
}
#[test]
fn test_created_container_create_launcher() {
// create cgroup directory needs root permission
skip_if_not_root!();
let logger = slog::Logger::root(slog::Discard, o!());
let bundle_dir = tempdir().unwrap();
let root = tempdir().unwrap();
// Since tests are executed concurrently, container_id must be unique in tests with cgroup.
// Or the cgroup directory may be removed by other tests in advance.
let id = "test_created_container_create".to_string();
create_created_container_dirs(root.path(), &id, bundle_dir.path());
let pid = getpid().as_raw();
let mut spec = create_dummy_spec();
spec.root_mut()
.as_mut()
.unwrap()
.set_path(bundle_dir.path().join(TEST_ROOTFS_PATH));
let status = create_custom_dummy_status(&id, pid, root.path(), &spec);
status.save().unwrap();
// create empty cgroup directory to avoid is_pause failing
let cgroup = create_dummy_cgroup(Path::new(id.as_str()));
defer!(cgroup.delete().unwrap());
let launcher = CreatedContainerBuilder::default()
.id(id.clone())
.root(root.into_path())
.build()
.unwrap()
.create_launcher(&logger)
.unwrap();
assert!(launcher.init);
assert_eq!(launcher.runner.config.spec.unwrap(), spec);
assert_eq!(launcher.runner.id, id);
}
}

View File

@@ -0,0 +1,215 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use crate::container::{create_linux_container, get_config_path, ContainerLauncher};
use crate::status::Status;
use crate::utils::{canonicalize_spec_root, validate_spec};
use anyhow::{anyhow, Result};
use derive_builder::Builder;
use oci_spec::runtime::Spec;
use rustjail::specconv::CreateOpts;
use slog::{debug, Logger};
use std::path::PathBuf;
/// Used for create and run commands. It will prepare the options used for creating a new container.
#[derive(Default, Builder, Debug, Clone)]
#[builder(build_fn(validate = "Self::validate"))]
pub struct InitContainer {
id: String,
bundle: PathBuf,
root: PathBuf,
console_socket: Option<PathBuf>,
pid_file: Option<PathBuf>,
}
impl InitContainerBuilder {
/// pre-validate before building InitContainer
fn validate(&self) -> Result<(), String> {
// ensure container hasn't already been created
let id = self.id.as_ref().unwrap();
let root = self.root.as_ref().unwrap();
let status_path = Status::get_dir_path(root, id);
if status_path.exists() {
return Err(format!(
"container {} already exists at path {:?}",
id, root
));
}
Ok(())
}
}
impl InitContainer {
/// Create ContainerLauncher that can be used to launch a new container.
/// It will read the spec under bundle path.
pub fn create_launcher(self, logger: &Logger) -> Result<ContainerLauncher> {
debug!(logger, "enter InitContainer::create_launcher {:?}", self);
let bundle_canon = self.bundle.canonicalize()?;
let config_path = get_config_path(&bundle_canon);
let mut spec = Spec::load(
config_path
.to_str()
.ok_or_else(|| anyhow!("invalid config path"))?,
)?;
// Only absolute rootfs path is valid when creating LinuxContainer later.
canonicalize_spec_root(&mut spec, &bundle_canon)?;
debug!(logger, "load spec from config file: {:?}", spec);
validate_spec(&spec, &self.console_socket)?;
let config = CreateOpts {
cgroup_name: "".to_string(),
use_systemd_cgroup: false,
// TODO: liboci-cli does not support --no-pivot option for create and run command.
// After liboci-cli supports the option, we will change the following code.
// no_pivot_root: self.no_pivot,
no_pivot_root: false,
no_new_keyring: false,
spec: Some(spec),
rootless_euid: false,
rootless_cgroup: false,
container_name: "".to_string(),
};
debug!(logger, "create LinuxContainer with config: {:?}", config);
let container =
create_linux_container(&self.id, &self.root, config, self.console_socket, logger)?;
Ok(ContainerLauncher::new(
&self.id,
&bundle_canon,
&self.root,
true,
container,
self.pid_file,
))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::container::CONFIG_FILE_NAME;
use crate::utils::test_utils::*;
use oci_spec::runtime::Process;
use slog::o;
use std::fs::{create_dir, File};
use std::path::Path;
use tempfile::tempdir;
#[test]
fn test_init_container_validate() {
let root = tempdir().unwrap();
let id = TEST_CONTAINER_ID.to_string();
Status::create_dir(root.path(), id.as_str()).unwrap();
let result = InitContainerBuilder::default()
.id(id)
.root(root.path().to_path_buf())
.bundle(PathBuf::from(TEST_BUNDLE_PATH))
.pid_file(None)
.console_socket(None)
.build();
assert!(result.is_err());
}
#[test]
fn test_init_container_create_launcher() {
#[cfg(all(target_arch = "powerpc64", target_endian = "little"))]
skip_if_not_root!();
let logger = slog::Logger::root(slog::Discard, o!());
let root_dir = tempdir().unwrap();
let bundle_dir = tempdir().unwrap();
// create dummy rootfs
create_dir(bundle_dir.path().join(TEST_ROOTFS_PATH)).unwrap();
let config_file = bundle_dir.path().join(CONFIG_FILE_NAME);
let mut spec = create_dummy_spec();
let file = File::create(config_file).unwrap();
serde_json::to_writer(&file, &spec).unwrap();
spec.root_mut()
.as_mut()
.unwrap()
.set_path(bundle_dir.path().join(TEST_ROOTFS_PATH));
let test_data = TestContainerData {
// Since tests are executed concurrently, container_id must be unique in tests with cgroup.
// Or the cgroup directory may be removed by other tests in advance.
id: String::from("test_init_container_create_launcher"),
bundle: bundle_dir.path().to_path_buf(),
root: root_dir.into_path(),
console_socket: Some(PathBuf::from(TEST_CONSOLE_SOCKET_PATH)),
config: CreateOpts {
spec: Some(spec),
..Default::default()
},
pid_file: Some(PathBuf::from(TEST_PID_FILE_PATH)),
};
let launcher = InitContainerBuilder::default()
.id(test_data.id.clone())
.bundle(test_data.bundle.clone())
.root(test_data.root.clone())
.console_socket(test_data.console_socket.clone())
.pid_file(test_data.pid_file.clone())
.build()
.unwrap()
.create_launcher(&logger)
.unwrap();
// LinuxContainer doesn't impl PartialEq, so we need to compare the fields manually.
assert!(launcher.init);
assert_eq!(launcher.bundle, test_data.bundle);
assert_eq!(launcher.state_root, test_data.root);
assert_eq!(launcher.pid_file, test_data.pid_file);
assert_eq!(launcher.runner.id, test_data.id);
assert_eq!(launcher.runner.config.spec, test_data.config.spec);
assert_eq!(
Some(launcher.runner.console_socket),
test_data.console_socket
);
// If it is run by root, create_launcher will create cgroup dirs successfully. So we need to do some cleanup stuff.
if nix::unistd::Uid::effective().is_root() {
clean_up_cgroup(Path::new(&test_data.id));
}
}
#[test]
fn test_init_container_tty_err() {
let logger = slog::Logger::root(slog::Discard, o!());
let bundle_dir = tempdir().unwrap();
let config_file = bundle_dir.path().join(CONFIG_FILE_NAME);
let mut spec = Spec::default();
spec.set_process(Some(Process::default()));
spec.process_mut()
.as_mut()
.unwrap()
.set_terminal(Some(true));
let file = File::create(config_file).unwrap();
serde_json::to_writer(&file, &spec).unwrap();
let test_data = TestContainerData {
id: String::from(TEST_CONTAINER_ID),
bundle: bundle_dir.into_path(),
root: tempdir().unwrap().into_path(),
console_socket: None,
config: CreateOpts {
spec: Some(spec),
..Default::default()
},
pid_file: None,
};
let result = InitContainerBuilder::default()
.id(test_data.id.clone())
.bundle(test_data.bundle.clone())
.root(test_data.root.clone())
.console_socket(test_data.console_socket.clone())
.pid_file(test_data.pid_file)
.build()
.unwrap()
.create_launcher(&logger);
assert!(result.is_err());
}
}

View File

@@ -0,0 +1,12 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
pub mod activated_builder;
pub mod cgroup;
pub mod container;
pub mod created_builder;
pub mod init_builder;
pub mod status;
pub mod utils;

View File

@@ -0,0 +1,236 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use crate::cgroup::is_paused;
use crate::container::get_fifo_path;
use crate::utils::*;
use anyhow::{anyhow, Result};
use chrono::{DateTime, Utc};
use libc::pid_t;
use nix::{
errno::Errno,
sys::{signal::kill, stat::Mode},
unistd::Pid,
};
use procfs::process::ProcState;
use runtime_spec::{ContainerState, State as OCIState};
use rustjail::{cgroups::fs::Manager as CgroupManager, specconv::CreateOpts};
use serde::{Deserialize, Serialize};
use std::{
fs::{self, File, OpenOptions},
path::{Path, PathBuf},
time::SystemTime,
};
const STATUS_FILE: &str = "status.json";
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Status {
pub oci_version: String,
pub id: String,
pub pid: pid_t,
pub root: PathBuf,
pub bundle: PathBuf,
pub rootfs: String,
pub process_start_time: u64,
pub created: DateTime<Utc>,
// Methods of Manager traits in rustjail are invisible, and CgroupManager.cgroup can't be serialized.
// So it is cumbersome to manage cgroups by this field. Instead, we use cgroups-rs::cgroup directly in Container to manager cgroups.
// Another solution is making some methods public outside rustjail and adding getter/setter for CgroupManager.cgroup.
// Temporarily keep this field for compatibility.
pub cgroup_manager: CgroupManager,
pub config: CreateOpts,
}
impl Status {
pub fn new(
root: &Path,
bundle: &Path,
oci_state: OCIState,
process_start_time: u64,
created_time: SystemTime,
cgroup_mg: CgroupManager,
config: CreateOpts,
) -> Result<Self> {
let created = DateTime::from(created_time);
let rootfs = config
.clone()
.spec
.ok_or_else(|| anyhow!("spec config was not present"))?
.root()
.as_ref()
.ok_or_else(|| anyhow!("root config was not present in the spec"))?
.path()
.clone();
Ok(Self {
oci_version: oci_state.version,
id: oci_state.id,
pid: oci_state.pid,
root: root.to_path_buf(),
bundle: bundle.to_path_buf(),
rootfs: rootfs.display().to_string(),
process_start_time,
created,
cgroup_manager: cgroup_mg,
config,
})
}
pub fn save(&self) -> Result<()> {
let state_file_path = Self::get_file_path(&self.root, &self.id);
if !&self.root.exists() {
create_dir_with_mode(&self.root, Mode::S_IRWXU, true)?;
}
let file = OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(state_file_path)?;
serde_json::to_writer(&file, self)?;
Ok(())
}
pub fn load(state_root: &Path, id: &str) -> Result<Self> {
let state_file_path = Self::get_file_path(state_root, id);
if !state_file_path.exists() {
return Err(anyhow!("container \"{}\" does not exist", id));
}
let file = File::open(&state_file_path)?;
let state: Self = serde_json::from_reader(&file)?;
Ok(state)
}
pub fn create_dir(state_root: &Path, id: &str) -> Result<()> {
let state_dir_path = Self::get_dir_path(state_root, id);
if !state_dir_path.exists() {
create_dir_with_mode(state_dir_path, Mode::S_IRWXU, true)?;
} else {
return Err(anyhow!("container with id exists: \"{}\"", id));
}
Ok(())
}
pub fn remove_dir(&self) -> Result<()> {
let state_dir_path = Self::get_dir_path(&self.root, &self.id);
fs::remove_dir_all(state_dir_path)?;
Ok(())
}
pub fn get_dir_path(state_root: &Path, id: &str) -> PathBuf {
state_root.join(id)
}
pub fn get_file_path(state_root: &Path, id: &str) -> PathBuf {
state_root.join(id).join(STATUS_FILE)
}
}
pub fn is_process_running(pid: Pid) -> Result<bool> {
match kill(pid, None) {
Err(errno) => {
if errno != Errno::ESRCH {
return Err(anyhow!("failed to kill process {}: {:?}", pid, errno));
}
Ok(false)
}
Ok(()) => Ok(true),
}
}
// Returns the current state of a container. It will read cgroupfs and procfs to determine the state.
// https://github.com/opencontainers/runc/blob/86d6898f3052acba1ebcf83aa2eae3f6cc5fb471/libcontainer/container_linux.go#L1953
pub fn get_current_container_state(
status: &Status,
cgroup: &cgroups::Cgroup,
) -> Result<ContainerState> {
if is_paused(cgroup)? {
return Ok(ContainerState::Paused);
}
let proc = procfs::process::Process::new(status.pid);
// if reading /proc/<pid> occurs error, then the process is not running
if proc.is_err() {
return Ok(ContainerState::Stopped);
}
let proc_stat = proc.unwrap().stat()?;
// if start time is not equal, then the pid is reused, and the process is not running
if proc_stat.starttime != status.process_start_time {
return Ok(ContainerState::Stopped);
}
match proc_stat.state()? {
ProcState::Zombie | ProcState::Dead => Ok(ContainerState::Stopped),
_ => {
let fifo = get_fifo_path(status);
if fifo.exists() {
return Ok(ContainerState::Created);
}
Ok(ContainerState::Running)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::test_utils::*;
use ::test_utils::skip_if_not_root;
use chrono::{DateTime, Utc};
use nix::unistd::getpid;
use runtime_spec::ContainerState;
use rustjail::cgroups::fs::Manager as CgroupManager;
use scopeguard::defer;
use std::path::Path;
use std::time::SystemTime;
#[test]
fn test_status() {
let cgm: CgroupManager = serde_json::from_str(TEST_CGM_DATA).unwrap();
let oci_state = create_dummy_oci_state();
let created = SystemTime::now();
let status = Status::new(
Path::new(TEST_STATE_ROOT_PATH),
Path::new(TEST_BUNDLE_PATH),
oci_state.clone(),
1,
created,
cgm,
create_dummy_opts(),
)
.unwrap();
assert_eq!(status.id, oci_state.id);
assert_eq!(status.pid, oci_state.pid);
assert_eq!(status.process_start_time, 1);
assert_eq!(status.created, DateTime::<Utc>::from(created));
}
#[test]
fn test_is_process_running() {
let pid = getpid();
let ret = is_process_running(pid).unwrap();
assert!(ret);
}
#[test]
fn test_get_current_container_state() {
skip_if_not_root!();
let mut status = create_dummy_status();
status.id = "test_get_current_container_state".to_string();
// crete a dummy cgroup to make sure is_pause doesn't return error
let cgroup = create_dummy_cgroup(Path::new(&status.id));
defer!(cgroup.delete().unwrap());
let state = get_current_container_state(&status, &cgroup).unwrap();
assert_eq!(state, ContainerState::Running);
}
}

View File

@@ -0,0 +1,294 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{anyhow, Result};
use nix::sys::stat::Mode;
use oci_spec::runtime::{Process, Spec};
use std::{
fs::{DirBuilder, File},
io::{prelude::*, BufReader},
os::unix::fs::DirBuilderExt,
path::{Path, PathBuf},
};
pub fn lines_from_file<P: AsRef<Path>>(path: P) -> Result<Vec<String>> {
let file = File::open(&path)?;
let buf = BufReader::new(file);
Ok(buf
.lines()
.map(|v| v.expect("could not parse line"))
.collect())
}
pub fn create_dir_with_mode<P: AsRef<Path>>(path: P, mode: Mode, recursive: bool) -> Result<()> {
let path = path.as_ref();
if path.exists() {
return Err(anyhow!("{} already exists", path.display()));
}
Ok(DirBuilder::new()
.recursive(recursive)
.mode(mode.bits())
.create(path)?)
}
/// If root in spec is a relative path, make it absolute.
pub fn canonicalize_spec_root(spec: &mut Spec, bundle_canon: &Path) -> Result<()> {
let spec_root = spec
.root_mut()
.as_mut()
.ok_or_else(|| anyhow!("root config was not present in the spec file"))?;
let rootfs_path = &spec_root.path();
if !rootfs_path.is_absolute() {
let bundle_canon_path = bundle_canon.join(rootfs_path).canonicalize()?;
spec_root.set_path(bundle_canon_path);
}
Ok(())
}
/// Check whether spec is valid. Now runk only support detach mode.
pub fn validate_spec(spec: &Spec, console_socket: &Option<PathBuf>) -> Result<()> {
validate_process_spec(spec.process())?;
if let Some(process) = spec.process().as_ref() {
// runk always launches containers with detached mode, so users have to
// use a console socket with run or create operation when a terminal is used.
if process.terminal().is_some() && console_socket.is_none() {
return Err(anyhow!(
"cannot allocate a pseudo-TTY without setting a console socket"
));
}
}
Ok(())
}
// Validate process just like runc, https://github.com/opencontainers/runc/pull/623
pub fn validate_process_spec(process: &Option<Process>) -> Result<()> {
let process = process
.as_ref()
.ok_or_else(|| anyhow!("process property must not be empty"))?;
if process.cwd().as_os_str().is_empty() {
return Err(anyhow!("cwd property must not be empty"));
}
let cwd = process.cwd();
if !cwd.is_absolute() {
return Err(anyhow!("cwd must be an absolute path"));
}
if process.args().is_none() {
return Err(anyhow!("args must not be empty"));
}
Ok(())
}
#[cfg(test)]
pub(crate) mod test_utils {
use super::*;
use crate::status::Status;
use chrono::DateTime;
use nix::unistd::getpid;
use oci::{LinuxBuilder, LinuxNamespaceBuilder, Process, Root, Spec};
use oci_spec::runtime as oci;
use runtime_spec::{ContainerState, State as OCIState};
use rustjail::{
cgroups::fs::Manager as CgroupManager, container::TYPETONAME, specconv::CreateOpts,
};
use std::{fs::create_dir_all, path::Path, time::SystemTime};
use tempfile::tempdir;
pub const TEST_CONTAINER_ID: &str = "test";
pub const TEST_STATE_ROOT_PATH: &str = "/state";
pub const TEST_BUNDLE_PATH: &str = "/bundle";
pub const TEST_ROOTFS_PATH: &str = "rootfs";
pub const TEST_ANNOTATION: &str = "test-annotation";
pub const TEST_CONSOLE_SOCKET_PATH: &str = "/test-console-sock";
pub const TEST_PROCESS_FILE_NAME: &str = "process.json";
pub const TEST_PID_FILE_PATH: &str = "/test-pid";
pub const TEST_HOST_NAME: &str = "test-host";
pub const TEST_OCI_SPEC_VERSION: &str = "1.0.2";
pub const TEST_CGM_DATA: &str = r#"{
"paths": {
"devices": "/sys/fs/cgroup/devices"
},
"mounts": {
"devices": "/sys/fs/cgroup/devices"
},
"cpath": "test"
}"#;
#[derive(Debug)]
pub struct TestContainerData {
pub id: String,
pub bundle: PathBuf,
pub root: PathBuf,
pub console_socket: Option<PathBuf>,
pub pid_file: Option<PathBuf>,
pub config: CreateOpts,
}
pub fn create_dummy_spec() -> Spec {
let linux = LinuxBuilder::default()
.namespaces(
TYPETONAME
.iter()
.filter(|&(_, &name)| name != "user")
.map(|ns| {
LinuxNamespaceBuilder::default()
.typ(ns.0.clone())
.path(PathBuf::from(""))
.build()
.unwrap()
})
.collect::<Vec<_>>(),
)
.build()
.unwrap();
let mut process = Process::default();
process.set_args(Some(vec!["sleep".to_string(), "10".to_string()]));
process.set_env(Some(vec!["PATH=/bin:/usr/bin".to_string()]));
process.set_cwd(PathBuf::from("/"));
let mut root = Root::default();
root.set_path(PathBuf::from(TEST_ROOTFS_PATH));
root.set_readonly(Some(false));
let mut spec = Spec::default();
spec.set_version(TEST_OCI_SPEC_VERSION.to_string());
spec.set_process(Some(process));
spec.set_hostname(Some(TEST_HOST_NAME.to_string()));
spec.set_root(Some(root));
spec.set_linux(Some(linux));
spec
}
pub fn create_dummy_opts() -> CreateOpts {
let mut spec = Spec::default();
spec.set_root(Some(Root::default()));
CreateOpts {
cgroup_name: "".to_string(),
use_systemd_cgroup: false,
no_pivot_root: false,
no_new_keyring: false,
spec: Some(spec),
rootless_euid: false,
rootless_cgroup: false,
container_name: "".to_string(),
}
}
pub fn create_dummy_oci_state() -> OCIState {
OCIState {
version: TEST_OCI_SPEC_VERSION.to_string(),
id: TEST_CONTAINER_ID.to_string(),
status: ContainerState::Running,
pid: getpid().as_raw(),
bundle: TEST_BUNDLE_PATH.to_string(),
annotations: [(TEST_ANNOTATION.to_string(), TEST_ANNOTATION.to_string())]
.iter()
.cloned()
.collect(),
}
}
pub fn create_dummy_status() -> Status {
let cgm: CgroupManager = serde_json::from_str(TEST_CGM_DATA).unwrap();
let oci_state = create_dummy_oci_state();
let created = SystemTime::now();
let start_time = procfs::process::Process::new(oci_state.pid)
.unwrap()
.stat()
.unwrap()
.starttime;
let status = Status::new(
Path::new(TEST_STATE_ROOT_PATH),
Path::new(TEST_BUNDLE_PATH),
oci_state,
start_time,
created,
cgm,
create_dummy_opts(),
)
.unwrap();
status
}
pub fn create_custom_dummy_status(id: &str, pid: i32, root: &Path, spec: &Spec) -> Status {
let start_time = procfs::process::Process::new(pid)
.unwrap()
.stat()
.unwrap()
.starttime;
Status {
oci_version: spec.version().clone(),
id: id.to_string(),
pid,
root: root.to_path_buf(),
bundle: PathBuf::from(TEST_BUNDLE_PATH),
rootfs: TEST_ROOTFS_PATH.to_string(),
process_start_time: start_time,
created: DateTime::from(SystemTime::now()),
cgroup_manager: serde_json::from_str(TEST_CGM_DATA).unwrap(),
config: CreateOpts {
spec: Some(spec.clone()),
..Default::default()
},
}
}
pub fn create_dummy_cgroup(cpath: &Path) -> cgroups::Cgroup {
cgroups::Cgroup::new(cgroups::hierarchies::auto(), cpath).unwrap()
}
pub fn clean_up_cgroup(cpath: &Path) {
let cgroup = cgroups::Cgroup::load(cgroups::hierarchies::auto(), cpath);
cgroup.delete().unwrap();
}
#[test]
fn test_canonicalize_spec_root() {
let gen_spec = |p: &str| -> Spec {
let mut root = Root::default();
root.set_path(PathBuf::from(p));
root.set_readonly(Some(false));
let mut spec = Spec::default();
spec.set_root(Some(root));
spec
};
let rootfs_name = TEST_ROOTFS_PATH;
let temp_dir = tempdir().unwrap();
let bundle_dir = temp_dir.path();
let abs_root = bundle_dir.join(rootfs_name);
create_dir_all(abs_root.clone()).unwrap();
let mut spec = gen_spec(abs_root.to_str().unwrap());
assert!(canonicalize_spec_root(&mut spec, bundle_dir).is_ok());
assert_eq!(spec.root_mut().clone().unwrap().path(), &abs_root);
let mut spec = gen_spec(rootfs_name);
assert!(canonicalize_spec_root(&mut spec, bundle_dir).is_ok());
assert_eq!(spec.root().clone().unwrap().path(), &abs_root);
}
#[test]
pub fn test_validate_process_spec() {
let mut valid_process = Process::default();
valid_process.set_args(Some(vec!["test".to_string()]));
valid_process.set_cwd(PathBuf::from("/"));
assert!(validate_process_spec(&None).is_err());
assert!(validate_process_spec(&Some(valid_process.clone())).is_ok());
let mut invalid_process = valid_process.clone();
invalid_process.set_args(None);
assert!(validate_process_spec(&Some(invalid_process)).is_err());
let mut invalid_process = valid_process.clone();
invalid_process.set_cwd(PathBuf::from(""));
assert!(validate_process_spec(&Some(invalid_process)).is_err());
let mut invalid_process = valid_process;
invalid_process.set_cwd(PathBuf::from("test/"));
assert!(validate_process_spec(&Some(invalid_process)).is_err());
}
}

View File

@@ -0,0 +1,28 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::{container::ContainerAction, init_builder::InitContainerBuilder};
use liboci_cli::Create;
use slog::{info, Logger};
use std::path::Path;
pub async fn run(opts: Create, root: &Path, logger: &Logger) -> Result<()> {
let mut launcher = InitContainerBuilder::default()
.id(opts.container_id)
.bundle(opts.bundle)
.root(root.to_path_buf())
.console_socket(opts.console_socket)
.pid_file(opts.pid_file)
.build()?
.create_launcher(logger)?;
launcher.launch(ContainerAction::Create, logger).await?;
info!(&logger, "create command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,30 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{anyhow, Result};
use libcontainer::{container::Container, status::Status};
use liboci_cli::Delete;
use slog::{info, Logger};
use std::{fs, path::Path};
pub async fn run(opts: Delete, root: &Path, logger: &Logger) -> Result<()> {
let container_id = &opts.container_id;
let status_dir = Status::get_dir_path(root, container_id);
if !status_dir.exists() {
return Err(anyhow!("container {} does not exist", container_id));
}
let container = if let Ok(value) = Container::load(root, container_id) {
value
} else {
fs::remove_dir_all(status_dir)?;
return Ok(());
};
container.delete(opts.force, logger).await?;
info!(&logger, "delete command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,32 @@
// Copyright 2021-2022 Kata Contributors
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::activated_builder::ActivatedContainerBuilder;
use libcontainer::container::ContainerAction;
use liboci_cli::Exec;
use slog::{info, Logger};
use std::path::Path;
pub async fn run(opts: Exec, root: &Path, logger: &Logger) -> Result<()> {
let mut launcher = ActivatedContainerBuilder::default()
.id(opts.container_id)
.root(root.to_path_buf())
.console_socket(opts.console_socket)
.pid_file(opts.pid_file)
.tty(opts.tty)
.cwd(opts.cwd)
.env(opts.env)
.no_new_privs(opts.no_new_privs)
.process(opts.process)
.args(opts.command)
.build()?
.create_launcher(logger)?;
launcher.launch(ContainerAction::Run, logger).await?;
info!(&logger, "exec command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,58 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::container::Container;
use liboci_cli::Kill;
use nix::sys::signal::Signal;
use slog::{info, Logger};
use std::{convert::TryFrom, path::Path, str::FromStr};
pub fn run(opts: Kill, state_root: &Path, logger: &Logger) -> Result<()> {
let container_id = &opts.container_id;
let container = Container::load(state_root, container_id)?;
let sig = parse_signal(&opts.signal)?;
let all = opts.all;
container.kill(sig, all)?;
info!(&logger, "kill command finished successfully");
Ok(())
}
fn parse_signal(signal: &str) -> Result<Signal> {
if let Ok(num) = signal.parse::<i32>() {
return Ok(Signal::try_from(num)?);
}
let mut signal_upper = signal.to_uppercase();
if !signal_upper.starts_with("SIG") {
signal_upper = "SIG".to_string() + &signal_upper;
}
Ok(Signal::from_str(&signal_upper)?)
}
#[cfg(test)]
mod tests {
use super::*;
use nix::sys::signal::Signal;
#[test]
fn test_parse_signal() {
assert_eq!(Signal::SIGHUP, parse_signal("1").unwrap());
assert_eq!(Signal::SIGHUP, parse_signal("sighup").unwrap());
assert_eq!(Signal::SIGHUP, parse_signal("hup").unwrap());
assert_eq!(Signal::SIGHUP, parse_signal("SIGHUP").unwrap());
assert_eq!(Signal::SIGHUP, parse_signal("HUP").unwrap());
assert_eq!(Signal::SIGKILL, parse_signal("9").unwrap());
assert_eq!(Signal::SIGKILL, parse_signal("sigkill").unwrap());
assert_eq!(Signal::SIGKILL, parse_signal("kill").unwrap());
assert_eq!(Signal::SIGKILL, parse_signal("SIGKILL").unwrap());
assert_eq!(Signal::SIGKILL, parse_signal("KILL").unwrap());
}
}

View File

@@ -0,0 +1,68 @@
// Copyright 2021-2022 Kata Contributors
//
// SPDX-License-Identifier: Apache-2.0
//
use super::state::get_container_state_name;
use anyhow::Result;
use libcontainer::container::Container;
use liboci_cli::List;
use runtime_spec::ContainerState;
use slog::{info, Logger};
use std::fmt::Write as _;
use std::{fs, os::unix::prelude::MetadataExt, path::Path};
use std::{io, io::Write};
use tabwriter::TabWriter;
use uzers::get_user_by_uid;
pub fn run(_: List, root: &Path, logger: &Logger) -> Result<()> {
let mut content = String::new();
for entry in fs::read_dir(root)? {
let entry = entry?;
// Possibly race with other command of runk, so continue loop when any error occurs below
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(_) => continue,
};
if !metadata.is_dir() {
continue;
}
let container_id = match entry.file_name().into_string() {
Ok(id) => id,
Err(_) => continue,
};
let container = match Container::load(root, &container_id) {
Ok(container) => container,
Err(_) => continue,
};
let state = container.state;
// Just like runc, pid of stopped container is 0
let pid = match state {
ContainerState::Stopped => 0,
_ => container.status.pid,
};
// May replace get_user_by_uid with getpwuid(3)
let owner = match get_user_by_uid(metadata.uid()) {
Some(user) => String::from(user.name().to_string_lossy()),
None => format!("#{}", metadata.uid()),
};
let _ = writeln!(
content,
"{}\t{}\t{}\t{}\t{}\t{}",
container_id,
pid,
get_container_state_name(state),
container.status.bundle.display(),
container.status.created,
owner
);
}
let mut tab_writer = TabWriter::new(io::stdout());
writeln!(&mut tab_writer, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER")?;
write!(&mut tab_writer, "{}", content)?;
tab_writer.flush()?;
info!(&logger, "list command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,17 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
pub mod create;
pub mod delete;
pub mod exec;
pub mod kill;
pub mod list;
pub mod pause;
pub mod ps;
pub mod resume;
pub mod run;
pub mod spec;
pub mod start;
pub mod state;

View File

@@ -0,0 +1,18 @@
// Copyright 2021-2022 Kata Contributors
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::container::Container;
use liboci_cli::Pause;
use slog::{info, Logger};
use std::path::Path;
pub fn run(opts: Pause, root: &Path, logger: &Logger) -> Result<()> {
let container = Container::load(root, &opts.container_id)?;
container.pause()?;
info!(&logger, "pause command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,63 @@
// Copyright 2021-2022 Kata Contributors
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::anyhow;
use anyhow::Result;
use libcontainer::container::Container;
use liboci_cli::Ps;
use slog::{info, Logger};
use std::path::Path;
use std::process::Command;
use std::str;
pub fn run(opts: Ps, root: &Path, logger: &Logger) -> Result<()> {
let container = Container::load(root, opts.container_id.as_str())?;
let pids = container
.processes()?
.iter()
.map(|pid| pid.as_raw())
.collect::<Vec<_>>();
match opts.format.as_str() {
"json" => println!("{}", serde_json::to_string(&pids)?),
"table" => {
let ps_options = if opts.ps_options.is_empty() {
vec!["-ef".to_string()]
} else {
opts.ps_options
};
let output = Command::new("ps").args(ps_options).output()?;
if !output.status.success() {
return Err(anyhow!("{}", std::str::from_utf8(&output.stderr)?));
}
let lines = str::from_utf8(&output.stdout)?.lines().collect::<Vec<_>>();
if lines.is_empty() {
return Err(anyhow!("no processes found"));
}
let pid_index = lines[0]
.split_whitespace()
.position(|field| field == "PID")
.ok_or_else(|| anyhow!("could't find PID field in ps output"))?;
println!("{}", lines[0]);
for &line in &lines[1..] {
if line.is_empty() {
continue;
}
let fields = line.split_whitespace().collect::<Vec<_>>();
if pid_index >= fields.len() {
continue;
}
let pid: i32 = fields[pid_index].parse()?;
if pids.contains(&pid) {
println!("{}", line);
}
}
}
_ => return Err(anyhow!("unknown format: {}", opts.format)),
}
info!(&logger, "ps command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,18 @@
// Copyright 2021-2022 Kata Contributors
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::container::Container;
use liboci_cli::Resume;
use slog::{info, Logger};
use std::path::Path;
pub fn run(opts: Resume, root: &Path, logger: &Logger) -> Result<()> {
let container = Container::load(root, &opts.container_id)?;
container.resume()?;
info!(&logger, "pause command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,27 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::{container::ContainerAction, init_builder::InitContainerBuilder};
use liboci_cli::Run;
use slog::{info, Logger};
use std::path::Path;
pub async fn run(opts: Run, root: &Path, logger: &Logger) -> Result<()> {
let mut launcher = InitContainerBuilder::default()
.id(opts.container_id)
.bundle(opts.bundle)
.root(root.to_path_buf())
.console_socket(opts.console_socket)
.pid_file(opts.pid_file)
.build()?
.create_launcher(logger)?;
launcher.launch(ContainerAction::Run, logger).await?;
info!(&logger, "run command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,207 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
//use crate::container::get_config_path;
use anyhow::Result;
use libcontainer::container::CONFIG_FILE_NAME;
use liboci_cli::Spec;
use slog::{info, Logger};
use std::{fs::File, io::Write, path::Path};
pub const DEFAULT_SPEC: &str = r#"{
"ociVersion": "1.0.2-dev",
"process": {
"terminal": true,
"user": {
"uid": 0,
"gid": 0
},
"args": [
"sh"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": {
"bounding": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"effective": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"inheritable": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"permitted": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"ambient": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
]
},
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs",
"readonly": true
},
"hostname": "runk",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"nosuid",
"noexec",
"nodev",
"relatime",
"ro"
]
}
],
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
}
]
},
"namespaces": [
{
"type": "pid"
},
{
"type": "network"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
}
],
"maskedPaths": [
"/proc/acpi",
"/proc/asound",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi"
],
"readonlyPaths": [
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
}
}"#;
pub fn run(_opts: Spec, logger: &Logger) -> Result<()> {
// TODO: liboci-cli does not support --bundle option for spec command.
// After liboci-cli supports the option, we will change the following code.
// let config_path = get_config_path(&opts.bundle);
let config_path = Path::new(".").join(CONFIG_FILE_NAME);
let config_data = DEFAULT_SPEC;
let mut file = File::create(config_path)?;
file.write_all(config_data.as_bytes())?;
info!(&logger, "spec command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,24 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use libcontainer::{container::ContainerAction, created_builder::CreatedContainerBuilder};
use liboci_cli::Start;
use slog::{info, Logger};
use std::path::Path;
pub async fn run(opts: Start, root: &Path, logger: &Logger) -> Result<()> {
let mut launcher = CreatedContainerBuilder::default()
.id(opts.container_id)
.root(root.to_path_buf())
.build()?
.create_launcher(logger)?;
launcher.launch(ContainerAction::Start, logger).await?;
info!(&logger, "start command finished successfully");
Ok(())
}

View File

@@ -0,0 +1,78 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use chrono::{DateTime, Utc};
use libcontainer::{container::Container, status::Status};
use liboci_cli::State;
use runtime_spec::ContainerState;
use serde::{Deserialize, Serialize};
use slog::{info, Logger};
use std::path::{Path, PathBuf};
#[derive(Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct RuntimeState {
pub oci_version: String,
pub id: String,
pub pid: i32,
pub status: String,
pub bundle: PathBuf,
pub created: DateTime<Utc>,
}
impl RuntimeState {
pub fn new(status: Status, state: ContainerState) -> Self {
Self {
oci_version: status.oci_version,
id: status.id,
pid: status.pid,
status: get_container_state_name(state),
bundle: status.bundle,
created: status.created,
}
}
}
pub fn run(opts: State, state_root: &Path, logger: &Logger) -> Result<()> {
let container = Container::load(state_root, &opts.container_id)?;
let oci_state = RuntimeState::new(container.status, container.state);
let json_state = &serde_json::to_string_pretty(&oci_state)?;
println!("{}", json_state);
info!(&logger, "state command finished successfully");
Ok(())
}
pub fn get_container_state_name(state: ContainerState) -> String {
match state {
ContainerState::Creating => "creating",
ContainerState::Created => "created",
ContainerState::Running => "running",
ContainerState::Stopped => "stopped",
ContainerState::Paused => "paused",
}
.into()
}
#[cfg(test)]
mod tests {
use super::*;
use runtime_spec::ContainerState;
#[test]
fn test_get_container_state_name() {
assert_eq!(
"creating",
get_container_state_name(ContainerState::Creating)
);
assert_eq!("created", get_container_state_name(ContainerState::Created));
assert_eq!("running", get_container_state_name(ContainerState::Running));
assert_eq!("stopped", get_container_state_name(ContainerState::Stopped));
assert_eq!("paused", get_container_state_name(ContainerState::Paused));
}
}

133
src/tools/runk/src/main.rs Normal file
View File

@@ -0,0 +1,133 @@
// Copyright 2021-2022 Sony Group Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::{anyhow, Result};
use clap::{crate_description, crate_name, Parser};
use liboci_cli::{CommonCmd, GlobalOpts};
use liboci_cli::{Create, Delete, Kill, Start, State};
use slog::{o, Logger};
use slog_async::AsyncGuard;
use std::{
fs::OpenOptions,
path::{Path, PathBuf},
process::exit,
};
const DEFAULT_ROOT_DIR: &str = "/run/runk";
const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info;
mod commands;
#[derive(Parser, Debug)]
enum SubCommand {
#[clap(flatten)]
Standard(StandardCmd),
#[clap(flatten)]
Common(CommonCmd),
/// Launch an init process (do not call it outside of runk)
Init {},
}
// Copy from https://github.com/containers/youki/blob/v0.0.3/crates/liboci-cli/src/lib.rs#L38-L44
#[derive(Parser, Debug)]
pub enum StandardCmd {
Create(Create),
Start(Start),
State(State),
Delete(Delete),
Kill(Kill),
}
#[derive(Parser, Debug)]
#[clap(version, author, about = crate_description!())]
struct Cli {
#[clap(flatten)]
global: GlobalOpts,
#[clap(subcommand)]
subcmd: SubCommand,
}
async fn cmd_run(subcmd: SubCommand, root_path: &Path, logger: &Logger) -> Result<()> {
match subcmd {
SubCommand::Standard(cmd) => match cmd {
StandardCmd::Create(create) => commands::create::run(create, root_path, logger).await,
StandardCmd::Start(start) => commands::start::run(start, root_path, logger).await,
StandardCmd::Delete(delete) => commands::delete::run(delete, root_path, logger).await,
StandardCmd::State(state) => commands::state::run(state, root_path, logger),
StandardCmd::Kill(kill) => commands::kill::run(kill, root_path, logger),
},
SubCommand::Common(cmd) => match cmd {
CommonCmd::Run(run) => commands::run::run(run, root_path, logger).await,
CommonCmd::Spec(spec) => commands::spec::run(spec, logger),
CommonCmd::List(list) => commands::list::run(list, root_path, logger),
CommonCmd::Exec(exec) => commands::exec::run(exec, root_path, logger).await,
CommonCmd::Ps(ps) => commands::ps::run(ps, root_path, logger),
CommonCmd::Pause(pause) => commands::pause::run(pause, root_path, logger),
CommonCmd::Resume(resume) => commands::resume::run(resume, root_path, logger),
_ => Err(anyhow!("command is not implemented yet")),
},
_ => unreachable!(),
}
}
fn setup_logger(
log_file: Option<PathBuf>,
log_level: slog::Level,
) -> Result<(Logger, Option<AsyncGuard>)> {
if let Some(ref file) = log_file {
let log_writer = OpenOptions::new()
.write(true)
.read(true)
.create(true)
.truncate(true)
.open(file)?;
// TODO: Support 'text' log format.
let (logger_local, logger_async_guard_local) =
logging::create_logger(crate_name!(), crate_name!(), log_level, log_writer);
Ok((logger_local, Some(logger_async_guard_local)))
} else {
let logger = slog::Logger::root(slog::Discard, o!());
Ok((logger, None))
}
}
async fn real_main() -> Result<()> {
let cli = Cli::parse();
if let SubCommand::Init {} = cli.subcmd {
rustjail::container::init_child();
exit(0);
}
let root_path = if let Some(path) = cli.global.root {
path
} else {
PathBuf::from(DEFAULT_ROOT_DIR)
};
let log_level = if cli.global.debug {
slog::Level::Debug
} else {
DEFAULT_LOG_LEVEL
};
let (logger, _async_guard) = setup_logger(cli.global.log, log_level)?;
cmd_run(cli.subcmd, &root_path, &logger).await?;
Ok(())
}
#[tokio::main]
async fn main() {
if let Err(e) = real_main().await {
eprintln!("ERROR: {}", e);
exit(1);
}
exit(0);
}

View File

@@ -14,6 +14,7 @@ and with different container managers.
- [Docker](https://github.com/kata-containers/kata-containers/tree/main/tests/integration/docker)
- [`Nerdctl`](https://github.com/kata-containers/kata-containers/tree/main/tests/integration/nerdctl)
- [`Nydus`](https://github.com/kata-containers/kata-containers/tree/main/tests/integration/nydus)
- [`Runk`](https://github.com/kata-containers/kata-containers/tree/main/tests/integration/runk)
2. [Stability tests](https://github.com/kata-containers/kata-containers/tree/main/tests/stability)
3. [Metrics](https://github.com/kata-containers/kata-containers/tree/main/tests/metrics)
4. [Functional](https://github.com/kata-containers/kata-containers/tree/main/tests/functional)

View File

@@ -1022,113 +1022,3 @@ function version_greater_than_equal() {
return 1
fi
}
# Run bats tests with proper reporting
#
# This function provides consistent test execution and reporting across
# all test suites (k8s, nvidia, kata-deploy, etc.)
#
# Parameters:
# $1 - Test directory (where tests are located and reports will be saved)
# $2 - Array name containing test files (passed by reference)
#
# Environment variables:
# BATS_TEST_FAIL_FAST - Set to "yes" to stop at first failure (default: "no")
#
# Example usage:
# tests=("test1.bats" "test2.bats")
# run_bats_tests "/path/to/tests" tests
#
function run_bats_tests() {
local test_dir="$1"
local -n test_array=$2
local fail_fast="${BATS_TEST_FAIL_FAST:-no}"
local report_dir="${test_dir}/reports/$(date +'%F-%T')"
mkdir -p "${report_dir}"
info "Running tests with bats version: $(bats --version). Save outputs to ${report_dir}"
local tests_fail=()
for test_entry in "${test_array[@]}"; do
test_entry=$(echo "${test_entry}" | tr -d '[:space:][:cntrl:]')
[ -z "${test_entry}" ] && continue
info "Executing ${test_entry}"
# Output file will be prefixed with "ok" or "not_ok" based on the result
local out_file="${report_dir}/${test_entry}.out"
pushd "${test_dir}" > /dev/null
if ! bats --timing --show-output-of-passing-tests "${test_entry}" | tee "${out_file}"; then
tests_fail+=("${test_entry}")
mv "${out_file}" "$(dirname "${out_file}")/not_ok-$(basename "${out_file}")"
[[ "${fail_fast}" == "yes" ]] && break
else
mv "${out_file}" "$(dirname "${out_file}")/ok-$(basename "${out_file}")"
fi
popd > /dev/null
done
if [[ ${#tests_fail[@]} -ne 0 ]]; then
die "Tests FAILED from suites: ${tests_fail[*]}"
fi
info "All tests SUCCEEDED"
}
# Report bats test results from the reports directory
#
# This function displays a summary of test results and outputs from
# the reports directory created by run_bats_tests().
#
# Parameters:
# $1 - Test directory (where reports subdirectory is located)
#
# Example usage:
# report_bats_tests "/path/to/tests"
#
function report_bats_tests() {
local test_dir="$1"
local reports_dir="${test_dir}/reports"
if [[ ! -d "${reports_dir}" ]]; then
warn "No reports directory found: ${reports_dir}"
return 1
fi
for report_dir in "${reports_dir}"/*; do
[[ ! -d "${report_dir}" ]] && continue
local ok=()
local not_ok=()
mapfile -t ok < <(find "${report_dir}" -name "ok-*.out" 2>/dev/null)
mapfile -t not_ok < <(find "${report_dir}" -name "not_ok-*.out" 2>/dev/null)
cat <<-EOF
SUMMARY ($(basename "${report_dir}")):
Pass: ${#ok[*]}
Fail: ${#not_ok[*]}
EOF
echo -e "\nSTATUSES:"
for out in "${not_ok[@]}" "${ok[@]}"; do
[[ -z "${out}" ]] && continue
local status
local bats
status=$(basename "${out}" | cut -d '-' -f1)
bats=$(basename "${out}" | cut -d '-' -f2- | sed 's/.out$//')
echo " ${status} ${bats}"
done
echo -e "\nOUTPUTS:"
for out in "${not_ok[@]}" "${ok[@]}"; do
[[ -z "${out}" ]] && continue
local bats
bats=$(basename "${out}" | cut -d '-' -f2- | sed 's/.out$//')
echo "::group::${bats}"
cat "${out}"
echo "::endgroup::"
done
done
}

View File

@@ -20,10 +20,6 @@ function run_tests() {
popd
}
function report_tests() {
report_bats_tests "${kata_deploy_dir}"
}
function cleanup_runtimeclasses() {
# Cleanup any runtime class that was left behind in the cluster, in
# case of a test failure, apart from the default one that comes from
@@ -63,7 +59,6 @@ function main() {
install-kubectl) install_kubectl ;;
get-cluster-credentials) get_cluster_credentials "kata-deploy" ;;
run-tests) run_tests ;;
report-tests) report_tests ;;
delete-cluster) cleanup "aks" "kata-deploy" ;;
*) >&2 echo "Invalid argument"; exit 2 ;;
esac

View File

@@ -4,35 +4,15 @@
#
# SPDX-License-Identifier: Apache-2.0
#
# Kata Deploy Functional Tests
#
# This test validates that kata-deploy successfully installs and configures
# Kata Containers on a Kubernetes cluster using Helm.
#
# Required environment variables:
# DOCKER_REGISTRY - Container registry for kata-deploy image
# DOCKER_REPO - Repository name for kata-deploy image
# DOCKER_TAG - Image tag to test
# KATA_HYPERVISOR - Hypervisor to test (qemu, clh, etc.)
# KUBERNETES - K8s distribution (microk8s, k3s, rke2, etc.)
#
# Optional timeout configuration (increase for slow networks or large images):
# KATA_DEPLOY_TIMEOUT - Overall helm timeout (default: 30m)
# KATA_DEPLOY_DAEMONSET_TIMEOUT - DaemonSet rollout timeout in seconds (default: 1200 = 20m)
# Includes time to pull kata-deploy image
# KATA_DEPLOY_VERIFICATION_TIMEOUT - Verification pod timeout in seconds (default: 180 = 3m)
# Time for verification pod to run
#
# Example with custom timeouts for slow network:
# KATA_DEPLOY_DAEMONSET_TIMEOUT=3600 bats kata-deploy.bats
#
load "${BATS_TEST_DIRNAME}/../../common.bash"
repo_root_dir="${BATS_TEST_DIRNAME}/../../../"
load "${repo_root_dir}/tests/gha-run-k8s-common.sh"
setup() {
ensure_helm
ensure_yq
pushd "${repo_root_dir}"
# We expect 2 runtime classes because:
# * `kata` is the default runtimeclass created by Helm, basically an alias for `kata-${KATA_HYPERVISOR}`.
@@ -46,113 +26,50 @@ setup() {
"kata\s+kata-${KATA_HYPERVISOR}" \
"kata-${KATA_HYPERVISOR}\s+kata-${KATA_HYPERVISOR}" \
)
}
@test "Test runtimeclasses are being properly created and container runtime is not broken" {
pushd "${repo_root_dir}"
# Create verification pod spec
local verification_yaml
verification_yaml=$(mktemp)
cat > "${verification_yaml}" << EOF
apiVersion: v1
kind: Pod
metadata:
name: kata-deploy-verify
spec:
runtimeClassName: kata-${KATA_HYPERVISOR}
restartPolicy: Never
nodeSelector:
katacontainers.io/kata-runtime: "true"
containers:
- name: verify
image: quay.io/kata-containers/alpine-bash-curl:latest
imagePullPolicy: Always
command:
- sh
- -c
- |
echo "=== Kata Verification ==="
echo "Kernel: \$(uname -r)"
echo "SUCCESS: Pod running with Kata runtime"
EOF
# Install kata-deploy via Helm
echo "Installing kata-deploy with Helm..."
local helm_chart_dir="tools/packaging/kata-deploy/helm-chart/kata-deploy"
# Timeouts can be customized via environment variables:
# - KATA_DEPLOY_TIMEOUT: Overall helm timeout (includes all hooks)
# Default: 600s (10 minutes)
# - KATA_DEPLOY_DAEMONSET_TIMEOUT: Time to wait for kata-deploy DaemonSet rollout (image pull + pod start)
# Default: 300s (5 minutes) - accounts for large image downloads
# - KATA_DEPLOY_VERIFICATION_TIMEOUT: Time to wait for verification pod to complete
# Default: 120s (2 minutes) - verification pod execution time
local helm_timeout="${KATA_DEPLOY_TIMEOUT:-600s}"
local daemonset_timeout="${KATA_DEPLOY_DAEMONSET_TIMEOUT:-300}"
local verification_timeout="${KATA_DEPLOY_VERIFICATION_TIMEOUT:-120}"
echo "Timeout configuration:"
echo " Helm overall: ${helm_timeout}"
echo " DaemonSet rollout: ${daemonset_timeout}s (includes image pull)"
echo " Verification pod: ${verification_timeout}s (pod execution)"
helm dependency build "${helm_chart_dir}"
# Disable all shims except the one being tested
helm upgrade --install kata-deploy "${helm_chart_dir}" \
--set image.reference="${DOCKER_REGISTRY}/${DOCKER_REPO}" \
--set image.tag="${DOCKER_TAG}" \
--set debug=true \
--set k8sDistribution="${KUBERNETES}" \
--set shims.clh.enabled=false \
--set shims.cloud-hypervisor.enabled=false \
--set shims.dragonball.enabled=false \
--set shims.fc.enabled=false \
--set shims.qemu.enabled=false \
--set shims.qemu-runtime-rs.enabled=false \
--set shims.qemu-cca.enabled=false \
--set shims.qemu-se.enabled=false \
--set shims.qemu-se-runtime-rs.enabled=false \
--set shims.qemu-nvidia-gpu.enabled=false \
--set shims.qemu-nvidia-gpu-snp.enabled=false \
--set shims.qemu-nvidia-gpu-tdx.enabled=false \
--set shims.qemu-sev.enabled=false \
--set shims.qemu-snp.enabled=false \
--set shims.qemu-snp-runtime-rs.enabled=false \
--set shims.qemu-tdx.enabled=false \
--set shims.qemu-tdx-runtime-rs.enabled=false \
--set shims.qemu-coco-dev.enabled=false \
--set shims.qemu-coco-dev-runtime-rs.enabled=false \
--set "shims.${KATA_HYPERVISOR}.enabled=true" \
--set "defaultShim.amd64=${KATA_HYPERVISOR}" \
--set "defaultShim.arm64=${KATA_HYPERVISOR}" \
--set runtimeClasses.enabled=true \
--set runtimeClasses.createDefault=true \
--set-file verification.pod="${verification_yaml}" \
--set verification.timeout="${verification_timeout}" \
--set verification.daemonsetTimeout="${daemonset_timeout}" \
--namespace kube-system \
--wait --timeout "${helm_timeout}"
rm -f "${verification_yaml}"
echo ""
# Set the latest image, the one generated as part of the PR, to be used as part of the tests
export HELM_IMAGE_REFERENCE="${DOCKER_REGISTRY}/${DOCKER_REPO}"
export HELM_IMAGE_TAG="${DOCKER_TAG}"
# Enable debug for Kata Containers
export HELM_DEBUG="true"
# Create the runtime class only for the shim that's being tested
export HELM_SHIMS="${KATA_HYPERVISOR}"
# Set the tested hypervisor as the default `kata` shim
export HELM_DEFAULT_SHIM="${KATA_HYPERVISOR}"
# Let the Helm chart create the default `kata` runtime class
export HELM_CREATE_DEFAULT_RUNTIME_CLASS="true"
HOST_OS=""
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
HOST_OS="${KATA_HOST_OS}"
fi
export HELM_HOST_OS="${HOST_OS}"
export HELM_K8S_DISTRIBUTION="${KUBERNETES}"
# Enable deployment verification (verifies Kata Containers
# VM kernel isolation by comparing node vs pod kernel)
export HELM_VERIFY_DEPLOYMENT="true"
helm_helper
echo "::group::kata-deploy logs"
kubectl -n kube-system logs --tail=200 -l name=kata-deploy
kubectl -n kube-system logs --tail=100 -l name=kata-deploy
echo "::endgroup::"
echo ""
echo "::group::Runtime classes"
kubectl get runtimeclass
echo "::endgroup::"
# helm --wait already waits for post-install hooks to complete
# If helm returns successfully, the verification job passed
# The job is deleted after success (hook-delete-policy: hook-succeeded)
echo ""
echo "Helm install completed successfully - verification passed"
popd
}
@test "Test runtimeclasses are being properly created and container runtime is not broken" {
# We filter `kata-mshv-vm-isolation` out as that's present on AKS clusters, but that's not coming from kata-deploy
current_runtime_classes=$(kubectl get runtimeclasses | grep -v "kata-mshv-vm-isolation" | grep "kata" | wc -l)
[[ ${current_runtime_classes} -eq ${expected_runtime_classes} ]]
@@ -174,8 +91,6 @@ EOF
# Check that the container runtime verison doesn't have unknown, which happens when containerd can't start properly
container_runtime_version=$(kubectl get nodes --no-headers -o custom-columns=CONTAINER_RUNTIME:.status.nodeInfo.containerRuntimeVersion)
[[ ${container_runtime_version} != *"containerd://Unknown"* ]]
popd
}
teardown() {

View File

@@ -6,14 +6,10 @@
#
set -e
set -o pipefail
kata_deploy_dir=$(dirname "$(readlink -f "$0")")
source "${kata_deploy_dir}/../../common.bash"
# Setting to "yes" enables fail fast, stopping execution at the first failed test.
export BATS_TEST_FAIL_FAST="${BATS_TEST_FAIL_FAST:-no}"
if [[ -n "${KATA_DEPLOY_TEST_UNION:-}" ]]; then
KATA_DEPLOY_TEST_UNION=("${KATA_DEPLOY_TEST_UNION}")
else
@@ -22,4 +18,8 @@ else
)
fi
run_bats_tests "${kata_deploy_dir}" KATA_DEPLOY_TEST_UNION
info "Run tests"
for KATA_DEPLOY_TEST_ENTRY in "${KATA_DEPLOY_TEST_UNION[@]}"
do
bats --show-output-of-passing-tests "${KATA_DEPLOY_TEST_ENTRY}"
done

View File

@@ -600,7 +600,7 @@ function helm_helper() {
yq -i ".shims.${shim}.enabled = true" "${values_yaml}"
yq -i ".shims.${shim}.supportedArches = [\"arm64\"]" "${values_yaml}"
;;
qemu-snp|qemu-snp-runtime-rs|qemu-tdx|qemu-tdx-runtime-rs|qemu-nvidia-gpu-snp|qemu-nvidia-gpu-tdx)
qemu-snp|qemu-tdx|qemu-nvidia-gpu-snp|qemu-nvidia-gpu-tdx)
yq -i ".shims.${shim}.enabled = true" "${values_yaml}"
yq -i ".shims.${shim}.supportedArches = [\"amd64\"]" "${values_yaml}"
;;
@@ -876,7 +876,7 @@ VERIFICATION_POD_EOF
max_tries=3
interval=10
i=0
i=10
# Retry loop for helm install to prevent transient failures due to instantly unreachable cluster
set +e # Disable immediate exit on failure
@@ -890,16 +890,15 @@ VERIFICATION_POD_EOF
fi
i=$((i+1))
if [[ ${i} -lt ${max_tries} ]]; then
echo "Retrying after ${interval} seconds (Attempt ${i} of ${max_tries})"
echo "Retrying after ${interval} seconds (Attempt ${i} of $((max_tries - 1)))"
else
break
fi
sleep "${interval}"
done
set -e # Re-enable immediate exit on failure
if [[ ${i} -ge ${max_tries} ]]; then
echo "ERROR: Failed to deploy kata-deploy after ${max_tries} tries"
return 1
if [[ ${i} -eq ${max_tries} ]]; then
die "Failed to deploy kata-deploy after ${max_tries} tries"
fi
# `helm install --wait` does not take effect on single replicas and maxUnavailable=1 DaemonSets

View File

@@ -326,7 +326,41 @@ function run_tests() {
# directory.
#
function report_tests() {
report_bats_tests "${kubernetes_dir}"
local reports_dir="${kubernetes_dir}/reports"
local ok
local not_ok
local status
if [[ ! -d "${reports_dir}" ]]; then
info "no reports directory found: ${reports_dir}"
return
fi
for report_dir in "${reports_dir}"/*; do
mapfile -t ok < <(find "${report_dir}" -name "ok-*.out")
mapfile -t not_ok < <(find "${report_dir}" -name "not_ok-*.out")
cat <<-EOF
SUMMARY ($(basename "${report_dir}")):
Pass: ${#ok[*]}
Fail: ${#not_ok[*]}
EOF
echo -e "\nSTATUSES:"
for out in "${not_ok[@]}" "${ok[@]}"; do
status=$(basename "${out}" | cut -d '-' -f1)
bats=$(basename "${out}" | cut -d '-' -f2- | sed 's/.out$//')
echo " ${status} ${bats}"
done
echo -e "\nOUTPUTS:"
for out in "${not_ok[@]}" "${ok[@]}"; do
bats=$(basename "${out}" | cut -d '-' -f2- | sed 's/.out$//')
echo "::group::${bats}"
cat "${out}"
echo "::endgroup::"
done
done
}
function collect_artifacts() {

View File

@@ -6,7 +6,6 @@
#
set -e
set -o pipefail
kubernetes_dir=$(dirname "$(readlink -f "$0")")
# shellcheck disable=SC1091 # import based on variable
@@ -54,6 +53,20 @@ if [[ "${ENABLE_NVRC_TRACE:-true}" == "true" ]]; then
enable_nvrc_trace
fi
# Use common bats test runner with proper reporting
export BATS_TEST_FAIL_FAST="${K8S_TEST_FAIL_FAST}"
run_bats_tests "${kubernetes_dir}" K8S_TEST_NV
info "Running tests with bats version: $(bats --version)"
tests_fail=()
for K8S_TEST_ENTRY in "${K8S_TEST_NV[@]}"
do
K8S_TEST_ENTRY=$(echo "${K8S_TEST_ENTRY}" | tr -d '[:space:][:cntrl:]')
info "$(kubectl get pods --all-namespaces 2>&1)"
info "Executing ${K8S_TEST_ENTRY}"
if ! bats --show-output-of-passing-tests "${K8S_TEST_ENTRY}"; then
tests_fail+=("${K8S_TEST_ENTRY}")
[[ "${K8S_TEST_FAIL_FAST}" = "yes" ]] && break
fi
done
[[ ${#tests_fail[@]} -ne 0 ]] && die "Tests FAILED from suites: ${tests_fail[*]}"
info "All tests SUCCEEDED"

View File

@@ -135,6 +135,28 @@ fi
ensure_yq
# Use common bats test runner with proper reporting
export BATS_TEST_FAIL_FAST="${K8S_TEST_FAIL_FAST}"
run_bats_tests "${kubernetes_dir}" K8S_TEST_UNION
report_dir="${kubernetes_dir}/reports/$(date +'%F-%T')"
mkdir -p "${report_dir}"
info "Running tests with bats version: $(bats --version). Save outputs to ${report_dir}"
tests_fail=()
for K8S_TEST_ENTRY in "${K8S_TEST_UNION[@]}"
do
K8S_TEST_ENTRY=$(echo "$K8S_TEST_ENTRY" | tr -d '[:space:][:cntrl:]')
time info "$(kubectl get pods --all-namespaces 2>&1)"
info "Executing ${K8S_TEST_ENTRY}"
# Output file will be prefixed with "ok" or "not_ok" based on the result
out_file="${report_dir}/${K8S_TEST_ENTRY}.out"
if ! bats --timing --show-output-of-passing-tests "${K8S_TEST_ENTRY}" | tee "${out_file}"; then
tests_fail+=("${K8S_TEST_ENTRY}")
mv "${out_file}" "$(dirname "${out_file}")/not_ok-$(basename "${out_file}")"
[ "${K8S_TEST_FAIL_FAST}" = "yes" ] && break
else
mv "${out_file}" "$(dirname "${out_file}")/ok-$(basename "${out_file}")"
fi
done
[ ${#tests_fail[@]} -ne 0 ] && die "Tests FAILED from suites: ${tests_fail[*]}"
info "All tests SUCCEEDED"

View File

@@ -0,0 +1,65 @@
#!/bin/bash
#
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
set -o errexit
set -o nounset
set -o pipefail
kata_tarball_dir="${2:-kata-artifacts}"
runk_dir="$(dirname "$(readlink -f "$0")")"
source "${runk_dir}/../../common.bash"
source "${runk_dir}/../../gha-run-k8s-common.sh"
function install_dependencies() {
info "Installing the dependencies needed for running the runk tests"
# Dependency list of projects that we can rely on the system packages
# - jq
declare -a system_deps=(
jq
)
sudo apt-get update
sudo apt-get -y install "${system_deps[@]}"
ensure_yq
# Dependency list of projects that we can install them
# directly from their releases on GitHub:
# - containerd
# - cri-container-cni release tarball already includes CNI plugins
declare -a github_deps
github_deps[0]="cri_containerd:$(get_from_kata_deps ".externals.containerd.${CONTAINERD_VERSION}")"
github_deps[1]="runc:$(get_from_kata_deps ".externals.runc.latest")"
github_deps[2]="cni_plugins:$(get_from_kata_deps ".externals.cni-plugins.version")"
for github_dep in "${github_deps[@]}"; do
IFS=":" read -r -a dep <<< "${github_dep}"
install_${dep[0]} "${dep[1]}"
done
# Requires bats to run the tests
install_bats
}
function run() {
info "Running runk tests using"
bats "${runk_dir}/runk-tests.bats"
}
function main() {
action="${1:-}"
case "${action}" in
install-dependencies) install_dependencies ;;
install-kata) install_kata ;;
run) run ;;
*) >&2 die "Invalid argument" ;;
esac
}
main "$@"

View File

@@ -0,0 +1,127 @@
#!/usr/bin/env bats
#
# Copyright (c) 2023,2024 Kata Contributors
#
# SPDX-License-Identifier: Apache-2.0
#
# This test will validate runk with containerd
load "${BATS_TEST_DIRNAME}/../../common.bash"
load "${BATS_TEST_DIRNAME}/../../metrics/lib/common.bash"
setup_file() {
export RUNK_BIN_PATH="/usr/local/bin/runk"
export TEST_IMAGE="quay.io/prometheus/busybox:latest"
export CONTAINER_ID="id1"
export PID_FILE="${CONTAINER_ID}.pid"
export WORK_DIR="${BATS_FILE_TMPDIR}"
echo "pull container image"
check_images ${TEST_IMAGE}
}
setup() {
# Bind mount ${WORK_DIR}:/tmp. Tests below will store files in this dir and check them when container is frozon.
sudo ctr run --pid-file ${PID_FILE} -d \
--mount type=bind,src=${WORK_DIR},dst=/tmp,options=rbind:rw \
--runc-binary ${RUNK_BIN_PATH} \
${TEST_IMAGE} \
${CONTAINER_ID}
read CID PID STATUS <<< $(sudo ctr t ls | grep ${CONTAINER_ID})
# Check the pid is consistent
[ "${PID}" == "$(cat "${PID_FILE}")" ]
# Check the container status is RUNNING
[ "${STATUS}" == "RUNNING" ]
}
teardown() {
echo "delete the container"
if sudo ctr t list -q | grep -q "${CONTAINER_ID}"; then
stop_container
fi
sudo ctr c rm "${CONTAINER_ID}"
sudo rm -f "${PID_FILE}"
}
stop_container() {
local cmd
sudo ctr t kill --signal SIGKILL --all "${CONTAINER_ID}"
# poll for a while until the task receives signal and exit
cmd='[ "STOPPED" == "$(sudo ctr t ls | grep ${CONTAINER_ID} | awk "{print \$3}")" ]'
waitForProcess 10 1 "${cmd}"
echo "check the container is stopped"
# there is only title line of ps command
[ "1" == "$(sudo ctr t ps ${CONTAINER_ID} | wc -l)" ]
}
@test "start container with runk" {
}
@test "exec process in a container" {
sudo ctr t exec --exec-id id1 "${CONTAINER_ID}" sh -c "echo hello > /tmp/foo"
# Check exec succeeded
[ "hello" == "$(sudo ctr t exec --exec-id id1 "${CONTAINER_ID}" cat /tmp/foo)" ]
}
@test "run ps command" {
sudo ctr t exec --detach --exec-id id1 "${CONTAINER_ID}" sh
return_code=$?
echo "ctr t exec sh return: ${return_code}"
# Give some time for the sh process to start within the container.
sleep 5
ps_out="$(sudo ctr t ps ${CONTAINER_ID})" || die "ps command failed"
printf "ps output:\n%s\n" "${ps_out}"
lines_no="$(printf "%s\n" "${ps_out}" | wc -l)"
echo "ps output lines: ${lines_no}"
# one line is the titles, and the other 2 lines are process info
[ "3" == "${lines_no}" ]
}
@test "pause and resume the container" {
# The process outputs lines into /tmp/{CONTAINER_ID}, which can be read in host when it's frozon.
sudo ctr t exec --detach --exec-id id2 ${CONTAINER_ID} \
sh -c "while true; do echo hello >> /tmp/${CONTAINER_ID}; sleep 0.1; done"
# sleep for 1s to make sure the process outputs some lines
sleep 1
sudo ctr t pause "${CONTAINER_ID}"
# Check the status is PAUSED
[ "PAUSED" == "$(sudo ctr t ls | grep ${CONTAINER_ID} | grep -o PAUSED)" ]
echo "container is paused"
local TMP_FILE="${WORK_DIR}/${CONTAINER_ID}"
local lines1=$(cat ${TMP_FILE} | wc -l)
# sleep for a while and check the lines are not changed.
sleep 1
local lines2=$(cat ${TMP_FILE} | wc -l)
# Check the paused container is not running the process (paused indeed)
[ ${lines1} == ${lines2} ]
sudo ctr t resume ${CONTAINER_ID}
# Check the resumed container has status of RUNNING
[ "RUNNING" == "$(sudo ctr t ls | grep ${CONTAINER_ID} | grep -o RUNNING)" ]
echo "container is resumed"
# sleep for a while and check the lines are changed.
sleep 1
local lines3=$(cat ${TMP_FILE} | wc -l)
# Check the process is running again
[ ${lines2} -lt ${lines3} ]
}
@test "kill the container and poll until it is stopped" {
stop_container
}
@test "kill --all is allowed regardless of the container state" {
# High-level container runtimes such as containerd call the kill command with
# --all option in order to terminate all processes inside the container
# even if the container already is stopped. Hence, a low-level runtime
# should allow kill --all regardless of the container state like runc.
echo "test kill --all is allowed regardless of the container state"
# Check kill should fail because the container is paused
stop_container
run sudo ctr t kill --signal SIGKILL ${CONTAINER_ID}
[ $status -eq 1 ]
# Check kill --all should not fail
sudo ctr t kill --signal SIGKILL --all "${CONTAINER_ID}"
}

View File

@@ -74,6 +74,8 @@ Extra environment variables:
AGENT_BIN: Use it to change the expected agent binary name
AGENT_INIT: Use kata agent as init process
BLOCK_SIZE: Use to specify the size of blocks in bytes. DEFAULT: 4096
DAX_DISABLE: If set to "yes", skip DAX metadata header (for kernels without FS_DAX support).
DEFAULT: not set
IMAGE_REGISTRY: Hostname for the image registry used to pull down the rootfs build image.
NSDAX_BIN: Use to specify path to pre-compiled 'nsdax' tool.
USE_DOCKER: If set will build image in a Docker Container (requries docker)
@@ -169,6 +171,7 @@ build_with_container() {
--env BLOCK_SIZE="${block_size}" \
--env ROOT_FREE_SPACE="${root_free_space}" \
--env NSDAX_BIN="${nsdax_bin}" \
--env DAX_DISABLE="${DAX_DISABLE:-no}" \
--env MEASURED_ROOTFS="${MEASURED_ROOTFS}" \
--env SELINUX="${SELINUX}" \
--env DEBUG="${DEBUG}" \
@@ -304,8 +307,12 @@ calculate_img_size() {
local fs_type="$3"
local block_size="$4"
# rootfs start + DAX header size + rootfs end
local reserved_size_mb=$((rootfs_start + dax_header_sz + rootfs_end))
# rootfs start + DAX header size (if enabled) + rootfs end
local dax_sz=0
if [ "${DAX_DISABLE:-no}" != "yes" ]; then
dax_sz="${dax_header_sz}"
fi
local reserved_size_mb=$((rootfs_start + dax_sz + rootfs_end))
disk_size="$(calculate_required_disk_size "${rootfs}" "${fs_type}" "${block_size}")"
@@ -624,25 +631,35 @@ main() {
die "Invalid rootfs"
fi
# Determine DAX header size based on DAX_DISABLE setting
local dax_sz=0
if [ "${DAX_DISABLE:-no}" != "yes" ]; then
dax_sz="${dax_header_sz}"
fi
if [ "${fs_type}" == 'erofs' ]; then
# mkfs.erofs accepts an src root dir directory as an input
# rather than some device, so no need to guess the device dest size first.
create_erofs_rootfs_image "${rootfs}" "${image}" \
"${block_size}" "${agent_bin}"
rootfs_img_size=$?
img_size=$((rootfs_img_size + dax_header_sz))
img_size=$((rootfs_img_size + dax_sz))
else
img_size=$(calculate_img_size "${rootfs}" "${root_free_space}" \
"${fs_type}" "${block_size}")
# the first 2M are for the first MBR + NVDIMM metadata and were already
# consider in calculate_img_size
rootfs_img_size=$((img_size - dax_header_sz))
# consider in calculate_img_size (if DAX is enabled)
rootfs_img_size=$((img_size - dax_sz))
create_rootfs_image "${rootfs}" "${image}" "${rootfs_img_size}" \
"${fs_type}" "${block_size}" "${agent_bin}"
fi
# insert at the beginning of the image the MBR + DAX header
set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}"
if [ "${DAX_DISABLE:-no}" != "yes" ]; then
set_dax_header "${image}" "${img_size}" "${fs_type}" "${nsdax_bin}"
else
info "Skipping DAX header (DAX_DISABLE=yes)"
fi
chown "${USER}:${GROUP}" "${image}"
}

View File

@@ -18,9 +18,12 @@ die() {
exit 1
}
arch_target=$1
nvidia_gpu_stack="$2"
base_os="$3"
run_file_name=$2
run_fm_file_name=$3
arch_target=$4
nvidia_gpu_stack="$5"
driver_version=""
base_os="noble"
APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install"
@@ -31,6 +34,31 @@ is_feature_enabled() {
[[ ",${nvidia_gpu_stack}," == *",${feature},"* ]]
}
set_driver_version() {
# Extract the driver=XXX part first, then get the value
if [[ "${nvidia_gpu_stack}" =~ driver=([^,]+) ]]; then
driver_version="${BASH_REMATCH[1]}"
fi
echo "chroot: driver_version: ${driver_version}"
echo "chroot: TODO remove with new NVRC"
cat <<-CHROOT_EOF > "/supported-gpu.devids"
0x230E
0x2321
0x2322
0x2324
0x2329
0x232C
0x2330
0x2331
0x2335
0x2339
0x233A
0x233B
0x2342
0x2348
CHROOT_EOF
}
install_nvidia_ctk() {
echo "chroot: Installing NVIDIA GPU container runtime"
# Base gives a nvidia-ctk and the nvidia-container-runtime
@@ -48,19 +76,13 @@ install_nvidia_fabricmanager() {
}
install_userspace_components() {
# Extract the driver=XXX part first, then get the value
if [[ "${nvidia_gpu_stack}" =~ driver=([^,]+) ]]; then
driver_version="${BASH_REMATCH[1]}"
fi
echo "chroot: driver_version: ${driver_version}"
eval "${APT_INSTALL}" nvidia-driver-pinning-"${driver_version}"
eval "${APT_INSTALL}" nvidia-imex nvidia-firmware \
eval "${APT_INSTALL}" nvidia-imex nvidia-firmware \
libnvidia-cfg1 libnvidia-gl libnvidia-extra \
libnvidia-decode libnvidia-fbc1 libnvidia-encode \
libnvidia-nscq
apt-mark hold nvidia-imex nvidia-firmware \
apt-mark hold nvidia-imex nvidia-firmware \
libnvidia-cfg1 libnvidia-gl libnvidia-extra \
libnvidia-decode libnvidia-fbc1 libnvidia-encode \
libnvidia-nscq
@@ -89,13 +111,12 @@ setup_apt_repositories() {
rm -f /etc/apt/sources.list.d/*
key="/usr/share/keyrings/ubuntu-archive-keyring.gpg"
comp="main restricted universe multiverse"
cat <<-CHROOT_EOF > /etc/apt/sources.list.d/"${base_os}".list
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os} ${comp}
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-updates ${comp}
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-security ${comp}
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-backports ${comp}
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os} main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-updates main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-security main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-backports main restricted universe multiverse
CHROOT_EOF
local arch="${arch_target}"
@@ -108,7 +129,7 @@ setup_apt_repositories() {
curl -fsSL -O "https://developer.download.nvidia.com/compute/cuda/repos/${osver}/${arch}/${keyring}"
dpkg -i "${keyring}" && rm -f "${keyring}"
# Set priorities: CUDA repos highest, Ubuntu non-driver next, Ubuntu blocked for driver packages
# Set priorities: Ubuntu repos highest, NVIDIA Container Toolkit next, CUDA repo blocked for driver packages
cat <<-CHROOT_EOF > /etc/apt/preferences.d/nvidia-priority
Package: *
Pin: $(dirname "${mirror}")
@@ -160,6 +181,7 @@ cleanup_rootfs() {
# Start of script
echo "chroot: Setup NVIDIA GPU rootfs stage one"
set_driver_version
setup_apt_repositories
install_userspace_components
install_nvidia_fabricmanager

View File

@@ -94,11 +94,25 @@ setup_nvidia_gpu_rootfs_stage_one() {
mkdir -p ./lib/modules/
tar --zstd -xvf "${BUILD_DIR}"/kata-static-kernel-nvidia-gpu"${appendix}"-modules.tar.zst -C ./lib/modules/
# If we find a local downloaded run file build the kernel modules
# with it, otherwise use the distribution packages. Run files may have
# more recent drivers available then the distribution packages.
local run_file_name="nvidia-driver.run"
if [[ -f ${BUILD_DIR}/${run_file_name} ]]; then
cp -L "${BUILD_DIR}"/"${run_file_name}" ./"${run_file_name}"
fi
local run_fm_file_name="nvidia-fabricmanager.run"
if [[ -f ${BUILD_DIR}/${run_fm_file_name} ]]; then
cp -L "${BUILD_DIR}"/"${run_fm_file_name}" ./"${run_fm_file_name}"
fi
mount --rbind /dev ./dev
mount --make-rslave ./dev
mount -t proc /proc ./proc
chroot . /bin/bash -c "/nvidia_chroot.sh ${machine_arch} ${NVIDIA_GPU_STACK} noble"
chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} \
${run_fm_file_name} ${machine_arch} ${NVIDIA_GPU_STACK}"
umount -R ./dev
umount ./proc
@@ -245,6 +259,7 @@ chisseled_init() {
cp -a "${stage_one}"/etc/kata-opa etc/.
fi
cp -a "${stage_one}"/etc/resolv.conf etc/.
cp -a "${stage_one}"/supported-gpu.devids .
cp -a "${stage_one}"/lib/firmware/nvidia lib/firmware/.
cp -a "${stage_one}"/sbin/ldconfig.real sbin/ldconfig

View File

@@ -1,8 +1,11 @@
# Copyright Intel Corporation, 2022 IBM Corp.
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE_NAME=alpine
ARG BASE_IMAGE_TAG=3.22
FROM ${BASE_IMAGE_NAME}:${BASE_IMAGE_TAG} AS base
#### Nydus snapshotter & nydus image
FROM golang:1.24-alpine AS nydus-binary-downloader
@@ -14,219 +17,53 @@ ARG NYDUS_SNAPSHOTTER_REPO=https://github.com/containerd/nydus-snapshotter
RUN \
mkdir -p /opt/nydus-snapshotter && \
ARCH="$(uname -m)" && \
if [ "${ARCH}" = "x86_64" ]; then ARCH=amd64 ; fi && \
if [ "${ARCH}" = "aarch64" ]; then ARCH=arm64; fi && \
ARCH=$(uname -m) && \
if [[ "${ARCH}" == "x86_64" ]]; then ARCH=amd64 ; fi && \
if [[ "${ARCH}" == "aarch64" ]]; then ARCH=arm64; fi && \
apk add --no-cache curl && \
curl -fOL --progress-bar "${NYDUS_SNAPSHOTTER_REPO}/releases/download/${NYDUS_SNAPSHOTTER_VERSION}/nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" && \
tar xvzpf "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" -C /opt/nydus-snapshotter && \
rm "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz"
curl -fOL --progress-bar ${NYDUS_SNAPSHOTTER_REPO}/releases/download/${NYDUS_SNAPSHOTTER_VERSION}/nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz && \
tar xvzpf nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz -C /opt/nydus-snapshotter && \
rm nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz
#### Build binary package
FROM ubuntu:22.04 AS rust-builder
# Default to Rust 1.90.0
ARG RUST_TOOLCHAIN=1.90.0
ENV DEBIAN_FRONTEND=noninteractive
ENV RUSTUP_HOME="/opt/rustup"
ENV CARGO_HOME="/opt/cargo"
ENV PATH="/opt/cargo/bin/:${PATH}"
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN \
mkdir ${RUSTUP_HOME} ${CARGO_HOME} && \
chmod -R a+rwX ${RUSTUP_HOME} ${CARGO_HOME}
RUN \
apt-get update && \
apt-get --no-install-recommends -y install \
ca-certificates \
curl \
gcc \
libc6-dev \
musl-tools && \
apt-get clean && rm -rf /var/lib/apt/lists/ && \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_TOOLCHAIN}
WORKDIR /kata-deploy
# Copy standalone binary project
COPY binary /kata-deploy
# Install target and run tests based on architecture
# - AMD64/arm64: use musl for fully static binaries
# - PPC64le/s390x: use glibc (musl has issues on these platforms)
RUN \
HOST_ARCH="$(uname -m)"; \
rust_arch=""; \
rust_target=""; \
case "${HOST_ARCH}" in \
"x86_64") \
rust_arch="x86_64"; \
rust_target="${rust_arch}-unknown-linux-musl"; \
echo "Installing musl target for ${rust_target}"; \
rustup target add "${rust_target}"; \
;; \
"aarch64") \
rust_arch="aarch64"; \
rust_target="${rust_arch}-unknown-linux-musl"; \
echo "Installing musl target for ${rust_target}"; \
rustup target add "${rust_target}"; \
;; \
"ppc64le") \
rust_arch="powerpc64le"; \
rust_target="${rust_arch}-unknown-linux-gnu"; \
echo "Using glibc target for ${rust_target} (musl is not well supported on ppc64le)"; \
;; \
"s390x") \
rust_arch="s390x"; \
rust_target="${rust_arch}-unknown-linux-gnu"; \
echo "Using glibc target for ${rust_target} (musl is not well supported on s390x)"; \
;; \
*) echo "Unsupported architecture: ${HOST_ARCH}" && exit 1 ;; \
esac; \
echo "${rust_target}" > /tmp/rust_target
# Run tests using --test-threads=1 to prevent environment variable pollution between tests,
# and this is fine as we'll never ever have multiple binaries running at the same time.
RUN \
rust_target="$(cat /tmp/rust_target)"; \
echo "Running binary tests with target ${rust_target}..." && \
RUSTFLAGS="-D warnings" cargo test --target "${rust_target}" -- --test-threads=1 && \
echo "All tests passed!"
RUN \
rust_target="$(cat /tmp/rust_target)"; \
echo "Building kata-deploy binary for ${rust_target}..." && \
RUSTFLAGS="-D warnings" cargo build --release --target "${rust_target}" && \
mkdir -p /kata-deploy/bin && \
cp "/kata-deploy/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \
echo "Cleaning up build artifacts to save disk space..." && \
rm -rf /kata-deploy/target && \
cargo clean
#### Extract kata artifacts
FROM alpine:3.22 AS artifact-extractor
ARG KATA_ARTIFACTS=kata-static.tar.zst
ARG DESTINATION=/opt/kata-artifacts
COPY ${KATA_ARTIFACTS} /tmp/
RUN \
apk add --no-cache tar zstd util-linux-misc && \
mkdir -p "${DESTINATION}" && \
tar --zstd -xf "/tmp/$(basename "${KATA_ARTIFACTS}")" -C "${DESTINATION}" && \
rm -f "/tmp/$(basename "${KATA_ARTIFACTS}")"
#### Prepare runtime dependencies (nsenter and required libraries)
# This stage assembles all runtime dependencies based on architecture
# using ldd to find exact library dependencies
FROM debian:bookworm-slim AS runtime-assembler
ARG DESTINATION=/opt/kata-artifacts
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN \
apt-get update && \
apt-get --no-install-recommends -y install \
util-linux && \
apt-get clean && rm -rf /var/lib/apt/lists/
# Copy the built binary to analyze its dependencies
COPY --from=rust-builder /kata-deploy/bin/kata-deploy /tmp/kata-deploy
# Create output directories
RUN mkdir -p /output/lib /output/lib64 /output/usr/bin
# Use ldd to find and copy all required libraries for the kata-deploy binary and nsenter
RUN \
HOST_ARCH="$(uname -m)"; \
echo "Preparing runtime dependencies for ${HOST_ARCH}"; \
case "${HOST_ARCH}" in \
"ppc64le"|"s390x") \
echo "Using glibc - copying libraries based on ldd output"; \
\
# Copy nsenter \
cp /usr/bin/nsenter /output/usr/bin/nsenter; \
\
# Show what the binaries need \
echo "Libraries needed by kata-deploy:"; \
ldd /tmp/kata-deploy || echo "ldd failed"; \
echo "Libraries needed by nsenter:"; \
ldd /usr/bin/nsenter || echo "ldd failed"; \
\
# Extract and copy all library paths from both binaries \
for binary in /tmp/kata-deploy /usr/bin/nsenter; do \
echo "Processing ${binary}..."; \
# Get libraries with "=>" (shared libs) \
ldd "${binary}" 2>/dev/null | grep "=>" | awk '{print $3}' | sort -u | while read -r lib; do \
if [ -n "${lib}" ] && [ -f "${lib}" ]; then \
dest_dir="/output$(dirname "${lib}")"; \
mkdir -p "${dest_dir}"; \
cp -Ln "${lib}" "${dest_dir}/" 2>/dev/null || true; \
echo " Copied lib: ${lib}"; \
fi; \
done; \
done; \
\
# Copy the dynamic linker - it's at /lib/ld64.so.1 (not /lib64/) \
echo "Copying dynamic linker:"; \
mkdir -p /output/lib; \
cp -Ln /lib/ld64.so* /output/lib/ 2>/dev/null || true; \
cp -Ln /lib64/ld64.so* /output/lib64/ 2>/dev/null || true; \
\
echo "glibc" > /output/.libc-type; \
;; \
*) \
echo "amd64/arm64: will use musl-based static binaries"; \
echo "musl" > /output/.libc-type; \
# Create placeholder so COPY doesn't fail \
touch /output/lib/.placeholder; \
touch /output/lib64/.placeholder; \
touch /output/usr/bin/.placeholder; \
;; \
esac
# Copy musl nsenter from alpine for amd64/arm64
COPY --from=artifact-extractor /usr/bin/nsenter /output/usr/bin/nsenter-musl
COPY --from=artifact-extractor /lib/ld-musl-*.so.1 /output/lib/
# For amd64/arm64, use the musl nsenter; for ppc64le/s390x, keep the glibc one
RUN \
HOST_ARCH="$(uname -m)"; \
case "${HOST_ARCH}" in \
"x86_64"|"aarch64") \
mv /output/usr/bin/nsenter-musl /output/usr/bin/nsenter; \
;; \
*) \
rm -f /output/usr/bin/nsenter-musl; \
;; \
esac
#### kata-deploy main image
FROM gcr.io/distroless/static-debian12@sha256:87bce11be0af225e4ca761c40babb06d6d559f5767fbf7dc3c47f0f1a466b92c
# kata-deploy args
FROM base
ARG KATA_ARTIFACTS=./kata-static.tar.zst
ARG DESTINATION=/opt/kata-artifacts
# Copy extracted kata artifacts
COPY --from=artifact-extractor ${DESTINATION} ${DESTINATION}
COPY ${KATA_ARTIFACTS} /
# Copy Rust binary
COPY --from=rust-builder /kata-deploy/bin/kata-deploy /usr/bin/kata-deploy
# I understand that in order to be on the safer side, it'd
# be good to have the alpine packages pointing to a very
# specific version, but this may break anyone else trying
# to use a different version of alpine for one reason or
# another. With this in mind, let's ignore DL3018.
# SC2086 is about using double quotes to prevent globbing and
# word splitting, which can also be ignored for now.
# hadolint ignore=DL3018,SC2086
RUN \
apk --no-cache add bash curl tar zstd && \
ARCH=$(uname -m) && \
if [ "${ARCH}" = "x86_64" ]; then ARCH=amd64; fi && \
if [ "${ARCH}" = "aarch64" ]; then ARCH=arm64; fi && \
DEBIAN_ARCH=${ARCH} && \
if [ "${DEBIAN_ARCH}" = "ppc64le" ]; then DEBIAN_ARCH=ppc64el; fi && \
curl -fL --progress-bar -o /usr/bin/kubectl https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl && \
chmod +x /usr/bin/kubectl && \
curl -fL --progress-bar -o /usr/bin/jq https://github.com/jqlang/jq/releases/download/jq-1.7.1/jq-linux-${DEBIAN_ARCH} && \
chmod +x /usr/bin/jq && \
mkdir -p ${DESTINATION} && \
tar --zstd -xvf ${WORKDIR}/${KATA_ARTIFACTS} -C ${DESTINATION} && \
rm -f ${WORKDIR}/${KATA_ARTIFACTS} && \
apk del curl tar zstd && \
apk --no-cache add py3-pip && \
pip install --no-cache-dir yq==3.2.3 --break-system-packages
# Copy nsenter and required libraries (assembled based on architecture)
COPY --from=runtime-assembler /output/usr/bin/nsenter /usr/bin/nsenter
COPY --from=runtime-assembler /output/lib/ /lib/
COPY --from=runtime-assembler /output/lib64/ /lib64/
# Copy nydus snapshotter
COPY scripts ${DESTINATION}/scripts
COPY nydus-snapshotter ${DESTINATION}/nydus-snapshotter
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/containerd-nydus-grpc ${DESTINATION}/nydus-snapshotter/
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/nydus-overlayfs ${DESTINATION}/nydus-snapshotter/
# Copy runtimeclasses and node-feature-rules
COPY node-feature-rules ${DESTINATION}/node-feature-rules
ENTRYPOINT ["/usr/bin/kata-deploy"]

View File

@@ -0,0 +1,232 @@
# Copyright Intel Corporation, 2022 IBM Corp.
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#### Nydus snapshotter & nydus image
FROM golang:1.24-alpine AS nydus-binary-downloader
# Keep the version here aligned with "ndyus-snapshotter.version"
# in versions.yaml
ARG NYDUS_SNAPSHOTTER_VERSION=v0.15.10
ARG NYDUS_SNAPSHOTTER_REPO=https://github.com/containerd/nydus-snapshotter
RUN \
mkdir -p /opt/nydus-snapshotter && \
ARCH="$(uname -m)" && \
if [ "${ARCH}" = "x86_64" ]; then ARCH=amd64 ; fi && \
if [ "${ARCH}" = "aarch64" ]; then ARCH=arm64; fi && \
apk add --no-cache curl && \
curl -fOL --progress-bar "${NYDUS_SNAPSHOTTER_REPO}/releases/download/${NYDUS_SNAPSHOTTER_VERSION}/nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" && \
tar xvzpf "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" -C /opt/nydus-snapshotter && \
rm "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz"
#### Build binary package
FROM ubuntu:22.04 AS rust-builder
# Default to Rust 1.90.0
ARG RUST_TOOLCHAIN=1.90.0
ENV DEBIAN_FRONTEND=noninteractive
ENV RUSTUP_HOME="/opt/rustup"
ENV CARGO_HOME="/opt/cargo"
ENV PATH="/opt/cargo/bin/:${PATH}"
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN \
mkdir ${RUSTUP_HOME} ${CARGO_HOME} && \
chmod -R a+rwX ${RUSTUP_HOME} ${CARGO_HOME}
RUN \
apt-get update && \
apt-get --no-install-recommends -y install \
ca-certificates \
curl \
gcc \
libc6-dev \
musl-tools && \
apt-get clean && rm -rf /var/lib/apt/lists/ && \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_TOOLCHAIN}
WORKDIR /kata-deploy
# Copy standalone binary project
COPY binary /kata-deploy
# Install target and run tests based on architecture
# - AMD64/arm64: use musl for fully static binaries
# - PPC64le/s390x: use glibc (musl has issues on these platforms)
RUN \
HOST_ARCH="$(uname -m)"; \
rust_arch=""; \
rust_target=""; \
case "${HOST_ARCH}" in \
"x86_64") \
rust_arch="x86_64"; \
rust_target="${rust_arch}-unknown-linux-musl"; \
echo "Installing musl target for ${rust_target}"; \
rustup target add "${rust_target}"; \
;; \
"aarch64") \
rust_arch="aarch64"; \
rust_target="${rust_arch}-unknown-linux-musl"; \
echo "Installing musl target for ${rust_target}"; \
rustup target add "${rust_target}"; \
;; \
"ppc64le") \
rust_arch="powerpc64le"; \
rust_target="${rust_arch}-unknown-linux-gnu"; \
echo "Using glibc target for ${rust_target} (musl is not well supported on ppc64le)"; \
;; \
"s390x") \
rust_arch="s390x"; \
rust_target="${rust_arch}-unknown-linux-gnu"; \
echo "Using glibc target for ${rust_target} (musl is not well supported on s390x)"; \
;; \
*) echo "Unsupported architecture: ${HOST_ARCH}" && exit 1 ;; \
esac; \
echo "${rust_target}" > /tmp/rust_target
# Run tests using --test-threads=1 to prevent environment variable pollution between tests,
# and this is fine as we'll never ever have multiple binaries running at the same time.
RUN \
rust_target="$(cat /tmp/rust_target)"; \
echo "Running binary tests with target ${rust_target}..." && \
RUSTFLAGS="-D warnings" cargo test --target "${rust_target}" -- --test-threads=1 && \
echo "All tests passed!"
RUN \
rust_target="$(cat /tmp/rust_target)"; \
echo "Building kata-deploy binary for ${rust_target}..." && \
RUSTFLAGS="-D warnings" cargo build --release --target "${rust_target}" && \
mkdir -p /kata-deploy/bin && \
cp "/kata-deploy/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \
echo "Cleaning up build artifacts to save disk space..." && \
rm -rf /kata-deploy/target && \
cargo clean
#### Extract kata artifacts
FROM alpine:3.22 AS artifact-extractor
ARG KATA_ARTIFACTS=kata-static.tar.zst
ARG DESTINATION=/opt/kata-artifacts
COPY ${KATA_ARTIFACTS} /tmp/
RUN \
apk add --no-cache tar zstd util-linux-misc && \
mkdir -p "${DESTINATION}" && \
tar --zstd -xf "/tmp/$(basename "${KATA_ARTIFACTS}")" -C "${DESTINATION}" && \
rm -f "/tmp/$(basename "${KATA_ARTIFACTS}")"
#### Prepare runtime dependencies (nsenter and required libraries)
# This stage assembles all runtime dependencies based on architecture
# using ldd to find exact library dependencies
FROM debian:bookworm-slim AS runtime-assembler
ARG DESTINATION=/opt/kata-artifacts
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
RUN \
apt-get update && \
apt-get --no-install-recommends -y install \
util-linux && \
apt-get clean && rm -rf /var/lib/apt/lists/
# Copy the built binary to analyze its dependencies
COPY --from=rust-builder /kata-deploy/bin/kata-deploy /tmp/kata-deploy
# Create output directories
RUN mkdir -p /output/lib /output/lib64 /output/usr/bin
# Use ldd to find and copy all required libraries for the kata-deploy binary and nsenter
RUN \
HOST_ARCH="$(uname -m)"; \
echo "Preparing runtime dependencies for ${HOST_ARCH}"; \
case "${HOST_ARCH}" in \
"ppc64le"|"s390x") \
echo "Using glibc - copying libraries based on ldd output"; \
\
# Copy nsenter \
cp /usr/bin/nsenter /output/usr/bin/nsenter; \
\
# Show what the binaries need \
echo "Libraries needed by kata-deploy:"; \
ldd /tmp/kata-deploy || echo "ldd failed"; \
echo "Libraries needed by nsenter:"; \
ldd /usr/bin/nsenter || echo "ldd failed"; \
\
# Extract and copy all library paths from both binaries \
for binary in /tmp/kata-deploy /usr/bin/nsenter; do \
echo "Processing ${binary}..."; \
# Get libraries with "=>" (shared libs) \
ldd "${binary}" 2>/dev/null | grep "=>" | awk '{print $3}' | sort -u | while read -r lib; do \
if [ -n "${lib}" ] && [ -f "${lib}" ]; then \
dest_dir="/output$(dirname "${lib}")"; \
mkdir -p "${dest_dir}"; \
cp -Ln "${lib}" "${dest_dir}/" 2>/dev/null || true; \
echo " Copied lib: ${lib}"; \
fi; \
done; \
done; \
\
# Copy the dynamic linker - it's at /lib/ld64.so.1 (not /lib64/) \
echo "Copying dynamic linker:"; \
mkdir -p /output/lib; \
cp -Ln /lib/ld64.so* /output/lib/ 2>/dev/null || true; \
cp -Ln /lib64/ld64.so* /output/lib64/ 2>/dev/null || true; \
\
echo "glibc" > /output/.libc-type; \
;; \
*) \
echo "amd64/arm64: will use musl-based static binaries"; \
echo "musl" > /output/.libc-type; \
# Create placeholder so COPY doesn't fail \
touch /output/lib/.placeholder; \
touch /output/lib64/.placeholder; \
touch /output/usr/bin/.placeholder; \
;; \
esac
# Copy musl nsenter from alpine for amd64/arm64
COPY --from=artifact-extractor /usr/bin/nsenter /output/usr/bin/nsenter-musl
COPY --from=artifact-extractor /lib/ld-musl-*.so.1 /output/lib/
# For amd64/arm64, use the musl nsenter; for ppc64le/s390x, keep the glibc one
RUN \
HOST_ARCH="$(uname -m)"; \
case "${HOST_ARCH}" in \
"x86_64"|"aarch64") \
mv /output/usr/bin/nsenter-musl /output/usr/bin/nsenter; \
;; \
*) \
rm -f /output/usr/bin/nsenter-musl; \
;; \
esac
#### kata-deploy main image
FROM gcr.io/distroless/static-debian12@sha256:87bce11be0af225e4ca761c40babb06d6d559f5767fbf7dc3c47f0f1a466b92c
ARG DESTINATION=/opt/kata-artifacts
# Copy extracted kata artifacts
COPY --from=artifact-extractor ${DESTINATION} ${DESTINATION}
# Copy Rust binary
COPY --from=rust-builder /kata-deploy/bin/kata-deploy /usr/bin/kata-deploy
# Copy nsenter and required libraries (assembled based on architecture)
COPY --from=runtime-assembler /output/usr/bin/nsenter /usr/bin/nsenter
COPY --from=runtime-assembler /output/lib/ /lib/
COPY --from=runtime-assembler /output/lib64/ /lib64/
# Copy nydus snapshotter
COPY nydus-snapshotter ${DESTINATION}/nydus-snapshotter
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/containerd-nydus-grpc ${DESTINATION}/nydus-snapshotter/
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/nydus-overlayfs ${DESTINATION}/nydus-snapshotter/
# Copy runtimeclasses and node-feature-rules
COPY node-feature-rules ${DESTINATION}/node-feature-rules
ENTRYPOINT ["/usr/bin/kata-deploy"]

View File

@@ -36,9 +36,7 @@ const ALL_SHIMS: &[&str] = &[
"qemu-se",
"qemu-se-runtime-rs",
"qemu-snp",
"qemu-snp-runtime-rs",
"qemu-tdx",
"qemu-tdx-runtime-rs",
];
/// Check if a shim is a QEMU-based shim (all QEMU shims start with "qemu")
@@ -826,8 +824,6 @@ VERSION_ID="24.04"
"qemu"
);
assert_eq!(get_hypervisor_name("qemu-se-runtime-rs").unwrap(), "qemu");
assert_eq!(get_hypervisor_name("qemu-snp-runtime-rs").unwrap(), "qemu");
assert_eq!(get_hypervisor_name("qemu-tdx-runtime-rs").unwrap(), "qemu");
}
#[test]

View File

@@ -85,11 +85,7 @@ pub async fn configure_snapshotter(
runtime: &str,
config: &Config,
) -> Result<()> {
// Get all paths and drop-in capability in one call
let paths = config.get_containerd_paths(runtime).await?;
// Read containerd version from config_file to determine pluginid
let pluginid = if fs::read_to_string(&paths.config_file)
let pluginid = if fs::read_to_string(&config.containerd_conf_file)
.unwrap_or_default()
.contains("version = 3")
{
@@ -98,21 +94,28 @@ pub async fn configure_snapshotter(
"\"io.containerd.grpc.v1.cri\".containerd"
};
let configuration_file: std::path::PathBuf = if paths.use_drop_in {
// Only add /host prefix if path is not in /etc/containerd (which is mounted from host)
let base_path = if paths.drop_in_file.starts_with("/etc/containerd/") {
Path::new(&paths.drop_in_file).to_path_buf()
let use_drop_in =
crate::runtime::is_containerd_capable_of_using_drop_in_files(config, runtime).await?;
let configuration_file: std::path::PathBuf = if use_drop_in {
// Ensure we have the absolute path with /host prefix
let base_path = if config.containerd_drop_in_conf_file.starts_with("/host") {
// Already has /host prefix
Path::new(&config.containerd_drop_in_conf_file).to_path_buf()
} else {
// Need to add /host prefix for paths outside /etc/containerd
let drop_in_path = paths.drop_in_file.trim_start_matches('/');
// Need to add /host prefix
let drop_in_path = config.containerd_drop_in_conf_file.trim_start_matches('/');
Path::new("/host").join(drop_in_path)
};
log::debug!("Snapshotter using drop-in config file: {:?}", base_path);
base_path
} else {
log::debug!("Snapshotter using main config file: {}", paths.config_file);
Path::new(&paths.config_file).to_path_buf()
log::debug!(
"Snapshotter using main config file: {}",
config.containerd_conf_file
);
Path::new(&config.containerd_conf_file).to_path_buf()
};
match snapshotter {

View File

@@ -7,22 +7,6 @@ use anyhow::{Context, Result};
use log::info;
use std::env;
/// Containerd configuration paths and capabilities for a specific runtime
#[derive(Debug, Clone)]
pub struct ContainerdPaths {
/// File to read containerd version from and write to (non-drop-in mode)
pub config_file: String,
/// Backup file path before modification
pub backup_file: String,
/// File to add/remove drop-in imports from (drop-in mode)
/// None if imports are not needed (e.g., k0s auto-loads from containerd.d/)
pub imports_file: Option<String>,
/// Path to the drop-in configuration file
pub drop_in_file: String,
/// Whether drop-in files can be used (based on containerd version)
pub use_drop_in: bool,
}
#[derive(Debug, Clone)]
pub struct Config {
pub node_name: String,
@@ -375,50 +359,6 @@ impl Config {
self.experimental_force_guest_pull_for_arch.join(",")
);
}
/// Get containerd configuration file paths based on runtime type and containerd version
pub async fn get_containerd_paths(&self, runtime: &str) -> Result<ContainerdPaths> {
use crate::runtime::manager;
// Check if drop-in files can be used based on containerd version
let use_drop_in = manager::is_containerd_capable_of_using_drop_in_files(self, runtime).await?;
let paths = match runtime {
"k0s-worker" | "k0s-controller" => ContainerdPaths {
config_file: "/etc/containerd/containerd.toml".to_string(),
backup_file: "/etc/containerd/containerd.toml.bak".to_string(), // Never used, but needed for consistency
imports_file: None, // k0s auto-loads from containerd.d/, imports not needed
drop_in_file: "/etc/containerd/containerd.d/kata-deploy.toml".to_string(),
use_drop_in,
},
"microk8s" => ContainerdPaths {
// microk8s uses containerd-template.toml instead of config.toml
config_file: "/etc/containerd/containerd-template.toml".to_string(),
backup_file: "/etc/containerd/containerd-template.toml.bak".to_string(),
imports_file: Some("/etc/containerd/containerd-template.toml".to_string()),
drop_in_file: self.containerd_drop_in_conf_file.clone(),
use_drop_in,
},
"k3s" | "k3s-agent" | "rke2-agent" | "rke2-server" => ContainerdPaths {
// k3s/rke2 generates config.toml from config.toml.tmpl on each restart
// We must modify the template file so our changes persist
config_file: "/etc/containerd/config.toml.tmpl".to_string(),
backup_file: "/etc/containerd/config.toml.tmpl.bak".to_string(),
imports_file: Some("/etc/containerd/config.toml.tmpl".to_string()),
drop_in_file: self.containerd_drop_in_conf_file.clone(),
use_drop_in,
},
_ => ContainerdPaths {
config_file: self.containerd_conf_file.clone(),
backup_file: self.containerd_conf_file_backup.clone(),
imports_file: Some(self.containerd_conf_file.clone()),
drop_in_file: self.containerd_drop_in_conf_file.clone(),
use_drop_in,
},
};
Ok(paths)
}
}
fn get_arch() -> Result<String> {
@@ -439,7 +379,7 @@ fn get_arch() -> Result<String> {
/// Returns only shims that are supported for that architecture
fn get_default_shims_for_arch(arch: &str) -> &'static str {
match arch {
"x86_64" => "clh cloud-hypervisor dragonball fc qemu qemu-coco-dev qemu-coco-dev-runtime-rs qemu-runtime-rs qemu-nvidia-gpu qemu-nvidia-gpu-snp qemu-nvidia-gpu-tdx qemu-snp qemu-snp-runtime-rs qemu-tdx qemu-tdx-runtime-rs",
"x86_64" => "clh cloud-hypervisor dragonball fc qemu qemu-coco-dev qemu-coco-dev-runtime-rs qemu-runtime-rs qemu-nvidia-gpu qemu-nvidia-gpu-snp qemu-nvidia-gpu-tdx qemu-snp qemu-tdx",
"aarch64" => "clh cloud-hypervisor dragonball fc qemu qemu-nvidia-gpu qemu-cca",
"s390x" => "qemu qemu-runtime-rs qemu-se qemu-se-runtime-rs qemu-coco-dev qemu-coco-dev-runtime-rs",
"ppc64le" => "qemu",

View File

@@ -459,48 +459,13 @@ impl K8sClient {
}
}
/// Split a JSONPath string by dots, but respect escaped dots (\.)
/// Example: "metadata.labels.microk8s\.io/cluster" -> ["metadata", "labels", "microk8s.io/cluster"]
fn split_jsonpath(path: &str) -> Vec<String> {
let mut parts = Vec::new();
let mut current = String::new();
let mut chars = path.chars().peekable();
while let Some(c) = chars.next() {
if c == '\\' {
// Check if next char is a dot (escaped dot)
if chars.peek() == Some(&'.') {
current.push(chars.next().unwrap()); // Add the dot literally
} else {
current.push(c); // Keep the backslash
}
} else if c == '.' {
if !current.is_empty() {
parts.push(current);
current = String::new();
}
} else {
current.push(c);
}
}
if !current.is_empty() {
parts.push(current);
}
parts
}
/// Get value from JSON using JSONPath-like syntax (simplified)
fn get_jsonpath_value(obj: &serde_json::Value, jsonpath: &str) -> Result<String> {
// Simple JSONPath implementation for common cases
// Supports: .field, .field.subfield, [index], escaped dots (\.)
// Supports: .field, .field.subfield, [index]
let mut current = serde_json::to_value(obj)?;
// Split by unescaped dots only
let parts = split_jsonpath(jsonpath.trim_start_matches('.'));
for part in parts {
for part in jsonpath.trim_start_matches('.').split('.') {
if part.is_empty() {
continue;
}
@@ -524,7 +489,7 @@ fn get_jsonpath_value(obj: &serde_json::Value, jsonpath: &str) -> Result<String>
.clone();
} else {
current = current
.get(&part)
.get(part)
.ok_or_else(|| anyhow::anyhow!("Field '{part}' not found"))?
.clone();
}
@@ -615,25 +580,6 @@ pub async fn update_runtimeclass(
mod tests {
use super::*;
#[test]
fn test_split_jsonpath_simple() {
let parts = split_jsonpath("metadata.labels.foo");
assert_eq!(parts, vec!["metadata", "labels", "foo"]);
}
#[test]
fn test_split_jsonpath_escaped_dot() {
// microk8s\.io/cluster should become a single key: microk8s.io/cluster
let parts = split_jsonpath(r"metadata.labels.microk8s\.io/cluster");
assert_eq!(parts, vec!["metadata", "labels", "microk8s.io/cluster"]);
}
#[test]
fn test_split_jsonpath_multiple_escaped_dots() {
let parts = split_jsonpath(r"a\.b\.c.d");
assert_eq!(parts, vec!["a.b.c", "d"]);
}
#[test]
fn test_get_jsonpath_value() {
let json = json!({
@@ -647,18 +593,4 @@ mod tests {
let result = get_jsonpath_value(&json, ".status.nodeInfo.containerRuntimeVersion").unwrap();
assert_eq!(result, "containerd://1.7.0");
}
#[test]
fn test_get_jsonpath_value_with_escaped_dot() {
let json = json!({
"metadata": {
"labels": {
"microk8s.io/cluster": "true"
}
}
});
let result = get_jsonpath_value(&json, r".metadata.labels.microk8s\.io/cluster").unwrap();
assert_eq!(result, "true");
}
}

View File

@@ -25,29 +25,34 @@ pub async fn configure_containerd_runtime(
let runtime_name = format!("kata-{adjusted_shim}");
let configuration = format!("configuration-{shim}");
log::info!("configure_containerd_runtime: Getting containerd paths");
let paths = config.get_containerd_paths(runtime).await?;
log::info!("configure_containerd_runtime: use_drop_in={}", paths.use_drop_in);
log::info!("configure_containerd_runtime: Checking drop-in support");
let use_drop_in =
super::manager::is_containerd_capable_of_using_drop_in_files(config, runtime).await?;
log::info!("configure_containerd_runtime: use_drop_in={}", use_drop_in);
let configuration_file: std::path::PathBuf = if paths.use_drop_in {
// Only add /host prefix if path is not in /etc/containerd (which is mounted from host)
let base_path = if paths.drop_in_file.starts_with("/etc/containerd/") {
Path::new(&paths.drop_in_file).to_path_buf()
let configuration_file: std::path::PathBuf = if use_drop_in {
// Ensure we have the absolute path with /host prefix
let base_path = if config.containerd_drop_in_conf_file.starts_with("/host") {
// Already has /host prefix
Path::new(&config.containerd_drop_in_conf_file).to_path_buf()
} else {
// Need to add /host prefix for paths outside /etc/containerd
let drop_in_path = paths.drop_in_file.trim_start_matches('/');
// Need to add /host prefix
let drop_in_path = config.containerd_drop_in_conf_file.trim_start_matches('/');
Path::new("/host").join(drop_in_path)
};
log::debug!("Using drop-in config file: {:?}", base_path);
base_path
} else {
log::debug!("Using main config file: {}", paths.config_file);
Path::new(&paths.config_file).to_path_buf()
log::debug!("Using main config file: {}", config.containerd_conf_file);
Path::new(&config.containerd_conf_file).to_path_buf()
};
// Use config_file to read containerd version from
let containerd_root_conf_file = &paths.config_file;
let containerd_root_conf_file = if matches!(runtime, "k0s-worker" | "k0s-controller") {
"/etc/containerd/containerd.toml"
} else {
&config.containerd_conf_file
};
let pluginid = if fs::read_to_string(containerd_root_conf_file)
.unwrap_or_default()
@@ -160,22 +165,21 @@ pub async fn configure_containerd(config: &Config, runtime: &str) -> Result<()>
fs::create_dir_all("/etc/containerd/")?;
// Get all paths and drop-in capability in one call
let paths = config.get_containerd_paths(runtime).await?;
let use_drop_in =
super::manager::is_containerd_capable_of_using_drop_in_files(config, runtime).await?;
if !paths.use_drop_in {
// For non-drop-in, backup the correct config file for each runtime
if Path::new(&paths.config_file).exists() && !Path::new(&paths.backup_file).exists() {
fs::copy(&paths.config_file, &paths.backup_file)?;
if !use_drop_in {
if Path::new(&config.containerd_conf_file).exists()
&& !Path::new(&config.containerd_conf_file_backup).exists()
{
fs::copy(
&config.containerd_conf_file,
&config.containerd_conf_file_backup,
)?;
}
} else {
// Create the drop-in file directory and file
// Only add /host prefix if path is not in /etc/containerd (which is mounted from host)
let drop_in_file = if paths.drop_in_file.starts_with("/etc/containerd/") {
paths.drop_in_file.clone()
} else {
format!("/host{}", paths.drop_in_file)
};
let drop_in_file = format!("/host{}", config.containerd_drop_in_conf_file);
log::info!("Creating drop-in file at: {}", drop_in_file);
if let Some(parent) = Path::new(&drop_in_file).parent() {
@@ -196,20 +200,20 @@ pub async fn configure_containerd(config: &Config, runtime: &str) -> Result<()>
}
// Add the drop-in file to the imports array in the main config
if let Some(imports_file) = &paths.imports_file {
log::info!("Adding drop-in to imports in: {}", imports_file);
let imports_path = ".imports";
let drop_in_path = format!("\"{}\"", paths.drop_in_file);
// The append_to_toml_array function is idempotent and will not add duplicates
log::info!(
"Adding drop-in to imports in: {}",
config.containerd_conf_file
);
let imports_path = ".imports";
let drop_in_path = format!("\"{}\"", config.containerd_drop_in_conf_file);
toml_utils::append_to_toml_array(
Path::new(imports_file),
imports_path,
&drop_in_path,
)?;
log::info!("Successfully added drop-in to imports array");
} else {
log::info!("Runtime auto-loads drop-in files, skipping imports");
}
toml_utils::append_to_toml_array(
Path::new(&config.containerd_conf_file),
imports_path,
&drop_in_path,
)?;
log::info!("Successfully added drop-in to imports array");
}
log::info!("Configuring {} shim(s)", config.shims_for_arch.len());
@@ -224,27 +228,27 @@ pub async fn configure_containerd(config: &Config, runtime: &str) -> Result<()>
}
pub async fn cleanup_containerd(config: &Config, runtime: &str) -> Result<()> {
// Get all paths and drop-in capability in one call
let paths = config.get_containerd_paths(runtime).await?;
let use_drop_in =
super::manager::is_containerd_capable_of_using_drop_in_files(config, runtime).await?;
if paths.use_drop_in {
// Remove drop-in from imports array (if imports are used)
if let Some(imports_file) = &paths.imports_file {
toml_utils::remove_from_toml_array(
Path::new(imports_file),
".imports",
&format!("\"{}\"", paths.drop_in_file),
)?;
}
if use_drop_in {
let drop_in_path = config.containerd_drop_in_conf_file.clone();
toml_utils::remove_from_toml_array(
Path::new(&config.containerd_conf_file),
".imports",
&format!("\"{drop_in_path}\""),
)?;
return Ok(());
}
// For non-drop-in, restore from backup
if Path::new(&paths.backup_file).exists() {
fs::remove_file(&paths.config_file)?;
fs::rename(&paths.backup_file, &paths.config_file)?;
if Path::new(&config.containerd_conf_file_backup).exists() {
fs::remove_file(&config.containerd_conf_file)?;
fs::rename(
&config.containerd_conf_file_backup,
&config.containerd_conf_file,
)?;
} else {
fs::remove_file(&paths.config_file).ok();
fs::remove_file(&config.containerd_conf_file).ok();
}
Ok(())
@@ -260,13 +264,11 @@ pub fn setup_containerd_config_files(runtime: &str, config: &Config) -> Result<(
}
}
"k0s-worker" | "k0s-controller" => {
// k0s uses /etc/containerd/containerd.d/ for drop-ins (no /host prefix needed)
// Path is fixed for k0s, so we can hardcode it here
let drop_in_file_path = "/etc/containerd/containerd.d/kata-deploy.toml";
if let Some(parent) = Path::new(drop_in_file_path).parent() {
let drop_in_file = format!("/host{}", config.containerd_drop_in_conf_file);
if let Some(parent) = Path::new(&drop_in_file).parent() {
fs::create_dir_all(parent)?;
}
fs::File::create(drop_in_file_path)?;
fs::File::create(&drop_in_file)?;
}
"containerd" => {
if !Path::new(&config.containerd_conf_file).exists() {

View File

@@ -26,7 +26,12 @@ const CONTAINERD_BASED_RUNTIMES: &[&str] = &[
];
/// Runtimes that don't support containerd drop-in configuration files
const RUNTIMES_WITHOUT_CONTAINERD_DROP_IN_SUPPORT: &[&str] = &["crio"];
const RUNTIMES_WITHOUT_CONTAINERD_DROP_IN_SUPPORT: &[&str] = &[
"crio",
"k0s-worker",
"k0s-controller",
"microk8s",
];
fn is_containerd_based(runtime: &str) -> bool {
CONTAINERD_BASED_RUNTIMES.contains(&runtime)
@@ -109,11 +114,6 @@ pub async fn is_containerd_capable_of_using_drop_in_files(
return Ok(false);
}
// k0s always supports drop-in files (auto-loads from containerd.d/)
if runtime == "k0s-worker" || runtime == "k0s-controller" {
return Ok(true);
}
// Check containerd version - only 2.0+ supports drop-in files properly
let runtime_version =
k8s::get_node_field(config, ".status.nodeInfo.containerRuntimeVersion").await?;

View File

@@ -12,8 +12,6 @@ pub const RUST_SHIMS: &[&str] = &[
"qemu-runtime-rs",
"qemu-coco-dev-runtime-rs",
"qemu-se-runtime-rs",
"qemu-snp-runtime-rs",
"qemu-tdx-runtime-rs",
];
pub fn is_rust_shim(shim: &str) -> bool {

View File

@@ -158,7 +158,7 @@ All values can be overridden with --set key=value or a custom `-f myvalues.yaml`
| `env.installationPrefix` | Prefix where to install the Kata artifacts | `/opt/kata` |
| `env.hostOS` | Provide host-OS setting, e.g. `cbl-mariner` to do additional configurations | `""` |
| `env.multiInstallSuffix` | Enable multiple Kata installation on the same node with suffix e.g. `/opt/kata-PR12232` | `""` |
| `env._experimentalSetupSnapshotter` | Deploys (`nydus`) and/or sets up (`erofs`, `nydus`) the snapshotter(s) specified as the value (supports multiple snapshotters, separated by commas; e.g., `nydus,erofs`) | `""` |
| `env._experimentalSetupSnapshotter` | Deploys (nydus) and/or sets up (erofs, nydus) the snapshotter(s) specified as the value (supports multiple snapshotters, separated by commas; e.g., `nydus,erofs`) | `""` |
| `env._experimentalForceGuestPull` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value (supports multiple shims, separated by commas; e.g., `qemu-tdx,qemu-snp`) | `""` |
| `env._experimentalForceGuestPull_x86_64` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for x86_64 (if set, overrides `_experimentalForceGuestPull`) | `""` |
| `env._experimentalForceGuestPull_aarch64` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for aarch64 (if set, overrides `_experimentalForceGuestPull`) | `""` |
@@ -267,7 +267,7 @@ helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-d
This includes all available Kata Containers shims:
- Standard shims: `qemu`, `qemu-runtime-rs`, `clh`, `cloud-hypervisor`, `dragonball`, `fc`
- TEE shims: `qemu-snp`, `qemu-snp-runtime-rs`, `qemu-tdx`, `qemu-tdx-runtime-rs`, `qemu-se`, `qemu-se-runtime-rs`, `qemu-cca`, `qemu-coco-dev`, `qemu-coco-dev-runtime-rs`
- TEE shims: `qemu-snp`, `qemu-tdx`, `qemu-se`, `qemu-se-runtime-rs`, `qemu-cca`, `qemu-coco-dev`, `qemu-coco-dev-runtime-rs`
- NVIDIA GPU shims: `qemu-nvidia-gpu`, `qemu-nvidia-gpu-snp`, `qemu-nvidia-gpu-tdx`
- Remote shims: `remote` (for `peer-pods`/`cloud-api-adaptor`, disabled by default)
@@ -401,12 +401,8 @@ shims:
enabled: false
qemu-snp:
enabled: false
qemu-snp-runtime-rs:
enabled: false
qemu-tdx:
enabled: false
qemu-tdx-runtime-rs:
enabled: false
qemu-se:
enabled: false
qemu-se-runtime-rs:

View File

@@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: "3.25.0"
version: "3.24.0"
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "3.25.0"
appVersion: "3.24.0"
dependencies:
- name: node-feature-discovery

View File

@@ -290,3 +290,18 @@ Note: EXPERIMENTAL_FORCE_GUEST_PULL only checks containerd.forceGuestPull, not c
{{- end -}}
{{- join "," $shimNames -}}
{{- end -}}
{{/*
Detect if this is a Rust-based build by checking the image tag
Returns "true" if the tag contains "-rust", otherwise returns "false"
This is a temporary helper for dual script/rust support
*/}}
{{- define "kata-deploy.isRustBuild" -}}
{{- $tag := default .Chart.AppVersion .Values.image.tag -}}
{{- if or (contains "-rust" $tag) (contains "nightly-rust" $tag) -}}
true
{{- else -}}
false
{{- end -}}
{{- end -}}

View File

@@ -133,7 +133,11 @@ spec:
- name: kube-kata
image: {{ .Values.image.reference }}:{{ default .Chart.AppVersion .Values.image.tag }}
imagePullPolicy: {{ .Values.imagePullPolicy }}
{{- if eq (include "kata-deploy.isRustBuild" .) "true" }}
command: ["/usr/bin/kata-deploy", "install"]
{{- else }}
command: ["/opt/kata-artifacts/scripts/kata-deploy.sh", "install"]
{{- end }}
env:
- name: NODE_NAME
valueFrom:

View File

@@ -104,7 +104,11 @@ spec:
- name: kube-kata-cleanup
image: {{ .Values.image.reference }}:{{ default .Chart.AppVersion .Values.image.tag }}
imagePullPolicy: {{ .Values.imagePullPolicy }}
{{- if eq (include "kata-deploy.isRustBuild" .) "true" }}
command: ["/usr/bin/kata-deploy", "cleanup"]
{{- else }}
command: ["/opt/kata-artifacts/scripts/kata-deploy.sh", "cleanup"]
{{- end }}
env:
- name: NODE_NAME
valueFrom:

View File

@@ -24,9 +24,7 @@
"qemu-se-runtime-rs" (dict "memory" "1024Mi" "cpu" "1.0")
"qemu-se" (dict "memory" "1024Mi" "cpu" "1.0")
"qemu-snp" (dict "memory" "2048Mi" "cpu" "1.0")
"qemu-snp-runtime-rs" (dict "memory" "2048Mi" "cpu" "1.0")
"qemu-tdx" (dict "memory" "2048Mi" "cpu" "1.0")
"qemu-tdx-runtime-rs" (dict "memory" "2048Mi" "cpu" "1.0")
"qemu-nvidia-gpu" (dict "memory" "4096Mi" "cpu" "1.0")
"qemu-nvidia-gpu-snp" (dict "memory" "20480Mi" "cpu" "1.0")
"qemu-nvidia-gpu-tdx" (dict "memory" "20480Mi" "cpu" "1.0")

View File

@@ -13,6 +13,10 @@ metadata:
namespace: {{ .Release.Namespace }}
labels:
{{- include "kata-deploy.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
data:
pod-spec.yaml: |
{{- .Values.verification.pod | nindent 4 }}
@@ -28,9 +32,9 @@ metadata:
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "0"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded,hook-failed
spec:
backoffLimit: 0
backoffLimit: 3
ttlSecondsAfterFinished: 3600
template:
metadata:
@@ -57,132 +61,19 @@ spec:
echo "Timeout: ${TIMEOUT}s"
echo ""
# First, wait for kata-deploy DaemonSet to exist (it's created by Helm, not a hook)
echo "Waiting for kata-deploy DaemonSet to be created..."
# Wait for kata-deploy DaemonSet to be ready
echo "Waiting for kata-deploy DaemonSet to be ready..."
{{- if .Values.env.multiInstallSuffix }}
DAEMONSET_NAME="{{ .Chart.Name }}-{{ .Values.env.multiInstallSuffix }}"
kubectl rollout status daemonset/{{ .Chart.Name }}-{{ .Values.env.multiInstallSuffix }} -n {{ .Release.Namespace }} --timeout=600s
{{- else }}
DAEMONSET_NAME="{{ .Chart.Name }}"
kubectl rollout status daemonset/{{ .Chart.Name }} -n {{ .Release.Namespace }} --timeout=600s
{{- end }}
max_wait=120
elapsed=0
while true; do
if kubectl get daemonset "${DAEMONSET_NAME}" -n {{ .Release.Namespace }} &>/dev/null; then
echo "DaemonSet ${DAEMONSET_NAME} exists"
break
fi
if [[ ${elapsed} -ge ${max_wait} ]]; then
echo "ERROR: Timeout waiting for DaemonSet to be created after ${max_wait}s"
kubectl get daemonset -n {{ .Release.Namespace }} || true
exit 1
fi
echo "Waiting for DaemonSet to be created... (${elapsed}s/${max_wait}s)"
sleep 2
elapsed=$((elapsed + 2))
done
# Now wait for kata-deploy DaemonSet pods to exist
echo ""
echo "Waiting for kata-deploy DaemonSet pods to be scheduled..."
max_wait=120
elapsed=0
while true; do
pod_count=$(kubectl get pods -n {{ .Release.Namespace }} -l name=kata-deploy --no-headers 2>/dev/null | wc -l)
if [[ ${pod_count} -gt 0 ]]; then
echo "Found ${pod_count} kata-deploy pod(s)"
break
fi
if [[ ${elapsed} -ge ${max_wait} ]]; then
echo "ERROR: Timeout waiting for kata-deploy pods after ${max_wait}s"
kubectl get pods -n {{ .Release.Namespace }} -l name=kata-deploy || true
exit 1
fi
echo "Waiting for pods to be scheduled... (${elapsed}s/${max_wait}s)"
sleep 2
elapsed=$((elapsed + 2))
done
# Wait for kata-deploy DaemonSet to be ready (all pods running)
# This includes waiting for the kata-deploy image to be pulled, which can take 20+ minutes
echo ""
echo "Waiting for kata-deploy DaemonSet to be ready..."
DAEMONSET_TIMEOUT="{{ .Values.verification.daemonsetTimeout }}"
echo "DaemonSet timeout: ${DAEMONSET_TIMEOUT}s"
kubectl rollout status daemonset/"${DAEMONSET_NAME}" -n {{ .Release.Namespace }} --timeout="${DAEMONSET_TIMEOUT}s"
# Wait for nodes to be labeled with katacontainers.io/kata-runtime=true
# This label is set by kata-deploy when installation is complete
# This is a quick internal operation (copying artifacts), so use a fixed timeout
echo ""
echo "Waiting for nodes to be labeled with kata-runtime..."
max_wait=60
echo "Node label timeout: ${max_wait}s"
elapsed=0
while true; do
labeled_nodes=$(kubectl get nodes -l katacontainers.io/kata-runtime=true --no-headers 2>/dev/null | wc -l)
total_nodes=$(kubectl get nodes --no-headers 2>/dev/null | wc -l)
if [[ ${labeled_nodes} -gt 0 ]] && [[ ${labeled_nodes} -eq ${total_nodes} ]]; then
echo "All ${total_nodes} node(s) labeled with kata-runtime=true"
kubectl get nodes -L katacontainers.io/kata-runtime || true
break
fi
if [[ ${elapsed} -ge ${max_wait} ]]; then
echo "ERROR: Timeout waiting for nodes to be labeled after ${max_wait}s"
echo "Labeled nodes: ${labeled_nodes}/${total_nodes}"
echo "Node labels:"
kubectl get nodes -L katacontainers.io/kata-runtime || true
exit 1
fi
echo "Labeled nodes: ${labeled_nodes}/${total_nodes} (${elapsed}s/${max_wait}s)"
sleep 5
elapsed=$((elapsed + 5))
done
# Give kubelet time to pick up the new runtime configuration after containerd restart
echo ""
echo "Waiting 15s for kubelet to refresh runtime information..."
sleep 15
# Retry pod creation if it fails due to runtime not being ready yet
POD_NAME="kata-deploy-verify"
MAX_POD_RETRIES=3
POD_RETRY_DELAY=10
for pod_attempt in $(seq 1 ${MAX_POD_RETRIES}); do
echo ""
echo "Creating verification pod (attempt ${pod_attempt}/${MAX_POD_RETRIES})..."
# Clean up any existing pod first
kubectl delete pod "${POD_NAME}" -n "${VERIFY_NS}" --ignore-not-found --wait=true 2>/dev/null || true
kubectl apply -n "${VERIFY_NS}" -f /config/pod-spec.yaml
echo "Created: ${POD_NAME}"
# Wait a moment for pod to be scheduled and initial status
sleep 5
# Check for immediate sandbox creation failure
sandbox_error=$(kubectl get events -n "${VERIFY_NS}" --field-selector involvedObject.name="${POD_NAME}",type=Warning 2>/dev/null | grep -c "FailedCreatePodSandBox" || echo 0)
if [[ ${sandbox_error} -gt 0 ]] && [[ ${pod_attempt} -lt ${MAX_POD_RETRIES} ]]; then
echo "Pod sandbox creation failed, runtime may not be ready yet"
echo "Waiting ${POD_RETRY_DELAY}s before retry..."
kubectl delete pod "${POD_NAME}" -n "${VERIFY_NS}" --ignore-not-found --wait=true 2>/dev/null || true
sleep ${POD_RETRY_DELAY}
continue
fi
# Pod created successfully or we're on last attempt, proceed to wait for completion
break
done
echo "Creating verification pod..."
POD_RESOURCE=$(kubectl apply -n "${VERIFY_NS}" -f /config/pod-spec.yaml -o name)
POD_NAME="${POD_RESOURCE#pod/}"
echo "Created: ${POD_NAME}"
# Ensure cleanup runs on any exit (success, failure, or signal)
cleanup() {
@@ -194,70 +85,23 @@ spec:
echo ""
echo "Waiting for verification pod to complete..."
# Wait for pod to either succeed or fail, checking every few seconds
start_time=$(date +%s)
while true; do
phase=$(kubectl get pod "${POD_NAME}" -n "${VERIFY_NS}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
if [[ "${phase}" == "Succeeded" ]]; then
echo ""
echo "=== Verification Pod Logs ==="
kubectl logs "${POD_NAME}" -n "${VERIFY_NS}" || true
echo ""
echo "SUCCESS: Verification passed"
exit 0
fi
if [[ "${phase}" == "Failed" ]]; then
echo ""
echo "=== Verification Failed - Pod phase is Failed ==="
echo "Pod status:"
kubectl describe pod "${POD_NAME}" -n "${VERIFY_NS}" || true
echo ""
echo "Pod logs:"
kubectl logs "${POD_NAME}" -n "${VERIFY_NS}" || true
exit 1
fi
# Check if pod is stuck - look for events indicating it can't start
if [[ "${phase}" == "Pending" ]]; then
# Look for FailedCreatePodSandBox or other error events
error_count=$(kubectl get events -n "${VERIFY_NS}" --field-selector involvedObject.name="${POD_NAME}",type=Warning 2>/dev/null | grep -c "FailedCreatePodSandBox\|Failed\|Error" || echo 0)
# Only fail on errors if we've waited at least 30s (give time for transient issues)
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [[ ${error_count} -gt 0 ]] && [[ ${elapsed} -gt 30 ]]; then
echo ""
echo "=== Verification Failed - Pod stuck with ${error_count} error events ==="
echo "Events:"
kubectl get events -n "${VERIFY_NS}" --field-selector involvedObject.name="${POD_NAME}" || true
echo ""
echo "Pod status:"
kubectl describe pod "${POD_NAME}" -n "${VERIFY_NS}" || true
exit 1
fi
fi
# Check timeout
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [[ ${elapsed} -ge ${TIMEOUT} ]]; then
echo ""
echo "=== Verification Failed - Timeout after ${TIMEOUT}s ==="
echo "Pod phase: ${phase}"
echo "Pod status:"
kubectl describe pod "${POD_NAME}" -n "${VERIFY_NS}" || true
echo ""
echo "Pod logs:"
kubectl logs "${POD_NAME}" -n "${VERIFY_NS}" 2>/dev/null || echo "No logs available"
exit 1
fi
echo "Pod phase: ${phase}, elapsed: ${elapsed}s/${TIMEOUT}s"
sleep 5
done
if kubectl wait pod "${POD_NAME}" -n "${VERIFY_NS}" --for=jsonpath='{.status.phase}'=Succeeded --timeout="${TIMEOUT}s"; then
echo ""
echo "=== Verification Pod Logs ==="
kubectl logs "${POD_NAME}" -n "${VERIFY_NS}" || true
echo ""
echo "SUCCESS: Verification passed"
exit 0
else
echo ""
echo "=== Verification Failed ==="
echo "Pod status:"
kubectl describe pod "${POD_NAME}" -n "${VERIFY_NS}" || true
echo ""
echo "Pod logs:"
kubectl logs "${POD_NAME}" -n "${VERIFY_NS}" || true
exit 1
fi
volumeMounts:
- name: pod-spec
mountPath: /config
@@ -292,12 +136,6 @@ rules:
- apiGroups: [""]
resources: ["pods", "pods/log"]
verbs: ["get", "list", "watch", "create", "delete"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list"]
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get", "list", "watch"]

View File

@@ -33,7 +33,7 @@ shims:
qemu-nvidia-gpu-tdx:
enabled: false
# Now enable TEE shims (qemu-snp, qemu-snp-runtime-rs, qemu-tdx, qemu-tdx-runtime-rs, qemu-se, qemu-se-runtime-rs, qemu-cca, qemu-coco-dev, qemu-coco-dev-runtime-rs)
# Now enable TEE shims (qemu-snp, qemu-tdx, qemu-se, qemu-se-runtime-rs, qemu-cca, qemu-coco-dev, qemu-coco-dev-runtime-rs)
qemu-snp:
enabled: true
supportedArches:
@@ -48,20 +48,6 @@ shims:
httpsProxy: ""
noProxy: ""
qemu-snp-runtime-rs:
enabled: true
supportedArches:
- amd64
allowedHypervisorAnnotations: []
containerd:
snapshotter: nydus
forceGuestPull: false
crio:
guestPull: true
agent:
httpsProxy: ""
noProxy: ""
qemu-tdx:
enabled: true
supportedArches:
@@ -76,20 +62,6 @@ shims:
httpsProxy: ""
noProxy: ""
qemu-tdx-runtime-rs:
enabled: true
supportedArches:
- amd64
allowedHypervisorAnnotations: []
containerd:
snapshotter: nydus
forceGuestPull: false
crio:
guestPull: true
agent:
httpsProxy: ""
noProxy: ""
qemu-se:
enabled: true
supportedArches:

View File

@@ -134,20 +134,6 @@ shims:
httpsProxy: ""
noProxy: ""
qemu-snp-runtime-rs:
enabled: true
supportedArches:
- amd64
allowedHypervisorAnnotations: []
containerd:
snapshotter: nydus
forceGuestPull: false
crio:
guestPull: true
agent:
httpsProxy: ""
noProxy: ""
qemu-tdx:
enabled: true
supportedArches:
@@ -162,20 +148,6 @@ shims:
httpsProxy: ""
noProxy: ""
qemu-tdx-runtime-rs:
enabled: true
supportedArches:
- amd64
allowedHypervisorAnnotations: []
containerd:
snapshotter: nydus
forceGuestPull: false
crio:
guestPull: true
agent:
httpsProxy: ""
noProxy: ""
qemu-se:
enabled: true
supportedArches:
@@ -312,17 +284,9 @@ verification:
# Namespace where verification pod will be created
namespace: default
# Timeout for the verification pod itself to complete (seconds)
# This is how long to wait for the verification pod to run and finish successfully.
# Default: 180s (3 minutes)
# Timeout for verification pod to complete (seconds)
timeout: 180
# Timeout for kata-deploy DaemonSet rollout (seconds)
# This includes waiting for the kata-deploy image to be pulled from the registry
# and pods to start. Large images over slow networks may need more time.
# Default: 1200s (20 minutes)
daemonsetTimeout: 1200
# Pod spec for verification (optional)
# If provided, a verification job will run after install/upgrade.
# If empty, no verification is performed.

Some files were not shown because too many files have changed in this diff Show More