mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-09 13:32:08 +00:00
Compare commits
30 Commits
topic/dock
...
fsmerged-e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd0c24d775 | ||
|
|
47a4816c43 | ||
|
|
25edd51ab5 | ||
|
|
fde3272825 | ||
|
|
c06fc44775 | ||
|
|
35d867c5a5 | ||
|
|
0a5d95988a | ||
|
|
30303960c2 | ||
|
|
d98f5fb3af | ||
|
|
d980dbbb0a | ||
|
|
4707bd26c9 | ||
|
|
92248ba6b8 | ||
|
|
feaec78ad0 | ||
|
|
461907918d | ||
|
|
9e1f595160 | ||
|
|
46a7b9e75d | ||
|
|
9b770793ba | ||
|
|
47770daa3b | ||
|
|
1300145f7a | ||
|
|
0a739b3b55 | ||
|
|
cb7c790dc7 | ||
|
|
2a024f55d0 | ||
|
|
9a2825a429 | ||
|
|
e1fae11509 | ||
|
|
35cafe8715 | ||
|
|
57e42b10f1 | ||
|
|
a762b136de | ||
|
|
43489f6d56 | ||
|
|
9923f251f5 | ||
|
|
086c0ed18e |
13
.dockerignore
Normal file
13
.dockerignore
Normal file
@@ -0,0 +1,13 @@
|
||||
# Context for tools/packaging/kata-deploy/Dockerfile (build from repo root: -f tools/packaging/kata-deploy/Dockerfile .)
|
||||
#
|
||||
# The Dockerfile only needs: Cargo.toml, Cargo.lock, src/, tools/packaging/kata-deploy/,
|
||||
# and versions.yaml. Exclude heavy or irrelevant trees to keep context small.
|
||||
.git
|
||||
.github
|
||||
target
|
||||
kata-artifacts
|
||||
docs
|
||||
tests
|
||||
utils
|
||||
tools/packaging/kata-deploy/local-build
|
||||
tools/packaging/kata-deploy/binary/target
|
||||
1
.github/workflows/basic-ci-amd64.yaml
vendored
1
.github/workflows/basic-ci-amd64.yaml
vendored
@@ -279,7 +279,6 @@ jobs:
|
||||
matrix:
|
||||
vmm:
|
||||
- qemu
|
||||
- qemu-runtime-rs
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
|
||||
4
.github/workflows/basic-ci-s390x.yaml
vendored
4
.github/workflows/basic-ci-s390x.yaml
vendored
@@ -132,9 +132,7 @@ jobs:
|
||||
# all due to a single flaky instance.
|
||||
fail-fast: false
|
||||
matrix:
|
||||
vmm:
|
||||
- qemu
|
||||
- qemu-runtime-rs
|
||||
vmm: ['qemu']
|
||||
runs-on: s390x-large
|
||||
env:
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
name: nydus-snapshotter-version-sync
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- edited
|
||||
- reopened
|
||||
- synchronize
|
||||
|
||||
permissions: {}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
nydus-snapshotter-version-check:
|
||||
name: nydus-snapshotter-version-check
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
persist-credentials: false
|
||||
- name: Ensure nydus-snapshotter-version is in sync inside our repo
|
||||
run: |
|
||||
dockerfile_version=$(grep "ARG NYDUS_SNAPSHOTTER_VERSION" tools/packaging/kata-deploy/Dockerfile | cut -f2 -d'=')
|
||||
versions_version=$(yq ".externals.nydus-snapshotter.version | explode(.)" versions.yaml)
|
||||
if [[ "${dockerfile_version}" != "${versions_version}" ]]; then
|
||||
echo "nydus-snapshotter version must be the same in the following places: "
|
||||
echo "- versions.yaml: ${versions_version}"
|
||||
echo "- tools/packaging/kata-deploy/Dockerfile: ${dockerfile_version}"
|
||||
exit 1
|
||||
fi
|
||||
1
.github/workflows/run-kata-coco-tests.yaml
vendored
1
.github/workflows/run-kata-coco-tests.yaml
vendored
@@ -378,6 +378,7 @@ jobs:
|
||||
matrix:
|
||||
vmm:
|
||||
- qemu-coco-dev
|
||||
- qemu-coco-dev-runtime-rs
|
||||
snapshotter:
|
||||
- erofs
|
||||
pull-type:
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
From 6936ab1bac4567095ad2a62e5871af1a7982c616 Mon Sep 17 00:00:00 2001
|
||||
From: Alex Lyn <alex.lyn@antgroup.com>
|
||||
Date: Thu, 9 Apr 2026 15:47:22 +0800
|
||||
Subject: [PATCH] kata-deploy: Complete containerd config for erofs snapshotter
|
||||
|
||||
Add missing containerd configuration items for erofs snapshotter to
|
||||
enable fsmerged erofs feature:
|
||||
|
||||
- Add differ plugin configuration:
|
||||
- mkfs_options: ["-T0","--mkfs-time","--sort=none"]
|
||||
- enable_tar_index: false
|
||||
|
||||
- Add snapshotter plugin configuration:
|
||||
- default_size: "10G"
|
||||
- max_unmerged_layers: 1
|
||||
|
||||
These configurations align with the documentation in
|
||||
docs/how-to/how-to-use-fsmerged-erofs-with-kata.md Step 2,
|
||||
ensuring the CI workflow run-k8s-tests-coco-nontee-with-erofs-snapshotter
|
||||
can properly configure containerd for erofs fsmerged rootfs.
|
||||
|
||||
Signed-off-by: Alex Lyn <alex.lyn@antgroup.com>
|
||||
---
|
||||
.../binary/src/artifacts/snapshotters.rs | 23 +++++++++++++++++++
|
||||
1 file changed, 23 insertions(+)
|
||||
|
||||
diff --git a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
|
||||
index fb49f35d5..998a881d2 100644
|
||||
--- a/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
|
||||
+++ b/tools/packaging/kata-deploy/binary/src/artifacts/snapshotters.rs
|
||||
@@ -30,6 +30,19 @@ pub async fn configure_erofs_snapshotter(
|
||||
"[\"erofs\",\"walking\"]",
|
||||
)?;
|
||||
|
||||
+ // Configure erofs differ plugin
|
||||
+ toml_utils::set_toml_value(
|
||||
+ configuration_file,
|
||||
+ ".plugins.\"io.containerd.differ.v1.erofs\".mkfs_options",
|
||||
+ "[\"-T0\",\"--mkfs-time\",\"--sort=none\"]",
|
||||
+ )?;
|
||||
+ toml_utils::set_toml_value(
|
||||
+ configuration_file,
|
||||
+ ".plugins.\"io.containerd.differ.v1.erofs\".enable_tar_index",
|
||||
+ "false",
|
||||
+ )?;
|
||||
+
|
||||
+ // Configure erofs snapshotter plugin
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity",
|
||||
@@ -40,6 +53,16 @@ pub async fn configure_erofs_snapshotter(
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".set_immutable",
|
||||
"true",
|
||||
)?;
|
||||
+ toml_utils::set_toml_value(
|
||||
+ configuration_file,
|
||||
+ ".plugins.\"io.containerd.snapshotter.v1.erofs\".default_size",
|
||||
+ "\"10G\"",
|
||||
+ )?;
|
||||
+ toml_utils::set_toml_value(
|
||||
+ configuration_file,
|
||||
+ ".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
|
||||
+ "1",
|
||||
+ )?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
--
|
||||
2.34.0
|
||||
|
||||
479
Cargo.lock
generated
479
Cargo.lock
generated
@@ -94,6 +94,12 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
@@ -409,6 +415,28 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
|
||||
dependencies = [
|
||||
"async-stream-impl",
|
||||
"futures-core",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-stream-impl"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "async-task"
|
||||
version = "4.7.1"
|
||||
@@ -511,6 +539,17 @@ dependencies = [
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "backon"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef"
|
||||
dependencies = [
|
||||
"fastrand 2.3.0",
|
||||
"gloo-timers",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "backtrace"
|
||||
version = "0.3.76"
|
||||
@@ -1272,6 +1311,16 @@ dependencies = [
|
||||
"darling_macro 0.20.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0"
|
||||
dependencies = [
|
||||
"darling_core 0.21.3",
|
||||
"darling_macro 0.21.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.14.4"
|
||||
@@ -1299,6 +1348,20 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.14.4"
|
||||
@@ -1321,6 +1384,17 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81"
|
||||
dependencies = [
|
||||
"darling_core 0.21.3",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "5.5.3"
|
||||
@@ -1600,6 +1674,27 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
|
||||
dependencies = [
|
||||
"derive_more-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more-impl"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc_version",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "device_tree"
|
||||
version = "1.1.0"
|
||||
@@ -1726,6 +1821,18 @@ version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
|
||||
|
||||
[[package]]
|
||||
name = "educe"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d7bc049e1bd8cdeb31b68bbd586a9464ecf9f3944af3958a7a9d0f8b9799417"
|
||||
dependencies = [
|
||||
"enum-ordinalize",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
@@ -1774,6 +1881,26 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum-ordinalize"
|
||||
version = "4.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0"
|
||||
dependencies = [
|
||||
"enum-ordinalize-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum-ordinalize-derive"
|
||||
version = "4.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enumflags2"
|
||||
version = "0.7.12"
|
||||
@@ -2338,6 +2465,18 @@ version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||
|
||||
[[package]]
|
||||
name = "gloo-timers"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "go-flag"
|
||||
version = "0.1.0"
|
||||
@@ -2406,6 +2545,8 @@ version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash",
|
||||
]
|
||||
|
||||
@@ -2506,6 +2647,17 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hostname"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.4",
|
||||
"libc",
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.12"
|
||||
@@ -2643,7 +2795,9 @@ dependencies = [
|
||||
"http 1.4.0",
|
||||
"hyper 1.8.1",
|
||||
"hyper-util",
|
||||
"log",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
@@ -2662,6 +2816,19 @@ dependencies = [
|
||||
"tokio-io-timeout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-timeout"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
|
||||
dependencies = [
|
||||
"hyper 1.8.1",
|
||||
"hyper-util",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.6.0"
|
||||
@@ -3127,6 +3294,19 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonpath-rust"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b"
|
||||
dependencies = [
|
||||
"pest",
|
||||
"pest_derive",
|
||||
"regex",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonptr"
|
||||
version = "0.4.7"
|
||||
@@ -3201,6 +3381,18 @@ dependencies = [
|
||||
"tonic-build 0.8.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "k8s-openapi"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06d9e5e61dd037cdc51da0d7e2b2be10f497478ea7e120d85dad632adb99882b"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"chrono",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kata-agent"
|
||||
version = "0.1.0"
|
||||
@@ -3285,6 +3477,28 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kata-deploy"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"k8s-openapi",
|
||||
"kube",
|
||||
"libc",
|
||||
"log",
|
||||
"regex",
|
||||
"rstest",
|
||||
"serde_json",
|
||||
"serde_yaml 0.9.34+deprecated",
|
||||
"serial_test 0.10.0",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"toml_edit 0.22.27",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kata-sys-util"
|
||||
version = "0.1.0"
|
||||
@@ -3307,6 +3521,8 @@ dependencies = [
|
||||
"slog",
|
||||
"slog-scope",
|
||||
"subprocess",
|
||||
"tempfile",
|
||||
"test-utils",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
@@ -3325,6 +3541,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"oci-spec 0.8.4",
|
||||
"regex",
|
||||
"rstest",
|
||||
"safe-path 0.1.0",
|
||||
"serde",
|
||||
"serde-enum-str",
|
||||
@@ -3334,6 +3551,8 @@ dependencies = [
|
||||
"slog-scope",
|
||||
"sysctl",
|
||||
"sysinfo",
|
||||
"tempfile",
|
||||
"test-utils",
|
||||
"thiserror 1.0.69",
|
||||
"toml",
|
||||
]
|
||||
@@ -3358,6 +3577,115 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kube"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48e7bb0b6a46502cc20e4575b6ff401af45cfea150b34ba272a3410b78aa014e"
|
||||
dependencies = [
|
||||
"k8s-openapi",
|
||||
"kube-client",
|
||||
"kube-core",
|
||||
"kube-derive",
|
||||
"kube-runtime",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kube-client"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4987d57a184d2b5294fdad3d7fc7f278899469d21a4da39a8f6ca16426567a36"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes 1.11.1",
|
||||
"chrono",
|
||||
"either",
|
||||
"futures",
|
||||
"home",
|
||||
"http 1.4.0",
|
||||
"http-body 1.0.1",
|
||||
"http-body-util",
|
||||
"hyper 1.8.1",
|
||||
"hyper-rustls",
|
||||
"hyper-timeout 0.5.2",
|
||||
"hyper-util",
|
||||
"jsonpath-rust",
|
||||
"k8s-openapi",
|
||||
"kube-core",
|
||||
"pem",
|
||||
"rustls",
|
||||
"secrecy",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_yaml 0.9.34+deprecated",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower 0.5.3",
|
||||
"tower-http",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kube-core"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "914bbb770e7bb721a06e3538c0edd2babed46447d128f7c21caa68747060ee73"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"derive_more",
|
||||
"form_urlencoded",
|
||||
"http 1.4.0",
|
||||
"json-patch 4.1.0",
|
||||
"k8s-openapi",
|
||||
"schemars",
|
||||
"serde",
|
||||
"serde-value",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kube-derive"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03dee8252be137772a6ab3508b81cd797dee62ee771112a2453bc85cbbe150d2"
|
||||
dependencies = [
|
||||
"darling 0.21.3",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kube-runtime"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6aea4de4b562c5cc89ab10300bb63474ae1fa57ff5a19275f2e26401a323e3fd"
|
||||
dependencies = [
|
||||
"ahash 0.8.12",
|
||||
"async-broadcast 0.7.2",
|
||||
"async-stream",
|
||||
"backon",
|
||||
"educe",
|
||||
"futures",
|
||||
"hashbrown 0.15.5",
|
||||
"hostname",
|
||||
"json-patch 4.1.0",
|
||||
"k8s-openapi",
|
||||
"kube-client",
|
||||
"parking_lot",
|
||||
"pin-project",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kvm-bindings"
|
||||
version = "0.14.0"
|
||||
@@ -3539,6 +3867,7 @@ dependencies = [
|
||||
"slog-json",
|
||||
"slog-scope",
|
||||
"slog-term",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3584,11 +3913,16 @@ version = "0.2.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"lazy_static",
|
||||
"maplit",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"page_size",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
"slog-term",
|
||||
"test-utils",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -4493,6 +4827,15 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "2.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.4.3"
|
||||
@@ -4602,6 +4945,16 @@ dependencies = [
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.2"
|
||||
@@ -4621,6 +4974,49 @@ dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest"
|
||||
version = "2.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"ucd-trie",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest_derive"
|
||||
version = "2.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77"
|
||||
dependencies = [
|
||||
"pest",
|
||||
"pest_generator",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest_generator"
|
||||
version = "2.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f"
|
||||
dependencies = [
|
||||
"pest",
|
||||
"pest_meta",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pest_meta"
|
||||
version = "2.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220"
|
||||
dependencies = [
|
||||
"pest",
|
||||
"sha2 0.10.9",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "petgraph"
|
||||
version = "0.5.1"
|
||||
@@ -5824,7 +6220,6 @@ dependencies = [
|
||||
"protobuf",
|
||||
"protocols",
|
||||
"resource",
|
||||
"rstest",
|
||||
"runtime-spec",
|
||||
"serde_json",
|
||||
"shim-interface",
|
||||
@@ -5975,7 +6370,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"log",
|
||||
"once_cell",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki",
|
||||
"subtle",
|
||||
@@ -6074,6 +6471,7 @@ name = "safe-path"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6120,10 +6518,23 @@ checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
|
||||
dependencies = [
|
||||
"dyn-clone",
|
||||
"ref-cast",
|
||||
"schemars_derive",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schemars_derive"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde_derive_internals",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scientific"
|
||||
version = "0.5.3"
|
||||
@@ -6165,6 +6576,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "secrecy"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e891af845473308773346dc847b2c23ee78fe442e0472ac50e22a18a93d3ae5a"
|
||||
dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "3.7.0"
|
||||
@@ -6244,6 +6664,16 @@ version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8a059d895f1a31dd928f40abbea4e7177e3d8ff3aa4152fdb7a396ae1ef63a3"
|
||||
|
||||
[[package]]
|
||||
name = "serde-value"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
|
||||
dependencies = [
|
||||
"ordered-float 2.10.1",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
@@ -6264,6 +6694,17 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive_internals"
|
||||
version = "0.29.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_ignored"
|
||||
version = "0.1.14"
|
||||
@@ -6497,6 +6938,8 @@ dependencies = [
|
||||
"kata-sys-util",
|
||||
"kata-types",
|
||||
"nix 0.26.4",
|
||||
"tempfile",
|
||||
"test-utils",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -7016,7 +7459,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"integer-encoding",
|
||||
"log",
|
||||
"ordered-float",
|
||||
"ordered-float 1.1.1",
|
||||
"threadpool",
|
||||
]
|
||||
|
||||
@@ -7164,6 +7607,7 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"pin-project-lite",
|
||||
"slab",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -7228,6 +7672,18 @@ dependencies = [
|
||||
"winnow 0.5.40",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
|
||||
dependencies = [
|
||||
"indexmap 2.13.0",
|
||||
"toml_datetime 0.6.11",
|
||||
"toml_write",
|
||||
"winnow 0.7.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.25.4+spec-1.1.0"
|
||||
@@ -7249,6 +7705,12 @@ dependencies = [
|
||||
"winnow 0.7.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_write"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
|
||||
|
||||
[[package]]
|
||||
name = "tonic"
|
||||
version = "0.9.2"
|
||||
@@ -7265,7 +7727,7 @@ dependencies = [
|
||||
"http 0.2.12",
|
||||
"http-body 0.4.6",
|
||||
"hyper 0.14.32",
|
||||
"hyper-timeout",
|
||||
"hyper-timeout 0.4.1",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.11.9",
|
||||
@@ -7334,8 +7796,10 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"sync_wrapper 1.0.2",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7344,16 +7808,19 @@ version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bitflags 2.11.0",
|
||||
"bytes 1.11.1",
|
||||
"futures-util",
|
||||
"http 1.4.0",
|
||||
"http-body 1.0.1",
|
||||
"iri-string",
|
||||
"mime",
|
||||
"pin-project-lite",
|
||||
"tower 0.5.3",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7526,6 +7993,12 @@ version = "1.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
|
||||
|
||||
[[package]]
|
||||
name = "ucd-trie"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
|
||||
|
||||
[[package]]
|
||||
name = "uds_windows"
|
||||
version = "1.2.0"
|
||||
|
||||
18
Cargo.toml
18
Cargo.toml
@@ -6,6 +6,17 @@ rust-version = "1.88"
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
# libs
|
||||
"src/libs/kata-sys-util",
|
||||
"src/libs/kata-types",
|
||||
"src/libs/logging",
|
||||
"src/libs/mem-agent",
|
||||
"src/libs/protocols",
|
||||
"src/libs/runtime-spec",
|
||||
"src/libs/safe-path",
|
||||
"src/libs/shim-interface",
|
||||
"src/libs/test-utils",
|
||||
|
||||
# kata-agent
|
||||
"src/agent",
|
||||
"src/agent/rustjail",
|
||||
@@ -31,6 +42,9 @@ members = [
|
||||
# genpolicy
|
||||
"src/tools/genpolicy",
|
||||
|
||||
# kata-deploy (Kubernetes installer binary)
|
||||
"tools/packaging/kata-deploy/binary",
|
||||
|
||||
# runtime-rs
|
||||
"src/runtime-rs",
|
||||
"src/runtime-rs/crates/agent",
|
||||
@@ -48,10 +62,6 @@ resolver = "2"
|
||||
# TODO: Add all excluded crates to root workspace
|
||||
exclude = [
|
||||
"src/tools",
|
||||
"src/libs",
|
||||
|
||||
# kata-deploy binary is standalone and has its own Cargo.toml for now
|
||||
"tools/packaging/kata-deploy/binary",
|
||||
|
||||
# We are cloning and building rust packages under
|
||||
# "tools/packaging/kata-deploy/local-build/build" folder, which may mislead
|
||||
|
||||
336
docs/how-to/how-to-use-fsmerged-erofs-with-kata.md
Normal file
336
docs/how-to/how-to-use-fsmerged-erofs-with-kata.md
Normal file
@@ -0,0 +1,336 @@
|
||||
# Use EROFS Snapshotter with Kata Containers (runtime-rs)
|
||||
|
||||
## Project Overview
|
||||
|
||||
The [EROFS snapshotter](https://erofs.docs.kernel.org) is a native containerd
|
||||
snapshotter that converts OCI container image layers into EROFS-formatted blobs.
|
||||
When used with Kata Containers `runtime-rs`, the EROFS snapshotter enables
|
||||
**block-level image pass-through** to the guest VM, bypassing virtio-fs / 9p
|
||||
entirely. This delivers lower overhead, better performance, and smaller memory
|
||||
footprints compared to traditional shared-filesystem approaches.
|
||||
|
||||
## Quick Start Guide
|
||||
|
||||
This section provides a quick overview of the steps to get started with EROFS snapshotter and Kata Containers. For detailed instructions, see the [Installation Guide](#installation-guide) section.
|
||||
|
||||
### Quick Steps
|
||||
|
||||
1. **Install erofs-utils**: Install erofs-utils (version >= 1.7) on your host system
|
||||
2. **Configure containerd**: Enable EROFS snapshotter and differ in containerd configuration
|
||||
3. **Configure Kata Containers**: Set up runtime-rs with appropriate hypervisor settings
|
||||
4. **Run a container**: Use `ctr` or Kubernetes to run containers with EROFS snapshotter
|
||||
|
||||
### Prerequisites
|
||||
|
||||
| Component | Version Requirement |
|
||||
|-----------|-------------------|
|
||||
| Linux kernel | >= 5.4 (with `erofs` module) |
|
||||
| erofs-utils | >= 1.7 (>= 1.8 recommended) |
|
||||
| containerd | >= 2.2 (with EROFS snapshotter and differ support) |
|
||||
| Kata Containers | Latest `main` branch with runtime-rs |
|
||||
| QEMU | >= 5.0 (VMDK flat-extent support and >= 8.0 recommended) |
|
||||
|
||||
## Installation Guide
|
||||
|
||||
This section provides detailed step-by-step instructions for installing and configuring EROFS snapshotter with Kata Containers.
|
||||
|
||||
### Step 1: Install erofs-utils
|
||||
|
||||
```bash
|
||||
# Debian/Ubuntu
|
||||
$ sudo apt install erofs-utils
|
||||
|
||||
# Fedora
|
||||
$ sudo dnf install erofs-utils
|
||||
```
|
||||
|
||||
Verify the version:
|
||||
|
||||
```bash
|
||||
$ mkfs.erofs --version
|
||||
# Should show 1.7 or higher
|
||||
```
|
||||
|
||||
Load the kernel module:
|
||||
|
||||
```bash
|
||||
$ sudo modprobe erofs
|
||||
```
|
||||
|
||||
### Step 2: Configure containerd
|
||||
|
||||
#### Enable the EROFS snapshotter and differ
|
||||
|
||||
Edit your containerd configuration (typically `/etc/containerd/config.toml`):
|
||||
|
||||
```toml
|
||||
version = 3
|
||||
...
|
||||
[plugins.'io.containerd.cri.v1.runtime']
|
||||
...
|
||||
[plugins.'io.containerd.cri.v1.runtime'.containerd]
|
||||
...
|
||||
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes]
|
||||
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.kata]
|
||||
runtime_type = 'io.containerd.kata.v2'
|
||||
pod_annotations = ["*"]
|
||||
container_annotations = ["*"]
|
||||
privileged_without_host_devices = false
|
||||
sandboxer = 'podsandbox'
|
||||
...
|
||||
|
||||
[plugins.'io.containerd.differ.v1.erofs']
|
||||
mkfs_options = ["-T0", "--mkfs-time", "--sort=none"]
|
||||
enable_tar_index = false
|
||||
|
||||
[plugins.'io.containerd.service.v1.diff-service']
|
||||
default = ['erofs', 'walking']
|
||||
|
||||
[plugins.'io.containerd.snapshotter.v1.erofs']
|
||||
default_size = '<SIZE>' # SIZE=6G or 10G or other size
|
||||
max_unmerged_layers = 1
|
||||
```
|
||||
|
||||
#### Verify the EROFS plugins are loaded
|
||||
|
||||
Check if EROFS module is loaded
|
||||
|
||||
```bash
|
||||
$ lsmod | grep erofs
|
||||
erofs 188416 0
|
||||
netfs 614400 1 erofs
|
||||
```
|
||||
|
||||
If not loaded:
|
||||
|
||||
```bash
|
||||
$ sudo modprobe erofs
|
||||
```
|
||||
|
||||
Restart containerd and check:
|
||||
|
||||
```bash
|
||||
$ sudo systemctl restart containerd
|
||||
$ sudo ctr plugins ls | grep erofs
|
||||
io.containerd.snapshotter.v1 erofs linux/amd64 ok
|
||||
io.containerd.differ.v1 erofs linux/amd64 ok
|
||||
```
|
||||
|
||||
Check containerd snapshotter status
|
||||
|
||||
```bash
|
||||
$ sudo ctr plugins ls | grep erofs
|
||||
io.containerd.mount-handler.v1 erofs linux/amd64 ok
|
||||
io.containerd.snapshotter.v1 erofs linux/amd64 ok
|
||||
io.containerd.differ.v1 erofs linux/amd64 ok
|
||||
```
|
||||
|
||||
Both `snapshotter` and `differ` should show `ok`.
|
||||
|
||||
### Step 3: Configure Kata Containers (runtime-rs)
|
||||
|
||||
Edit the Kata configuration file (e.g.,
|
||||
`configuration-qemu-runtime-rs.toml`):
|
||||
|
||||
```toml
|
||||
[hypervisor.qemu]
|
||||
# shared_fs can be set to "none" since EROFS layers are passed via
|
||||
# block devices, not via virtio-fs. If you still need virtio-fs for
|
||||
# other purposes (e.g., file sharing), keep "virtio-fs".
|
||||
# For pure block-device EROFS mode:
|
||||
shared_fs = "none"
|
||||
```
|
||||
|
||||
> **Note**: The `shared_fs = "none"` setting is for the case where all
|
||||
> container images use the EROFS snapshotter. If you have a mixed environment,
|
||||
> keep `shared_fs = "virtio-fs"` so that non-EROFS containers can still use
|
||||
> virtio-fs.
|
||||
|
||||
|
||||
### Quick Test
|
||||
|
||||
Once the installation is complete, you can quickly test with:
|
||||
|
||||
Using `ctr` for example.
|
||||
|
||||
```bash
|
||||
# Pull the image
|
||||
$ sudo ctr image pull docker.io/library/wordpress:latest
|
||||
|
||||
# Run with EROFS snapshotter and Kata runtime-rs
|
||||
$ sudo ctr run --runtime io.containerd.kata.v2 --snapshotter=erofs --rm -t library/wordpress:latest test001 date
|
||||
Wed Apr 1 07:10:53 UTC 2026
|
||||
|
||||
$ sudo ctr run --runtime io.containerd.kata.v2 --snapshotter=erofs --rm -t wordpress:latest test001 lsblk
|
||||
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS
|
||||
vda 254:0 0 256M 0 disk
|
||||
`-vda1 254:1 0 253M 0 part
|
||||
vdb 254:16 0 6G 0 disk
|
||||
vdc 254:32 0 759.7M 0 disk
|
||||
```
|
||||
|
||||
> **Note**: Ensure that the containerd CRI configuration maps the `kata`
|
||||
> handler to the Kata runtime with `snapshotter = "erofs"` as shown in
|
||||
> [Step 2](#step-2-configure-containerd).
|
||||
|
||||
### Architecture
|
||||
|
||||
The following diagram illustrates the data flow:
|
||||
|
||||
```
|
||||
Host Guest VM
|
||||
==== ========
|
||||
|
||||
containerd kata-agent
|
||||
| |
|
||||
v v
|
||||
EROFS snapshotter 1. mount ext4 /dev/vdX
|
||||
| (writable upper)
|
||||
|-- Mount[0]: ext4 rw layer |
|
||||
| (block device on host) 2. mount erofs /dev/vdY
|
||||
| (read-only lower)
|
||||
|-- Mount[1]: erofs layers |
|
||||
| source: layer.erofs 3. overlay mount
|
||||
| device=extra1.erofs lowerdir=<erofs_mount>
|
||||
| device=extra2.erofs upperdir=<ext4_mount>/upper
|
||||
| workdir=<ext4_mount>/work
|
||||
v |
|
||||
runtime-rs v
|
||||
| container rootfs
|
||||
|-- single erofs: attach as Raw ready
|
||||
|-- multi erofs: generate VMDK
|
||||
| descriptor + attach as Vmdk
|
||||
|
|
||||
v
|
||||
QEMU (virtio-blk)
|
||||
|-- /dev/vdX: ext4 rw layer
|
||||
|-- /dev/vdY: erofs layer(s)
|
||||
```
|
||||
|
||||
### VMDK flat-extent descriptor (multi-layer case)
|
||||
|
||||
VMDK Descriptor Format (twoGbMaxExtentFlat)
|
||||
The descriptor follows the [VMware Virtual Disk Format specification](https://github.com/libyal/libvmdk/blob/main/documentation/VMWare%20Virtual%20Disk%20Format%20(VMDK).asciidoc):
|
||||
|
||||
- Header: `# Disk DescriptorFile` marker and version info
|
||||
- Extent descriptions: `RW <sectors> FLAT "<filename>" <offset>`
|
||||
- `sectors`: number of 512-byte sectors for this extent
|
||||
- `filename`: absolute path to the backing file
|
||||
- `offset`: starting sector offset within the file (0-based)
|
||||
- DDB (Disk Data Base): virtual hardware and geometry metadata
|
||||
|
||||
Files larger than 2GB are automatically split into multiple extents
|
||||
(MAX_2GB_EXTENT_SECTORS per extent) as required by the twoGbMaxExtentFlat format.
|
||||
|
||||
When multiple EROFS layers are merged, `runtime-rs` generates a VMDK
|
||||
descriptor file (`twoGbMaxExtentFlat` format):
|
||||
|
||||
```
|
||||
# Disk DescriptorFile
|
||||
version=1
|
||||
CID=fffffffe
|
||||
parentCID=ffffffff
|
||||
createType="twoGbMaxExtentFlat"
|
||||
|
||||
# Extent description
|
||||
RW 2048 FLAT "/path/to/fsmeta.erofs" 0
|
||||
RW 4096 FLAT "/path/to/layer1.erofs" 0
|
||||
RW 8192 FLAT "/path/to/layer2.erofs" 0
|
||||
|
||||
# The Disk Data Base
|
||||
#DDB
|
||||
|
||||
ddb.virtualHWVersion = "4"
|
||||
ddb.geometry.cylinders = "15"
|
||||
ddb.geometry.heads = "16"
|
||||
ddb.geometry.sectors = "63"
|
||||
ddb.adapterType = "ide"
|
||||
```
|
||||
|
||||
QEMU's VMDK driver reads this descriptor and presents all extents as a
|
||||
single contiguous block device to the guest. The guest kernel's EROFS driver
|
||||
then mounts this combined device with multi-device support.
|
||||
|
||||
### How it works
|
||||
|
||||
The containerd EROFS snapshotter prepares a multi-layer rootfs layout:
|
||||
|
||||
```
|
||||
Mount[0]: ext4 rw layer --> virtio-blk device (writable upper layer)
|
||||
Mount[1]: erofs layers --> virtio-blk device (read-only, via VMDK for multi-extent)
|
||||
Mount[2]: overlay --> guest agent mounts overlay combining upper + lower
|
||||
```
|
||||
|
||||
For the EROFS read-only layers:
|
||||
|
||||
- **Single layer**: The single `.erofs` blob is attached directly as a raw
|
||||
virtio-blk device.
|
||||
- **Multiple layers**: Multiple `.erofs` blobs (the base layer + `device=`
|
||||
extra layers) are merged into a single virtual block device using a VMDK
|
||||
flat-extent descriptor (`twoGbMaxExtentFlat` format). QEMU's VMDK driver
|
||||
parses the descriptor and concatenates all extents transparently.
|
||||
|
||||
Inside the guest VM, the kata-agent:
|
||||
|
||||
1. Mounts the ext4 block device as the writable upper layer.
|
||||
2. Mounts the erofs block device as the read-only lower layer.
|
||||
3. Creates an overlay filesystem combining the two.
|
||||
|
||||
### Verify QEMU VMDK support
|
||||
|
||||
The multi-layer EROFS rootfs relies on QEMU's VMDK block driver to present
|
||||
a VMDK flat-extent descriptor as a single virtual disk. QEMU must be compiled
|
||||
with VMDK format support enabled (this is typically on by default, but some
|
||||
minimal or custom builds may disable it).
|
||||
|
||||
Run the following command to check:
|
||||
|
||||
```bash
|
||||
$ qemu-system-x86_64 -drive format=help 2>&1 | grep vmdk
|
||||
```
|
||||
|
||||
You should see `vmdk` in the `Supported formats` list, for example:
|
||||
|
||||
```
|
||||
Supported formats: blkdebug blklogwrites blkreplay blkverify bochs cloop
|
||||
compress copy-before-write copy-on-read dmg file ftp ftps host_cdrom
|
||||
host_device http https luks nbd null-aio null-co nvme parallels preallocate
|
||||
qcow qcow2 qed quorum raw replication snapshot-access ssh throttle vdi vhdx
|
||||
vmdk vpc vvfat
|
||||
```
|
||||
|
||||
If `vmdk` does not appear, you need to rebuild QEMU with VMDK support enabled.
|
||||
|
||||
#### Build the guest components
|
||||
|
||||
The guest kernel must have `CONFIG_EROFS_FS=y` (or `=m` with the module
|
||||
auto-loaded). The kata-agent in the guest image must include multi-layer
|
||||
EROFS support.
|
||||
|
||||
Refer to [how-to-use-erofs-build-rootfs.md](how-to-use-erofs-build-rootfs.md)
|
||||
for building a guest rootfs with EROFS support.
|
||||
|
||||
### Limitations
|
||||
|
||||
> **Hypervisor support**: The fsmerged EROFS rootfs feature currently **only
|
||||
> supports QEMU** as the hypervisor, because it depends on the VMDK
|
||||
> flat-extent descriptor format for merging multiple EROFS layers into a
|
||||
> single block device. The following hypervisors do **not** support VMDK
|
||||
> format block devices at this time, and therefore **cannot** be used with
|
||||
> fsmerged EROFS rootfs:
|
||||
>
|
||||
> - **Cloud Hypervisor (CLH)** — no VMDK block device support ([WIP](https://github.com/cloud-hypervisor/cloud-hypervisor/issues/7167))
|
||||
> - **Firecracker** — no VMDK block device support ([WIP](https://github.com/firecracker-microvm/firecracker/pull/5741))
|
||||
> - **Dragonball** — no VMDK block device support (TODO)
|
||||
>
|
||||
> For single-layer EROFS (only one `.erofs` blob, no `device=` extra layers),
|
||||
> the blob is attached as a raw block device without a VMDK descriptor. This
|
||||
> mode may work with other hypervisors that support raw virtio-blk devices,
|
||||
> but has not been fully tested.
|
||||
|
||||
## References
|
||||
|
||||
- [EROFS documentation](https://erofs.docs.kernel.org)
|
||||
- [Containerd EROFS snapshotter](https://github.com/containerd/containerd/blob/main/docs/snapshotters/erofs.md)
|
||||
- [Configure Kata to use EROFS build rootfs](how-to-use-erofs-build-rootfs.md)
|
||||
- [Kata Containers architecture](../design/architecture)
|
||||
@@ -4,7 +4,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::os::unix::fs::{MetadataExt, PermissionsExt};
|
||||
use std::path::Path;
|
||||
@@ -26,6 +26,7 @@ use self::ephemeral_handler::EphemeralHandler;
|
||||
use self::fs_handler::{OverlayfsHandler, Virtio9pHandler, VirtioFsHandler};
|
||||
use self::image_pull_handler::ImagePullHandler;
|
||||
use self::local_handler::LocalHandler;
|
||||
use self::multi_layer_erofs::{handle_multi_layer_erofs_group, is_multi_layer_storage};
|
||||
use crate::mount::{baremount, is_mounted, remove_mounts};
|
||||
use crate::sandbox::Sandbox;
|
||||
|
||||
@@ -37,6 +38,7 @@ mod ephemeral_handler;
|
||||
mod fs_handler;
|
||||
mod image_pull_handler;
|
||||
mod local_handler;
|
||||
mod multi_layer_erofs;
|
||||
|
||||
const RW_MASK: u32 = 0o660;
|
||||
const RO_MASK: u32 = 0o440;
|
||||
@@ -147,6 +149,7 @@ lazy_static! {
|
||||
#[cfg(target_arch = "s390x")]
|
||||
Arc::new(self::block_handler::VirtioBlkCcwHandler {}),
|
||||
Arc::new(ImagePullHandler {}),
|
||||
Arc::new(self::multi_layer_erofs::MultiLayerErofsHandler {}),
|
||||
];
|
||||
|
||||
for handler in handlers {
|
||||
@@ -157,6 +160,88 @@ lazy_static! {
|
||||
};
|
||||
}
|
||||
|
||||
/// Result of multi-layer storage handling
|
||||
struct MultiLayerProcessResult {
|
||||
/// The primary device created
|
||||
device: Arc<dyn StorageDevice>,
|
||||
/// All mount points that were processed as part of this group
|
||||
processed_mount_points: Vec<String>,
|
||||
}
|
||||
|
||||
/// Handle multi-layer storage by creating the overlay device.
|
||||
/// Returns None if the storage is not a multi-layer storage.
|
||||
/// Returns Some(Ok(result)) if successfully processed.
|
||||
/// Returns Some(Err(e)) if there was an error.
|
||||
async fn handle_multi_layer_storage(
|
||||
logger: &Logger,
|
||||
storage: &Storage,
|
||||
storages: &[Storage],
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
cid: &Option<String>,
|
||||
processed_mount_points: &HashSet<String>,
|
||||
) -> Result<Option<MultiLayerProcessResult>> {
|
||||
if !is_multi_layer_storage(storage) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Skip if already processed as part of a previous multi-layer group
|
||||
if processed_mount_points.contains(&storage.mount_point) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"processing multi-layer EROFS storage";
|
||||
"mount-point" => &storage.mount_point,
|
||||
"source" => &storage.source,
|
||||
"driver" => &storage.driver,
|
||||
"fstype" => &storage.fstype,
|
||||
);
|
||||
|
||||
let result = handle_multi_layer_erofs_group(storage, storages, cid, sandbox, logger).await?;
|
||||
|
||||
// Create device for the mount point
|
||||
let device = new_device(result.mount_point.clone())?;
|
||||
|
||||
Ok(Some(MultiLayerProcessResult {
|
||||
device,
|
||||
processed_mount_points: result.processed_mount_points,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Update sandbox storage with the created device.
|
||||
/// Handles cleanup on failure.
|
||||
async fn update_storage_device(
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
mount_point: &str,
|
||||
device: Arc<dyn StorageDevice>,
|
||||
logger: &Logger,
|
||||
) -> Result<()> {
|
||||
if let Err(device) = sandbox
|
||||
.lock()
|
||||
.await
|
||||
.update_sandbox_storage(mount_point, device)
|
||||
{
|
||||
error!(logger, "failed to update device for storage"; "mount-point" => mount_point);
|
||||
if let Err(e) = sandbox
|
||||
.lock()
|
||||
.await
|
||||
.remove_sandbox_storage(mount_point)
|
||||
.await
|
||||
{
|
||||
warn!(logger, "failed to remove dummy sandbox storage"; "error" => ?e);
|
||||
}
|
||||
if let Err(e) = device.cleanup() {
|
||||
error!(logger, "failed to clean state for storage device"; "mount-point" => mount_point, "error" => ?e);
|
||||
}
|
||||
return Err(anyhow!(
|
||||
"failed to update device for storage: {}",
|
||||
mount_point
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// add_storages takes a list of storages passed by the caller, and perform the
|
||||
// associated operations such as waiting for the device to show up, and mount
|
||||
// it to a specific location, according to the type of handler chosen, and for
|
||||
@@ -169,8 +254,54 @@ pub async fn add_storages(
|
||||
cid: Option<String>,
|
||||
) -> Result<Vec<String>> {
|
||||
let mut mount_list = Vec::new();
|
||||
let mut processed_mount_points = HashSet::new();
|
||||
|
||||
for storage in storages {
|
||||
for storage in &storages {
|
||||
// Try multi-layer storage handling first
|
||||
if let Some(result) = handle_multi_layer_storage(
|
||||
&logger,
|
||||
storage,
|
||||
&storages,
|
||||
sandbox,
|
||||
&cid,
|
||||
&processed_mount_points,
|
||||
)
|
||||
.await?
|
||||
{
|
||||
// Register all processed mount points
|
||||
for mp in &result.processed_mount_points {
|
||||
processed_mount_points.insert(mp.clone());
|
||||
}
|
||||
|
||||
// Add sandbox storage for each mount point in the group
|
||||
for mp in &result.processed_mount_points {
|
||||
let state = sandbox
|
||||
.lock()
|
||||
.await
|
||||
.add_sandbox_storage(mp, storage.shared)
|
||||
.await;
|
||||
|
||||
// Only update device for the first occurrence
|
||||
if state.ref_count().await == 1 {
|
||||
update_storage_device(sandbox, mp, result.device.clone(), &logger).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the primary mount point to the list
|
||||
if let Some(path) = result.device.path() {
|
||||
if !path.is_empty() {
|
||||
mount_list.push(path.to_string());
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if already processed as part of multi-layer group
|
||||
if processed_mount_points.contains(&storage.mount_point) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Standard storage handling
|
||||
let path = storage.mount_point.clone();
|
||||
let state = sandbox
|
||||
.lock()
|
||||
@@ -178,68 +309,48 @@ pub async fn add_storages(
|
||||
.add_sandbox_storage(&path, storage.shared)
|
||||
.await;
|
||||
if state.ref_count().await > 1 {
|
||||
if let Some(path) = state.path() {
|
||||
if !path.is_empty() {
|
||||
mount_list.push(path.to_string());
|
||||
if let Some(p) = state.path() {
|
||||
if !p.is_empty() {
|
||||
mount_list.push(p.to_string());
|
||||
}
|
||||
}
|
||||
// The device already exists.
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(handler) = STORAGE_HANDLERS.handler(&storage.driver) {
|
||||
// Create device using handler
|
||||
let device = if let Some(handler) = STORAGE_HANDLERS.handler(&storage.driver) {
|
||||
let logger =
|
||||
logger.new(o!( "subsystem" => "storage", "storage-type" => storage.driver.clone()));
|
||||
logger.new(o!("subsystem" => "storage", "storage-type" => storage.driver.clone()));
|
||||
let mut ctx = StorageContext {
|
||||
cid: &cid,
|
||||
logger: &logger,
|
||||
sandbox,
|
||||
};
|
||||
|
||||
match handler.create_device(storage, &mut ctx).await {
|
||||
Ok(device) => {
|
||||
match sandbox
|
||||
.lock()
|
||||
.await
|
||||
.update_sandbox_storage(&path, device.clone())
|
||||
{
|
||||
Ok(d) => {
|
||||
if let Some(path) = device.path() {
|
||||
if !path.is_empty() {
|
||||
mount_list.push(path.to_string());
|
||||
}
|
||||
}
|
||||
drop(d);
|
||||
}
|
||||
Err(device) => {
|
||||
error!(logger, "failed to update device for storage");
|
||||
if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await
|
||||
{
|
||||
warn!(logger, "failed to remove dummy sandbox storage {:?}", e);
|
||||
}
|
||||
if let Err(e) = device.cleanup() {
|
||||
error!(
|
||||
logger,
|
||||
"failed to clean state for storage device {}, {}", path, e
|
||||
);
|
||||
}
|
||||
return Err(anyhow!("failed to update device for storage"));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(logger, "failed to create device for storage, error: {e:?}");
|
||||
if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await {
|
||||
warn!(logger, "failed to remove dummy sandbox storage {e:?}");
|
||||
}
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
handler.create_device(storage.clone(), &mut ctx).await
|
||||
} else {
|
||||
return Err(anyhow!(
|
||||
"Failed to find the storage handler {}",
|
||||
storage.driver
|
||||
));
|
||||
};
|
||||
|
||||
match device {
|
||||
Ok(device) => {
|
||||
update_storage_device(sandbox, &path, device.clone(), &logger).await?;
|
||||
if let Some(p) = device.path() {
|
||||
if !p.is_empty() {
|
||||
mount_list.push(p.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(logger, "failed to create device for storage"; "error" => ?e);
|
||||
if let Err(e) = sandbox.lock().await.remove_sandbox_storage(&path).await {
|
||||
warn!(logger, "failed to remove dummy sandbox storage"; "error" => ?e);
|
||||
}
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
492
src/agent/src/storage/multi_layer_erofs.rs
Normal file
492
src/agent/src/storage/multi_layer_erofs.rs
Normal file
@@ -0,0 +1,492 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
//! Multi-layer EROFS storage handler
|
||||
//!
|
||||
//! This handler implements the guest-side processing of multi-layer EROFS rootfs:
|
||||
//! - Storage with X-kata.overlay-upper: ext4 rw layer (upperdir)
|
||||
//! - Storage with X-kata.overlay-lower: erofs layers (lowerdir)
|
||||
//! - Creates overlay to combine them
|
||||
//! - Supports X-kata.mkdir.path options to create directories in upper layer before overlay mount
|
||||
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use kata_sys_util::mount::create_mount_destination;
|
||||
use kata_types::mount::StorageDevice;
|
||||
use protocols::agent::Storage;
|
||||
use regex::Regex;
|
||||
use slog::Logger;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use crate::device::BLOCK;
|
||||
use crate::mount::baremount;
|
||||
use crate::sandbox::Sandbox;
|
||||
use crate::storage::{StorageContext, StorageHandler};
|
||||
use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher};
|
||||
|
||||
/// EROFS Type
|
||||
const EROFS_TYPE: &str = "erofs";
|
||||
/// ext4 Type
|
||||
const EXT4_TYPE: &str = "ext4";
|
||||
/// Overlay Type
|
||||
const OVERLAY_TYPE: &str = "overlay";
|
||||
|
||||
/// Driver type for multi-layer EROFS
|
||||
pub const DRIVER_MULTI_LAYER_EROFS: &str = "erofs.multi-layer";
|
||||
|
||||
/// Custom storage option markers
|
||||
const OPT_OVERLAY_UPPER: &str = "X-kata.overlay-upper";
|
||||
const OPT_OVERLAY_LOWER: &str = "X-kata.overlay-lower";
|
||||
const OPT_MULTI_LAYER: &str = "X-kata.multi-layer=true";
|
||||
const OPT_MKDIR_PATH: &str = "X-kata.mkdir.path=";
|
||||
|
||||
#[derive(Debug)]
|
||||
struct VirtioBlkMatcher {
|
||||
rex: Regex,
|
||||
}
|
||||
|
||||
impl VirtioBlkMatcher {
|
||||
fn new(devname: &str) -> Self {
|
||||
let re = format!(r"/virtio[0-9]+/block/{}$", devname);
|
||||
VirtioBlkMatcher {
|
||||
rex: Regex::new(&re).expect("Failed to compile VirtioBlkMatcher regex"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UeventMatcher for VirtioBlkMatcher {
|
||||
fn is_match(&self, uev: &Uevent) -> bool {
|
||||
uev.subsystem == BLOCK && self.rex.is_match(&uev.devpath) && !uev.devname.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MultiLayerErofsHandler {}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MultiLayerErofsResult {
|
||||
pub mount_point: String,
|
||||
pub processed_mount_points: Vec<String>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug)]
|
||||
struct MkdirDirective {
|
||||
raw_path: String,
|
||||
mode: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl StorageHandler for MultiLayerErofsHandler {
|
||||
fn driver_types(&self) -> &[&str] {
|
||||
&[DRIVER_MULTI_LAYER_EROFS]
|
||||
}
|
||||
|
||||
async fn create_device(
|
||||
&self,
|
||||
storage: Storage,
|
||||
ctx: &mut StorageContext,
|
||||
) -> Result<Arc<dyn StorageDevice>> {
|
||||
// This is called when a single storage has driver="erofs.multi-layer"
|
||||
// For now, treat it as a regular mount point
|
||||
slog::info!(
|
||||
ctx.logger,
|
||||
"multi-layer EROFS handler invoked for single storage";
|
||||
"driver" => &storage.driver,
|
||||
"source" => &storage.source,
|
||||
"fstype" => &storage.fstype,
|
||||
"mount-point" => &storage.mount_point,
|
||||
);
|
||||
|
||||
let path = crate::storage::common_storage_handler(ctx.logger, &storage)?;
|
||||
crate::storage::new_device(path)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_multi_layer_storage(storage: &Storage) -> bool {
|
||||
storage.options.iter().any(|o| o == OPT_MULTI_LAYER)
|
||||
|| storage.driver == DRIVER_MULTI_LAYER_EROFS
|
||||
}
|
||||
|
||||
pub async fn handle_multi_layer_erofs_group(
|
||||
trigger: &Storage,
|
||||
storages: &[Storage],
|
||||
cid: &Option<String>,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
logger: &Logger,
|
||||
) -> Result<MultiLayerErofsResult> {
|
||||
let logger = logger.new(o!(
|
||||
"subsystem" => "multi-layer-erofs",
|
||||
"trigger-mount-point" => trigger.mount_point.clone(),
|
||||
));
|
||||
|
||||
let multi_layer_storages: Vec<&Storage> = storages
|
||||
.iter()
|
||||
.filter(|s| is_multi_layer_storage(s))
|
||||
.collect();
|
||||
|
||||
if multi_layer_storages.is_empty() {
|
||||
return Err(anyhow!("no multi-layer storages found"));
|
||||
}
|
||||
|
||||
let mut ext4_storage: Option<&Storage> = None;
|
||||
let mut erofs_storages: Vec<&Storage> = Vec::new();
|
||||
let mut mkdir_dirs: Vec<MkdirDirective> = Vec::new();
|
||||
|
||||
for storage in &multi_layer_storages {
|
||||
if is_upper_storage(storage) {
|
||||
if ext4_storage.is_some() {
|
||||
return Err(anyhow!(
|
||||
"multi-layer erofs currently supports exactly one ext4 upper layer"
|
||||
));
|
||||
}
|
||||
ext4_storage = Some(*storage);
|
||||
|
||||
// Extract mkdir directories from X-kata.mkdir.path options
|
||||
for opt in &storage.options {
|
||||
if let Some(mkdir_spec) = opt.strip_prefix(OPT_MKDIR_PATH) {
|
||||
mkdir_dirs.push(parse_mkdir_directive(mkdir_spec)?);
|
||||
}
|
||||
}
|
||||
} else if is_lower_storage(storage) {
|
||||
erofs_storages.push(*storage);
|
||||
}
|
||||
}
|
||||
|
||||
let ext4 = ext4_storage
|
||||
.ok_or_else(|| anyhow!("multi-layer erofs missing ext4 upper layer storage"))?;
|
||||
|
||||
if erofs_storages.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"multi-layer erofs missing erofs lower layer storage"
|
||||
));
|
||||
}
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"handling multi-layer erofs group";
|
||||
"ext4-device" => &ext4.source,
|
||||
"erofs-devices" => erofs_storages
|
||||
.iter()
|
||||
.map(|s| s.source.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
"mount-point" => &ext4.mount_point,
|
||||
"mkdir-dirs-count" => mkdir_dirs.len(),
|
||||
);
|
||||
|
||||
// Create temporary mount points for upper and lower layers
|
||||
let cid_str = cid.as_deref().unwrap_or("sandbox");
|
||||
let temp_base = PathBuf::from(format!("/run/kata-containers/{}/multi-layer", cid_str));
|
||||
fs::create_dir_all(&temp_base).context("failed to create temp mount base")?;
|
||||
|
||||
let upper_mount = temp_base.join("upper");
|
||||
fs::create_dir_all(&upper_mount).context("failed to create upper mount dir")?;
|
||||
|
||||
wait_and_mount_upper(ext4, &upper_mount, sandbox, &logger).await?;
|
||||
|
||||
for mkdir_dir in &mkdir_dirs {
|
||||
// As {{ mount 1 }} refers to the first lower layer, which is not available until we mount it.
|
||||
// Just skip it for now and handle it in a second pass after mounting the lower layers.
|
||||
if mkdir_dir.raw_path.contains("{{ mount 1 }}") {
|
||||
continue;
|
||||
}
|
||||
let resolved_path = resolve_mkdir_path(&mkdir_dir.raw_path, &upper_mount, None);
|
||||
slog::info!(
|
||||
logger,
|
||||
"creating mkdir directory in upper layer";
|
||||
"raw-path" => &mkdir_dir.raw_path,
|
||||
"resolved-path" => &resolved_path,
|
||||
);
|
||||
|
||||
fs::create_dir_all(&resolved_path)
|
||||
.with_context(|| format!("failed to create mkdir directory: {}", resolved_path))?;
|
||||
}
|
||||
|
||||
let mut lower_mounts = Vec::new();
|
||||
for (index, erofs) in erofs_storages.iter().enumerate() {
|
||||
let lower_mount = temp_base.join(format!("lower-{}", index));
|
||||
fs::create_dir_all(&lower_mount).with_context(|| {
|
||||
format!("failed to create lower mount dir {}", lower_mount.display())
|
||||
})?;
|
||||
|
||||
wait_and_mount_lower(erofs, &lower_mount, sandbox, &logger).await?;
|
||||
lower_mounts.push(lower_mount);
|
||||
}
|
||||
|
||||
// If any mkdir directive refers to {{ mount 1 }}, resolve it now using the first lower mount.
|
||||
// This matches current supported placeholder behavior without inventing a broader template scheme.
|
||||
for mkdir_dir in &mkdir_dirs {
|
||||
if mkdir_dir.raw_path.contains("{{ mount 1 }}") {
|
||||
let first_lower = lower_mounts
|
||||
.first()
|
||||
.ok_or_else(|| anyhow!("lower mount is missing while resolving mkdir path"))?;
|
||||
let resolved_path =
|
||||
resolve_mkdir_path(&mkdir_dir.raw_path, &upper_mount, Some(first_lower));
|
||||
slog::info!(
|
||||
logger,
|
||||
"creating deferred mkdir directory";
|
||||
"raw-path" => &mkdir_dir.raw_path,
|
||||
"resolved-path" => &resolved_path,
|
||||
);
|
||||
|
||||
fs::create_dir_all(&resolved_path).with_context(|| {
|
||||
format!(
|
||||
"failed to create deferred mkdir directory: {}",
|
||||
resolved_path
|
||||
)
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
let upperdir = upper_mount.join("upper");
|
||||
let workdir = upper_mount.join("work");
|
||||
|
||||
if !upperdir.exists() {
|
||||
fs::create_dir_all(&upperdir).context("failed to create upperdir")?;
|
||||
}
|
||||
fs::create_dir_all(&workdir).context("failed to create workdir")?;
|
||||
|
||||
let lowerdir = lower_mounts
|
||||
.iter()
|
||||
.map(|p| p.display().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(":");
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"creating overlay mount";
|
||||
"upperdir" => upperdir.display(),
|
||||
"lowerdir" => &lowerdir,
|
||||
"workdir" => workdir.display(),
|
||||
"target" => &ext4.mount_point,
|
||||
);
|
||||
|
||||
create_mount_destination(
|
||||
Path::new(OVERLAY_TYPE),
|
||||
Path::new(&ext4.mount_point),
|
||||
"",
|
||||
OVERLAY_TYPE,
|
||||
)
|
||||
.context("failed to create overlay mount destination")?;
|
||||
|
||||
let overlay_options = format!(
|
||||
"upperdir={},lowerdir={},workdir={}",
|
||||
upperdir.display(),
|
||||
lowerdir,
|
||||
workdir.display()
|
||||
);
|
||||
|
||||
baremount(
|
||||
Path::new(OVERLAY_TYPE),
|
||||
Path::new(&ext4.mount_point),
|
||||
OVERLAY_TYPE,
|
||||
nix::mount::MsFlags::empty(),
|
||||
&overlay_options,
|
||||
&logger,
|
||||
)
|
||||
.context("failed to mount overlay")?;
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"multi-layer erofs overlay mounted successfully";
|
||||
"mount-point" => &ext4.mount_point,
|
||||
);
|
||||
|
||||
// Collect all unique mount points to maintain a clean resource state.
|
||||
//
|
||||
// In multi-layer EROFS configurations, upper and lower storages may share
|
||||
// the same mount point.
|
||||
// We must deduplicate these entries before processing to prevent:
|
||||
// 1. Double-incrementing sandbox refcounts for the same resource.
|
||||
// 2. Redundant bookkeeping operations that could lead to state inconsistency.
|
||||
//
|
||||
// Note: We maintain the original order of insertion, which is essential for
|
||||
// ensuring a predictable and correct sequence during resource cleanup.
|
||||
let processed_mount_points = multi_layer_storages.iter().fold(Vec::new(), |mut acc, s| {
|
||||
if !acc.contains(&s.mount_point) {
|
||||
acc.push(s.mount_point.clone());
|
||||
}
|
||||
acc
|
||||
});
|
||||
|
||||
Ok(MultiLayerErofsResult {
|
||||
mount_point: ext4.mount_point.clone(),
|
||||
processed_mount_points,
|
||||
})
|
||||
}
|
||||
|
||||
fn is_upper_storage(storage: &Storage) -> bool {
|
||||
storage.options.iter().any(|o| o == OPT_OVERLAY_UPPER)
|
||||
|| (storage.fstype == EXT4_TYPE && storage.options.iter().any(|o| o == OPT_MULTI_LAYER))
|
||||
}
|
||||
|
||||
fn is_lower_storage(storage: &Storage) -> bool {
|
||||
storage.options.iter().any(|o| o == OPT_OVERLAY_LOWER)
|
||||
|| (storage.fstype == EROFS_TYPE && storage.options.iter().any(|o| o == OPT_MULTI_LAYER))
|
||||
}
|
||||
|
||||
fn parse_mkdir_directive(spec: &str) -> Result<MkdirDirective> {
|
||||
let parts: Vec<&str> = spec.splitn(2, ':').collect();
|
||||
if parts.is_empty() || parts[0].is_empty() {
|
||||
return Err(anyhow!("invalid X-kata.mkdir.path directive: '{}'", spec));
|
||||
}
|
||||
|
||||
Ok(MkdirDirective {
|
||||
raw_path: parts[0].to_string(),
|
||||
mode: parts.get(1).map(|s| s.to_string()),
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_mkdir_path(
|
||||
raw_path: &str,
|
||||
upper_mount: &Path,
|
||||
first_lower_mount: Option<&Path>,
|
||||
) -> String {
|
||||
let mut resolved = raw_path.replace("{{ mount 0 }}", upper_mount.to_str().unwrap_or(""));
|
||||
|
||||
if let Some(lower) = first_lower_mount {
|
||||
resolved = resolved.replace("{{ mount 1 }}", lower.to_str().unwrap_or(""));
|
||||
}
|
||||
|
||||
resolved
|
||||
}
|
||||
|
||||
async fn wait_and_mount_upper(
|
||||
ext4: &Storage,
|
||||
upper_mount: &Path,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
logger: &Logger,
|
||||
) -> Result<()> {
|
||||
let ext4_devname = extract_device_name(&ext4.source)?;
|
||||
slog::info!(
|
||||
logger,
|
||||
"waiting for ext4 block device to be ready";
|
||||
"device" => &ext4.source,
|
||||
"devname" => &ext4_devname,
|
||||
);
|
||||
|
||||
let matcher = VirtioBlkMatcher::new(&ext4_devname);
|
||||
wait_for_uevent(sandbox, matcher)
|
||||
.await
|
||||
.context("timeout waiting for ext4 block device")?;
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"mounting ext4 upper layer";
|
||||
"device" => &ext4.source,
|
||||
"fstype" => &ext4.fstype,
|
||||
"mount-point" => upper_mount.display(),
|
||||
"options" => ext4.options.join(","),
|
||||
);
|
||||
|
||||
create_mount_destination(Path::new(&ext4.source), upper_mount, "", &ext4.fstype)
|
||||
.context("failed to create upper mount destination")?;
|
||||
|
||||
// Filter out X-kata.* custom options before mount
|
||||
// These are metadata markers, not actual mount options
|
||||
let mount_options: Vec<String> = ext4
|
||||
.options
|
||||
.iter()
|
||||
.filter(|o| !o.starts_with("X-kata."))
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"filtered ext4 mount options";
|
||||
"original-options" => ext4.options.join(","),
|
||||
"mount-options" => mount_options.join(","),
|
||||
);
|
||||
|
||||
let (flags, options) = kata_sys_util::mount::parse_mount_options(&mount_options)?;
|
||||
baremount(
|
||||
Path::new(&ext4.source),
|
||||
upper_mount,
|
||||
&ext4.fstype,
|
||||
flags,
|
||||
options.as_str(),
|
||||
logger,
|
||||
)
|
||||
.context("failed to mount ext4 upper layer")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn wait_and_mount_lower(
|
||||
erofs: &Storage,
|
||||
lower_mount: &Path,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
logger: &Logger,
|
||||
) -> Result<()> {
|
||||
let erofs_devname = extract_device_name(&erofs.source)?;
|
||||
slog::info!(
|
||||
logger,
|
||||
"waiting for erofs block device to be ready";
|
||||
"device" => &erofs.source,
|
||||
"devname" => &erofs_devname,
|
||||
);
|
||||
|
||||
let matcher = VirtioBlkMatcher::new(&erofs_devname);
|
||||
wait_for_uevent(sandbox, matcher)
|
||||
.await
|
||||
.context("timeout waiting for erofs block device")?;
|
||||
|
||||
slog::info!(
|
||||
logger,
|
||||
"mounting erofs lower layer";
|
||||
"device" => &erofs.source,
|
||||
"mount-point" => lower_mount.display(),
|
||||
);
|
||||
|
||||
create_mount_destination(Path::new(&erofs.source), lower_mount, "", EROFS_TYPE)
|
||||
.context("failed to create lower mount destination")?;
|
||||
|
||||
baremount(
|
||||
Path::new(&erofs.source),
|
||||
lower_mount,
|
||||
EROFS_TYPE,
|
||||
nix::mount::MsFlags::MS_RDONLY,
|
||||
"ro",
|
||||
logger,
|
||||
)
|
||||
.context("failed to mount erofs lower layer")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract device name from a device path
|
||||
///
|
||||
/// Examples:
|
||||
/// - "/dev/vda" -> "vda"
|
||||
/// - "/dev/vdb" -> "vdb"
|
||||
fn extract_device_name(device_path: &str) -> Result<String> {
|
||||
device_path
|
||||
.strip_prefix("/dev/")
|
||||
.map(|s| s.to_string())
|
||||
.ok_or_else(|| anyhow!("device path '{}' must start with /dev/", device_path))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_driver_types() {
|
||||
let handler = MultiLayerErofsHandler {};
|
||||
assert_eq!(handler.driver_types(), &[DRIVER_MULTI_LAYER_EROFS]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_constants() {
|
||||
assert_eq!(OPT_OVERLAY_UPPER, "X-kata.overlay-upper");
|
||||
assert_eq!(OPT_OVERLAY_LOWER, "X-kata.overlay-lower");
|
||||
assert_eq!(OPT_MULTI_LAYER, "X-kata.multi-layer=true");
|
||||
assert_eq!(OPT_MKDIR_PATH, "X-kata.mkdir.path=");
|
||||
}
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
[workspace]
|
||||
members = [
|
||||
"kata-sys-util",
|
||||
"kata-types",
|
||||
"logging",
|
||||
"mem-agent",
|
||||
"protocols",
|
||||
"runtime-spec",
|
||||
"safe-path",
|
||||
"shim-interface",
|
||||
"test-utils",
|
||||
]
|
||||
resolver = "2"
|
||||
@@ -11,6 +11,17 @@ ifeq ($(USERID), 0)
|
||||
override EXTRA_TEST_FLAGS = --ignored
|
||||
endif
|
||||
|
||||
LIBS := \
|
||||
-p kata-sys-util \
|
||||
-p kata-types \
|
||||
-p logging \
|
||||
-p mem-agent \
|
||||
-p protocols \
|
||||
-p runtime-spec \
|
||||
-p safe-path \
|
||||
-p shim-interface \
|
||||
-p test-utils
|
||||
|
||||
default: build
|
||||
|
||||
build:
|
||||
@@ -23,13 +34,13 @@ check: clippy format
|
||||
|
||||
clippy:
|
||||
@echo "INFO: cargo clippy..."
|
||||
cargo clippy --all-targets --all-features --release \
|
||||
cargo clippy $(LIBS) --all-features --release \
|
||||
-- \
|
||||
-D warnings
|
||||
|
||||
format:
|
||||
@echo "INFO: cargo fmt..."
|
||||
cargo fmt -- --check
|
||||
cargo fmt $(LIBS) -- --check
|
||||
|
||||
clean:
|
||||
cargo clean
|
||||
@@ -38,8 +49,8 @@ clean:
|
||||
# See the `test_logger_levels()` test for further information.
|
||||
test:
|
||||
@echo "INFO: testing libraries for development build"
|
||||
cargo test --all $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS)
|
||||
cargo test $(LIBS) $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS)
|
||||
@echo "INFO: testing libraries for release build"
|
||||
cargo test --release --all $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS)
|
||||
cargo test --release $(LIBS) $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS)
|
||||
|
||||
.PHONY: install vendor
|
||||
|
||||
@@ -137,16 +137,12 @@ ifeq ($(ARCH), aarch64)
|
||||
EDK2_NAME := aavmf
|
||||
endif
|
||||
|
||||
# Set firmware paths from QEMUFW/QEMUFWVOL if defined
|
||||
# Set firmware path from QEMUFW if defined
|
||||
FIRMWAREPATH :=
|
||||
FIRMWAREVOLUMEPATH :=
|
||||
ifneq (,$(QEMUCMD))
|
||||
ifneq (,$(QEMUFW))
|
||||
FIRMWAREPATH := $(PREFIXDEPS)/share/$(EDK2_NAME)/$(QEMUFW)
|
||||
endif
|
||||
ifneq (,$(QEMUFWVOL))
|
||||
FIRMWAREVOLUMEPATH := $(PREFIXDEPS)/share/$(EDK2_NAME)/$(QEMUFWVOL)
|
||||
endif
|
||||
endif
|
||||
|
||||
KERNELVERITYPARAMS ?= ""
|
||||
@@ -157,7 +153,6 @@ FIRMWARETDVFPATH := $(PREFIXDEPS)/share/ovmf/OVMF.inteltdx.fd
|
||||
|
||||
# SEV-SNP
|
||||
FIRMWARE_SNP_PATH := $(PREFIXDEPS)/share/ovmf/AMDSEV.fd
|
||||
FIRMWARE_VOLUME_SNP_PATH :=
|
||||
|
||||
##VAR DEFVCPUS=<number> Default number of vCPUs
|
||||
DEFVCPUS := 1
|
||||
@@ -204,7 +199,6 @@ DEFVIRTIOFSQUEUESIZE ?= 1024
|
||||
# Make sure you quote args.
|
||||
DEFVIRTIOFSEXTRAARGS ?= [\"--thread-pool-size=1\", \"-o\", \"announce_submounts\"]
|
||||
DEFENABLEIOTHREADS := false
|
||||
DEFINDEPIOTHREADS := 0
|
||||
DEFENABLEVHOSTUSERSTORE := false
|
||||
DEFVHOSTUSERSTOREPATH := $(PKGRUNDIR)/vhost-user
|
||||
DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"]
|
||||
@@ -222,7 +216,6 @@ DEFCREATECONTAINERTIMEOUT ?= 30
|
||||
DEFCREATECONTAINERTIMEOUT_COCO ?= 60
|
||||
DEFSTATICRESOURCEMGMT_COCO = true
|
||||
DEFDISABLEIMAGENVDIMM ?= false
|
||||
DEFPODRESOURCEAPISOCK := ""
|
||||
|
||||
SED = sed
|
||||
CLI_DIR = cmd
|
||||
@@ -412,9 +405,6 @@ endif
|
||||
# Most users will want to set this to "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
# for better security. Note: "elevateprivileges=deny" doesn't work with daemonize option.
|
||||
DEFSECCOMPSANDBOXPARAM := ""
|
||||
# Default is empty string "" to match Rust default None (when commented out in config).
|
||||
# Most users will want to set this to "system_u:system_r:container_t" for SELinux support.
|
||||
DEFGUESTSELINUXLABEL := ""
|
||||
endif
|
||||
|
||||
ifneq (,$(FCCMD))
|
||||
@@ -524,7 +514,6 @@ USER_VARS += KERNELPATH_COCO
|
||||
USER_VARS += KERNELPATH
|
||||
USER_VARS += KERNELVIRTIOFSPATH
|
||||
USER_VARS += FIRMWAREPATH
|
||||
USER_VARS += FIRMWAREVOLUMEPATH
|
||||
USER_VARS += MACHINEACCELERATORS
|
||||
USER_VARS += CPUFEATURES
|
||||
USER_VARS += DEFMACHINETYPE_CLH
|
||||
@@ -584,9 +573,7 @@ USER_VARS += DEFVIRTIOFSEXTRAARGS
|
||||
USER_VARS += DEFENABLEANNOTATIONS
|
||||
USER_VARS += DEFENABLEANNOTATIONS_COCO
|
||||
USER_VARS += DEFENABLEIOTHREADS
|
||||
USER_VARS += DEFINDEPIOTHREADS
|
||||
USER_VARS += DEFSECCOMPSANDBOXPARAM
|
||||
USER_VARS += DEFGUESTSELINUXLABEL
|
||||
USER_VARS += DEFENABLEVHOSTUSERSTORE
|
||||
USER_VARS += DEFVHOSTUSERSTOREPATH
|
||||
USER_VARS += DEFVALIDVHOSTUSERSTOREPATHS
|
||||
@@ -628,11 +615,9 @@ USER_VARS += DEFCREATECONTAINERTIMEOUT
|
||||
USER_VARS += DEFCREATECONTAINERTIMEOUT_COCO
|
||||
USER_VARS += QEMUTDXEXPERIMENTALCMD
|
||||
USER_VARS += FIRMWARE_SNP_PATH
|
||||
USER_VARS += FIRMWARE_VOLUME_SNP_PATH
|
||||
USER_VARS += KERNELTDXPARAMS
|
||||
USER_VARS += DEFSHAREDFS_QEMU_TDX_VIRTIOFS
|
||||
USER_VARS += FIRMWARETDVFPATH
|
||||
USER_VARS += DEFPODRESOURCEAPISOCK
|
||||
|
||||
SOURCES := \
|
||||
$(shell find . 2>&1 | grep -E '.*\.rs$$') \
|
||||
|
||||
@@ -13,7 +13,6 @@ CPUFEATURES := pmu=off
|
||||
|
||||
QEMUCMD := qemu-system-aarch64
|
||||
QEMUFW := AAVMF_CODE.fd
|
||||
QEMUFWVOL := AAVMF_VARS.fd
|
||||
|
||||
# dragonball binary name
|
||||
DBCMD := dragonball
|
||||
|
||||
@@ -76,12 +76,6 @@ kernel_params = "@KERNELPARAMS@"
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWAREPATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
@@ -89,12 +83,12 @@ machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# For example, `seccomp_sandbox = "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
@@ -311,11 +305,6 @@ enable_iommu_platform = false
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
@@ -350,7 +339,7 @@ enable_debug = false
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
dbg_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
@@ -378,18 +367,6 @@ disable_image_nvdimm = false
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "no-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
@@ -483,9 +460,6 @@ guest_memory_dump_path = ""
|
||||
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
|
||||
#guest_memory_dump_paging=false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
@@ -497,7 +471,7 @@ disable_selinux = @DEFDISABLESELINUX@
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[factory]
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
@@ -726,20 +700,6 @@ agent_name = "@PROJECT_TYPE@"
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
|
||||
@@ -60,12 +60,6 @@ kernel_params = "@KERNELPARAMS@"
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWAREPATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
@@ -73,12 +67,12 @@ machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# For example, `seccomp_sandbox = "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
@@ -307,11 +301,6 @@ enable_iommu_platform = false
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
@@ -346,7 +335,7 @@ enable_debug = false
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
dbg_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
@@ -373,18 +362,6 @@ disable_image_nvdimm = false
|
||||
# Default false
|
||||
hotplug_vfio_on_root_bus = false
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "no-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
@@ -489,9 +466,6 @@ guest_memory_dump_path = ""
|
||||
# See: https://www.qemu.org/docs/master/qemu-qmp-ref.html#Dump-guest-memory for details
|
||||
guest_memory_dump_paging = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
@@ -720,20 +694,6 @@ agent_name = "@PROJECT_TYPE@"
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
|
||||
@@ -69,12 +69,6 @@ kernel_params = "@KERNELPARAMS@"
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWAREPATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
@@ -82,12 +76,12 @@ machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# For example, `seccomp_sandbox = "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
@@ -345,18 +339,6 @@ msize_9p = @DEFMSIZE9P@
|
||||
# Default is false
|
||||
disable_image_nvdimm = true
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "no-port"
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hotplugging on root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hotplugging on
|
||||
@@ -460,9 +442,6 @@ guest_memory_dump_paging = false
|
||||
# be default_memory.
|
||||
enable_guest_swap = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
@@ -474,7 +453,7 @@ disable_selinux = @DEFDISABLESELINUX@
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[factory]
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
@@ -593,20 +572,6 @@ agent_name = "@PROJECT_TYPE@"
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
|
||||
@@ -103,12 +103,6 @@ kernel_params = "@KERNELPARAMS@"
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWARE_SNP_PATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWARE_VOLUME_SNP_PATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
@@ -116,12 +110,12 @@ machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# For example, `seccomp_sandbox = "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
@@ -289,10 +283,6 @@ block_device_cache_noflush = false
|
||||
#
|
||||
enable_iothreads = @DEFENABLEIOTHREADS@
|
||||
|
||||
# Independent IOThreads enables IO to be processed in a separate thread, it is
|
||||
# for QEMU hotplug device attach to iothread, like virtio-blk.
|
||||
indep_iothreads = @DEFINDEPIOTHREADS@
|
||||
|
||||
# Enable pre allocation of VM RAM, default false
|
||||
# Enabling this will result in lower container density
|
||||
# as all of the memory will be allocated and locked
|
||||
@@ -346,11 +336,6 @@ enable_iommu_platform = false
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
@@ -407,7 +392,7 @@ disable_vhost_net = false
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
#extra_monitor_socket = "hmp"
|
||||
#dbg_monitor_socket = "hmp"
|
||||
|
||||
#
|
||||
# Default entropy source.
|
||||
@@ -495,9 +480,6 @@ guest_memory_dump_paging = false
|
||||
# be default_memory.
|
||||
enable_guest_swap = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
@@ -509,7 +491,7 @@ disable_selinux = @DEFDISABLESELINUX@
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[factory]
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
@@ -528,30 +510,6 @@ enable_template = false
|
||||
# Default "/run/vc/vm/template"
|
||||
template_path = "/run/vc/vm/template"
|
||||
|
||||
# The number of caches of VMCache:
|
||||
# unspecified or == 0 --> VMCache is disabled
|
||||
# > 0 --> will be set to the specified number
|
||||
#
|
||||
# VMCache is a function that creates VMs as caches before using it.
|
||||
# It helps speed up new container creation.
|
||||
# The function consists of a server and some clients communicating
|
||||
# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto.
|
||||
# The VMCache server will create some VMs and cache them by factory cache.
|
||||
# It will convert the VM to gRPC format and transport it when gets
|
||||
# requestion from clients.
|
||||
# Factory grpccache is the VMCache client. It will request gRPC format
|
||||
# VM and convert it back to a VM. If VMCache function is enabled,
|
||||
# kata-runtime will request VM from factory grpccache when it creates
|
||||
# a new sandbox.
|
||||
#
|
||||
# Default 0
|
||||
vm_cache_number = 0
|
||||
|
||||
# Specify the address of the Unix socket that is used by VMCache.
|
||||
#
|
||||
# Default /var/run/kata-containers/cache.sock
|
||||
vm_cache_endpoint = "/var/run/kata-containers/cache.sock"
|
||||
|
||||
[agent.@PROJECT_TYPE@]
|
||||
# If enabled, make the agent display debug-level messages.
|
||||
# (default: disabled)
|
||||
@@ -651,19 +609,6 @@ agent_name="@PROJECT_TYPE@"
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
@@ -757,22 +702,3 @@ enable_pprof = false
|
||||
# to the hypervisor.
|
||||
# (default: /run/kata-containers/dans)
|
||||
dan_conf = "@DEFDANCONF@"
|
||||
|
||||
# pod_resource_api_sock specifies the unix socket for the Kubelet's
|
||||
# PodResource API endpoint. If empty, kubernetes based cold plug
|
||||
# will not be attempted. In order for this feature to work, the
|
||||
# KubeletPodResourcesGet featureGate must be enabled in Kubelet,
|
||||
# if using Kubelet older than 1.34.
|
||||
#
|
||||
# The pod resource API's socket is relative to the Kubelet's root-dir,
|
||||
# which is defined by the cluster admin, and its location is:
|
||||
# ${KubeletRootDir}/pod-resources/kubelet.sock
|
||||
#
|
||||
# cold_plug_vfio(see hypervisor config) acts as a feature gate:
|
||||
# cold_plug_vfio = no_port (default) => no cold plug
|
||||
# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need
|
||||
# explicit CDI annotation for cold plug (applies mainly
|
||||
# to non-k8s cases)
|
||||
# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet
|
||||
# based cold plug.
|
||||
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK@"
|
||||
|
||||
@@ -83,12 +83,6 @@ kernel_verity_params = "@KERNELVERITYPARAMS@"
|
||||
# If you want that qemu uses the default firmware leave this option empty
|
||||
firmware = "@FIRMWARETDVFPATH@"
|
||||
|
||||
# Path to the firmware volume.
|
||||
# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables
|
||||
# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables
|
||||
# can be customized per each user while UEFI code is kept same.
|
||||
firmware_volume = "@FIRMWAREVOLUMEPATH@"
|
||||
|
||||
# Machine accelerators
|
||||
# comma-separated list of machine accelerators to pass to the hypervisor.
|
||||
# For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"`
|
||||
@@ -96,12 +90,12 @@ machine_accelerators = "@MACHINEACCELERATORS@"
|
||||
|
||||
# Qemu seccomp sandbox feature
|
||||
# comma-separated list of seccomp sandbox features to control the syscall access.
|
||||
# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# For example, `seccomp_sandbox = "on,obsolete=deny,spawn=deny,resourcecontrol=deny"`
|
||||
# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox
|
||||
# Another note: enabling this feature may reduce performance, you may enable
|
||||
# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html
|
||||
# Recommended value when enabling: "on,obsolete=deny,spawn=deny,resourcecontrol=deny"
|
||||
seccompsandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
seccomp_sandbox = "@DEFSECCOMPSANDBOXPARAM@"
|
||||
|
||||
# CPU features
|
||||
# comma-separated list of cpu features to pass to the cpu
|
||||
@@ -268,10 +262,6 @@ block_device_cache_noflush = false
|
||||
#
|
||||
enable_iothreads = @DEFENABLEIOTHREADS@
|
||||
|
||||
# Independent IOThreads enables IO to be processed in a separate thread, it is
|
||||
# for QEMU hotplug device attach to iothread, like virtio-blk.
|
||||
indep_iothreads = @DEFINDEPIOTHREADS@
|
||||
|
||||
# Enable pre allocation of VM RAM, default false
|
||||
# Enabling this will result in lower container density
|
||||
# as all of the memory will be allocated and locked
|
||||
@@ -325,11 +315,6 @@ enable_iommu_platform = false
|
||||
# Your distribution recommends: @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
valid_vhost_user_store_paths = @DEFVALIDVHOSTUSERSTOREPATHS@
|
||||
|
||||
# The timeout for reconnecting on non-server spdk sockets when the remote end goes away.
|
||||
# qemu will delay this many seconds and then attempt to reconnect.
|
||||
# Zero disables reconnecting, and the default is zero.
|
||||
vhost_user_reconnect_timeout_sec = 0
|
||||
|
||||
# Enable file based guest memory support. The default is an empty string which
|
||||
# will disable this feature. In the case of virtio-fs, this is enabled
|
||||
# automatically and '/dev/shm' is used as the backing folder.
|
||||
@@ -364,7 +349,7 @@ enable_debug = false
|
||||
#
|
||||
# If set to the empty string "", no extra monitor socket is added. This is
|
||||
# the default.
|
||||
extra_monitor_socket = ""
|
||||
dbg_monitor_socket = ""
|
||||
|
||||
# Disable the customizations done in the runtime when it detects
|
||||
# that it is running on top a VMM. This will result in the runtime
|
||||
@@ -474,9 +459,6 @@ guest_memory_dump_paging = false
|
||||
# be default_memory.
|
||||
enable_guest_swap = false
|
||||
|
||||
# use legacy serial for guest console if available and implemented for architecture. Default false
|
||||
use_legacy_serial = false
|
||||
|
||||
# disable applying SELinux on the VMM process (default false)
|
||||
disable_selinux = @DEFDISABLESELINUX@
|
||||
|
||||
@@ -488,7 +470,7 @@ disable_selinux = @DEFDISABLESELINUX@
|
||||
disable_guest_selinux = @DEFDISABLEGUESTSELINUX@
|
||||
|
||||
|
||||
[factory]
|
||||
[hypervisor.qemu.factory]
|
||||
# VM templating support. Once enabled, new VMs are created from template
|
||||
# using vm cloning. They will share the same initial kernel, initramfs and
|
||||
# agent memory by mapping it readonly. It helps speeding up new container
|
||||
@@ -507,30 +489,6 @@ enable_template = false
|
||||
# Default "/run/vc/vm/template"
|
||||
template_path = "/run/vc/vm/template"
|
||||
|
||||
# The number of caches of VMCache:
|
||||
# unspecified or == 0 --> VMCache is disabled
|
||||
# > 0 --> will be set to the specified number
|
||||
#
|
||||
# VMCache is a function that creates VMs as caches before using it.
|
||||
# It helps speed up new container creation.
|
||||
# The function consists of a server and some clients communicating
|
||||
# through Unix socket. The protocol is gRPC in protocols/cache/cache.proto.
|
||||
# The VMCache server will create some VMs and cache them by factory cache.
|
||||
# It will convert the VM to gRPC format and transport it when gets
|
||||
# requestion from clients.
|
||||
# Factory grpccache is the VMCache client. It will request gRPC format
|
||||
# VM and convert it back to a VM. If VMCache function is enabled,
|
||||
# kata-runtime will request VM from factory grpccache when it creates
|
||||
# a new sandbox.
|
||||
#
|
||||
# Default 0
|
||||
vm_cache_number = 0
|
||||
|
||||
# Specify the address of the Unix socket that is used by VMCache.
|
||||
#
|
||||
# Default /var/run/kata-containers/cache.sock
|
||||
vm_cache_endpoint = "/var/run/kata-containers/cache.sock"
|
||||
|
||||
[agent.@PROJECT_TYPE@]
|
||||
# If enabled, make the agent display debug-level messages.
|
||||
# (default: disabled)
|
||||
@@ -631,20 +589,6 @@ agent_name="@PROJECT_TYPE@"
|
||||
# (default: true)
|
||||
disable_guest_seccomp = @DEFDISABLEGUESTSECCOMP@
|
||||
|
||||
# vCPUs pinning settings
|
||||
# if enabled, each vCPU thread will be scheduled to a fixed CPU
|
||||
# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
|
||||
enable_vcpus_pinning = false
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
@@ -739,21 +683,3 @@ enable_pprof = false
|
||||
# (default: /run/kata-containers/dans)
|
||||
dan_conf = "@DEFDANCONF@"
|
||||
|
||||
# pod_resource_api_sock specifies the unix socket for the Kubelet's
|
||||
# PodResource API endpoint. If empty, kubernetes based cold plug
|
||||
# will not be attempted. In order for this feature to work, the
|
||||
# KubeletPodResourcesGet featureGate must be enabled in Kubelet,
|
||||
# if using Kubelet older than 1.34.
|
||||
#
|
||||
# The pod resource API's socket is relative to the Kubelet's root-dir,
|
||||
# which is defined by the cluster admin, and its location is:
|
||||
# ${KubeletRootDir}/pod-resources/kubelet.sock
|
||||
#
|
||||
# cold_plug_vfio(see hypervisor config) acts as a feature gate:
|
||||
# cold_plug_vfio = no_port (default) => no cold plug
|
||||
# cold_plug_vfio != no_port AND pod_resource_api_sock = "" => need
|
||||
# explicit CDI annotation for cold plug (applies mainly
|
||||
# to non-k8s cases)
|
||||
# cold_plug_vfio != no_port AND pod_resource_api_sock != "" => kubelet
|
||||
# based cold plug.
|
||||
pod_resource_api_sock = "@DEFPODRESOURCEAPISOCK@"
|
||||
|
||||
@@ -205,15 +205,6 @@ agent_name = "kata"
|
||||
disable_guest_seccomp = true
|
||||
|
||||
|
||||
# Apply a custom SELinux security policy to the container process inside the VM.
|
||||
# This is used when you want to apply a type other than the default `container_t`,
|
||||
# so general users should not uncomment and apply it.
|
||||
# (format: "user:role:type")
|
||||
# Note: You cannot specify MCS policy with the label because the sensitivity levels and
|
||||
# categories are determined automatically by high-level container runtimes such as containerd.
|
||||
# Example value when enabling: "system_u:system_r:container_t"
|
||||
guest_selinux_label = "@DEFGUESTSELINUXLABEL@"
|
||||
|
||||
# If enabled, the runtime will create opentracing.io traces and spans.
|
||||
# (See https://www.jaegertracing.io/docs/getting-started).
|
||||
# (default: disabled)
|
||||
|
||||
@@ -24,9 +24,9 @@ pub use vfio::{
|
||||
pub use vhost_user::{VhostUserConfig, VhostUserDevice, VhostUserType};
|
||||
pub use vhost_user_net::VhostUserNetDevice;
|
||||
pub use virtio_blk::{
|
||||
BlockConfig, BlockDevice, BlockDeviceAio, KATA_BLK_DEV_TYPE, KATA_CCW_DEV_TYPE,
|
||||
KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE, VIRTIO_BLOCK_CCW,
|
||||
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
BlockConfig, BlockDevice, BlockDeviceAio, BlockDeviceFormat, KATA_BLK_DEV_TYPE,
|
||||
KATA_CCW_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, KATA_SCSI_DEV_TYPE,
|
||||
VIRTIO_BLOCK_CCW, VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM,
|
||||
};
|
||||
pub use virtio_fs::{
|
||||
ShareFsConfig, ShareFsDevice, ShareFsMountConfig, ShareFsMountOperation, ShareFsMountType,
|
||||
|
||||
@@ -59,6 +59,23 @@ impl std::fmt::Display for BlockDeviceAio {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||
pub enum BlockDeviceFormat {
|
||||
#[default]
|
||||
Raw,
|
||||
Vmdk,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockDeviceFormat {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let to_string = match *self {
|
||||
BlockDeviceFormat::Raw => "raw".to_string(),
|
||||
BlockDeviceFormat::Vmdk => "vmdk".to_string(),
|
||||
};
|
||||
write!(f, "{to_string}")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct BlockConfig {
|
||||
/// Path of the drive.
|
||||
@@ -71,6 +88,9 @@ pub struct BlockConfig {
|
||||
/// Don't close `path_on_host` file when dropping the device.
|
||||
pub no_drop: bool,
|
||||
|
||||
/// raw, vmdk, etc. And default to raw if not set.
|
||||
pub format: BlockDeviceFormat,
|
||||
|
||||
/// Specifies cache-related options for block devices.
|
||||
/// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled.
|
||||
/// If not set, use configurarion block_device_cache_direct.
|
||||
|
||||
@@ -2610,6 +2610,7 @@ impl<'a> QemuCmdLine<'a> {
|
||||
self.devices.push(Box::new(Bios::new(firmware.to_owned())));
|
||||
|
||||
self.machine
|
||||
.set_kernel_irqchip("split")
|
||||
.set_confidential_guest_support("tdx")
|
||||
.set_nvdimm(false);
|
||||
}
|
||||
|
||||
@@ -866,6 +866,7 @@ impl QemuInner {
|
||||
),
|
||||
block_device.config.is_readonly,
|
||||
block_device.config.no_drop,
|
||||
&format!("{}", block_device.config.format),
|
||||
)
|
||||
.context("hotplug block device")?;
|
||||
|
||||
|
||||
@@ -14,7 +14,8 @@ use kata_types::rootless::is_rootless;
|
||||
use nix::sys::socket::{sendmsg, ControlMessage, MsgFlags};
|
||||
use qapi_qmp::{
|
||||
self as qmp, BlockdevAioOptions, BlockdevOptions, BlockdevOptionsBase,
|
||||
BlockdevOptionsGenericFormat, BlockdevOptionsRaw, BlockdevRef, MigrationInfo, PciDeviceInfo,
|
||||
BlockdevOptionsGenericCOWFormat, BlockdevOptionsGenericFormat, BlockdevOptionsRaw, BlockdevRef,
|
||||
MigrationInfo, PciDeviceInfo,
|
||||
};
|
||||
use qapi_qmp::{migrate, migrate_incoming, migrate_set_capabilities};
|
||||
use qapi_qmp::{MigrationCapability, MigrationCapabilityStatus};
|
||||
@@ -642,6 +643,7 @@ impl Qmp {
|
||||
is_direct: Option<bool>,
|
||||
is_readonly: bool,
|
||||
no_drop: bool,
|
||||
format: &str,
|
||||
) -> Result<(Option<PciPath>, Option<String>)> {
|
||||
// `blockdev-add`
|
||||
let node_name = format!("drive-{index}");
|
||||
@@ -690,27 +692,64 @@ impl Qmp {
|
||||
}
|
||||
};
|
||||
|
||||
let blockdev_options_raw = BlockdevOptions::raw {
|
||||
base: BlockdevOptionsBase {
|
||||
detect_zeroes: None,
|
||||
cache: None,
|
||||
discard: None,
|
||||
force_share: None,
|
||||
auto_read_only: None,
|
||||
node_name: Some(node_name.clone()),
|
||||
read_only: None,
|
||||
},
|
||||
raw: BlockdevOptionsRaw {
|
||||
base: BlockdevOptionsGenericFormat {
|
||||
file: BlockdevRef::definition(Box::new(blockdev_file)),
|
||||
},
|
||||
offset: None,
|
||||
size: None,
|
||||
},
|
||||
let blockdev_options = match format {
|
||||
"raw" => {
|
||||
// Use raw format for regular block devices
|
||||
BlockdevOptions::raw {
|
||||
base: BlockdevOptionsBase {
|
||||
detect_zeroes: None,
|
||||
cache: None,
|
||||
discard: None,
|
||||
force_share: None,
|
||||
auto_read_only: None,
|
||||
node_name: Some(node_name.clone()),
|
||||
read_only: None,
|
||||
},
|
||||
raw: BlockdevOptionsRaw {
|
||||
base: BlockdevOptionsGenericFormat {
|
||||
file: BlockdevRef::definition(Box::new(blockdev_file)),
|
||||
},
|
||||
offset: None,
|
||||
size: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
"vmdk" => {
|
||||
// Use VMDK format driver for VMDK descriptor files
|
||||
// The VMDK driver will parse the descriptor and handle multi-extent files
|
||||
info!(
|
||||
sl!(),
|
||||
"hotplug_block_device: using VMDK format driver for {}", path_on_host
|
||||
);
|
||||
BlockdevOptions::vmdk {
|
||||
base: BlockdevOptionsBase {
|
||||
detect_zeroes: None,
|
||||
cache: None,
|
||||
discard: None,
|
||||
force_share: None,
|
||||
auto_read_only: None,
|
||||
node_name: Some(node_name.clone()),
|
||||
read_only: None,
|
||||
},
|
||||
vmdk: BlockdevOptionsGenericCOWFormat {
|
||||
base: BlockdevOptionsGenericFormat {
|
||||
file: BlockdevRef::definition(Box::new(blockdev_file)),
|
||||
},
|
||||
backing: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
other => {
|
||||
warn!(
|
||||
sl!(),
|
||||
"unrecognized format '{}', defaulting to raw for {}", other, path_on_host
|
||||
);
|
||||
return Err(anyhow!("unrecognized block device format: {}", other));
|
||||
}
|
||||
};
|
||||
|
||||
self.qmp
|
||||
.execute(&qapi_qmp::blockdev_add(blockdev_options_raw))
|
||||
.execute(&qapi_qmp::blockdev_add(blockdev_options))
|
||||
.map_err(|e| anyhow!("blockdev-add backend {:?}", e))
|
||||
.map(|_| ())?;
|
||||
|
||||
|
||||
@@ -84,16 +84,6 @@ impl ResourceManager {
|
||||
inner.handle_network(network_config).await
|
||||
}
|
||||
|
||||
pub async fn has_network_endpoints(&self) -> bool {
|
||||
let inner = self.inner.read().await;
|
||||
inner.has_network_endpoints().await
|
||||
}
|
||||
|
||||
pub async fn setup_network_in_guest(&self) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
inner.setup_network_in_guest().await
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub async fn setup_after_start_vm(&self) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
|
||||
@@ -296,33 +296,6 @@ impl ResourceManagerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn has_network_endpoints(&self) -> bool {
|
||||
if let Some(network) = &self.network {
|
||||
match network.interfaces().await {
|
||||
std::result::Result::Ok(interfaces) => !interfaces.is_empty(),
|
||||
Err(_) => false,
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn setup_network_in_guest(&self) -> Result<()> {
|
||||
if let Some(network) = self.network.as_ref() {
|
||||
let network = network.as_ref();
|
||||
self.handle_interfaces(network)
|
||||
.await
|
||||
.context("handle interfaces during network rescan")?;
|
||||
self.handle_neighbours(network)
|
||||
.await
|
||||
.context("handle neighbours during network rescan")?;
|
||||
self.handle_routes(network)
|
||||
.await
|
||||
.context("handle routes during network rescan")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn setup_after_start_vm(&mut self) -> Result<()> {
|
||||
self.cgroups_resource
|
||||
.setup_after_start_vm(self.hypervisor.as_ref())
|
||||
|
||||
@@ -137,8 +137,8 @@ impl Rootfs for BlockRootfs {
|
||||
Ok(vec![self.mount.clone()])
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<Storage> {
|
||||
self.storage.clone()
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
self.storage.clone().map(|s| vec![s])
|
||||
}
|
||||
|
||||
async fn get_device_id(&self) -> Result<Option<String>> {
|
||||
|
||||
640
src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
Normal file
640
src/runtime-rs/crates/resource/src/rootfs/erofs_rootfs.rs
Normal file
@@ -0,0 +1,640 @@
|
||||
// Copyright (c) 2026 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Handle multi-layer EROFS rootfs:
|
||||
// Mount[0]: ext4 rw layer -> virtio-blk device (writable)
|
||||
// Mount[1]: erofs with device= -> virtio-blk via VMDK (read-only)
|
||||
// Mount[2]: overlay (format/mkdir/overlay) -> host mount OR guest agent
|
||||
// The overlay mount may be handled by the guest agent if it contains "{{"
|
||||
// templates in upperdir/workdir.
|
||||
|
||||
use super::{Rootfs, ROOTFS};
|
||||
use crate::share_fs::{do_get_guest_path, do_get_host_path};
|
||||
use agent::Storage;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use hypervisor::{
|
||||
device::{
|
||||
device_manager::{do_handle_device, get_block_device_info, DeviceManager},
|
||||
DeviceConfig, DeviceType,
|
||||
},
|
||||
BlockConfig, BlockDeviceAio, BlockDeviceFormat,
|
||||
};
|
||||
use kata_types::config::hypervisor::{
|
||||
VIRTIO_BLK_CCW, VIRTIO_BLK_MMIO, VIRTIO_BLK_PCI, VIRTIO_PMEM, VIRTIO_SCSI,
|
||||
};
|
||||
use kata_types::mount::Mount;
|
||||
use oci_spec::runtime as oci;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
/// EROFS rootfs type identifier
|
||||
pub(crate) const EROFS_ROOTFS_TYPE: &str = "erofs";
|
||||
/// RW layer rootfs type identifier, used for multi-layer EROFS as the writable upper layer
|
||||
/// Typically ext4 format, but can be extended to other fs types in the future.
|
||||
pub(crate) const RW_LAYER_ROOTFS_TYPE: &str = "ext4";
|
||||
/// VMDK file extension for merged EROFS image
|
||||
const EROFS_MERGED_VMDK: &str = "merged_fs.vmdk";
|
||||
/// Maximum number of virtio-blk devices allowed
|
||||
const MAX_VIRTIO_BLK_DEVICES: usize = 10;
|
||||
/// Maximum sectors per 2GB extent (2GB / 512 bytes per sector)
|
||||
const MAX_2GB_EXTENT_SECTORS: u64 = 0x8000_0000 >> 9;
|
||||
/// Sectors per track for VMDK geometry
|
||||
const SECTORS_PER_TRACK: u64 = 63;
|
||||
/// Number of heads for VMDK geometry
|
||||
const NUMBER_HEADS: u64 = 16;
|
||||
/// VMDK subformat type (twoGbMaxExtentFlat for large files)
|
||||
const VMDK_SUBFORMAT: &str = "twoGbMaxExtentFlat";
|
||||
/// VMDK adapter type
|
||||
const VMDK_ADAPTER_TYPE: &str = "ide";
|
||||
/// VMDK hardware version
|
||||
const VMDK_HW_VERSION: &str = "4";
|
||||
/// Default shared directory for guest rootfs VMDK files (for multi-layer EROFS)
|
||||
const DEFAULT_KATA_GUEST_ROOT_SHARED_FS: &str = "/run/kata-containers/";
|
||||
/// Template for mkdir option in overlay mount (X-containerd.mkdir.path)
|
||||
const X_CONTAINERD_MKDIR_PATH: &str = "X-containerd.mkdir.path=";
|
||||
/// Template for mkdir option passed to guest agent (X-kata.mkdir.path)
|
||||
const X_KATA_MKDIR_PATH: &str = "X-kata.mkdir.path=";
|
||||
|
||||
/// Generate merged VMDK file from multiple EROFS devices
|
||||
///
|
||||
/// Creates a VMDK descriptor that combines multiple EROFS images into a single
|
||||
/// virtual block device (flatten device). For a single device, the EROFS image
|
||||
/// is used directly without a VMDK wrapper.
|
||||
///
|
||||
/// And `erofs_devices` are for host paths to EROFS image files (from `source` and `device=` options)
|
||||
async fn generate_merged_erofs_vmdk(
|
||||
sid: &str,
|
||||
cid: &str,
|
||||
erofs_devices: &[String],
|
||||
) -> Result<(String, BlockDeviceFormat)> {
|
||||
if erofs_devices.is_empty() {
|
||||
return Err(anyhow!("no EROFS devices provided"));
|
||||
}
|
||||
|
||||
// Validate all device paths exist and are regular files before proceeding.
|
||||
for dev_path in erofs_devices {
|
||||
let metadata = fs::metadata(dev_path)
|
||||
.context(format!("EROFS device path not accessible: {}", dev_path))?;
|
||||
if !metadata.is_file() {
|
||||
return Err(anyhow!(
|
||||
"EROFS device path is not a regular file: {}",
|
||||
dev_path
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// For single device, use it directly with Raw format (no need for VMDK descriptor)
|
||||
if erofs_devices.len() == 1 {
|
||||
info!(
|
||||
sl!(),
|
||||
"single EROFS device, using directly with Raw format: {}", erofs_devices[0]
|
||||
);
|
||||
return Ok((erofs_devices[0].clone(), BlockDeviceFormat::Raw));
|
||||
}
|
||||
|
||||
// For multiple devices, create VMDK descriptor
|
||||
let sandbox_dir = PathBuf::from(DEFAULT_KATA_GUEST_ROOT_SHARED_FS).join(sid);
|
||||
let container_dir = sandbox_dir.join(cid);
|
||||
fs::create_dir_all(&container_dir).context(format!(
|
||||
"failed to create container directory: {}",
|
||||
container_dir.display()
|
||||
))?;
|
||||
|
||||
let vmdk_path = container_dir.join(EROFS_MERGED_VMDK);
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"creating VMDK descriptor for {} EROFS devices: {}",
|
||||
erofs_devices.len(),
|
||||
vmdk_path.display()
|
||||
);
|
||||
|
||||
// create_vmdk_descriptor uses atomic write (temp + rename) internally,
|
||||
// so a failure will not leave a corrupt descriptor file.
|
||||
create_vmdk_descriptor(&vmdk_path, erofs_devices)
|
||||
.context("failed to create VMDK descriptor")?;
|
||||
|
||||
Ok((vmdk_path.display().to_string(), BlockDeviceFormat::Vmdk))
|
||||
}
|
||||
|
||||
/// Create VMDK descriptor for multiple EROFS extents (flatten device)
|
||||
///
|
||||
/// Generates a VMDK descriptor file (twoGbMaxExtentFlat format) that references
|
||||
/// multiple EROFS images as flat extents, allowing them to be treated as a single
|
||||
/// contiguous block device in the VM.
|
||||
fn create_vmdk_descriptor(vmdk_path: &Path, erofs_paths: &[String]) -> Result<()> {
|
||||
if erofs_paths.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"empty EROFS path list, cannot create VMDK descriptor"
|
||||
));
|
||||
}
|
||||
|
||||
// collect extent information without writing anything.
|
||||
struct ExtentInfo {
|
||||
path: String,
|
||||
total_sectors: u64,
|
||||
}
|
||||
|
||||
let mut extents: Vec<ExtentInfo> = Vec::with_capacity(erofs_paths.len());
|
||||
let mut total_sectors: u64 = 0;
|
||||
|
||||
for erofs_path in erofs_paths {
|
||||
let metadata = fs::metadata(erofs_path)
|
||||
.context(format!("failed to stat EROFS file: {}", erofs_path))?;
|
||||
|
||||
let file_size = metadata.len();
|
||||
if file_size == 0 {
|
||||
warn!(sl!(), "EROFS file {} is zero-length, skipping", erofs_path);
|
||||
continue;
|
||||
}
|
||||
|
||||
// round up to whole sectors to avoid losing tail bytes on non-aligned files.
|
||||
// VMDK extents are measured in 512-byte sectors; a file that is not sector-aligned
|
||||
// still needs the last partial sector to be addressable by the VM.
|
||||
let sectors = file_size.div_ceil(512);
|
||||
|
||||
if file_size % 512 != 0 {
|
||||
warn!(
|
||||
sl!(),
|
||||
"EROFS file {} size ({} bytes) is not 512-byte aligned, \
|
||||
rounding up to {} sectors ({} bytes addressable)",
|
||||
erofs_path,
|
||||
file_size,
|
||||
sectors,
|
||||
sectors * 512
|
||||
);
|
||||
}
|
||||
|
||||
total_sectors = total_sectors.checked_add(sectors).ok_or_else(|| {
|
||||
anyhow!(
|
||||
"total sector count overflow when adding {} ({} sectors)",
|
||||
erofs_path,
|
||||
sectors
|
||||
)
|
||||
})?;
|
||||
|
||||
extents.push(ExtentInfo {
|
||||
path: erofs_path.clone(),
|
||||
total_sectors: sectors,
|
||||
});
|
||||
}
|
||||
|
||||
if total_sectors == 0 {
|
||||
return Err(anyhow!(
|
||||
"no valid EROFS files to create VMDK descriptor (all files are empty)"
|
||||
));
|
||||
}
|
||||
|
||||
// write descriptor to a temp file, then atomically rename.
|
||||
let tmp_path = vmdk_path.with_extension("vmdk.tmp");
|
||||
|
||||
let file = fs::File::create(&tmp_path).context(format!(
|
||||
"failed to create temp VMDK file: {}",
|
||||
tmp_path.display()
|
||||
))?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
// Header
|
||||
writeln!(writer, "# Disk DescriptorFile")?;
|
||||
writeln!(writer, "version=1")?;
|
||||
writeln!(writer, "CID=fffffffe")?;
|
||||
writeln!(writer, "parentCID=ffffffff")?;
|
||||
writeln!(writer, "createType=\"{}\"", VMDK_SUBFORMAT)?;
|
||||
writeln!(writer)?;
|
||||
|
||||
// Extent descriptions
|
||||
writeln!(writer, "# Extent description")?;
|
||||
for extent in &extents {
|
||||
let mut remaining = extent.total_sectors;
|
||||
let mut file_offset: u64 = 0;
|
||||
|
||||
while remaining > 0 {
|
||||
let chunk = remaining.min(MAX_2GB_EXTENT_SECTORS);
|
||||
writeln!(
|
||||
writer,
|
||||
"RW {} FLAT \"{}\" {}",
|
||||
chunk, extent.path, file_offset
|
||||
)?;
|
||||
file_offset += chunk;
|
||||
remaining -= chunk;
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK extent: {} ({} sectors, {} extent chunk(s))",
|
||||
extent.path,
|
||||
extent.total_sectors,
|
||||
extent.total_sectors.div_ceil(MAX_2GB_EXTENT_SECTORS)
|
||||
);
|
||||
}
|
||||
writeln!(writer)?;
|
||||
|
||||
// Disk Data Base (DDB)
|
||||
// Geometry: cylinders = ceil(total_sectors / (sectors_per_track * heads))
|
||||
let cylinders = total_sectors.div_ceil(SECTORS_PER_TRACK * NUMBER_HEADS);
|
||||
|
||||
writeln!(writer, "# The Disk Data Base")?;
|
||||
writeln!(writer, "#DDB")?;
|
||||
writeln!(writer)?;
|
||||
writeln!(writer, "ddb.virtualHWVersion = \"{}\"", VMDK_HW_VERSION)?;
|
||||
writeln!(writer, "ddb.geometry.cylinders = \"{}\"", cylinders)?;
|
||||
writeln!(writer, "ddb.geometry.heads = \"{}\"", NUMBER_HEADS)?;
|
||||
writeln!(writer, "ddb.geometry.sectors = \"{}\"", SECTORS_PER_TRACK)?;
|
||||
writeln!(writer, "ddb.adapterType = \"{}\"", VMDK_ADAPTER_TYPE)?;
|
||||
|
||||
// Flush the BufWriter to ensure all data is written before rename.
|
||||
writer.flush().context("failed to flush VMDK descriptor")?;
|
||||
// Explicitly drop to close the file handle before rename.
|
||||
drop(writer);
|
||||
|
||||
// atomic rename: tmp -> final path.
|
||||
fs::rename(&tmp_path, vmdk_path).context(format!(
|
||||
"failed to rename temp VMDK {} -> {}",
|
||||
tmp_path.display(),
|
||||
vmdk_path.display()
|
||||
))?;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"VMDK descriptor created: {} (total {} sectors, {} extents, {} cylinders)",
|
||||
vmdk_path.display(),
|
||||
total_sectors,
|
||||
extents.len(),
|
||||
cylinders
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract block device information from hypervisor device info
|
||||
fn extract_block_device_info(
|
||||
device_info: &DeviceType,
|
||||
block_driver: &str,
|
||||
) -> Result<(String, String, String)> {
|
||||
if let DeviceType::Block(device) = device_info {
|
||||
let blk_driver = device.config.driver_option.clone();
|
||||
let device_id = device.device_id.clone();
|
||||
|
||||
// Use virt_path as guest device path (e.g., /dev/vda)
|
||||
// pci_path is PCI address (e.g., 02/00) which is not a valid mount source
|
||||
let guest_path = match block_driver {
|
||||
VIRTIO_BLK_PCI | VIRTIO_BLK_MMIO | VIRTIO_BLK_CCW => {
|
||||
// virt_path is the correct guest device path for all virtio-blk types
|
||||
if device.config.virt_path.is_empty() {
|
||||
return Err(anyhow!("virt_path is empty for block device"));
|
||||
}
|
||||
device.config.virt_path.clone()
|
||||
}
|
||||
VIRTIO_SCSI | VIRTIO_PMEM => {
|
||||
return Err(anyhow!(
|
||||
"Block driver {} not fully supported for EROFS",
|
||||
block_driver
|
||||
));
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!("Unknown block driver: {}", block_driver));
|
||||
}
|
||||
};
|
||||
|
||||
Ok((device_id, guest_path, blk_driver))
|
||||
} else {
|
||||
Err(anyhow!("Expected block device, got {:?}", device_info))
|
||||
}
|
||||
}
|
||||
|
||||
/// EROFS Multi-Layer Rootfs with overlay support
|
||||
///
|
||||
/// Handles the EROFS Multi-Layer where rootfs consists of:
|
||||
/// - Mount[0]: ext4 rw layer (writable container layer) -> virtio-blk device
|
||||
/// - Mount[1]: erofs layers (fsmeta + flattened layers) -> virtio-blk via VMDK
|
||||
/// - Mount[2]: overlay (to combine ext4 upper + erofs lower)
|
||||
pub(crate) struct ErofsMultiLayerRootfs {
|
||||
guest_path: String,
|
||||
device_ids: Vec<String>,
|
||||
mount: oci::Mount,
|
||||
rwlayer_storage: Option<Storage>, // Writable layer storage (upper layer), typically ext4
|
||||
erofs_storage: Option<Storage>,
|
||||
/// Path to generated VMDK descriptor (only set when multiple EROFS devices are merged)
|
||||
vmdk_path: Option<PathBuf>,
|
||||
}
|
||||
|
||||
impl ErofsMultiLayerRootfs {
|
||||
pub async fn new(
|
||||
device_manager: &RwLock<DeviceManager>,
|
||||
sid: &str,
|
||||
cid: &str,
|
||||
rootfs_mounts: &[Mount],
|
||||
_share_fs: &Option<Arc<dyn crate::share_fs::ShareFs>>,
|
||||
) -> Result<Self> {
|
||||
let container_path = do_get_guest_path(ROOTFS, cid, false, false);
|
||||
let host_path = do_get_host_path(ROOTFS, sid, cid, false, false);
|
||||
|
||||
fs::create_dir_all(&host_path)
|
||||
.map_err(|e| anyhow!("failed to create rootfs dir {}: {:?}", host_path, e))?;
|
||||
|
||||
let mut device_ids = Vec::new();
|
||||
let mut rwlayer_storage: Option<Storage> = None;
|
||||
let mut erofs_storage: Option<Storage> = None;
|
||||
let mut vmdk_path: Option<PathBuf> = None;
|
||||
|
||||
// Directories to create (X-containerd.mkdir.path)
|
||||
let mut mkdir_dirs: Vec<String> = Vec::new();
|
||||
|
||||
let blkdev_info = get_block_device_info(device_manager).await;
|
||||
let block_driver = blkdev_info.block_device_driver.clone();
|
||||
|
||||
// Process each mount in rootfs_mounts to set up devices and storages
|
||||
for mount in rootfs_mounts {
|
||||
match mount.fs_type.as_str() {
|
||||
fmt if fmt.eq_ignore_ascii_case(RW_LAYER_ROOTFS_TYPE) => {
|
||||
// Mount[0]: rw layer -> virtio-blk device /dev/vdX1
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: adding rw layer: {}", mount.source
|
||||
);
|
||||
|
||||
let device_config = &mut BlockConfig {
|
||||
driver_option: block_driver.clone(),
|
||||
format: BlockDeviceFormat::Raw, // rw layer should be raw format
|
||||
path_on_host: mount.source.clone(),
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = do_handle_device(
|
||||
device_manager,
|
||||
&DeviceConfig::BlockCfg(device_config.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to attach rw block device")?;
|
||||
|
||||
let (device_id, guest_path, blk_driver) =
|
||||
extract_block_device_info(&device_info, &block_driver)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"writable block device attached - device_id: {} guest_path: {}",
|
||||
device_id,
|
||||
guest_path
|
||||
);
|
||||
|
||||
// Filter out "loop" option which is not needed in VM (device is already /dev/vdX)
|
||||
let mut options: Vec<String> = mount
|
||||
.options
|
||||
.iter()
|
||||
.filter(|o| *o != "loop")
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
// RW layer is the writable upper layer (marked with X-kata.overlay-upper)
|
||||
options.push("X-kata.overlay-upper".to_string());
|
||||
options.push("X-kata.multi-layer=true".to_string());
|
||||
|
||||
// Set up storage for rw layer (upper layer)
|
||||
rwlayer_storage = Some(Storage {
|
||||
driver: blk_driver,
|
||||
source: guest_path.clone(),
|
||||
fs_type: RW_LAYER_ROOTFS_TYPE.to_string(),
|
||||
mount_point: container_path.clone(),
|
||||
options,
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
device_ids.push(device_id);
|
||||
}
|
||||
fmt if fmt.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE) => {
|
||||
// Mount[1]: erofs layers -> virtio-blk via VMDK /dev/vdX2
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: adding erofs layers: {}", mount.source
|
||||
);
|
||||
|
||||
// Collect all EROFS devices: source + `device=` options
|
||||
let mut erofs_devices = vec![mount.source.clone()];
|
||||
for opt in &mount.options {
|
||||
if let Some(device_path) = opt.strip_prefix("device=") {
|
||||
erofs_devices.push(device_path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
info!(sl!(), "EROFS devices count: {}", erofs_devices.len());
|
||||
|
||||
// Generate merged VMDK file from all EROFS devices
|
||||
// Returns (path, format) - format is Vmdk for multiple devices, Raw for single device
|
||||
let (erofs_path, erofs_format) =
|
||||
generate_merged_erofs_vmdk(sid, cid, &erofs_devices)
|
||||
.await
|
||||
.context("failed to generate EROFS VMDK")?;
|
||||
|
||||
// Track VMDK path for cleanup (only when VMDK is actually created)
|
||||
if erofs_format == BlockDeviceFormat::Vmdk {
|
||||
vmdk_path = Some(PathBuf::from(&erofs_path));
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"EROFS block device config - path: {}, format: {:?}",
|
||||
erofs_path,
|
||||
erofs_format
|
||||
);
|
||||
|
||||
let device_config = &mut BlockConfig {
|
||||
driver_option: block_driver.clone(),
|
||||
format: erofs_format, // Vmdk for multiple devices, Raw for single device
|
||||
path_on_host: erofs_path,
|
||||
blkdev_aio: BlockDeviceAio::new(&blkdev_info.block_device_aio),
|
||||
is_readonly: true, // EROFS layer is read-only
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let device_info = do_handle_device(
|
||||
device_manager,
|
||||
&DeviceConfig::BlockCfg(device_config.clone()),
|
||||
)
|
||||
.await
|
||||
.context("failed to attach erofs block device")?;
|
||||
|
||||
let (device_id, guest_path, blk_driver) =
|
||||
extract_block_device_info(&device_info, &block_driver)?;
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs device attached - device_id: {} guest_path: {}",
|
||||
device_id,
|
||||
guest_path
|
||||
);
|
||||
|
||||
let mut options: Vec<String> = mount
|
||||
.options
|
||||
.iter()
|
||||
.filter(|o| {
|
||||
// Filter out options that are not valid erofs mount parameters:
|
||||
// 1. "loop" - not needed in VM, device is already /dev/vdX
|
||||
// 2. "device=" prefix - used for VMDK generation only, not for mount
|
||||
// 3. "X-kata." prefix - metadata markers for kata internals
|
||||
*o != "loop" && !o.starts_with("device=") && !o.starts_with("X-kata.")
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
// Erofs layers are read-only lower layers (marked with X-kata.overlay-lower)
|
||||
options.push("X-kata.overlay-lower".to_string());
|
||||
options.push("X-kata.multi-layer=true".to_string());
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"erofs storage options filtered: {:?} -> {:?}", mount.options, options
|
||||
);
|
||||
|
||||
erofs_storage = Some(Storage {
|
||||
driver: blk_driver,
|
||||
source: guest_path.clone(),
|
||||
fs_type: EROFS_ROOTFS_TYPE.to_string(),
|
||||
mount_point: container_path.clone(),
|
||||
options,
|
||||
..Default::default()
|
||||
});
|
||||
|
||||
device_ids.push(device_id);
|
||||
}
|
||||
fmt if fmt.eq_ignore_ascii_case("overlay")
|
||||
|| fmt.eq_ignore_ascii_case("format/overlay")
|
||||
|| fmt.eq_ignore_ascii_case("format/mkdir/overlay") =>
|
||||
{
|
||||
// Mount[2]: overlay to combine rwlayer (upper) + erofs (lower)
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: parsing overlay mount, options: {:?}", mount.options
|
||||
);
|
||||
|
||||
// Parse mkdir options (X-containerd.mkdir.path)
|
||||
for opt in &mount.options {
|
||||
if let Some(mkdir_spec) = opt.strip_prefix(X_CONTAINERD_MKDIR_PATH) {
|
||||
// Keep the full spec (path:mode or path:mode:uid:gid) for guest agent
|
||||
mkdir_dirs.push(mkdir_spec.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
info!(
|
||||
sl!(),
|
||||
"multi-layer erofs: ignoring unknown mount type: {}", mount.fs_type
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if device_ids.is_empty() {
|
||||
return Err(anyhow!("no devices attached for multi-layer erofs rootfs"));
|
||||
}
|
||||
|
||||
// Check device count limit
|
||||
if device_ids.len() > MAX_VIRTIO_BLK_DEVICES {
|
||||
return Err(anyhow!(
|
||||
"exceeded maximum virtio disk count: {} > {}",
|
||||
device_ids.len(),
|
||||
MAX_VIRTIO_BLK_DEVICES
|
||||
));
|
||||
}
|
||||
|
||||
// Add mkdir directives to rwlayer storage options for guest agent
|
||||
if let Some(ref mut rwlayer) = rwlayer_storage {
|
||||
rwlayer.options.extend(
|
||||
mkdir_dirs
|
||||
.iter()
|
||||
.map(|dir| format!("{}{}", X_KATA_MKDIR_PATH, dir)),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
guest_path: container_path,
|
||||
device_ids,
|
||||
mount: oci::Mount::default(),
|
||||
rwlayer_storage,
|
||||
erofs_storage,
|
||||
vmdk_path,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Rootfs for ErofsMultiLayerRootfs {
|
||||
async fn get_guest_rootfs_path(&self) -> Result<String> {
|
||||
Ok(self.guest_path.clone())
|
||||
}
|
||||
|
||||
async fn get_rootfs_mount(&self) -> Result<Vec<oci::Mount>> {
|
||||
Ok(vec![self.mount.clone()])
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
// Return all storages for multi-layer EROFS (rw layer + erofs layer) to guest agent.
|
||||
// Guest agent needs both to create overlay mount
|
||||
let mut storages = Vec::new();
|
||||
|
||||
if let Some(rwlayer) = self.rwlayer_storage.clone() {
|
||||
storages.push(rwlayer);
|
||||
}
|
||||
|
||||
if let Some(erofs) = self.erofs_storage.clone() {
|
||||
storages.push(erofs);
|
||||
}
|
||||
|
||||
if storages.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(storages)
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_device_id(&self) -> Result<Option<String>> {
|
||||
Ok(self.device_ids.first().cloned())
|
||||
}
|
||||
|
||||
async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()> {
|
||||
let mut dm = device_manager.write().await;
|
||||
for device_id in &self.device_ids {
|
||||
dm.try_remove_device(device_id).await?;
|
||||
}
|
||||
|
||||
// Clean up generated VMDK descriptor file if it exists (only for multi-device case)
|
||||
if let Some(ref vmdk) = self.vmdk_path {
|
||||
if vmdk.exists() {
|
||||
if let Err(e) = fs::remove_file(vmdk) {
|
||||
warn!(
|
||||
sl!(),
|
||||
"failed to remove VMDK descriptor {}: {}",
|
||||
vmdk.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if mounts represent multi-layer EROFS rootfs(with or without `device=` options):
|
||||
/// - Must have at least 2 mounts (rw layer + erofs layer)
|
||||
/// - Multi-layer: erofs with `device=` options
|
||||
/// - Single-layer: erofs without `device=` options (just layer.erofs)
|
||||
pub fn is_erofs_multi_layer(rootfs_mounts: &[Mount]) -> bool {
|
||||
if rootfs_mounts.len() < 2 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let has_rwlayer = rootfs_mounts.iter().any(|m| {
|
||||
m.fs_type.eq_ignore_ascii_case(RW_LAYER_ROOTFS_TYPE) && m.options.iter().any(|o| o == "rw")
|
||||
});
|
||||
|
||||
let has_erofs = rootfs_mounts
|
||||
.iter()
|
||||
.any(|m| m.fs_type.eq_ignore_ascii_case(EROFS_ROOTFS_TYPE));
|
||||
|
||||
// Must have rwlayer + erofs (multi-layer or single-layer)
|
||||
has_rwlayer && has_erofs
|
||||
}
|
||||
@@ -11,6 +11,7 @@ use anyhow::{anyhow, Context, Result};
|
||||
use async_trait::async_trait;
|
||||
use kata_types::mount::Mount;
|
||||
mod block_rootfs;
|
||||
mod erofs_rootfs;
|
||||
pub mod virtual_volume;
|
||||
|
||||
use hypervisor::{device::device_manager::DeviceManager, Hypervisor};
|
||||
@@ -19,8 +20,11 @@ use virtual_volume::{is_kata_virtual_volume, VirtualVolume};
|
||||
use std::{collections::HashMap, sync::Arc, vec::Vec};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use self::{block_rootfs::is_block_rootfs, nydus_rootfs::NYDUS_ROOTFS_TYPE};
|
||||
use crate::share_fs::ShareFs;
|
||||
use self::{
|
||||
block_rootfs::is_block_rootfs, erofs_rootfs::ErofsMultiLayerRootfs,
|
||||
nydus_rootfs::NYDUS_ROOTFS_TYPE,
|
||||
};
|
||||
use crate::{rootfs::erofs_rootfs::is_erofs_multi_layer, share_fs::ShareFs};
|
||||
use oci_spec::runtime as oci;
|
||||
|
||||
const ROOTFS: &str = "rootfs";
|
||||
@@ -31,7 +35,7 @@ const TYPE_OVERLAY_FS: &str = "overlay";
|
||||
pub trait Rootfs: Send + Sync {
|
||||
async fn get_guest_rootfs_path(&self) -> Result<String>;
|
||||
async fn get_rootfs_mount(&self) -> Result<Vec<oci::Mount>>;
|
||||
async fn get_storage(&self) -> Option<Storage>;
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>>;
|
||||
async fn cleanup(&self, device_manager: &RwLock<DeviceManager>) -> Result<()>;
|
||||
async fn get_device_id(&self) -> Result<Option<String>>;
|
||||
}
|
||||
@@ -90,9 +94,26 @@ impl RootFsResource {
|
||||
Err(anyhow!("share fs is unavailable"))
|
||||
}
|
||||
}
|
||||
mounts_vec if is_single_layer_rootfs(mounts_vec) => {
|
||||
_ if is_erofs_multi_layer(rootfs_mounts) => {
|
||||
info!(
|
||||
sl!(),
|
||||
"handling multi-layer erofs rootfs with {} mounts",
|
||||
rootfs_mounts.len()
|
||||
);
|
||||
|
||||
let multi_layer =
|
||||
ErofsMultiLayerRootfs::new(device_manager, sid, cid, rootfs_mounts, share_fs)
|
||||
.await
|
||||
.context("new multi-layer erofs rootfs")?;
|
||||
|
||||
let ret = Arc::new(multi_layer);
|
||||
let mut inner = self.inner.write().await;
|
||||
inner.rootfs.push(ret.clone());
|
||||
Ok(ret)
|
||||
}
|
||||
_ if is_single_layer_rootfs(rootfs_mounts) => {
|
||||
// Safe as single_layer_rootfs must have one layer
|
||||
let layer = &mounts_vec[0];
|
||||
let layer = &rootfs_mounts[0];
|
||||
let mut inner = self.inner.write().await;
|
||||
|
||||
if is_guest_pull_volume(share_fs, layer) {
|
||||
|
||||
@@ -149,8 +149,8 @@ impl Rootfs for NydusRootfs {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<Storage> {
|
||||
Some(self.rootfs.clone())
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
Some(vec![self.rootfs.clone()])
|
||||
}
|
||||
|
||||
async fn get_device_id(&self) -> Result<Option<String>> {
|
||||
|
||||
@@ -73,7 +73,7 @@ impl Rootfs for ShareFsRootfs {
|
||||
todo!()
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<Storage> {
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ use oci_spec::runtime as oci;
|
||||
use serde_json;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use agent::Storage;
|
||||
use hypervisor::device::device_manager::DeviceManager;
|
||||
use kata_types::{
|
||||
annotations,
|
||||
@@ -184,8 +185,8 @@ impl super::Rootfs for VirtualVolume {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
async fn get_storage(&self) -> Option<agent::Storage> {
|
||||
Some(self.storages[0].clone())
|
||||
async fn get_storage(&self) -> Option<Vec<Storage>> {
|
||||
Some(self.storages.clone())
|
||||
}
|
||||
|
||||
async fn get_device_id(&self) -> Result<Option<String>> {
|
||||
|
||||
@@ -53,9 +53,6 @@ linux_container = { workspace = true, optional = true }
|
||||
virt_container = { workspace = true, optional = true }
|
||||
wasm_container = { workspace = true, optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
rstest = { workspace = true }
|
||||
|
||||
[features]
|
||||
default = ["virt"]
|
||||
linux = ["linux_container"]
|
||||
|
||||
@@ -51,13 +51,6 @@ pub trait Sandbox: Send + Sync {
|
||||
shim_pid: u32,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Re-scan the network namespace for late-discovered endpoints.
|
||||
/// This handles runtimes like Docker 26+ that configure networking
|
||||
/// after the Start response. The default implementation is a no-op.
|
||||
async fn rescan_network(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// metrics function
|
||||
async fn agent_metrics(&self) -> Result<String>;
|
||||
async fn hypervisor_metrics(&self) -> Result<String>;
|
||||
|
||||
@@ -69,53 +69,6 @@ use crate::{
|
||||
tracer::{KataTracer, ROOTSPAN},
|
||||
};
|
||||
|
||||
const DOCKER_LIBNETWORK_SETKEY: &str = "libnetwork-setkey";
|
||||
|
||||
const DOCKER_NETNS_PREFIXES: &[&str] = &["/var/run/docker/netns/", "/run/docker/netns/"];
|
||||
|
||||
fn is_valid_docker_sandbox_id(id: &str) -> bool {
|
||||
id.len() == 64 && id.bytes().all(|b| matches!(b, b'0'..=b'9' | b'a'..=b'f'))
|
||||
}
|
||||
|
||||
/// Discover Docker's pre-created network namespace path from OCI spec hooks.
|
||||
///
|
||||
/// Docker's libnetwork-setkey hook contains the sandbox ID as its
|
||||
/// argument following "libnetwork-setkey", which maps to a netns file
|
||||
/// under /var/run/docker/netns/<sandbox_id> or /run/docker/netns/<sandbox_id>.
|
||||
fn docker_netns_path(spec: &oci::Spec) -> Option<String> {
|
||||
let hooks = spec.hooks().as_ref()?;
|
||||
|
||||
let hook_sets: [&[oci::Hook]; 2] = [
|
||||
hooks.prestart().as_deref().unwrap_or_default(),
|
||||
hooks.create_runtime().as_deref().unwrap_or_default(),
|
||||
];
|
||||
|
||||
for hooks in &hook_sets {
|
||||
for hook in *hooks {
|
||||
if let Some(args) = hook.args() {
|
||||
for (i, arg) in args.iter().enumerate() {
|
||||
if arg == DOCKER_LIBNETWORK_SETKEY && i + 1 < args.len() {
|
||||
let sandbox_id = &args[i + 1];
|
||||
if !is_valid_docker_sandbox_id(sandbox_id) {
|
||||
continue;
|
||||
}
|
||||
for prefix in DOCKER_NETNS_PREFIXES {
|
||||
let ns_path = format!("{}{}", prefix, sandbox_id);
|
||||
if let Ok(metadata) = std::fs::symlink_metadata(&ns_path) {
|
||||
if metadata.is_file() {
|
||||
return Some(ns_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn convert_string_to_slog_level(string_level: &str) -> slog::Level {
|
||||
match string_level {
|
||||
"trace" => slog::Level::Trace,
|
||||
@@ -424,17 +377,8 @@ impl RuntimeHandlerManager {
|
||||
if ns.path().is_some() {
|
||||
netns = ns.path().clone().map(|p| p.display().to_string());
|
||||
}
|
||||
// Docker 26+ may configure networking outside of the OCI
|
||||
// spec namespace path. Try to discover the netns from hook
|
||||
// args before falling back to creating a placeholder.
|
||||
else if let Some(docker_ns) = docker_netns_path(spec) {
|
||||
info!(
|
||||
sl!(),
|
||||
"discovered Docker network namespace from hook args";
|
||||
"netns" => &docker_ns
|
||||
);
|
||||
netns = Some(docker_ns);
|
||||
} else {
|
||||
// if we get empty netns from oci spec, we need to create netns for the VM
|
||||
else {
|
||||
let ns_name = generate_netns_name();
|
||||
let raw_netns = NetNs::new(ns_name)?;
|
||||
let path = Some(PathBuf::from(raw_netns.path()).display().to_string());
|
||||
@@ -695,7 +639,6 @@ impl RuntimeHandlerManager {
|
||||
Ok(TaskResponse::WaitProcess(exit_status))
|
||||
}
|
||||
TaskRequest::StartProcess(process_id) => {
|
||||
let is_sandbox_container = cm.is_sandbox_container(&process_id).await;
|
||||
let shim_pid = cm
|
||||
.start_process(&process_id)
|
||||
.await
|
||||
@@ -704,25 +647,6 @@ impl RuntimeHandlerManager {
|
||||
let pid = shim_pid.pid;
|
||||
let process_type = process_id.process_type;
|
||||
let container_id = process_id.container_id().to_string();
|
||||
|
||||
// Schedule an async network rescan for sandbox containers.
|
||||
// This handles runtimes that configure networking after the
|
||||
// Start response (e.g. Docker 26+). rescan_network is
|
||||
// idempotent — it returns immediately if endpoints already
|
||||
// exist.
|
||||
if is_sandbox_container {
|
||||
let sandbox_rescan = sandbox.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = sandbox_rescan.rescan_network().await {
|
||||
error!(
|
||||
sl!(),
|
||||
"async network rescan failed — container may lack networking: {:?}",
|
||||
e
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
tokio::spawn(async move {
|
||||
let result = sandbox.wait_process(cm, process_id, pid).await;
|
||||
if let Err(e) = result {
|
||||
@@ -996,85 +920,3 @@ fn configure_non_root_hypervisor(config: &mut Hypervisor) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use oci_spec::runtime::{HookBuilder, HooksBuilder, SpecBuilder};
|
||||
use rstest::rstest;
|
||||
|
||||
const VALID_SANDBOX_ID: &str =
|
||||
"a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2";
|
||||
|
||||
#[rstest]
|
||||
#[case::all_lowercase_hex(VALID_SANDBOX_ID, true)]
|
||||
#[case::all_zeros("0000000000000000000000000000000000000000000000000000000000000000", true)]
|
||||
#[case::uppercase_hex("A1B2C3D4E5F6A1B2C3D4E5F6A1B2C3D4E5F6A1B2C3D4E5F6A1B2C3D4E5F6A1B2", false)]
|
||||
#[case::too_short("a1b2c3d4", false)]
|
||||
#[case::non_hex("zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", false)]
|
||||
#[case::path_traversal("../../../etc/passwd", false)]
|
||||
#[case::empty("", false)]
|
||||
fn test_is_valid_docker_sandbox_id(#[case] id: &str, #[case] expected: bool) {
|
||||
assert_eq!(is_valid_docker_sandbox_id(id), expected);
|
||||
}
|
||||
|
||||
fn make_hook_with_args(args: Vec<&str>) -> oci::Hook {
|
||||
HookBuilder::default()
|
||||
.path("/usr/bin/test")
|
||||
.args(args.into_iter().map(String::from).collect::<Vec<_>>())
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[rstest]
|
||||
#[case::no_hooks(None, None)]
|
||||
#[case::unrelated_hooks(
|
||||
Some(HooksBuilder::default()
|
||||
.prestart(vec![make_hook_with_args(vec!["some-hook", "arg1"])])
|
||||
.build().unwrap()),
|
||||
None
|
||||
)]
|
||||
#[case::invalid_sandbox_id(
|
||||
Some(HooksBuilder::default()
|
||||
.prestart(vec![make_hook_with_args(vec![
|
||||
"/usr/bin/dockerd", "libnetwork-setkey", "not-a-valid-id",
|
||||
])])
|
||||
.build().unwrap()),
|
||||
None
|
||||
)]
|
||||
#[case::setkey_at_end_of_args(
|
||||
Some(HooksBuilder::default()
|
||||
.prestart(vec![make_hook_with_args(vec![
|
||||
"/usr/bin/dockerd", "libnetwork-setkey",
|
||||
])])
|
||||
.build().unwrap()),
|
||||
None
|
||||
)]
|
||||
#[case::valid_prestart_but_no_file(
|
||||
Some(HooksBuilder::default()
|
||||
.prestart(vec![make_hook_with_args(vec![
|
||||
"/usr/bin/dockerd", "libnetwork-setkey", VALID_SANDBOX_ID,
|
||||
])])
|
||||
.build().unwrap()),
|
||||
None
|
||||
)]
|
||||
#[case::valid_create_runtime_but_no_file(
|
||||
Some(HooksBuilder::default()
|
||||
.create_runtime(vec![make_hook_with_args(vec![
|
||||
"/usr/bin/dockerd", "libnetwork-setkey", VALID_SANDBOX_ID,
|
||||
])])
|
||||
.build().unwrap()),
|
||||
None
|
||||
)]
|
||||
fn test_docker_netns_path(
|
||||
#[case] hooks: Option<oci::Hooks>,
|
||||
#[case] expected: Option<String>,
|
||||
) {
|
||||
let mut builder = SpecBuilder::default();
|
||||
if let Some(h) = hooks {
|
||||
builder = builder.hooks(h);
|
||||
}
|
||||
let spec = builder.build().unwrap();
|
||||
assert_eq!(docker_netns_path(&spec), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,8 +167,8 @@ impl Container {
|
||||
);
|
||||
|
||||
let mut storages = vec![];
|
||||
if let Some(storage) = rootfs.get_storage().await {
|
||||
storages.push(storage);
|
||||
if let Some(mut storage_list) = rootfs.get_storage().await {
|
||||
storages.append(&mut storage_list);
|
||||
}
|
||||
inner.rootfs.push(rootfs);
|
||||
|
||||
|
||||
@@ -58,7 +58,6 @@ use resource::{ResourceConfig, ResourceManager};
|
||||
use runtime_spec as spec;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use strum::Display;
|
||||
use tokio::sync::{mpsc::Sender, Mutex, RwLock};
|
||||
use tracing::instrument;
|
||||
@@ -974,71 +973,6 @@ impl Sandbox for VirtSandbox {
|
||||
self.hypervisor.get_hypervisor_metrics().await
|
||||
}
|
||||
|
||||
async fn rescan_network(&self) -> Result<()> {
|
||||
let config = self.resource_manager.config().await;
|
||||
if config.runtime.disable_new_netns {
|
||||
return Ok(());
|
||||
}
|
||||
if dan_config_path(&config, &self.sid).exists() {
|
||||
return Ok(());
|
||||
}
|
||||
if self.resource_manager.has_network_endpoints().await {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let sandbox_config = match &self.sandbox_config {
|
||||
Some(c) => c,
|
||||
None => return Ok(()),
|
||||
};
|
||||
let netns_path = match &sandbox_config.network_env.netns {
|
||||
Some(p) => p.clone(),
|
||||
None => return Ok(()),
|
||||
};
|
||||
|
||||
const MAX_WAIT: Duration = Duration::from_secs(5);
|
||||
const POLL_INTERVAL: Duration = Duration::from_millis(50);
|
||||
let deadline = tokio::time::Instant::now() + MAX_WAIT;
|
||||
|
||||
info!(sl!(), "waiting for network interfaces in namespace");
|
||||
|
||||
loop {
|
||||
let network_config = NetworkConfig::NetNs(NetworkWithNetNsConfig {
|
||||
network_model: config.runtime.internetworking_model.clone(),
|
||||
netns_path: netns_path.clone(),
|
||||
queues: self
|
||||
.hypervisor
|
||||
.hypervisor_config()
|
||||
.await
|
||||
.network_info
|
||||
.network_queues as usize,
|
||||
network_created: sandbox_config.network_env.network_created,
|
||||
});
|
||||
|
||||
if let Err(e) = self.resource_manager.handle_network(network_config).await {
|
||||
warn!(sl!(), "network rescan attempt failed: {:?}", e);
|
||||
}
|
||||
|
||||
if self.resource_manager.has_network_endpoints().await {
|
||||
info!(sl!(), "network interfaces discovered during rescan");
|
||||
return self
|
||||
.resource_manager
|
||||
.setup_network_in_guest()
|
||||
.await
|
||||
.context("setup network in guest after rescan");
|
||||
}
|
||||
|
||||
if tokio::time::Instant::now() >= deadline {
|
||||
warn!(
|
||||
sl!(),
|
||||
"no network interfaces found after timeout — networking may be configured later"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
tokio::time::sleep(POLL_INTERVAL).await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn set_policy(&self, policy: &str) -> Result<()> {
|
||||
if policy.is_empty() {
|
||||
debug!(sl!(), "sb: set_policy skipped without policy");
|
||||
|
||||
@@ -26,6 +26,10 @@ use shim::{config, Args, Error, ShimExecutor};
|
||||
const DEFAULT_TOKIO_RUNTIME_WORKER_THREADS: usize = 2;
|
||||
// env to config tokio runtime worker threads
|
||||
const ENV_TOKIO_RUNTIME_WORKER_THREADS: &str = "TOKIO_RUNTIME_WORKER_THREADS";
|
||||
// RUNTIME_ALLOW_MOUNTS are the custom mount types allowed by the runtime. These
|
||||
// types should not be handled by the mount manager.
|
||||
// To include prepare mount types, use "/*" suffix, such as "format/*"
|
||||
pub const RUNTIME_ALLOW_MOUNTS: &str = "containerd.io/runtime-allow-mounts";
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Action {
|
||||
@@ -134,6 +138,10 @@ fn show_info() -> Result<()> {
|
||||
let mut info = RuntimeInfo::new();
|
||||
info.name = config::CONTAINERD_RUNTIME_NAME.to_string();
|
||||
info.version = Some(version).into();
|
||||
info.annotations.insert(
|
||||
RUNTIME_ALLOW_MOUNTS.to_string(),
|
||||
"mkdir/*,format/*,erofs".to_string(),
|
||||
);
|
||||
|
||||
let data = info
|
||||
.write_to_bytes()
|
||||
|
||||
@@ -143,7 +143,13 @@ DEFROOTFSTYPE := $(ROOTFSTYPE_EXT4)
|
||||
FIRMWAREPATH :=
|
||||
FIRMWAREVOLUMEPATH :=
|
||||
|
||||
FIRMWAREPATH_NV = $(FIRMWAREPATH)
|
||||
FIRMWAREPATH_NV :=
|
||||
ifeq ($(ARCH),amd64)
|
||||
FIRMWAREPATH_NV := $(PREFIXDEPS)/share/$(EDK2_NAME)/OVMF.fd
|
||||
endif
|
||||
ifeq ($(ARCH),arm64)
|
||||
FIRMWAREPATH_NV := $(PREFIXDEPS)/share/$(EDK2_NAME)/AAVMF_CODE.fd
|
||||
endif
|
||||
|
||||
FIRMWARETDVFPATH := $(PREFIXDEPS)/share/ovmf/OVMF.inteltdx.fd
|
||||
FIRMWARETDVFPATH_NV := $(FIRMWARETDVFPATH)
|
||||
|
||||
@@ -166,4 +166,14 @@ impl yaml::K8sResource for CronJob {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.jobTemplate.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec
|
||||
.jobTemplate
|
||||
.spec
|
||||
.template
|
||||
.spec
|
||||
.securityContext
|
||||
.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,4 +167,8 @@ impl yaml::K8sResource for DaemonSet {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,4 +178,8 @@ impl yaml::K8sResource for Deployment {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,6 +167,10 @@ impl yaml::K8sResource for Job {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pod_name_regex(job_name: String) -> String {
|
||||
|
||||
@@ -114,10 +114,12 @@ pub fn get_mount_and_storage(
|
||||
|
||||
if let Some(emptyDir) = &yaml_volume.emptyDir {
|
||||
let settings_volumes = &settings.volumes;
|
||||
let volume = match emptyDir.medium.as_deref() {
|
||||
Some("Memory") => &settings_volumes.emptyDir_memory,
|
||||
_ if settings.cluster_config.encrypted_emptydir => &settings_volumes.emptyDir_encrypted,
|
||||
_ => &settings_volumes.emptyDir,
|
||||
let (volume, block_encrypted_emptydir) = match emptyDir.medium.as_deref() {
|
||||
Some("Memory") => (&settings_volumes.emptyDir_memory, false),
|
||||
_ if settings.cluster_config.encrypted_emptydir => {
|
||||
(&settings_volumes.emptyDir_encrypted, true)
|
||||
}
|
||||
_ => (&settings_volumes.emptyDir, false),
|
||||
};
|
||||
|
||||
get_empty_dir_mount_and_storage(
|
||||
@@ -127,6 +129,7 @@ pub fn get_mount_and_storage(
|
||||
yaml_mount,
|
||||
volume,
|
||||
pod_security_context,
|
||||
block_encrypted_emptydir,
|
||||
);
|
||||
} else if yaml_volume.persistentVolumeClaim.is_some() || yaml_volume.azureFile.is_some() {
|
||||
get_shared_bind_mount(yaml_mount, p_mounts, "rprivate", "rw");
|
||||
@@ -150,18 +153,42 @@ fn get_empty_dir_mount_and_storage(
|
||||
yaml_mount: &pod::VolumeMount,
|
||||
settings_empty_dir: &settings::EmptyDirVolume,
|
||||
pod_security_context: &Option<pod::PodSecurityContext>,
|
||||
block_encrypted_emptydir: bool,
|
||||
) {
|
||||
debug!("Settings emptyDir: {:?}", settings_empty_dir);
|
||||
|
||||
if yaml_mount.subPathExpr.is_none() {
|
||||
let mut options = settings_empty_dir.options.clone();
|
||||
if let Some(gid) = pod_security_context.as_ref().and_then(|sc| sc.fsGroup) {
|
||||
// This matches the runtime behavior of only setting the fsgid if the mountpoint GID is not 0.
|
||||
// https://github.com/kata-containers/kata-containers/blob/b69da5f3ba8385c5833b31db41a846a203812675/src/runtime/virtcontainers/kata_agent.go#L1602-L1607
|
||||
if gid != 0 {
|
||||
options.push(format!("fsgid={gid}"));
|
||||
// Pod fsGroup in policy must mirror how the shim encodes it on Storage:
|
||||
// - block-encrypted host emptyDirs become virtio-blk/scsi volumes; the runtime sets
|
||||
// Storage.fs_group from mount metadata (handleDeviceBlockVolume in kata_agent.go).
|
||||
// - shared-fs / guest-local emptyDirs use Storage.options: the runtime appends
|
||||
// fsgid=<host GID> when the volume is not root-owned (handleEphemeralStorage and
|
||||
// handleLocalStorage in kata_agent.go). Genpolicy uses pod fsGroup when non-zero as
|
||||
// the usual kubelet-applied GID for that stat.
|
||||
let pod_gid = pod_security_context.as_ref().and_then(|sc| sc.fsGroup);
|
||||
let fs_group = if block_encrypted_emptydir {
|
||||
match pod_gid {
|
||||
Some(gid) if gid > 0 => protobuf::MessageField::some(agent::FSGroup {
|
||||
group_id: u32::try_from(gid).unwrap_or_else(|_| {
|
||||
panic!(
|
||||
"get_empty_dir_mount_and_storage: securityContext.fsGroup {gid} \
|
||||
must be <= {}",
|
||||
u32::MAX
|
||||
)
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
_ => protobuf::MessageField::none(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if let Some(gid) = pod_gid {
|
||||
if gid != 0 {
|
||||
options.push(format!("fsgid={gid}"));
|
||||
}
|
||||
}
|
||||
protobuf::MessageField::none()
|
||||
};
|
||||
storages.push(agent::Storage {
|
||||
driver: settings_empty_dir.driver.clone(),
|
||||
driver_options: settings_empty_dir.driver_options.clone(),
|
||||
@@ -173,7 +200,7 @@ fn get_empty_dir_mount_and_storage(
|
||||
} else {
|
||||
settings_empty_dir.mount_point.clone()
|
||||
},
|
||||
fs_group: protobuf::MessageField::none(),
|
||||
fs_group,
|
||||
shared: settings_empty_dir.shared,
|
||||
special_fields: ::protobuf::SpecialFields::new(),
|
||||
});
|
||||
|
||||
@@ -937,6 +937,10 @@ impl yaml::K8sResource for Pod {
|
||||
fn get_sysctls(&self) -> Vec<Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&PodSecurityContext> {
|
||||
self.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl Container {
|
||||
|
||||
@@ -971,6 +971,16 @@ impl AgentPolicy {
|
||||
);
|
||||
}
|
||||
|
||||
yaml::apply_pod_fs_group_and_supplemental_groups(
|
||||
&mut process,
|
||||
resource.get_pod_security_context(),
|
||||
is_pause_container,
|
||||
);
|
||||
debug!(
|
||||
"get_container_process: after apply_pod_fs_group_and_supplemental_groups: User = {:?}",
|
||||
&process.User
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
// Container-level settings from user's YAML.
|
||||
yaml_container.get_process_fields(&mut process);
|
||||
|
||||
@@ -128,4 +128,8 @@ impl yaml::K8sResource for ReplicaSet {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,4 +131,8 @@ impl yaml::K8sResource for ReplicationController {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,6 +211,10 @@ impl yaml::K8sResource for StatefulSet {
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
yaml::get_sysctls(&self.spec.template.spec.securityContext)
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
self.spec.template.spec.securityContext.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl StatefulSet {
|
||||
|
||||
@@ -107,6 +107,10 @@ pub trait K8sResource {
|
||||
// for some of the K8s resource types.
|
||||
}
|
||||
|
||||
fn get_pod_security_context(&self) -> Option<&pod::PodSecurityContext> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_sysctls(&self) -> Vec<pod::Sysctl> {
|
||||
vec![]
|
||||
}
|
||||
@@ -388,6 +392,39 @@ fn handle_unused_field(path: &str, silent_unsupported_fields: bool) {
|
||||
}
|
||||
}
|
||||
|
||||
/// Applies pod `fsGroup` and `supplementalGroups` to `AdditionalGids`.
|
||||
pub fn apply_pod_fs_group_and_supplemental_groups(
|
||||
process: &mut policy::KataProcess,
|
||||
security_context: Option<&pod::PodSecurityContext>,
|
||||
is_pause_container: bool,
|
||||
) {
|
||||
if is_pause_container {
|
||||
return;
|
||||
}
|
||||
let Some(context) = security_context else {
|
||||
return;
|
||||
};
|
||||
|
||||
if let Some(fs_group) = context.fsGroup {
|
||||
let gid: u32 = fs_group.try_into().unwrap();
|
||||
process.User.AdditionalGids.insert(gid);
|
||||
debug!(
|
||||
"apply_pod_fs_group_and_supplemental_groups: inserted fs_group = {gid} into AdditionalGids, User = {:?}",
|
||||
&process.User
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(supplemental_groups) = &context.supplementalGroups {
|
||||
supplemental_groups.iter().for_each(|g| {
|
||||
process.User.AdditionalGids.insert(*g);
|
||||
});
|
||||
debug!(
|
||||
"apply_pod_fs_group_and_supplemental_groups: inserted supplementalGroups = {:?} into AdditionalGids, User = {:?}",
|
||||
&supplemental_groups, &process.User
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_process_fields(
|
||||
process: &mut policy::KataProcess,
|
||||
must_check_passwd: &mut bool,
|
||||
@@ -447,27 +484,6 @@ pub fn get_process_fields(
|
||||
*must_check_passwd = false;
|
||||
}
|
||||
|
||||
if !is_pause_container {
|
||||
if let Some(fs_group) = context.fsGroup {
|
||||
let gid = fs_group.try_into().unwrap();
|
||||
process.User.AdditionalGids.insert(gid);
|
||||
debug!(
|
||||
"get_process_fields: inserted fs_group = {gid} into AdditionalGids, User = {:?}",
|
||||
&process.User
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(supplemental_groups) = &context.supplementalGroups {
|
||||
supplemental_groups.iter().for_each(|g| {
|
||||
process.User.AdditionalGids.insert(*g);
|
||||
});
|
||||
debug!(
|
||||
"get_process_fields: inserted supplementalGroups = {:?} into AdditionalGids, User = {:?}",
|
||||
&supplemental_groups, &process.User
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(allow) = context.allowPrivilegeEscalation {
|
||||
process.NoNewPrivileges = !allow
|
||||
}
|
||||
|
||||
@@ -345,12 +345,12 @@
|
||||
"driver_options": [
|
||||
"encryption_key=ephemeral"
|
||||
],
|
||||
"fs_group": null,
|
||||
"fs_group": {
|
||||
"group_id": 1000
|
||||
},
|
||||
"fstype": "ext4",
|
||||
"mount_point": "/run/kata-containers/sandbox/storage/MDAvMDA=",
|
||||
"options": [
|
||||
"fsgid=1000"
|
||||
],
|
||||
"options": [],
|
||||
"source": "00/00",
|
||||
"shared": true
|
||||
}
|
||||
|
||||
@@ -22,10 +22,17 @@ setup() {
|
||||
pod_name="sharevol-kata"
|
||||
pod_logs_file=""
|
||||
setup_common || die "setup_common failed"
|
||||
yaml_file="${pod_config_dir}/pod-empty-dir.yaml"
|
||||
|
||||
# Add policy to yaml
|
||||
policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")"
|
||||
add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest"
|
||||
}
|
||||
|
||||
@test "Empty dir volumes" {
|
||||
local yaml_file
|
||||
local mount_command
|
||||
local dd_command
|
||||
|
||||
yaml_file="${pod_config_dir}/pod-empty-dir.yaml"
|
||||
|
||||
mount_command=(sh -c "mount | grep cache")
|
||||
add_exec_to_policy_settings "${policy_settings_dir}" "${mount_command[@]}"
|
||||
@@ -33,11 +40,9 @@ setup() {
|
||||
dd_command=(sh -c "dd if=/dev/zero of=/tmp/cache/file1 bs=1M count=50; echo $?")
|
||||
add_exec_to_policy_settings "${policy_settings_dir}" "${dd_command[@]}"
|
||||
|
||||
add_requests_to_policy_settings "${policy_settings_dir}" "ReadStreamRequest"
|
||||
# Add policy to yaml
|
||||
auto_generate_policy "${policy_settings_dir}" "${yaml_file}"
|
||||
}
|
||||
|
||||
@test "Empty dir volumes" {
|
||||
# Create the pod
|
||||
kubectl create -f "${yaml_file}"
|
||||
|
||||
@@ -55,20 +60,25 @@ setup() {
|
||||
local agnhost_name
|
||||
local agnhost_version
|
||||
local gid
|
||||
local image
|
||||
local logs
|
||||
local pod_file
|
||||
local pod_yaml
|
||||
local pod_yaml_in
|
||||
local uid
|
||||
|
||||
# This is a reproducer of k8s e2e "[sig-storage] EmptyDir volumes when FSGroup is specified [LinuxOnly] [NodeFeature:FSGroup] new files should be created with FSGroup ownership when container is non-root" test
|
||||
pod_file="${pod_config_dir}/pod-empty-dir-fsgroup.yaml"
|
||||
pod_yaml_in="${pod_config_dir}/pod-empty-dir-fsgroup.yaml.in"
|
||||
pod_yaml="${pod_config_dir}/pod-empty-dir-fsgroup.yaml"
|
||||
agnhost_name="${container_images_agnhost_name}"
|
||||
agnhost_version="${container_images_agnhost_version}"
|
||||
image="${agnhost_name}:${agnhost_version}"
|
||||
export AGNHOST_IMAGE="${agnhost_name}:${agnhost_version}"
|
||||
|
||||
envsubst '${AGNHOST_IMAGE}' <"${pod_yaml_in}" >"${pod_yaml}"
|
||||
|
||||
# Add policy to yaml
|
||||
auto_generate_policy "${policy_settings_dir}" "${pod_yaml}"
|
||||
|
||||
# Try to avoid timeout by prefetching the image.
|
||||
sed -e "s#\${agnhost_image}#${image}#" "$pod_file" |\
|
||||
kubectl create -f -
|
||||
kubectl create -f "${pod_yaml}"
|
||||
cmd="kubectl get pods ${pod_name} | grep Completed"
|
||||
waitForProcess "${wait_time}" "${sleep_time}" "${cmd}"
|
||||
|
||||
@@ -90,6 +100,7 @@ setup() {
|
||||
|
||||
teardown() {
|
||||
[ ! -f "$pod_logs_file" ] || rm -f "$pod_logs_file"
|
||||
[[ -n "${pod_config_dir:-}" ]] && rm -f "${pod_config_dir}/pod-empty-dir-fsgroup.yaml"
|
||||
|
||||
delete_tmp_policy_settings_dir "${policy_settings_dir}"
|
||||
teardown_common "${node}" "${node_start_time:-}"
|
||||
|
||||
@@ -10,6 +10,7 @@ load "${BATS_TEST_DIRNAME}/confidential_common.sh"
|
||||
|
||||
export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu}"
|
||||
|
||||
# when using hostPath, ensure directory is writable by container user
|
||||
export LOCAL_NIM_CACHE="/opt/nim/.cache"
|
||||
|
||||
SKIP_MULTI_GPU_TESTS=${SKIP_MULTI_GPU_TESTS:-false}
|
||||
|
||||
@@ -16,14 +16,18 @@ metadata:
|
||||
# cc_init_data annotation will be added by genpolicy with CDH configuration
|
||||
# from the custom default-initdata.toml created by create_nim_initdata_file()
|
||||
spec:
|
||||
# Explicit user/group/supplementary groups to support nydus guest-pull.
|
||||
# See issue https://github.com/kata-containers/kata-containers/issues/11162 and
|
||||
# other references to this issue in the genpolicy source folder.
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
fsGroup: 1000
|
||||
supplementalGroups: [4, 20, 24, 25, 27, 29, 30, 44, 46]
|
||||
restartPolicy: Never
|
||||
runtimeClassName: kata
|
||||
imagePullSecrets:
|
||||
- name: ngc-secret-instruct
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsGroup: 0
|
||||
fsGroup: 0
|
||||
containers:
|
||||
- name: ${POD_NAME_INSTRUCT}
|
||||
image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.13.1
|
||||
|
||||
@@ -14,10 +14,6 @@ spec:
|
||||
runtimeClassName: kata
|
||||
imagePullSecrets:
|
||||
- name: ngc-secret-instruct
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
runAsGroup: 0
|
||||
fsGroup: 0
|
||||
containers:
|
||||
- name: ${POD_NAME_INSTRUCT}
|
||||
image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.13.1
|
||||
|
||||
@@ -16,15 +16,18 @@ metadata:
|
||||
# cc_init_data annotation will be added by genpolicy with CDH configuration
|
||||
# from the custom default-initdata.toml created by create_nim_initdata_file()
|
||||
spec:
|
||||
# Explicit user/group/supplementary groups to support nydus guest-pull.
|
||||
# See issue https://github.com/kata-containers/kata-containers/issues/11162 and
|
||||
# other references to this issue in the genpolicy source folder.
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
fsGroup: 1000
|
||||
restartPolicy: Always
|
||||
runtimeClassName: kata
|
||||
serviceAccountName: default
|
||||
imagePullSecrets:
|
||||
- name: ngc-secret-embedqa
|
||||
securityContext:
|
||||
fsGroup: 0
|
||||
runAsGroup: 0
|
||||
runAsUser: 0
|
||||
containers:
|
||||
- name: ${POD_NAME_EMBEDQA}
|
||||
image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
|
||||
|
||||
@@ -10,15 +10,16 @@ metadata:
|
||||
labels:
|
||||
app: ${POD_NAME_EMBEDQA}
|
||||
spec:
|
||||
# unlike the instruct manifest, this image needs securityContext to
|
||||
# avoid NVML/GPU permission failures
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
restartPolicy: Always
|
||||
runtimeClassName: kata
|
||||
serviceAccountName: default
|
||||
imagePullSecrets:
|
||||
- name: ngc-secret-embedqa
|
||||
securityContext:
|
||||
fsGroup: 0
|
||||
runAsGroup: 0
|
||||
runAsUser: 0
|
||||
containers:
|
||||
- name: ${POD_NAME_EMBEDQA}
|
||||
image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1
|
||||
|
||||
@@ -15,7 +15,7 @@ spec:
|
||||
fsGroup: 123
|
||||
containers:
|
||||
- name: mounttest-container
|
||||
image: ${agnhost_image}
|
||||
image: ${AGNHOST_IMAGE}
|
||||
command:
|
||||
- /agnhost
|
||||
args:
|
||||
@@ -28,7 +28,7 @@ spec:
|
||||
- name: emptydir-volume
|
||||
mountPath: /test-volume
|
||||
- name: mounttest-container-2
|
||||
image: ${agnhost_image}
|
||||
image: ${AGNHOST_IMAGE}
|
||||
command:
|
||||
- /agnhost
|
||||
args:
|
||||
@@ -148,9 +148,7 @@ install_genpolicy_drop_ins() {
|
||||
# 20-* OCI version overlay
|
||||
if [[ "${KATA_HOST_OS:-}" == "cbl-mariner" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.2.0-drop-in.json" "${settings_d}/"
|
||||
elif is_k3s_or_rke2; then
|
||||
cp "${examples_dir}/20-oci-1.2.1-drop-in.json" "${settings_d}/"
|
||||
elif is_nvidia_gpu_platform || [[ "${KATA_HYPERVISOR}" == "qemu-snp" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
elif is_k3s_or_rke2 || is_nvidia_gpu_platform || [[ "${KATA_HYPERVISOR}" == "qemu-snp" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.3.0-drop-in.json" "${settings_d}/"
|
||||
fi
|
||||
|
||||
|
||||
@@ -64,6 +64,7 @@ memdisk
|
||||
pmem
|
||||
Sharedfs
|
||||
Initdata
|
||||
fsmerged
|
||||
|
||||
# Networking & Communication
|
||||
netns
|
||||
|
||||
@@ -7,17 +7,17 @@
|
||||
|
||||
FROM golang:1.24-alpine AS nydus-binary-downloader
|
||||
|
||||
# Keep the version here aligned with "ndyus-snapshotter.version"
|
||||
# in versions.yaml
|
||||
ARG NYDUS_SNAPSHOTTER_VERSION=v0.15.13
|
||||
ARG NYDUS_SNAPSHOTTER_REPO=https://github.com/containerd/nydus-snapshotter
|
||||
COPY versions.yaml /tmp/versions.yaml
|
||||
|
||||
RUN \
|
||||
set -e && \
|
||||
apk add --no-cache curl yq-go && \
|
||||
NYDUS_SNAPSHOTTER_VERSION="$(yq eval -e '.externals.nydus-snapshotter.version | explode(.)' /tmp/versions.yaml)" && \
|
||||
NYDUS_SNAPSHOTTER_REPO="$(yq eval -e '.externals.nydus-snapshotter.url | explode(.)' /tmp/versions.yaml)" && \
|
||||
mkdir -p /opt/nydus-snapshotter && \
|
||||
ARCH="$(uname -m)" && \
|
||||
if [ "${ARCH}" = "x86_64" ]; then ARCH=amd64 ; fi && \
|
||||
if [ "${ARCH}" = "aarch64" ]; then ARCH=arm64; fi && \
|
||||
apk add --no-cache curl && \
|
||||
curl -fOL --progress-bar "${NYDUS_SNAPSHOTTER_REPO}/releases/download/${NYDUS_SNAPSHOTTER_VERSION}/nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" && \
|
||||
tar xvzpf "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz" -C /opt/nydus-snapshotter && \
|
||||
rm "nydus-snapshotter-${NYDUS_SNAPSHOTTER_VERSION}-linux-${ARCH}.tar.gz"
|
||||
@@ -49,10 +49,13 @@ RUN \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/ && \
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_TOOLCHAIN}
|
||||
|
||||
WORKDIR /kata-deploy
|
||||
# Build from the repository root so kata-deploy uses the root Cargo workspace:
|
||||
# docker build -f tools/packaging/kata-deploy/Dockerfile .
|
||||
WORKDIR /kata
|
||||
|
||||
# Copy standalone binary project
|
||||
COPY binary /kata-deploy
|
||||
COPY Cargo.toml Cargo.lock ./
|
||||
COPY src ./src
|
||||
COPY tools/packaging/kata-deploy/binary ./tools/packaging/kata-deploy/binary
|
||||
|
||||
# Install target and run tests based on architecture
|
||||
# - AMD64/arm64: use musl for fully static binaries
|
||||
@@ -93,23 +96,23 @@ RUN \
|
||||
RUN \
|
||||
rust_target="$(cat /tmp/rust_target)"; \
|
||||
echo "Running binary tests with target ${rust_target}..." && \
|
||||
RUSTFLAGS="-D warnings" cargo test --target "${rust_target}" -- --test-threads=1 && \
|
||||
RUSTFLAGS="-D warnings" cargo test -p kata-deploy --target "${rust_target}" -- --test-threads=1 && \
|
||||
echo "All tests passed!"
|
||||
|
||||
RUN \
|
||||
rust_target="$(cat /tmp/rust_target)"; \
|
||||
echo "Building kata-deploy binary for ${rust_target}..." && \
|
||||
RUSTFLAGS="-D warnings" cargo build --release --target "${rust_target}" && \
|
||||
RUSTFLAGS="-D warnings" cargo build --release -p kata-deploy --target "${rust_target}" && \
|
||||
mkdir -p /kata-deploy/bin && \
|
||||
cp "/kata-deploy/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \
|
||||
cp "/kata/target/${rust_target}/release/kata-deploy" /kata-deploy/bin/kata-deploy && \
|
||||
echo "Cleaning up build artifacts to save disk space..." && \
|
||||
rm -rf /kata-deploy/target && \
|
||||
rm -rf /kata/target && \
|
||||
cargo clean
|
||||
|
||||
#### Extract kata artifacts
|
||||
FROM alpine:3.22 AS artifact-extractor
|
||||
|
||||
ARG KATA_ARTIFACTS=kata-static.tar.zst
|
||||
ARG KATA_ARTIFACTS=tools/packaging/kata-deploy/kata-static.tar.zst
|
||||
ARG DESTINATION=/opt/kata-artifacts
|
||||
|
||||
COPY ${KATA_ARTIFACTS} /tmp/
|
||||
@@ -222,11 +225,11 @@ COPY --from=runtime-assembler /output/lib/ /lib/
|
||||
COPY --from=runtime-assembler /output/lib64/ /lib64/
|
||||
|
||||
# Copy nydus snapshotter
|
||||
COPY nydus-snapshotter ${DESTINATION}/nydus-snapshotter
|
||||
COPY tools/packaging/kata-deploy/nydus-snapshotter ${DESTINATION}/nydus-snapshotter
|
||||
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/containerd-nydus-grpc ${DESTINATION}/nydus-snapshotter/
|
||||
COPY --from=nydus-binary-downloader /opt/nydus-snapshotter/bin/nydus-overlayfs ${DESTINATION}/nydus-snapshotter/
|
||||
|
||||
# Copy runtimeclasses and node-feature-rules
|
||||
COPY node-feature-rules ${DESTINATION}/node-feature-rules
|
||||
COPY tools/packaging/kata-deploy/node-feature-rules ${DESTINATION}/node-feature-rules
|
||||
|
||||
ENTRYPOINT ["/usr/bin/kata-deploy"]
|
||||
|
||||
@@ -1,58 +1,38 @@
|
||||
[package]
|
||||
name = "kata-deploy"
|
||||
version = "0.1.0"
|
||||
authors.workspace = true
|
||||
edition = "2021"
|
||||
license.workspace = true
|
||||
rust-version = "1.90.0"
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
license = "Apache-2.0"
|
||||
|
||||
[[bin]]
|
||||
name = "kata-deploy"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
# Error handling
|
||||
anyhow = "1.0"
|
||||
|
||||
# Logging
|
||||
log = "0.4"
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
env_logger = "0.10"
|
||||
|
||||
# Command line parsing
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
|
||||
# TOML parsing and manipulation
|
||||
toml_edit = "0.22"
|
||||
|
||||
# YAML parsing and manipulation
|
||||
serde_yaml = "0.9"
|
||||
|
||||
# Kubernetes API client
|
||||
kube = { version = "2.0", features = ["runtime", "derive"] }
|
||||
k8s-openapi = { version = "0.26", default-features = false, features = [
|
||||
"v1_33",
|
||||
] }
|
||||
|
||||
# System operations (using nsenter command instead of syscalls)
|
||||
libc = "0.2"
|
||||
|
||||
# JSON serialization
|
||||
serde_json = "1.0"
|
||||
|
||||
# File operations
|
||||
walkdir = "2"
|
||||
|
||||
# String manipulation
|
||||
regex = "1.10"
|
||||
|
||||
# Async runtime (required by kube-rs and for async main)
|
||||
tokio = { version = "1.38", features = [
|
||||
kube = { version = "2.0", features = ["runtime", "derive"] }
|
||||
libc.workspace = true
|
||||
log.workspace = true
|
||||
regex.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_yaml = "0.9"
|
||||
tokio = { workspace = true, features = [
|
||||
"rt-multi-thread",
|
||||
"macros",
|
||||
"signal",
|
||||
"time",
|
||||
] }
|
||||
toml_edit = "0.22"
|
||||
walkdir = "2"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.8"
|
||||
rstest = "0.18"
|
||||
rstest.workspace = true
|
||||
serial_test.workspace = true
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -30,6 +30,20 @@ pub async fn configure_erofs_snapshotter(
|
||||
"[\"erofs\",\"walking\"]",
|
||||
)?;
|
||||
|
||||
//// Configure erofs differ plugin
|
||||
//// erofs-utils >= 1.8.2
|
||||
//toml_utils::set_toml_value(
|
||||
// configuration_file,
|
||||
// ".plugins.\"io.containerd.differ.v1.erofs\".mkfs_options",
|
||||
// "[\"-T0\",\"--mkfs-time\",\"--sort=none\"]",
|
||||
//)?;
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.differ.v1.erofs\".enable_tar_index",
|
||||
"false",
|
||||
)?;
|
||||
|
||||
// Configure erofs snapshotter plugin
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".enable_fsverity",
|
||||
@@ -40,6 +54,16 @@ pub async fn configure_erofs_snapshotter(
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".set_immutable",
|
||||
"true",
|
||||
)?;
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".default_size",
|
||||
"\"10G\"",
|
||||
)?;
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
".plugins.\"io.containerd.snapshotter.v1.erofs\".max_unmerged_layers",
|
||||
"1",
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -757,18 +757,15 @@ fn get_arch_var_or_base(base_name: &str, arch: &str) -> Option<String> {
|
||||
mod tests {
|
||||
//! Tests for configuration parsing and validation.
|
||||
//!
|
||||
//! IMPORTANT: All tests in this crate MUST be run serially (--test-threads=1)
|
||||
//! because they manipulate shared environment variables. Running tests in parallel
|
||||
//! will cause race conditions and test failures.
|
||||
//!
|
||||
//! Use: cargo test --bin kata-deploy -- --test-threads=1
|
||||
//! Tests that touch environment variables use `serial_test::serial` so they do not run
|
||||
//! in parallel within this process. For extra isolation you can still use
|
||||
//! `cargo test -p kata-deploy config::tests -- --test-threads=1`.
|
||||
|
||||
use super::*;
|
||||
use rstest::rstest;
|
||||
use serial_test::serial;
|
||||
|
||||
// NOTE: These tests modify environment variables which are process-global.
|
||||
// Run with: cargo test config::tests -- --test-threads=1
|
||||
// to ensure proper test isolation.
|
||||
// NOTE: Env-var tests use #[serial] (see above) for safe parallel execution with other modules.
|
||||
|
||||
/// Helper to clean up common environment variables used in tests
|
||||
fn cleanup_env_vars() {
|
||||
@@ -867,6 +864,7 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_get_arch() {
|
||||
let arch = get_arch().unwrap();
|
||||
@@ -874,6 +872,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_get_arch_var() {
|
||||
std::env::set_var("SHIMS_X86_64", "test1 test2");
|
||||
@@ -887,10 +886,12 @@ mod tests {
|
||||
#[rstest]
|
||||
#[case(false, "config.toml.d")]
|
||||
#[case(true, "config-v3.toml.d")]
|
||||
#[serial]
|
||||
fn test_k3s_rke2_drop_in_dir_name(#[case] use_v3: bool, #[case] expected: &str) {
|
||||
assert_eq!(k3s_rke2_drop_in_dir_name(use_v3), expected);
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_k3s_rke2_rendered_config_path() {
|
||||
assert_eq!(k3s_rke2_rendered_config_path(), "/etc/containerd/config.toml");
|
||||
@@ -905,6 +906,7 @@ mod tests {
|
||||
#[case("version = 2\n", false, false)]
|
||||
#[case("imports = [\"/path/config-v3.toml.d/*.toml\"]", true, true)]
|
||||
#[case("imports = [\"/path/config.toml.d/*.toml\"]", true, false)]
|
||||
#[serial]
|
||||
fn test_k3s_rke2_rendered_has_import(
|
||||
#[case] content: &str,
|
||||
#[case] use_v3: bool,
|
||||
@@ -913,6 +915,7 @@ mod tests {
|
||||
assert_eq!(k3s_rke2_rendered_has_import(content, use_v3), expected);
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_multi_install_suffix_not_set() {
|
||||
setup_minimal_env();
|
||||
@@ -929,6 +932,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_multi_install_suffix_with_value() {
|
||||
setup_minimal_env();
|
||||
@@ -950,6 +954,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_multi_install_suffix_different_values() {
|
||||
let suffixes = ["staging", "prod", "v2", "test123"];
|
||||
@@ -970,6 +975,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_multi_install_prefix_and_suffix() {
|
||||
setup_minimal_env();
|
||||
@@ -988,6 +994,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_empty_shims_no_custom_runtimes() {
|
||||
setup_minimal_env();
|
||||
@@ -1013,6 +1020,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_default_shim_not_in_shims() {
|
||||
setup_minimal_env();
|
||||
@@ -1025,6 +1033,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_hypervisor_annotation_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1041,6 +1050,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_agent_https_proxy_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1057,6 +1067,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_snapshotter_mapping_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1067,6 +1078,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_pull_type_mapping_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1077,6 +1089,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_force_guest_pull_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1087,6 +1100,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_validate_success() {
|
||||
setup_minimal_env();
|
||||
@@ -1106,6 +1120,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_missing_node_name_fails() {
|
||||
cleanup_env_vars();
|
||||
@@ -1116,6 +1131,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_empty_node_name_fails() {
|
||||
setup_minimal_env();
|
||||
@@ -1125,6 +1141,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_empty_default_shim_fails() {
|
||||
setup_minimal_env();
|
||||
@@ -1137,6 +1154,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_whitespace_only_default_shim_fails() {
|
||||
setup_minimal_env();
|
||||
@@ -1147,6 +1165,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_whitespace_only_shims_fails() {
|
||||
setup_minimal_env();
|
||||
@@ -1156,6 +1175,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_agent_no_proxy_invalid_shim() {
|
||||
setup_minimal_env();
|
||||
@@ -1166,6 +1186,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_multi_install_suffix_empty_treated_as_none() {
|
||||
setup_minimal_env();
|
||||
@@ -1177,6 +1198,7 @@ mod tests {
|
||||
cleanup_env_vars();
|
||||
}
|
||||
|
||||
#[serial]
|
||||
#[test]
|
||||
fn test_arch_specific_all_variables() {
|
||||
// Test ALL architecture-specific variables work without base variables
|
||||
|
||||
@@ -41,7 +41,7 @@ updateStrategy:
|
||||
debug: false
|
||||
|
||||
snapshotter:
|
||||
setup: [] # ["nydus", "erofs"] or []
|
||||
setup: ["nydus"] # ["nydus", "erofs"] or []
|
||||
|
||||
# Shim configuration
|
||||
# By default (disableAll: false), all shims with enabled: ~ (null) are enabled.
|
||||
@@ -153,8 +153,8 @@ shims:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
forceGuestPull: true
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
@@ -176,8 +176,8 @@ shims:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
forceGuestPull: true
|
||||
snapshotter: "nydus"
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
|
||||
@@ -11,25 +11,48 @@ set -o nounset
|
||||
set -o pipefail
|
||||
set -o errtrace
|
||||
|
||||
KATA_DEPLOY_DIR="`dirname ${0}`/../../kata-deploy"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${0}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
|
||||
KATA_DEPLOY_DIR="${REPO_ROOT}/tools/packaging/kata-deploy"
|
||||
STAGED_ARTIFACT="${KATA_DEPLOY_DIR}/kata-static.tar.zst"
|
||||
KATA_DEPLOY_ARTIFACT="${1:-"kata-static.tar.zst"}"
|
||||
REGISTRY="${2:-"quay.io/kata-containers/kata-deploy"}"
|
||||
TAG="${3:-}"
|
||||
|
||||
echo "Copying ${KATA_DEPLOY_ARTIFACT} to ${KATA_DEPLOY_DIR}"
|
||||
cp ${KATA_DEPLOY_ARTIFACT} ${KATA_DEPLOY_DIR}
|
||||
# Only remove a staged copy we created (skip when source is already the staged path).
|
||||
REMOVE_STAGED_ON_EXIT=false
|
||||
cleanup() {
|
||||
if [ "${REMOVE_STAGED_ON_EXIT}" = true ]; then
|
||||
rm -f "${STAGED_ARTIFACT}"
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
pushd ${KATA_DEPLOY_DIR}
|
||||
src_rp="$(realpath -e "${KATA_DEPLOY_ARTIFACT}" 2>/dev/null || true)"
|
||||
dest_rp="$(realpath -e "${STAGED_ARTIFACT}" 2>/dev/null || true)"
|
||||
if [ -n "${src_rp}" ] && [ -n "${dest_rp}" ] && [ "${src_rp}" = "${dest_rp}" ]; then
|
||||
echo "Artifact already at staged path ${STAGED_ARTIFACT}; skipping copy"
|
||||
else
|
||||
echo "Copying ${KATA_DEPLOY_ARTIFACT} to ${STAGED_ARTIFACT}"
|
||||
cp "${KATA_DEPLOY_ARTIFACT}" "${STAGED_ARTIFACT}"
|
||||
REMOVE_STAGED_ON_EXIT=true
|
||||
fi
|
||||
|
||||
pushd "${REPO_ROOT}"
|
||||
|
||||
arch=$(uname -m)
|
||||
[ "$arch" = "x86_64" ] && arch="amd64"
|
||||
[ "$arch" = "aarch64" ] && arch="arm64"
|
||||
# Disable provenance and SBOM so each tag is a single image manifest. quay.io rejects
|
||||
# pushing multi-arch manifest lists that include attestation manifests ("manifest invalid").
|
||||
PLATFORM="linux/${arch}"
|
||||
IMAGE_TAG="${REGISTRY}:kata-containers-$(git rev-parse HEAD)-${arch}"
|
||||
IMAGE_TAG="${REGISTRY}:kata-containers-$(git -C "${REPO_ROOT}" rev-parse HEAD)-${arch}"
|
||||
|
||||
DOCKERFILE="${REPO_ROOT}/tools/packaging/kata-deploy/Dockerfile"
|
||||
|
||||
echo "Building the image"
|
||||
docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
|
||||
-f "${DOCKERFILE}" \
|
||||
--tag "${IMAGE_TAG}" --push .
|
||||
|
||||
if [ -n "${TAG}" ]; then
|
||||
@@ -37,6 +60,7 @@ if [ -n "${TAG}" ]; then
|
||||
|
||||
echo "Building the ${ADDITIONAL_TAG} image"
|
||||
docker buildx build --platform "${PLATFORM}" --provenance false --sbom false \
|
||||
-f "${DOCKERFILE}" \
|
||||
--tag "${ADDITIONAL_TAG}" --push .
|
||||
fi
|
||||
|
||||
|
||||
@@ -383,8 +383,6 @@ externals:
|
||||
url: "https://github.com/dragonflyoss/image-service"
|
||||
version: "v2.2.3"
|
||||
|
||||
# Keep the version here aligned with the NYDUS_SNAPSHOTTER_VERSION
|
||||
# on tools/packaging/kata-deploy/Dockerfile
|
||||
nydus-snapshotter:
|
||||
description: "Snapshotter for Nydus image acceleration service"
|
||||
url: "https://github.com/containerd/nydus-snapshotter"
|
||||
|
||||
Reference in New Issue
Block a user