mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-23 13:12:16 +00:00
Compare commits
38 Commits
fupan_debu
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fe817bb47b | ||
|
|
514a2b1a7c | ||
|
|
83f37f4beb | ||
|
|
e44dfccf7a | ||
|
|
1035504492 | ||
|
|
20cb65b1fb | ||
|
|
864f181faf | ||
|
|
642b5661ff | ||
|
|
4403289123 | ||
|
|
d2c2ec6e23 | ||
|
|
608f378bff | ||
|
|
f14895bdc4 | ||
|
|
fd716c017d | ||
|
|
740d380b8e | ||
|
|
6194510e90 | ||
|
|
7e3fd74779 | ||
|
|
f6319da73d | ||
|
|
91d6c39f06 | ||
|
|
5ab0744c25 | ||
|
|
e905b74267 | ||
|
|
5333e45313 | ||
|
|
795869152d | ||
|
|
8903b12d34 | ||
|
|
476f550977 | ||
|
|
4afb55154a | ||
|
|
eabb98ecab | ||
|
|
c1b7069e50 | ||
|
|
fddd1e8b6e | ||
|
|
d6178d78b1 | ||
|
|
1c7b14e282 | ||
|
|
e9bda42b01 | ||
|
|
a66c93caaa | ||
|
|
17454c0969 | ||
|
|
f8617241f4 | ||
|
|
d0f0dc2008 | ||
|
|
3e39c1fad3 | ||
|
|
a6a81124cb | ||
|
|
8d09a0e7e7 |
@@ -32,5 +32,6 @@ ignoreRegExpList:
|
||||
ignorePaths:
|
||||
- "**/vendor/**" # vendor files aren't owned by us
|
||||
- "**/src/runtime/virtcontainers/pkg/cloud-hypervisor/client/**" # Generated files
|
||||
- "**/requirements.txt"
|
||||
|
||||
useGitignore: true
|
||||
|
||||
@@ -47,6 +47,7 @@ jobs:
|
||||
- coco-guest-components
|
||||
- firecracker
|
||||
- kernel
|
||||
- kernel-debug
|
||||
- kernel-dragonball-experimental
|
||||
- kernel-nvidia-gpu
|
||||
- nydus
|
||||
|
||||
@@ -45,6 +45,7 @@ jobs:
|
||||
- cloud-hypervisor
|
||||
- firecracker
|
||||
- kernel
|
||||
- kernel-debug
|
||||
- kernel-dragonball-experimental
|
||||
- kernel-nvidia-gpu
|
||||
- kernel-cca-confidential
|
||||
|
||||
43
.github/workflows/docs.yaml
vendored
43
.github/workflows/docs.yaml
vendored
@@ -4,17 +4,18 @@ on:
|
||||
branches:
|
||||
- main
|
||||
permissions: {}
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
deploy-docs:
|
||||
name: deploy-docs
|
||||
build:
|
||||
runs-on: ubuntu-24.04
|
||||
name: Build docs
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/configure-pages@983d7736d9b0ae728b81ab479565c72886d7745b # v5.0.0
|
||||
- uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
|
||||
@@ -23,10 +24,30 @@ jobs:
|
||||
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: 3.x
|
||||
- run: pip install zensical
|
||||
- run: zensical build --clean
|
||||
|
||||
- run: pip install -r docs/requirements.txt
|
||||
- run: python3 -m mkdocs build --config-file ./mkdocs.yaml --site-dir site/
|
||||
id: build
|
||||
|
||||
- uses: actions/upload-pages-artifact@7b1f4a764d45c48632c6b24a0339c27f5614fb0b # v4.0.0
|
||||
with:
|
||||
path: site
|
||||
- uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
|
||||
id: deployment
|
||||
with:
|
||||
path: site/
|
||||
name: github-pages
|
||||
|
||||
deploy:
|
||||
needs: build
|
||||
runs-on: ubuntu-24.04
|
||||
name: Deploy docs
|
||||
permissions:
|
||||
pages: write
|
||||
id-token: write
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Deploy to GitHub Pages
|
||||
uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
|
||||
id: deployment
|
||||
with:
|
||||
artifact_name: github-pages
|
||||
|
||||
@@ -49,6 +49,8 @@ jobs:
|
||||
KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
|
||||
KUBERNETES: kubeadm
|
||||
KBS: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }}
|
||||
SNAPSHOTTER: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'nydus' || '' }}
|
||||
USE_EXPERIMENTAL_SNAPSHOTTER_SETUP: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }}
|
||||
K8S_TEST_HOST_TYPE: baremetal
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
@@ -98,7 +100,7 @@ jobs:
|
||||
run: bash tests/integration/kubernetes/gha-run.sh install-bats
|
||||
|
||||
- name: Run tests ${{ matrix.environment.vmm }}
|
||||
timeout-minutes: 30
|
||||
timeout-minutes: 60
|
||||
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
|
||||
env:
|
||||
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||
|
||||
1941
Cargo.lock
generated
1941
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
24
Cargo.toml
24
Cargo.toml
@@ -56,19 +56,19 @@ exclude = [
|
||||
|
||||
[workspace.dependencies]
|
||||
# Rust-VMM crates
|
||||
event-manager = "0.2.1"
|
||||
kvm-bindings = "0.6.0"
|
||||
kvm-ioctls = "=0.12.1"
|
||||
linux-loader = "0.8.0"
|
||||
event-manager = "0.4.0"
|
||||
kvm-bindings = "0.14.0"
|
||||
kvm-ioctls = "0.24.0"
|
||||
linux-loader = "0.13.0"
|
||||
seccompiler = "0.5.0"
|
||||
vfio-bindings = "0.3.0"
|
||||
vfio-ioctls = "0.1.0"
|
||||
virtio-bindings = "0.1.0"
|
||||
virtio-queue = "0.7.0"
|
||||
vm-fdt = "0.2.0"
|
||||
vm-memory = "0.10.0"
|
||||
vm-superio = "0.5.0"
|
||||
vmm-sys-util = "0.11.0"
|
||||
vfio-bindings = "0.6.1"
|
||||
vfio-ioctls = "0.5.0"
|
||||
virtio-bindings = "0.2.0"
|
||||
virtio-queue = "0.17.0"
|
||||
vm-fdt = "0.3.0"
|
||||
vm-memory = "=0.17.1"
|
||||
vm-superio = "0.8.0"
|
||||
vmm-sys-util = "0.15.0"
|
||||
|
||||
# Local dependencies from Dragonball Sandbox crates
|
||||
dragonball = { path = "src/dragonball" }
|
||||
|
||||
8
Makefile
8
Makefile
@@ -49,8 +49,11 @@ docs-url-alive-check:
|
||||
build-and-publish-kata-debug:
|
||||
bash tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh ${KATA_DEBUG_REGISTRY} ${KATA_DEBUG_TAG}
|
||||
|
||||
docs-serve:
|
||||
docker run --rm -p 8000:8000 -v ./docs:/docs:ro -v ${PWD}/zensical.toml:/zensical.toml:ro zensical/zensical serve --config-file /zensical.toml -a 0.0.0.0:8000
|
||||
docs-build:
|
||||
docker build -t kata-docs:latest -f ./docs/Dockerfile ./docs
|
||||
|
||||
docs-serve: docs-build
|
||||
docker run --rm -p 8000:8000 -v ${PWD}:/docs:ro kata-docs:latest serve --config-file /docs/mkdocs.yaml -a 0.0.0.0:8000
|
||||
|
||||
.PHONY: \
|
||||
all \
|
||||
@@ -59,4 +62,5 @@ docs-serve:
|
||||
default \
|
||||
static-checks \
|
||||
docs-url-alive-check \
|
||||
docs-build \
|
||||
docs-serve
|
||||
|
||||
18
docs/.nav.yml
Normal file
18
docs/.nav.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
# https://lukasgeiter.github.io/mkdocs-awesome-nav/
|
||||
nav:
|
||||
- Home: index.md
|
||||
- Getting Started:
|
||||
- prerequisites.md
|
||||
- installation.md
|
||||
- Configuration:
|
||||
- helm-configuration.md
|
||||
- runtime-configuration.md
|
||||
- Platform Support:
|
||||
- hypervisors.md
|
||||
- Guides:
|
||||
- Use Cases:
|
||||
- NVIDIA GPU Passthrough: use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md
|
||||
- NVIDIA vGPU: use-cases/NVIDIA-GPU-passthrough-and-Kata.md
|
||||
- Intel Discrete GPU: use-cases/Intel-Discrete-GPU-passthrough-and-Kata.md
|
||||
- Misc:
|
||||
- Architecture: design/architecture/
|
||||
11
docs/Dockerfile
Normal file
11
docs/Dockerfile
Normal file
@@ -0,0 +1,11 @@
|
||||
# Copyright 2026 Kata Contributors
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /
|
||||
COPY ./requirements.txt requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "mkdocs"]
|
||||
@@ -32,23 +32,26 @@ their own validation and patch backporting from there.
|
||||
|
||||
## Release Process
|
||||
|
||||
### Bump the `VERSION` and `Chart.yaml` file
|
||||
|
||||
When the `kata-containers/kata-containers` repository is ready for a new
|
||||
release, first create a PR to set the release in the [`VERSION`](./../VERSION)
|
||||
file and update the `version` and `appVersion` in the
|
||||
[`Chart.yaml`](./../tools/packaging/kata-deploy/helm-chart/kata-deploy/Chart.yaml)
|
||||
file and have it merged.
|
||||
|
||||
### Lock the `main` branch
|
||||
### Lock the `main` branch and announce release process
|
||||
|
||||
In order to prevent any PRs getting merged during the release process, and
|
||||
slowing the release process down, by impacting the payload caches, we have
|
||||
recently trialed setting the `main` branch to read only whilst the release
|
||||
action runs.
|
||||
recently trialed setting the `main` branch to read-only.
|
||||
Once the `kata-containers/kata-containers` repository is ready for a new
|
||||
release, lock the main branch until the release action has completed.
|
||||
Notify the #kata-dev Slack channel about the ongoing release process.
|
||||
Ideally, CI usage by others should be reduced to a minimum during the
|
||||
ongoing release process.
|
||||
|
||||
> [!NOTE]
|
||||
> Admin permission is needed to complete this task.
|
||||
> Admin permission is needed to lock/unlock the `main` branch.
|
||||
|
||||
### Bump the `VERSION` and `Chart.yaml` file
|
||||
|
||||
Create a PR to set the release in the [`VERSION`](./../VERSION) file and to
|
||||
update the `version` and `appVersion` fields in the
|
||||
[`Chart.yaml`](./../tools/packaging/kata-deploy/helm-chart/kata-deploy/Chart.yaml)
|
||||
file. Temporarily unlock the main branch to merge the PR.
|
||||
|
||||
### Wait for the `VERSION` bump PR payload publish to complete
|
||||
|
||||
@@ -60,7 +63,7 @@ and are cached, so that the release process can just download these artifacts
|
||||
rather than needing to build them all, which takes time and can reveal errors in
|
||||
infra.
|
||||
|
||||
### Check GitHub Actions
|
||||
### Trigger the `Release Kata Containers` GitHub Action
|
||||
|
||||
We make use of [GitHub actions](https://github.com/features/actions) in the
|
||||
[release](https://github.com/kata-containers/kata-containers/actions/workflows/release.yaml)
|
||||
|
||||
|
Before Width: | Height: | Size: 710 B After Width: | Height: | Size: 710 B |
264
docs/helm-configuration.md
Normal file
264
docs/helm-configuration.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# Helm Configuration
|
||||
|
||||
## Parameters
|
||||
|
||||
The helm chart provides a comprehensive set of configuration options. You may view the parameters and their descriptions by going to the [GitHub source](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/kata-deploy/values.yaml) or by using helm:
|
||||
|
||||
```sh
|
||||
# List available kata-deploy chart versions:
|
||||
# helm search repo kata-deploy-charts/kata-deploy --versions
|
||||
#
|
||||
# Then replace X.Y.Z below with the desired chart version:
|
||||
helm show values --version X.Y.Z oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy
|
||||
```
|
||||
|
||||
### shims
|
||||
|
||||
Kata ships with a number of pre-built artifacts and runtimes. You may selectively enable or disable specific shims. For example:
|
||||
|
||||
```yaml title="values.yaml"
|
||||
shims:
|
||||
disableAll: true
|
||||
qemu:
|
||||
enabled: true
|
||||
qemu-nvidia-gpu:
|
||||
enabled: true
|
||||
qemu-nvidia-gpu-snp:
|
||||
enabled: false
|
||||
|
||||
```
|
||||
|
||||
Shims can also have configuration options specific to them:
|
||||
|
||||
```yaml
|
||||
qemu-nvidia-gpu:
|
||||
enabled: ~
|
||||
supportedArches:
|
||||
- amd64
|
||||
- arm64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
runtimeClass:
|
||||
# This label is automatically added by gpu-operator. Override it
|
||||
# if you want to use a different label.
|
||||
# Uncomment once GPU Operator v26.3 is out
|
||||
# nodeSelector:
|
||||
# nvidia.com/cc.ready.state: "false"
|
||||
```
|
||||
|
||||
It's best to reference the default `values.yaml` file above for more details.
|
||||
|
||||
### Custom Runtimes
|
||||
|
||||
Kata allows you to create custom runtime configurations. This is done by overlaying one of the pre-existing runtime configs with user-provided configs. For example, we can use the `qemu-nvidia-gpu` as a base config and overlay our own parameters to it:
|
||||
|
||||
```yaml
|
||||
customRuntimes:
|
||||
enabled: false
|
||||
runtimes:
|
||||
my-gpu-runtime:
|
||||
baseConfig: "qemu-nvidia-gpu" # Required: existing config to use as base
|
||||
dropIn: | # Optional: overrides via config.d mechanism
|
||||
[hypervisor.qemu]
|
||||
default_memory = 1024
|
||||
default_vcpus = 4
|
||||
runtimeClass: |
|
||||
kind: RuntimeClass
|
||||
apiVersion: node.k8s.io/v1
|
||||
metadata:
|
||||
name: kata-my-gpu-runtime
|
||||
labels:
|
||||
app.kubernetes.io/managed-by: kata-deploy
|
||||
handler: kata-my-gpu-runtime
|
||||
overhead:
|
||||
podFixed:
|
||||
memory: "640Mi"
|
||||
cpu: "500m"
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
katacontainers.io/kata-runtime: "true"
|
||||
# Optional: CRI-specific configuration
|
||||
containerd:
|
||||
snapshotter: "nydus" # Configure containerd snapshotter (nydus, erofs, etc.)
|
||||
crio:
|
||||
pullType: "guest-pull" # Configure CRI-O runtime_pull_image = true
|
||||
```
|
||||
|
||||
Again, view the default [`values.yaml`](#parameters) file for more details.
|
||||
|
||||
## Examples
|
||||
|
||||
We provide a few examples that you can pass to helm via the `-f`/`--values` flag.
|
||||
|
||||
### [`try-kata-tee.values.yaml`](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-tee.values.yaml)
|
||||
|
||||
This file enables only the TEE (Trusted Execution Environment) shims for confidential computing:
|
||||
|
||||
```sh
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--version VERSION \
|
||||
-f try-kata-tee.values.yaml
|
||||
```
|
||||
|
||||
Includes:
|
||||
|
||||
- `qemu-snp` - AMD SEV-SNP (amd64)
|
||||
- `qemu-tdx` - Intel TDX (amd64)
|
||||
- `qemu-se` - IBM Secure Execution for Linux (SEL) (s390x)
|
||||
- `qemu-se-runtime-rs` - IBM Secure Execution for Linux (SEL) Rust runtime (s390x)
|
||||
- `qemu-cca` - Arm Confidential Compute Architecture (arm64)
|
||||
- `qemu-coco-dev` - Confidential Containers development (amd64, s390x)
|
||||
- `qemu-coco-dev-runtime-rs` - Confidential Containers development Rust runtime (amd64, s390x)
|
||||
|
||||
### [`try-kata-nvidia-gpu.values.yaml`](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml)
|
||||
|
||||
This file enables only the NVIDIA GPU-enabled shims:
|
||||
|
||||
```sh
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--version VERSION \
|
||||
-f try-kata-nvidia-gpu.values.yaml
|
||||
```
|
||||
|
||||
Includes:
|
||||
|
||||
- `qemu-nvidia-gpu` - Standard NVIDIA GPU support (amd64, arm64)
|
||||
- `qemu-nvidia-gpu-snp` - NVIDIA GPU with AMD SEV-SNP (amd64)
|
||||
- `qemu-nvidia-gpu-tdx` - NVIDIA GPU with Intel TDX (amd64)
|
||||
|
||||
### `nodeSelector`
|
||||
|
||||
We can deploy Kata only to specific nodes using `nodeSelector`
|
||||
|
||||
```sh
|
||||
# First, label the nodes where you want kata-containers to be installed
|
||||
$ kubectl label nodes worker-node-1 kata-containers=enabled
|
||||
$ kubectl label nodes worker-node-2 kata-containers=enabled
|
||||
|
||||
# Then install the chart with `nodeSelector`
|
||||
$ helm install kata-deploy \
|
||||
--set nodeSelector.kata-containers="enabled" \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
You can also use a values file:
|
||||
|
||||
```yaml title="values.yaml"
|
||||
nodeSelector:
|
||||
kata-containers: "enabled"
|
||||
node-type: "worker"
|
||||
```
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy -f values.yaml "${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
### Multiple Kata installations on the Same Node
|
||||
|
||||
For debugging, testing and other use-case it is possible to deploy multiple
|
||||
versions of Kata on the very same node. All the needed artifacts are getting the
|
||||
`multiInstallSuffix` appended to distinguish each installation. **BEWARE** that one
|
||||
needs at least **containerd-2.0** since this version has drop-in conf support
|
||||
which is a prerequisite for the `multiInstallSuffix` to work properly.
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy-cicd \
|
||||
-n kata-deploy-cicd \
|
||||
--set env.multiInstallSuffix=cicd \
|
||||
--set env.debug=true \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
Note: `runtimeClasses` are automatically created by Helm (via
|
||||
`runtimeClasses.enabled=true`, which is the default).
|
||||
|
||||
Now verify the installation by examining the `runtimeClasses`:
|
||||
|
||||
```sh
|
||||
$ kubectl get runtimeClasses
|
||||
NAME HANDLER AGE
|
||||
kata-clh-cicd kata-clh-cicd 77s
|
||||
kata-cloud-hypervisor-cicd kata-cloud-hypervisor-cicd 77s
|
||||
kata-dragonball-cicd kata-dragonball-cicd 77s
|
||||
kata-fc-cicd kata-fc-cicd 77s
|
||||
kata-qemu-cicd kata-qemu-cicd 77s
|
||||
kata-qemu-coco-dev-cicd kata-qemu-coco-dev-cicd 77s
|
||||
kata-qemu-nvidia-gpu-cicd kata-qemu-nvidia-gpu-cicd 77s
|
||||
kata-qemu-nvidia-gpu-snp-cicd kata-qemu-nvidia-gpu-snp-cicd 77s
|
||||
kata-qemu-nvidia-gpu-tdx-cicd kata-qemu-nvidia-gpu-tdx-cicd 76s
|
||||
kata-qemu-runtime-rs-cicd kata-qemu-runtime-rs-cicd 77s
|
||||
kata-qemu-se-runtime-rs-cicd kata-qemu-se-runtime-rs-cicd 77s
|
||||
kata-qemu-snp-cicd kata-qemu-snp-cicd 77s
|
||||
kata-qemu-tdx-cicd kata-qemu-tdx-cicd 77s
|
||||
kata-stratovirt-cicd kata-stratovirt-cicd 77s
|
||||
```
|
||||
|
||||
## RuntimeClass Node Selectors for TEE Shims
|
||||
|
||||
**Manual configuration:** Any `nodeSelector` you set under `shims.<shim>.runtimeClass.nodeSelector`
|
||||
is **always applied** to that shim's RuntimeClass, whether or not NFD is present. Use this when
|
||||
you want to pin TEE workloads to specific nodes (e.g. without NFD, or with custom labels).
|
||||
|
||||
**Auto-inject when NFD is present:** If you do *not* set a `runtimeClass.nodeSelector` for a
|
||||
TEE shim, the chart can **automatically inject** NFD-based labels when NFD is detected in the
|
||||
cluster (deployed by this chart with `node-feature-discovery.enabled=true` or found externally):
|
||||
|
||||
- AMD SEV-SNP shims: `amd.feature.node.kubernetes.io/snp: "true"`
|
||||
- Intel TDX shims: `intel.feature.node.kubernetes.io/tdx: "true"`
|
||||
- IBM Secure Execution for Linux (SEL) shims (s390x): `feature.node.kubernetes.io/cpu-security.se.enabled: "true"`
|
||||
|
||||
The chart uses Helm's `lookup` function to detect NFD (by looking for the
|
||||
`node-feature-discovery-worker` DaemonSet). Auto-inject only runs when NFD is detected and
|
||||
no manual `runtimeClass.nodeSelector` is set for that shim.
|
||||
|
||||
**Note**: NFD detection requires cluster access. During `helm template` (dry-run without a
|
||||
cluster), external NFD is not seen, so auto-injected labels are not added. Manual
|
||||
`runtimeClass.nodeSelector` values are still applied in all cases.
|
||||
|
||||
## Customizing Configuration with Drop-in Files
|
||||
|
||||
When kata-deploy installs Kata Containers, the base configuration files should not
|
||||
be modified directly. Instead, use drop-in configuration files to customize
|
||||
settings. This approach ensures your customizations survive kata-deploy upgrades.
|
||||
|
||||
### How Drop-in Files Work
|
||||
|
||||
The Kata runtime reads the base configuration file and then applies any `.toml`
|
||||
files found in the `config.d/` directory alongside it. Files are processed in
|
||||
alphabetical order, with later files overriding earlier settings.
|
||||
|
||||
### Creating Custom Drop-in Files
|
||||
|
||||
To add custom settings, create a `.toml` file in the appropriate `config.d/`
|
||||
directory. Use a numeric prefix to control the order of application.
|
||||
|
||||
**Reserved prefixes** (used by kata-deploy):
|
||||
|
||||
- `10-*`: Core kata-deploy settings
|
||||
- `20-*`: Debug settings
|
||||
- `30-*`: Kernel parameters
|
||||
|
||||
**Recommended prefixes for custom settings**: `50-89`
|
||||
|
||||
### Drop-In Config Examples
|
||||
|
||||
#### Adding Custom Kernel Parameters
|
||||
|
||||
```bash
|
||||
# SSH into the node or use kubectl exec
|
||||
sudo mkdir -p /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/
|
||||
sudo cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-custom.toml << 'EOF'
|
||||
[hypervisor.qemu]
|
||||
kernel_params = "my_param=value"
|
||||
EOF
|
||||
```
|
||||
|
||||
#### Changing Default Memory Size
|
||||
|
||||
```bash
|
||||
sudo cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-memory.toml << 'EOF'
|
||||
[hypervisor.qemu]
|
||||
default_memory = 4096
|
||||
EOF
|
||||
```
|
||||
@@ -16,83 +16,38 @@ which hypervisors you may wish to investigate further.
|
||||
|
||||
## Types
|
||||
|
||||
| Hypervisor | Written in | Architectures | Type |
|
||||
|-|-|-|-|
|
||||
|[Cloud Hypervisor] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) |
|
||||
|[Firecracker] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) |
|
||||
|[QEMU] | C | all | Type 2 ([KVM]) | `configuration-qemu.toml` |
|
||||
|[`Dragonball`] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) |
|
||||
|[StratoVirt] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) |
|
||||
| Hypervisor | Written in | Architectures | GPU Support | Intel TDX | AMD SEV-SNP |
|
||||
|-|-|-|-|-|-|
|
||||
|[Cloud Hypervisor](#cloud-hypervisor) | rust | `aarch64`, `x86_64` | :x: | :x: | :x: |
|
||||
|[Firecracker](#firecracker) | rust | `aarch64`, `x86_64` | :x: | :x: | :x: |
|
||||
|[QEMU](#qemu) | C | all | :white_check_mark: | :white_check_mark: | :white_check_mark: |
|
||||
|[Dragonball](#dragonball) | rust | `aarch64`, `x86_64` | :x: | :x: | :x: |
|
||||
|StratoVirt | rust | `aarch64`, `x86_64` | :x: | :x: | :x: |
|
||||
|
||||
## Determine currently configured hypervisor
|
||||
Each Kata runtime is configured for a specific hypervisor through the runtime's configuration file. For example:
|
||||
|
||||
```bash
|
||||
$ kata-runtime kata-env | awk -v RS= '/\[Hypervisor\]/' | grep Path
|
||||
```toml title="/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
[hypervisor.qemu]
|
||||
path = "/opt/kata/bin/qemu-system-x86_64"
|
||||
```
|
||||
|
||||
## Choose a Hypervisor
|
||||
```toml title="/opt/kata/share/defaults/kata-containers/configuration-clh.toml"
|
||||
[hypervisor.clh]
|
||||
path = "/opt/kata/bin/cloud-hypervisor"
|
||||
```
|
||||
|
||||
The table below provides a brief summary of some of the differences between
|
||||
the hypervisors:
|
||||
## Cloud Hypervisor
|
||||
|
||||
| Hypervisor | Summary | Features | Limitations | Container Creation speed | Memory density | Use cases | Comment |
|
||||
|-|-|-|-|-|-|-|-|
|
||||
|[Cloud Hypervisor] | Low latency, small memory footprint, small attack surface | Minimal | | excellent | excellent | High performance modern cloud workloads | |
|
||||
|[Firecracker] | Very slimline | Extremely minimal | Doesn't support all device types | excellent | excellent | Serverless / FaaS | |
|
||||
|[QEMU] | Lots of features | Lots | | good | good | Good option for most users | |
|
||||
|[`Dragonball`] | Built-in VMM, low CPU and memory overhead| Minimal | | excellent | excellent | Optimized for most container workloads | `out-of-the-box` Kata Containers experience |
|
||||
|[StratoVirt] | Unified architecture supporting three scenarios: VM, container, and serverless | Extremely minimal(`MicroVM`) to Lots(`StandardVM`) | | excellent | excellent | Common container workloads | `StandardVM` type of StratoVirt for Kata is under development |
|
||||
[Cloud Hypervisor](https://www.cloudhypervisor.org/) is a more modern hypervisor written in Rust.
|
||||
|
||||
For further details, see the [Virtualization in Kata Containers](design/virtualization.md) document and the official documentation for each hypervisor.
|
||||
## Firecracker
|
||||
|
||||
## Hypervisor configuration files
|
||||
[Firecracker](https://firecracker-microvm.github.io/) is a minimal and lightweight hypervisor created for the AWS Lambda product.
|
||||
|
||||
Since each hypervisor offers different features and options, Kata Containers
|
||||
provides a separate
|
||||
[configuration file](../src/runtime/README.md#configuration)
|
||||
for each. The configuration files contain comments explaining which options
|
||||
are available, their default values and how each setting can be used.
|
||||
## QEMU
|
||||
|
||||
| Hypervisor | Golang runtime config file | golang runtime short name | golang runtime default | rust runtime config file | rust runtime short name | rust runtime default |
|
||||
|-|-|-|-|-|-|-|
|
||||
| [Cloud Hypervisor] | [`configuration-clh.toml`](../src/runtime/config/configuration-clh.toml.in) | `clh` | | [`configuration-cloud-hypervisor.toml`](../src/runtime-rs/config/configuration-cloud-hypervisor.toml.in) | `cloud-hypervisor` | |
|
||||
| [Firecracker] | [`configuration-fc.toml`](../src/runtime/config/configuration-fc.toml.in) | `fc` | | | | |
|
||||
| [QEMU] | [`configuration-qemu.toml`](../src/runtime/config/configuration-qemu.toml.in) | `qemu` | yes | [`configuration-qemu.toml`](../src/runtime-rs/config/configuration-qemu-runtime-rs.toml.in) | `qemu` | |
|
||||
| [`Dragonball`] | | | | [`configuration-dragonball.toml`](../src/runtime-rs/config/configuration-dragonball.toml.in) | `dragonball` | yes |
|
||||
| [StratoVirt] | [`configuration-stratovirt.toml`](../src/runtime/config/configuration-stratovirt.toml.in) | `stratovirt` | | | | |
|
||||
QEMU is the best supported hypervisor for NVIDIA-based GPUs and for confidential computing use-cases (such as Intel TDX and AMD SEV-SNP). Runtimes that use this are normally named `kata-qemu-nvidia-gpu-*`. The Kata project focuses primarily on QEMU runtimes for GPU support.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The short names specified are used by the [`kata-manager`](../utils/README.md) tool.
|
||||
> - As shown by the default columns, each runtime type has its own default hypervisor.
|
||||
> - The [golang runtime](../src/runtime) is the current default runtime.
|
||||
> - The [rust runtime](../src/runtime-rs), also known as `runtime-rs`,
|
||||
> is the newer runtime written in the rust language.
|
||||
> - See the [Configuration](../README.md#configuration) for further details.
|
||||
> - The configuration file links in the table link to the "source"
|
||||
> versions: these are not usable configuration files as they contain
|
||||
> variables that need to be expanded:
|
||||
> - The links are provided for reference only.
|
||||
> - The final (installed) versions, where all variables have been
|
||||
> expanded, are built from these source configuration files.
|
||||
> - The pristine configuration files are usually installed in the
|
||||
> `/opt/kata/share/defaults/kata-containers/` or
|
||||
> `/usr/share/defaults/kata-containers/` directories.
|
||||
> - Some hypervisors may have the same name for both golang and rust
|
||||
> runtimes, but the file contents may differ.
|
||||
> - If there is no configuration file listed for the golang or
|
||||
> rust runtimes, this either means the hypervisor cannot be run with
|
||||
> a particular runtime, or that a driver has not yet been made
|
||||
> available for that runtime.
|
||||
## Dragonball
|
||||
|
||||
## Switch configured hypervisor
|
||||
|
||||
To switch the configured hypervisor, you only need to run a single command.
|
||||
See [the `kata-manager` documentation](../utils/README.md#choose-a-hypervisor) for further details.
|
||||
|
||||
[Cloud Hypervisor]: https://github.com/cloud-hypervisor/cloud-hypervisor
|
||||
[Firecracker]: https://github.com/firecracker-microvm/firecracker
|
||||
[KVM]: https://en.wikipedia.org/wiki/Kernel-based_Virtual_Machine
|
||||
[QEMU]: http://www.qemu.org
|
||||
[`Dragonball`]: https://github.com/kata-containers/kata-containers/blob/main/src/dragonball
|
||||
[StratoVirt]: https://gitee.com/openeuler/stratovirt
|
||||
Dragonball is a special hypervisor created by the Ant Group that runs in the same process as the Rust-based containerd shim.
|
||||
|
||||
94
docs/index.md
Normal file
94
docs/index.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# Kata Containers
|
||||
|
||||
Kata Containers is an open source community working to build a secure container runtime with lightweight virtual machines (VM's) that feel and perform like standard Linux containers, but provide stronger workload isolation using hardware virtualization technology as a second layer of defense.
|
||||
|
||||
## How it Works
|
||||
|
||||
Kata implements the [Open Containers Runtime Specification](https://github.com/opencontainers/runtime-spec). More specifically, it implements a containerd shim that implements the expected interface for managing container lifecycles. The default containerd runtime of `runc` spawns a container like this:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
subgraph Host
|
||||
containerd
|
||||
runc
|
||||
process[Container Process]
|
||||
containerd --> runc --> process
|
||||
end
|
||||
```
|
||||
|
||||
When containerd receives a request to spawn a container, it will pull the container image down and then call out to the runc shim (usually located at `/usr/local/bin/containerd-shim-runc-v2`). runc will then create various process isolation resources like Linux namespaces (networking, PIDs, mounts etc), seccomp filters, Linux capability reductions, and then spawn the process inside of those resources. This process runs in the host kernel.
|
||||
|
||||
Kata spawns containers like this:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
subgraph Host
|
||||
containerdOuter[containerd]
|
||||
kata
|
||||
|
||||
containerdOuter --> kata
|
||||
kata --> kataAgent
|
||||
|
||||
subgraph VM
|
||||
kataAgent[Kata Agent]
|
||||
process[Container Process]
|
||||
kataAgent --> process
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
The container process spawned inside of the VM allows us to isolate the guest kernel from the host system. This is the fundamental principle of how Kata achieves its isolation boundaries.
|
||||
|
||||
## Example
|
||||
|
||||
When Kata is installed in a system, a number of artifacts are laid down. containerd's config will be modified as such:
|
||||
|
||||
```toml title="/etc/containerd/config.toml"
|
||||
imports = ["/opt/kata/containerd/config.d/kata-deploy.toml"]
|
||||
```
|
||||
|
||||
This file will contain configuration for various flavors of Kata runtimes. We can see the vanilla CPU runtime config here:
|
||||
|
||||
```toml title="/opt/kata/containerd/config.d/kata-deploy.toml"
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata-qemu]
|
||||
runtime_type = "io.containerd.kata-qemu.v2"
|
||||
runtime_path = "/opt/kata/bin/containerd-shim-kata-v2"
|
||||
privileged_without_host_devices = true
|
||||
pod_annotations = ["io.katacontainers.*"]
|
||||
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata-qemu.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
|
||||
```
|
||||
|
||||
Because containerd's CRI is aware of the Kata runtimes, we can spawn Kubernetes pods:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: test
|
||||
spec:
|
||||
runtimeClassName: kata-qemu
|
||||
containers:
|
||||
- name: test
|
||||
image: "quay.io/libpod/ubuntu:latest"
|
||||
command: ["/bin/bash", "-c"]
|
||||
args: ["echo hello"]
|
||||
```
|
||||
|
||||
We can also spawn a Kata container by submitting a request to containerd like so:
|
||||
|
||||
<div class="annotate" markdown>
|
||||
|
||||
```sh
|
||||
$ ctr image pull quay.io/libpod/ubuntu:latest
|
||||
$ ctr run --runtime "io.containerd.kata.v2" --runtime-config-path /opt/kata/share/defaults/kata-containers/configuration-qemu.toml --rm -t "quay.io/libpod/ubuntu:latest" foo sh
|
||||
# echo hello
|
||||
hello
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
!!! tip
|
||||
|
||||
`ctr` is not aware of the CRI config in `/etc/containerd/config.toml`. This is why you must specify the `--runtime-config-path`. Additionally, the `--runtime` value is converted into a specific binary name which containerd then searches for in its `PATH`. See the [containerd docs](https://github.com/containerd/containerd/blob/release/2.2/core/runtime/v2/README.md#usage) for more details.
|
||||
64
docs/installation.md
Normal file
64
docs/installation.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# Installation
|
||||
|
||||
## Helm Chart
|
||||
|
||||
[helm](https://helm.sh/docs/intro/install/) can be used to install templated kubernetes manifests.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Kubernetes ≥ v1.22** – v1.22 is the first release where the CRI v1 API
|
||||
became the default and `RuntimeClass` left alpha. The chart depends on those
|
||||
stable interfaces; earlier clusters need `feature‑gates` or CRI shims that are
|
||||
out of scope.
|
||||
|
||||
- **Kata Release 3.12** - v3.12.0 introduced publishing the helm-chart on the
|
||||
release page for easier consumption, since v3.8.0 we shipped the helm-chart
|
||||
via source code in the kata-containers `GitHub` repository.
|
||||
|
||||
- CRI‑compatible runtime (containerd or CRI‑O). If one wants to use the
|
||||
`multiInstallSuffix` feature one needs at least **containerd-2.0** which
|
||||
supports drop-in config files
|
||||
|
||||
- Nodes must allow loading kernel modules and installing Kata artifacts (the
|
||||
chart runs privileged containers to do so)
|
||||
|
||||
### `helm install`
|
||||
|
||||
```sh
|
||||
# Install directly from the official ghcr.io OCI registry
|
||||
# update the VERSION X.YY.Z to your needs or just use the latest
|
||||
|
||||
export VERSION=$(curl -sSL https://api.github.com/repos/kata-containers/kata-containers/releases/latest | jq .tag_name | tr -d '"')
|
||||
export CHART="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy"
|
||||
|
||||
$ helm install kata-deploy "${CHART}" --version "${VERSION}"
|
||||
|
||||
# See everything you can configure
|
||||
$ helm show values "${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
This installs the `kata-deploy` DaemonSet and the default Kata `RuntimeClass`
|
||||
resources on your cluster.
|
||||
|
||||
To see what versions of the chart are available:
|
||||
|
||||
```sh
|
||||
$ helm show chart oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy
|
||||
```
|
||||
|
||||
### `helm uninstall`
|
||||
|
||||
```sh
|
||||
$ helm uninstall kata-deploy -n kube-system
|
||||
```
|
||||
|
||||
During uninstall, Helm will report that some resources were kept due to the
|
||||
resource policy (`ServiceAccount`, `ClusterRole`, `ClusterRoleBinding`). This
|
||||
is **normal**. A post-delete hook Job runs after uninstall and removes those
|
||||
resources so no cluster-wide `RBAC` is left behind.
|
||||
|
||||
## Pre-Built Release
|
||||
|
||||
Kata can also be installed using the pre-built releases: https://github.com/kata-containers/kata-containers/releases
|
||||
|
||||
This method does not have any facilities for artifact lifecycle management.
|
||||
116
docs/prerequisites.md
Normal file
116
docs/prerequisites.md
Normal file
@@ -0,0 +1,116 @@
|
||||
# Prerequisites
|
||||
|
||||
## Kubernetes
|
||||
|
||||
If using Kubernetes, at least version `v1.22` is recommended. This is the first release that the CRI v1 API and the `RuntimeClass` left alpha.
|
||||
|
||||
## containerd
|
||||
|
||||
Kata requires a [CRI](https://kubernetes.io/docs/concepts/containers/cri/)-compatible container runtime. containerd is commonly used for Kata. We recommend installing containerd using your platform's package distribution mechanism. We recommend at least the latest version of containerd v2.1.x.[^1]
|
||||
|
||||
|
||||
### Debian/Ubuntu
|
||||
|
||||
To install on Debian-based systems:
|
||||
|
||||
```sh
|
||||
$ apt update
|
||||
$ apt install containerd
|
||||
$ systemctl status containerd
|
||||
● containerd.service - containerd container runtime
|
||||
Loaded: loaded (/etc/systemd/system/containerd.service; enabled; preset: enabled)
|
||||
Drop-In: /etc/systemd/system/containerd.service.d
|
||||
└─http-proxy.conf
|
||||
Active: active (running) since Wed 2026-02-25 22:58:13 UTC; 5 days ago
|
||||
Docs: https://containerd.io
|
||||
Main PID: 3767885 (containerd)
|
||||
Tasks: 540
|
||||
Memory: 70.7G (peak: 70.8G)
|
||||
CPU: 4h 9min 26.153s
|
||||
CGroup: /runtime.slice/containerd.service
|
||||
├─ 12694 /usr/local/bin/container
|
||||
```
|
||||
|
||||
### Fedora/RedHat
|
||||
|
||||
To install on Fedora-based systems:
|
||||
|
||||
```
|
||||
$ yum install containerd
|
||||
```
|
||||
|
||||
??? help
|
||||
|
||||
Documentation assistance is requested for more specific instructions on Fedora systems.
|
||||
|
||||
### Pre-Built Releases
|
||||
|
||||
Many Linux distributions will not package the latest versions of containerd. If you find that your distribution provides very old versions of containerd, it's recommended to upgrade with the [pre-built releases](https://github.com/containerd/containerd/releases).
|
||||
|
||||
#### Executable
|
||||
|
||||
Download the latest release of containerd:
|
||||
|
||||
```sh
|
||||
$ wget https://github.com/containerd/containerd/releases/download/v${VERSION}/containerd-${VERSION}-linux-${PLATFORM}.tar.gz
|
||||
|
||||
# Extract to the current directory
|
||||
$ tar -xf ./containerd*.tar.gz
|
||||
|
||||
# Extract to root if you want it installed to its final location.
|
||||
$ tar -C / -xf ./*.tar.gz
|
||||
```
|
||||
|
||||
### Containerd Config
|
||||
|
||||
Containerd requires a config file at `/etc/containerd/config.toml`. This needs to be populated with a simple default config:
|
||||
|
||||
```sh
|
||||
$ /usr/local/bin/containerd config default > /etc/containerd/config.toml
|
||||
```
|
||||
|
||||
### Systemd Unit File
|
||||
|
||||
Install the systemd unit file:
|
||||
|
||||
```sh
|
||||
$ wget -O /etc/systemd/system/containerd.service https://raw.githubusercontent.com/containerd/containerd/main/containerd.service
|
||||
```
|
||||
|
||||
!!! info
|
||||
|
||||
- You must modify the `ExecStart` line to the location of the installed containerd executable.
|
||||
- containerd's `PATH` variable must allow it to find `containerd-shim-kata-v2`. You can do this by either creating a symlink from `/usr/local/bin/containerd-shim-kata-v2` to `/opt/kata/bin/containerd-shim-kata-v2` or by modifying containerd's `PATH` variable to search in `/opt/kata/bin/`. See the Environment= command in systemd.exec(5) for further details.
|
||||
|
||||
|
||||
Reload systemd and start containerd:
|
||||
|
||||
```sh
|
||||
$ systemctl daemon-reload
|
||||
$ systemctl enable --now containerd
|
||||
$ systemctl start containerd
|
||||
$ systemctl status containerd
|
||||
```
|
||||
|
||||
More details can be found on the [containerd installation docs](https://github.com/containerd/containerd/blob/main/docs/getting-started.md).
|
||||
|
||||
### Enable CRI
|
||||
|
||||
If you're using Kubernetes, you must enable the containerd Container Runtime Interface (CRI) plugin:
|
||||
|
||||
```sh
|
||||
$ ctr plugins ls | grep cri
|
||||
io.containerd.cri.v1 images - ok
|
||||
io.containerd.cri.v1 runtime linux/amd64 ok
|
||||
io.containerd.grpc.v1 cri - ok
|
||||
```
|
||||
|
||||
If these are not enabled, you'll need to remove it from the `disabled_plugins` section of the containerd config.
|
||||
|
||||
|
||||
[^1]: Kata makes use of containerd's drop-in config merging in `/etc/containerd/config.d/` which is only available starting from containerd v2. containerd v1 may work, but some Kata features will not work as expected.
|
||||
|
||||
|
||||
## runc
|
||||
|
||||
The default `runc` runtime needs to be installed for non-kata containers. More details can be found at the [containerd docs](https://github.com/containerd/containerd/blob/979c80d8a5d7fc7be34102a1ada53ae5a0ff09e8/docs/RUNC.md).
|
||||
9
docs/requirements.txt
Normal file
9
docs/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
mkdocs-materialx==10.0.9
|
||||
mkdocs-glightbox==0.4.0
|
||||
mkdocs-macros-plugin==1.5.0
|
||||
mkdocs-awesome-nav==3.3.0
|
||||
mkdocs-open-in-new-tab==1.0.8
|
||||
mkdocs-redirects==1.2.2
|
||||
CairoSVG==2.9.0
|
||||
pillow==12.1.1
|
||||
click==8.2.1
|
||||
56
docs/runtime-configuration.md
Normal file
56
docs/runtime-configuration.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Runtime Configuration
|
||||
|
||||
The containerd shims (both the Rust and Go implementations) take configuration files to control their behavior. These files are in `/opt/kata/share/defaults/kata-containers/`. An example excerpt:
|
||||
|
||||
```toml title="/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
[hypervisor.qemu]
|
||||
path = "/opt/kata/bin/qemu-system-x86_64"
|
||||
kernel = "/opt/kata/share/kata-containers/vmlinux.container"
|
||||
image = "/opt/kata/share/kata-containers/kata-containers.img"
|
||||
machine_type = "q35"
|
||||
|
||||
# rootfs filesystem type:
|
||||
# - ext4 (default)
|
||||
# - xfs
|
||||
# - erofs
|
||||
rootfs_type = "ext4"
|
||||
|
||||
# Enable running QEMU VMM as a non-root user.
|
||||
# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as
|
||||
# a non-root random user. See documentation for the limitations of this mode.
|
||||
rootless = false
|
||||
|
||||
# List of valid annotation names for the hypervisor
|
||||
# Each member of the list is a regular expression, which is the base name
|
||||
# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path"
|
||||
enable_annotations = ["enable_iommu", "virtio_fs_extra_args", "kernel_params"]
|
||||
```
|
||||
|
||||
These files should never be modified directly. If you wish to create a modified version of these files, you may create your own [custom runtime](helm-configuration.md#custom-runtimes). For example, to modify the image path, we provide these values to helm:
|
||||
|
||||
```yaml title="values.yaml"
|
||||
customRuntimes:
|
||||
enabled: true
|
||||
runtimes:
|
||||
my-gpu-runtime:
|
||||
baseConfig: "qemu-nvidia-gpu"
|
||||
dropIn: |
|
||||
[hypervisor.qemu]
|
||||
image = "/path/to/custom-image.img"
|
||||
runtimeClass: |
|
||||
kind: RuntimeClass
|
||||
apiVersion: node.k8s.io/v1
|
||||
metadata:
|
||||
name: kata-my-gpu-runtime
|
||||
labels:
|
||||
app.kubernetes.io/managed-by: kata-deploy
|
||||
handler: kata-my-gpu-runtime
|
||||
overhead:
|
||||
podFixed:
|
||||
memory: "640Mi"
|
||||
cpu: "500m"
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
katacontainers.io/kata-runtime: "true"
|
||||
```
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
# Enabling NVIDIA GPU workloads using GPU passthrough with Kata Containers
|
||||
|
||||
This page provides:
|
||||
|
||||
1. A description of the components involved when running GPU workloads with
|
||||
Kata Containers using the NVIDIA TEE and non-TEE GPU runtime classes.
|
||||
1. An explanation of the orchestration flow on a Kubernetes node for this
|
||||
scenario.
|
||||
1. A deployment guide enabling to utilize these runtime classes.
|
||||
1. A deployment guide to utilize these runtime classes.
|
||||
|
||||
The goal is to educate readers familiar with Kubernetes and Kata Containers
|
||||
on NVIDIA's reference implementation which is reflected in Kata CI's build
|
||||
@@ -18,58 +19,56 @@ Confidential Containers.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The current supported mode for enabling GPU workloads in the TEE scenario
|
||||
> is single GPU passthrough (one GPU per pod) on AMD64 platforms (AMD SEV-SNP
|
||||
> being the only supported TEE scenario so far with support for Intel TDX being
|
||||
> on the way).
|
||||
> The currently supported modes for enabling GPU workloads in the TEE
|
||||
> scenario are: (1) single‑GPU passthrough (one physical GPU per pod) and
|
||||
> (2) multi-GPU passthrough on NVSwitch (NVLink) based HGX systems
|
||||
> (for example, HGX Hopper (SXM) and HGX Blackwell / HGX B200).
|
||||
|
||||
## Component Overview
|
||||
|
||||
Before providing deployment guidance, we describe the components involved to
|
||||
support running GPU workloads. We start from a top to bottom perspective
|
||||
from the NVIDIA GPU operator via the Kata runtime to the components within
|
||||
from the NVIDIA GPU Operator via the Kata runtime to the components within
|
||||
the NVIDIA GPU Utility Virtual Machine (UVM) root filesystem.
|
||||
|
||||
### NVIDIA GPU Operator
|
||||
|
||||
A central component is the
|
||||
[NVIDIA GPU operator](https://github.com/NVIDIA/gpu-operator) which can be
|
||||
deployed onto your cluster as a helm chart. Installing the GPU operator
|
||||
[NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) which can be
|
||||
deployed onto your cluster as a helm chart. Installing the GPU Operator
|
||||
delivers various operands on your nodes in the form of Kubernetes DaemonSets.
|
||||
These operands are vital to support the flow of orchestrating pod manifests
|
||||
using NVIDIA GPU runtime classes with GPU passthrough on your nodes. Without
|
||||
getting into the details, the most important operands and their
|
||||
responsibilities are:
|
||||
|
||||
- **nvidia-vfio-manager:** Binding discovered NVIDIA GPUs to the `vfio-pci`
|
||||
driver for VFIO passthrough.
|
||||
- **nvidia-vfio-manager:** Binding discovered NVIDIA GPUs and nvswitches to
|
||||
the `vfio-pci` driver for VFIO passthrough.
|
||||
- **nvidia-cc-manager:** Transitioning GPUs into confidential computing (CC)
|
||||
and non-CC mode (see the
|
||||
[NVIDIA/k8s-cc-manager](https://github.com/NVIDIA/k8s-cc-manager)
|
||||
repository).
|
||||
- **nvidia-kata-manager:** Creating host-side CDI specifications for GPU
|
||||
passthrough, resulting in the file `/var/run/cdi/nvidia.yaml`, containing
|
||||
`kind: nvidia.com/pgpu` (see the
|
||||
[NVIDIA/k8s-kata-manager](https://github.com/NVIDIA/k8s-kata-manager)
|
||||
repository).
|
||||
- **nvidia-sandbox-device-plugin** (see the
|
||||
[NVIDIA/sandbox-device-plugin](https://github.com/NVIDIA/sandbox-device-plugin)
|
||||
repository):
|
||||
- Creating host-side CDI specifications for GPU passthrough,
|
||||
resulting in the file `/var/run/cdi/nvidia.yaml`, containing
|
||||
`kind: nvidia.com/pgpu`
|
||||
- Allocating GPUs during pod deployment.
|
||||
- Discovering NVIDIA GPUs, their capabilities, and advertising these to
|
||||
the Kubernetes control plane (allocatable resources as type
|
||||
`nvidia.com/pgpu` resources will appear for the node and GPU Device IDs
|
||||
will be registered with Kubelet). These GPUs can thus be allocated as
|
||||
container resources in your pod manifests. See below GPU operator
|
||||
container resources in your pod manifests. See below GPU Operator
|
||||
deployment instructions for the use of the key `pgpu`, controlled via a
|
||||
variable.
|
||||
|
||||
To summarize, the GPU operator manages the GPUs on each node, allowing for
|
||||
To summarize, the GPU Operator manages the GPUs on each node, allowing for
|
||||
simple orchestration of pod manifests using Kata Containers. Once the cluster
|
||||
with GPU operator and Kata bits is up and running, the end user can schedule
|
||||
with GPU Operator and Kata bits is up and running, the end user can schedule
|
||||
Kata NVIDIA GPU workloads, using resource limits and the
|
||||
`kata-qemu-nvidia-gpu` or `kata-qemu-nvidia-gpu-snp` runtime classes, for
|
||||
example:
|
||||
`kata-qemu-nvidia-gpu`, `kata-qemu-nvidia-gpu-tdx` or
|
||||
`kata-qemu-nvidia-gpu-snp` runtime classes, for example:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
@@ -213,7 +212,7 @@ API and kernel drivers, interacting with the pass-through GPU device.
|
||||
|
||||
An additional step is exercised in our CI samples: when using images from an
|
||||
authenticated registry, the guest-pull mechanism triggers attestation using
|
||||
trustee's Key Broker Service (KBS) for secure release of the NGC API
|
||||
Trustee's Key Broker Service (KBS) for secure release of the NGC API
|
||||
authentication key used to access the NVCR container registry. As part of
|
||||
this, the attestation agent exercises composite attestation and transitions
|
||||
the GPU into `Ready` state (without this, the GPU has to explicitly be
|
||||
@@ -232,24 +231,40 @@ NVIDIA GPU CI validation jobs. Note that, this setup:
|
||||
- uses the genpolicy tool to attach Kata agent security policies to the pod
|
||||
manifest
|
||||
- has dedicated (composite) attestation tests, a CUDA vectorAdd test, and a
|
||||
NIM/RA test sample with secure API key release
|
||||
NIM/RA test sample with secure API key release using sealed secrets.
|
||||
|
||||
A similar deployment guide and scenario description can be found in NVIDIA resources
|
||||
under
|
||||
[Early Access: NVIDIA GPU Operator with Confidential Containers based on Kata](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/confidential-containers.html).
|
||||
[NVIDIA Confidential Containers Overview (Early Access)](https://docs.nvidia.com/datacenter/cloud-native/confidential-containers/latest/overview.html).
|
||||
|
||||
### Feature Set
|
||||
|
||||
The NVIDIA stack for Kata Containers leverages features for the confidential
|
||||
computing scenario from both the confidential containers open source project
|
||||
and from the Kata Containers source tree, such as:
|
||||
- composite attestation using Trustee and the NVIDIA Remote Attestation
|
||||
Service NRAS
|
||||
- generating kata agent security policies using the genpolicy tool
|
||||
- use of signed sealed secrets
|
||||
- access to authenticated registries for container image guest-pull
|
||||
- container image signature verification and encrypted container images
|
||||
- ephemeral container data and image layer storage
|
||||
|
||||
### Requirements
|
||||
|
||||
The requirements for the TEE scenario are:
|
||||
|
||||
- Ubuntu 25.10 as host OS
|
||||
- CPU with AMD SEV-SNP support with proper BIOS/UEFI version and settings
|
||||
- CPU with AMD SEV-SNP or Intel TDX support with proper BIOS/UEFI version
|
||||
and settings
|
||||
- CC-capable Hopper/Blackwell GPU with proper VBIOS version.
|
||||
|
||||
BIOS and VBIOS configuration is out of scope for this guide. Other resources,
|
||||
such as the documentation found on the
|
||||
[NVIDIA Trusted Computing Solutions](https://docs.nvidia.com/nvtrust/index.html)
|
||||
page and the above linked NVIDIA documentation, provide guidance on
|
||||
page, on the
|
||||
[Secure AI Compatibility Matrix](https://www.nvidia.com/en-us/data-center/solutions/confidential-computing/secure-ai-compatibility-matrix/)
|
||||
page, and on the above linked NVIDIA documentation, provide guidance on
|
||||
selecting proper hardware and on properly configuring its firmware and OS.
|
||||
|
||||
### Installation
|
||||
@@ -257,12 +272,16 @@ selecting proper hardware and on properly configuring its firmware and OS.
|
||||
#### Containerd and Kubernetes
|
||||
|
||||
First, set up your Kubernetes cluster. For instance, in Kata CI, our NVIDIA
|
||||
jobs use a single-node vanilla Kubernetes cluster with a 2.x containerd
|
||||
version and Kata's current supported Kubernetes version. We set this cluster
|
||||
up using the `deploy_k8s` function from `tests/integration/kubernetes/gha-run.sh`
|
||||
as follows:
|
||||
|
||||
jobs use a single-node vanilla Kubernetes cluster with a 2.1 containerd
|
||||
version and Kata's current supported Kubernetes version. This cluster is
|
||||
being set up using the `deploy_k8s` function from the script file
|
||||
`tests/integration/kubernetes/gha-run.sh`. If you intend to run this script,
|
||||
follow these steps, and make sure you have `yq` and `helm` installed. Note
|
||||
that, these scripts query the GitHub API, so creating and declaring a
|
||||
personal access token prevents rate limiting issues.
|
||||
You can execute the function as follows:
|
||||
```bash
|
||||
$ export GH_TOKEN="<your-gh-pat>"
|
||||
$ export KUBERNETES="vanilla"
|
||||
$ export CONTAINER_ENGINE="containerd"
|
||||
$ export CONTAINER_ENGINE_VERSION="v2.1"
|
||||
@@ -276,8 +295,11 @@ $ deploy_k8s
|
||||
> `runtimeRequestTimeout` timeout value than the two minute default timeout.
|
||||
> Using the guest-pull mechanism, pulling large images may take a significant
|
||||
> amount of time and may delay container start, possibly leading your Kubelet
|
||||
> to de-allocate your pod before it transitions from the *container created*
|
||||
> to the *container running* state.
|
||||
> to de-allocate your pod before it transitions from the *container creating*
|
||||
> to the *container running* state. The NVIDIA shim configurations use a
|
||||
> `create_container_timeout` of 1200s, which is the equivalent value on shim
|
||||
> side, controlling the time the shim allows for a container to remain in
|
||||
> *container creating* state.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
@@ -291,7 +313,7 @@ $ deploy_k8s
|
||||
#### GPU Operator
|
||||
|
||||
Assuming you have the helm tools installed, deploy the latest version of the
|
||||
GPU Operator as a helm chart (minimum version: `v25.10.0`):
|
||||
GPU Operator as a helm chart (minimum version: `v26.3.0`):
|
||||
|
||||
```bash
|
||||
$ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
|
||||
@@ -300,33 +322,27 @@ $ helm install --wait --generate-name \
|
||||
nvidia/gpu-operator \
|
||||
--set sandboxWorkloads.enabled=true \
|
||||
--set sandboxWorkloads.defaultWorkload=vm-passthrough \
|
||||
--set kataManager.enabled=true \
|
||||
--set kataManager.config.runtimeClasses=null \
|
||||
--set kataManager.repository=nvcr.io/nvidia/cloud-native \
|
||||
--set kataManager.image=k8s-kata-manager \
|
||||
--set kataManager.version=v0.2.4 \
|
||||
--set ccManager.enabled=true \
|
||||
--set ccManager.defaultMode=on \
|
||||
--set ccManager.repository=nvcr.io/nvidia/cloud-native \
|
||||
--set ccManager.image=k8s-cc-manager \
|
||||
--set ccManager.version=v0.2.0 \
|
||||
--set sandboxDevicePlugin.repository=nvcr.io/nvidia/cloud-native \
|
||||
--set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \
|
||||
--set sandboxDevicePlugin.version=v0.0.1 \
|
||||
--set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \
|
||||
--set 'sandboxDevicePlugin.env[0].value=pgpu' \
|
||||
--set sandboxWorkloads.mode=kata \
|
||||
--set nfd.enabled=true \
|
||||
--set nfd.nodefeaturerules=true
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> For heterogeneous clusters with different GPU types, you can omit
|
||||
> the `P_GPU_ALIAS` environment variable lines. This will cause the sandbox
|
||||
> device plugin to create GPU model-specific resource types (e.g.,
|
||||
> `nvidia.com/GH100_H100L_94GB`) instead of the generic `nvidia.com/pgpu`,
|
||||
> which in turn can be used by pods through respective resource limits.
|
||||
> For simplicity, this guide uses the generic alias.
|
||||
> For heterogeneous clusters with different GPU types, you can specify an
|
||||
> empty `P_GPU_ALIAS` environment variable for the sandbox device plugin:
|
||||
> `- --set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \`
|
||||
> `- --set 'sandboxDevicePlugin.env[0].value=""' \`
|
||||
> This will cause the sandbox device plugin to create GPU model-specific
|
||||
> resource types (e.g., `nvidia.com/GH100_H100L_94GB`) instead of the
|
||||
> default `pgpu` type, which usually results in advertising a resource of
|
||||
> type `nvidia.com/pgpu`
|
||||
> The exposed device resource types can be used for pods by specifying
|
||||
> respective resource limits.
|
||||
> Your node's nvswitches are exposed as resources of type
|
||||
> `nvidia.com/nvswitch` by default. Using the variable `NVSWITCH_ALIAS`
|
||||
> allows to control the advertising behavior similar to the `P_GPU_ALIAS`
|
||||
> variable.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
@@ -351,8 +367,7 @@ $ helm install kata-deploy \
|
||||
--create-namespace \
|
||||
-f "https://raw.githubusercontent.com/kata-containers/kata-containers/refs/tags/${VERSION}/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml" \
|
||||
--set nfd.enabled=false \
|
||||
--set shims.qemu-nvidia-gpu-tdx.enabled=false \
|
||||
--wait --timeout 10m --atomic \
|
||||
--wait --timeout 10m \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
@@ -382,31 +397,22 @@ mode which requires entering a licensing agreement with NVIDIA, see the
|
||||
### Cluster validation and preparation
|
||||
|
||||
If you did not use the `sandboxWorkloads.defaultWorkload=vm-passthrough`
|
||||
parameter during GPU operator deployment, label your nodes for GPU VM
|
||||
parameter during GPU Operator deployment, label your nodes for GPU VM
|
||||
passthrough, for the example of using all nodes for GPU passthrough, run:
|
||||
|
||||
```bash
|
||||
$ kubectl label nodes --all nvidia.com/gpu.workload.config=vm-passthrough --overwrite
|
||||
```
|
||||
|
||||
Check if the `nvidia-cc-manager` pod is running if you intend to run GPU TEE
|
||||
scenarios. If not, you need to manually label the node as CC capable. Current
|
||||
GPU Operator node feature rules do not yet recognize all CC capable GPU PCI
|
||||
IDs. Run the following command:
|
||||
|
||||
```bash
|
||||
$ kubectl label nodes --all nvidia.com/cc.capable=true
|
||||
```
|
||||
|
||||
After this, assure the `nvidia-cc-manager` pod is running. With the suggested
|
||||
parameters for GPU Operator deployment, the `nvidia-cc-manager` will
|
||||
automatically transition the GPU into CC mode.
|
||||
With the suggested parameters for GPU Operator deployment, the
|
||||
`nvidia-cc-manager` operand will automatically transition the GPU into CC
|
||||
mode.
|
||||
|
||||
After deployment, you can transition your node(s) to the desired CC state,
|
||||
using either the `on` or `off` value, depending on your scenario. For the
|
||||
non-CC scenario, transition to the `off` state via:
|
||||
using either the `on`, `ppcie`, or `off` value, depending on your scenario.
|
||||
For the non-CC scenario, transition to the `off` state via:
|
||||
`kubectl label nodes --all nvidia.com/cc.mode=off` and wait until all pods
|
||||
are back running. When an actual change is exercised, various GPU operator
|
||||
are back running. When an actual change is exercised, various GPU Operator
|
||||
operands will be restarted.
|
||||
|
||||
Ensure all pods are running:
|
||||
@@ -425,9 +431,10 @@ $ lspci -nnk -d 10de:
|
||||
|
||||
### Run the CUDA vectorAdd sample
|
||||
|
||||
Create the following file:
|
||||
Create the pod manifest with:
|
||||
|
||||
```yaml
|
||||
```bash
|
||||
$ cat > cuda-vectoradd-kata.yaml.in << 'EOF'
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
@@ -445,6 +452,7 @@ spec:
|
||||
limits:
|
||||
nvidia.com/pgpu: "1"
|
||||
memory: 16Gi
|
||||
EOF
|
||||
```
|
||||
|
||||
Depending on your scenario and on the CC state, export your desired runtime
|
||||
@@ -477,6 +485,17 @@ To stop the pod, run: `kubectl delete pod cuda-vectoradd-kata`.
|
||||
|
||||
### Next steps
|
||||
|
||||
#### Use multi-GPU passthrough
|
||||
|
||||
If you have machines supporting multi-GPU passthrough, use a pod deployment
|
||||
manifest which uses 8 pgpu and 4 nvswitch resources.
|
||||
On the NVIDIA Hopper architecture multi-GPU passthrough uses protected PCIe
|
||||
(PPCIE) which claims exclusive use of the nvswitches for a single CVM. In
|
||||
this case, transition your relevant node(s) GPU mode to `ppcie` mode.
|
||||
The NVIDIA Blackwell architecture uses NVLink encryption which places the
|
||||
switches outside of the Trusted Computing Base (TCB) and so does not
|
||||
require a separate switch setting.
|
||||
|
||||
#### Transition between CC and non-CC mode
|
||||
|
||||
Use the previously described node labeling approach to transition between
|
||||
@@ -492,7 +511,7 @@ and a basic NIM/RAG deployment. Running CI tests for the TEE GPU scenario
|
||||
requires KBS to be deployed (except for the CUDA vectorAdd test). The best
|
||||
place to get started running these tests locally is to look into our
|
||||
[NVIDIA CI workflow manifest](https://github.com/kata-containers/kata-containers/blob/main/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml)
|
||||
and into the underling
|
||||
and into the underlying
|
||||
[run_kubernetes_nv_tests.sh](https://github.com/kata-containers/kata-containers/blob/main/tests/integration/kubernetes/run_kubernetes_nv_tests.sh)
|
||||
script. For example, to run the CUDA vectorAdd scenario against the TEE GPU
|
||||
runtime class use the following commands:
|
||||
@@ -547,6 +566,22 @@ With GPU passthrough being supported by the
|
||||
you can use the tool to create a Kata agent security policy. Our CI deploys
|
||||
all sample pod manifests with a Kata agent security policy.
|
||||
|
||||
Note that, using containerd 2.1 in upstream's CI, we use the following
|
||||
modification to the genpolicy default settings:
|
||||
```bash
|
||||
[
|
||||
{
|
||||
"op": "replace",
|
||||
"path": "/kata_config/oci_version",
|
||||
"value": "1.2.1"
|
||||
}
|
||||
]
|
||||
```
|
||||
This modification is applied via the genpolicy drop-in configuration file
|
||||
`src\tools\genpolicy\drop-in-examples\20-oci-1.2.1-drop-in.json`.
|
||||
When using a newer containerd version, such as containerd 2.2, the OCI
|
||||
version field needs to be adjusted to "1.3.0", for instance.
|
||||
|
||||
#### Deploy pods using your own containers and manifests
|
||||
|
||||
You can author pod manifests leveraging your own containers, for instance,
|
||||
@@ -564,6 +599,3 @@ following annotation in the manifest:
|
||||
>
|
||||
> - musl-based container images (e.g., using Alpine), or distro-less
|
||||
> containers are not supported.
|
||||
> - for the TEE scenario, only single-GPU passthrough per pod is supported,
|
||||
> so your pod resource limit must be: `nvidia.com/pgpu: "1"` (on a system
|
||||
> with multiple GPUs, you can thus pass through one GPU per pod).
|
||||
|
||||
91
mkdocs.yaml
Normal file
91
mkdocs.yaml
Normal file
@@ -0,0 +1,91 @@
|
||||
site_name: "Kata Containers Docs"
|
||||
site_description: "Developer and user documentation for the Kata Containers project."
|
||||
site_author: "Kata Containers Community"
|
||||
|
||||
repo_url: "https://github.com/kata-containers/kata-containers"
|
||||
site_url: "https://kata-containers.github.io/kata-containers"
|
||||
edit_uri: "edit/main/docs/"
|
||||
repo_name: kata-containers
|
||||
|
||||
theme:
|
||||
name: materialx
|
||||
favicon: "assets/images/favicon.svg"
|
||||
logo: "assets/images/favicon.svg"
|
||||
topbar_style: glass
|
||||
palette:
|
||||
- media: "(prefers-color-scheme)"
|
||||
toggle:
|
||||
icon: material/brightness-auto
|
||||
name: Switch to light mode
|
||||
- media: "(prefers-color-scheme: light)"
|
||||
scheme: default
|
||||
primary: blue
|
||||
accent: light blue
|
||||
toggle:
|
||||
icon: material/weather-sunny
|
||||
name: Switch to dark mode
|
||||
- media: "(prefers-color-scheme: dark)"
|
||||
scheme: slate
|
||||
primary: cyan
|
||||
accent: cyan
|
||||
toggle:
|
||||
icon: material/brightness-4
|
||||
name: Switch to system preference
|
||||
features:
|
||||
- content.action.edit
|
||||
- content.action.view
|
||||
- content.code.annotate
|
||||
- content.code.copy
|
||||
- content.code.select
|
||||
- content.footnote.tooltips
|
||||
- content.tabs.link
|
||||
- content.tooltips
|
||||
- navigation.expand
|
||||
- navigation.indexes
|
||||
- navigation.path
|
||||
- navigation.sections
|
||||
- navigation.tabs
|
||||
- navigation.tracking
|
||||
- navigation.top
|
||||
- navigation.instant
|
||||
- navigation.instant.prefetch
|
||||
- navigation.instant.progress
|
||||
- toc.follow
|
||||
markdown_extensions:
|
||||
- abbr
|
||||
- admonition
|
||||
- attr_list
|
||||
- def_list
|
||||
- footnotes
|
||||
- md_in_html
|
||||
- pymdownx.arithmatex:
|
||||
generic: true
|
||||
- pymdownx.emoji:
|
||||
emoji_index: !!python/name:material.extensions.emoji.twemoji
|
||||
emoji_generator: !!python/name:material.extensions.emoji.to_svg
|
||||
- pymdownx.details
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
line_spans: __span
|
||||
pygments_lang_class: true
|
||||
auto_title: true
|
||||
- pymdownx.keys
|
||||
- pymdownx.magiclink
|
||||
- pymdownx.superfences:
|
||||
custom_fences:
|
||||
- name: mermaid
|
||||
class: mermaid
|
||||
format: !!python/name:pymdownx.superfences.fence_code_format
|
||||
- pymdownx.inlinehilite
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
- pymdownx.tilde
|
||||
- pymdownx.caret
|
||||
- pymdownx.mark
|
||||
- toc:
|
||||
permalink: true
|
||||
|
||||
plugins:
|
||||
- search
|
||||
- awesome-nav
|
||||
|
||||
@@ -50,6 +50,7 @@ vm-memory = { workspace = true, features = ["backend-mmap"] }
|
||||
crossbeam-channel = "0.5.6"
|
||||
vfio-bindings = { workspace = true, optional = true }
|
||||
vfio-ioctls = { workspace = true, optional = true }
|
||||
kata-sys-util = { path = "../libs/kata-sys-util" }
|
||||
|
||||
[dev-dependencies]
|
||||
slog-async = "2.7.0"
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::io::{Read, Write};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
use vm_memory::bitmap::{Bitmap, BS};
|
||||
use vm_memory::guest_memory::GuestMemoryIterator;
|
||||
use vm_memory::mmap::{Error, NewBitmap};
|
||||
use vm_memory::mmap::NewBitmap;
|
||||
use vm_memory::{
|
||||
guest_memory, AtomicAccess, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion,
|
||||
GuestRegionMmap, GuestUsize, MemoryRegionAddress, VolatileSlice,
|
||||
GuestRegionCollectionError, GuestRegionMmap, GuestUsize, MemoryRegionAddress, ReadVolatile,
|
||||
VolatileSlice, WriteVolatile,
|
||||
};
|
||||
|
||||
use crate::GuestRegionRaw;
|
||||
@@ -67,63 +66,63 @@ impl<B: Bitmap> Bytes<MemoryRegionAddress> for GuestRegionHybrid<B> {
|
||||
}
|
||||
}
|
||||
|
||||
fn read_from<F>(
|
||||
fn read_volatile_from<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
src: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<usize>
|
||||
where
|
||||
F: Read,
|
||||
F: ReadVolatile,
|
||||
{
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.read_from(addr, src, count),
|
||||
GuestRegionHybrid::Raw(region) => region.read_from(addr, src, count),
|
||||
GuestRegionHybrid::Mmap(region) => region.read_volatile_from(addr, src, count),
|
||||
GuestRegionHybrid::Raw(region) => region.read_volatile_from(addr, src, count),
|
||||
}
|
||||
}
|
||||
|
||||
fn read_exact_from<F>(
|
||||
fn read_exact_volatile_from<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
src: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<()>
|
||||
where
|
||||
F: Read,
|
||||
F: ReadVolatile,
|
||||
{
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.read_exact_from(addr, src, count),
|
||||
GuestRegionHybrid::Raw(region) => region.read_exact_from(addr, src, count),
|
||||
GuestRegionHybrid::Mmap(region) => region.read_exact_volatile_from(addr, src, count),
|
||||
GuestRegionHybrid::Raw(region) => region.read_exact_volatile_from(addr, src, count),
|
||||
}
|
||||
}
|
||||
|
||||
fn write_to<F>(
|
||||
fn write_volatile_to<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
dst: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<usize>
|
||||
where
|
||||
F: Write,
|
||||
F: WriteVolatile,
|
||||
{
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.write_to(addr, dst, count),
|
||||
GuestRegionHybrid::Raw(region) => region.write_to(addr, dst, count),
|
||||
GuestRegionHybrid::Mmap(region) => region.write_volatile_to(addr, dst, count),
|
||||
GuestRegionHybrid::Raw(region) => region.write_volatile_to(addr, dst, count),
|
||||
}
|
||||
}
|
||||
|
||||
fn write_all_to<F>(
|
||||
fn write_all_volatile_to<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
dst: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<()>
|
||||
where
|
||||
F: Write,
|
||||
F: WriteVolatile,
|
||||
{
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.write_all_to(addr, dst, count),
|
||||
GuestRegionHybrid::Raw(region) => region.write_all_to(addr, dst, count),
|
||||
GuestRegionHybrid::Mmap(region) => region.write_all_volatile_to(addr, dst, count),
|
||||
GuestRegionHybrid::Raw(region) => region.write_all_volatile_to(addr, dst, count),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,7 +167,7 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionHybrid<B> {
|
||||
}
|
||||
}
|
||||
|
||||
fn bitmap(&self) -> &Self::B {
|
||||
fn bitmap(&self) -> BS<'_, Self::B> {
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.bitmap(),
|
||||
GuestRegionHybrid::Raw(region) => region.bitmap(),
|
||||
@@ -189,20 +188,6 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionHybrid<B> {
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn as_slice(&self) -> Option<&[u8]> {
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.as_slice(),
|
||||
GuestRegionHybrid::Raw(region) => region.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> {
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => region.as_mut_slice(),
|
||||
GuestRegionHybrid::Raw(region) => region.as_mut_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_slice(
|
||||
&self,
|
||||
offset: MemoryRegionAddress,
|
||||
@@ -223,6 +208,39 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionHybrid<B> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: Bitmap> GuestRegionHybrid<B> {
|
||||
/// Returns a slice corresponding to the region.
|
||||
///
|
||||
/// # Safety
|
||||
/// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
/// overflow. However, it is possible to alias.
|
||||
pub unsafe fn as_slice(&self) -> Option<&[u8]> {
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => {
|
||||
let addr = region.get_host_address(MemoryRegionAddress(0)).ok()?;
|
||||
Some(std::slice::from_raw_parts(addr, region.len() as usize))
|
||||
}
|
||||
GuestRegionHybrid::Raw(region) => region.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a mutable slice corresponding to the region.
|
||||
///
|
||||
/// # Safety
|
||||
/// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
/// overflow. However, it is possible to alias.
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> {
|
||||
match self {
|
||||
GuestRegionHybrid::Mmap(region) => {
|
||||
let addr = region.get_host_address(MemoryRegionAddress(0)).ok()?;
|
||||
Some(std::slice::from_raw_parts_mut(addr, region.len() as usize))
|
||||
}
|
||||
GuestRegionHybrid::Raw(region) => region.as_mut_slice(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// [`GuestMemory`](trait.GuestMemory.html) implementation that manage hybrid types of guest memory
|
||||
/// regions.
|
||||
///
|
||||
@@ -248,7 +266,9 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
/// * `regions` - The vector of regions.
|
||||
/// The regions shouldn't overlap and they should be sorted
|
||||
/// by the starting address.
|
||||
pub fn from_regions(mut regions: Vec<GuestRegionHybrid<B>>) -> Result<Self, Error> {
|
||||
pub fn from_regions(
|
||||
mut regions: Vec<GuestRegionHybrid<B>>,
|
||||
) -> Result<Self, GuestRegionCollectionError> {
|
||||
Self::from_arc_regions(regions.drain(..).map(Arc::new).collect())
|
||||
}
|
||||
|
||||
@@ -264,9 +284,11 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
/// * `regions` - The vector of `Arc` regions.
|
||||
/// The regions shouldn't overlap and they should be sorted
|
||||
/// by the starting address.
|
||||
pub fn from_arc_regions(regions: Vec<Arc<GuestRegionHybrid<B>>>) -> Result<Self, Error> {
|
||||
pub fn from_arc_regions(
|
||||
regions: Vec<Arc<GuestRegionHybrid<B>>>,
|
||||
) -> Result<Self, GuestRegionCollectionError> {
|
||||
if regions.is_empty() {
|
||||
return Err(Error::NoMemoryRegion);
|
||||
return Err(GuestRegionCollectionError::NoMemoryRegion);
|
||||
}
|
||||
|
||||
for window in regions.windows(2) {
|
||||
@@ -274,11 +296,11 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
let next = &window[1];
|
||||
|
||||
if prev.start_addr() > next.start_addr() {
|
||||
return Err(Error::UnsortedMemoryRegions);
|
||||
return Err(GuestRegionCollectionError::UnsortedMemoryRegions);
|
||||
}
|
||||
|
||||
if prev.last_addr() >= next.start_addr() {
|
||||
return Err(Error::MemoryRegionOverlap);
|
||||
return Err(GuestRegionCollectionError::MemoryRegionOverlap);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -292,7 +314,7 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
pub fn insert_region(
|
||||
&self,
|
||||
region: Arc<GuestRegionHybrid<B>>,
|
||||
) -> Result<GuestMemoryHybrid<B>, Error> {
|
||||
) -> Result<GuestMemoryHybrid<B>, GuestRegionCollectionError> {
|
||||
let mut regions = self.regions.clone();
|
||||
regions.push(region);
|
||||
regions.sort_by_key(|x| x.start_addr());
|
||||
@@ -310,7 +332,7 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
&self,
|
||||
base: GuestAddress,
|
||||
size: GuestUsize,
|
||||
) -> Result<(GuestMemoryHybrid<B>, Arc<GuestRegionHybrid<B>>), Error> {
|
||||
) -> Result<(GuestMemoryHybrid<B>, Arc<GuestRegionHybrid<B>>), GuestRegionCollectionError> {
|
||||
if let Ok(region_index) = self.regions.binary_search_by_key(&base, |x| x.start_addr()) {
|
||||
if self.regions.get(region_index).unwrap().len() as GuestUsize == size {
|
||||
let mut regions = self.regions.clone();
|
||||
@@ -319,32 +341,13 @@ impl<B: Bitmap> GuestMemoryHybrid<B> {
|
||||
}
|
||||
}
|
||||
|
||||
Err(Error::InvalidGuestRegion)
|
||||
Err(GuestRegionCollectionError::NoMemoryRegion)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the elements of `GuestMemoryHybrid`.
|
||||
///
|
||||
/// This struct is created by `GuestMemory::iter()`. See its documentation for more.
|
||||
pub struct Iter<'a, B>(std::slice::Iter<'a, Arc<GuestRegionHybrid<B>>>);
|
||||
|
||||
impl<'a, B> Iterator for Iter<'a, B> {
|
||||
type Item = &'a GuestRegionHybrid<B>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.next().map(AsRef::as_ref)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, B: 'a> GuestMemoryIterator<'a, GuestRegionHybrid<B>> for GuestMemoryHybrid<B> {
|
||||
type Iter = Iter<'a, B>;
|
||||
}
|
||||
|
||||
impl<B: Bitmap + 'static> GuestMemory for GuestMemoryHybrid<B> {
|
||||
type R = GuestRegionHybrid<B>;
|
||||
|
||||
type I = Self;
|
||||
|
||||
fn num_regions(&self) -> usize {
|
||||
self.regions.len()
|
||||
}
|
||||
@@ -359,15 +362,15 @@ impl<B: Bitmap + 'static> GuestMemory for GuestMemoryHybrid<B> {
|
||||
index.map(|x| self.regions[x].as_ref())
|
||||
}
|
||||
|
||||
fn iter(&self) -> Iter<'_, B> {
|
||||
Iter(self.regions.iter())
|
||||
fn iter(&self) -> impl Iterator<Item = &GuestRegionHybrid<B>> {
|
||||
self.regions.iter().map(AsRef::as_ref)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Seek;
|
||||
use std::io::{Read, Seek, Write};
|
||||
use vm_memory::{GuestMemoryError, MmapRegion};
|
||||
use vmm_sys_util::tempfile::TempFile;
|
||||
|
||||
@@ -654,14 +657,14 @@ mod tests {
|
||||
// Rewind file pointer after write operation.
|
||||
file_to_write_mmap_region.rewind().unwrap();
|
||||
guest_region
|
||||
.read_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_volatile_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file();
|
||||
file_read_from_mmap_region
|
||||
.set_len(size_of_file as u64)
|
||||
.unwrap();
|
||||
guest_region
|
||||
.write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_all_volatile_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
// Rewind file pointer after write operation.
|
||||
file_read_from_mmap_region.rewind().unwrap();
|
||||
@@ -679,7 +682,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_region
|
||||
.read_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_volatile_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -689,7 +692,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_region
|
||||
.write_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_volatile_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -719,14 +722,14 @@ mod tests {
|
||||
// Rewind file pointer after write operation.
|
||||
file_to_write_mmap_region.rewind().unwrap();
|
||||
guest_region
|
||||
.read_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_volatile_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file();
|
||||
file_read_from_mmap_region
|
||||
.set_len(size_of_file as u64)
|
||||
.unwrap();
|
||||
guest_region
|
||||
.write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_all_volatile_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
// Rewind file pointer after write operation.
|
||||
file_read_from_mmap_region.rewind().unwrap();
|
||||
@@ -744,7 +747,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_region
|
||||
.read_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_volatile_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -754,7 +757,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_region
|
||||
.write_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_volatile_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -788,14 +791,14 @@ mod tests {
|
||||
.unwrap();
|
||||
file_to_write_mmap_region.rewind().unwrap();
|
||||
guest_mmap_region
|
||||
.read_exact_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_exact_volatile_from(write_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file();
|
||||
file_read_from_mmap_region
|
||||
.set_len(size_of_file as u64)
|
||||
.unwrap();
|
||||
guest_mmap_region
|
||||
.write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_all_volatile_to(write_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.unwrap();
|
||||
file_read_from_mmap_region.rewind().unwrap();
|
||||
let mut content = String::new();
|
||||
@@ -818,14 +821,14 @@ mod tests {
|
||||
.unwrap();
|
||||
file_to_write_raw_region.rewind().unwrap();
|
||||
guest_raw_region
|
||||
.read_exact_from(write_addr, &mut file_to_write_raw_region, size_of_file)
|
||||
.read_exact_volatile_from(write_addr, &mut file_to_write_raw_region, size_of_file)
|
||||
.unwrap();
|
||||
let mut file_read_from_raw_region = TempFile::new().unwrap().into_file();
|
||||
file_read_from_raw_region
|
||||
.set_len(size_of_file as u64)
|
||||
.unwrap();
|
||||
guest_raw_region
|
||||
.write_all_to(write_addr, &mut file_read_from_raw_region, size_of_file)
|
||||
.write_all_volatile_to(write_addr, &mut file_read_from_raw_region, size_of_file)
|
||||
.unwrap();
|
||||
file_read_from_raw_region.rewind().unwrap();
|
||||
let mut content = String::new();
|
||||
@@ -842,7 +845,11 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_mmap_region
|
||||
.read_exact_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file)
|
||||
.read_exact_volatile_from(
|
||||
invalid_addr,
|
||||
&mut file_to_write_mmap_region,
|
||||
size_of_file
|
||||
)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -852,7 +859,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_mmap_region
|
||||
.write_all_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.write_all_volatile_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -862,7 +869,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_raw_region
|
||||
.read_exact_from(invalid_addr, &mut file_to_write_raw_region, size_of_file)
|
||||
.read_exact_volatile_from(invalid_addr, &mut file_to_write_raw_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -872,7 +879,7 @@ mod tests {
|
||||
let invalid_addr = MemoryRegionAddress(0x900);
|
||||
assert!(matches!(
|
||||
guest_raw_region
|
||||
.write_all_to(invalid_addr, &mut file_read_from_raw_region, size_of_file)
|
||||
.write_all_volatile_to(invalid_addr, &mut file_read_from_raw_region, size_of_file)
|
||||
.err()
|
||||
.unwrap(),
|
||||
GuestMemoryError::InvalidBackendAddress
|
||||
@@ -1076,13 +1083,16 @@ mod tests {
|
||||
let guest_region = GuestMemoryHybrid::<()>::from_regions(regions);
|
||||
assert!(matches!(
|
||||
guest_region.err().unwrap(),
|
||||
Error::UnsortedMemoryRegions
|
||||
GuestRegionCollectionError::UnsortedMemoryRegions
|
||||
));
|
||||
|
||||
// Error no memory region case.
|
||||
let regions = Vec::<GuestRegionHybrid<()>>::new();
|
||||
let guest_region = GuestMemoryHybrid::<()>::from_regions(regions);
|
||||
assert!(matches!(guest_region.err().unwrap(), Error::NoMemoryRegion));
|
||||
assert!(matches!(
|
||||
guest_region.err().unwrap(),
|
||||
GuestRegionCollectionError::NoMemoryRegion
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::io::{Read, Write};
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use vm_memory::bitmap::{Bitmap, BS};
|
||||
@@ -9,7 +8,7 @@ use vm_memory::mmap::NewBitmap;
|
||||
use vm_memory::volatile_memory::compute_offset;
|
||||
use vm_memory::{
|
||||
guest_memory, volatile_memory, Address, AtomicAccess, Bytes, FileOffset, GuestAddress,
|
||||
GuestMemoryRegion, GuestUsize, MemoryRegionAddress, VolatileSlice,
|
||||
GuestMemoryRegion, GuestUsize, MemoryRegionAddress, ReadVolatile, VolatileSlice, WriteVolatile,
|
||||
};
|
||||
|
||||
/// Guest memory region for virtio-fs DAX window.
|
||||
@@ -73,67 +72,67 @@ impl<B: Bitmap> Bytes<MemoryRegionAddress> for GuestRegionRaw<B> {
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn read_from<F>(
|
||||
fn read_volatile_from<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
src: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<usize>
|
||||
where
|
||||
F: Read,
|
||||
F: ReadVolatile,
|
||||
{
|
||||
let maddr = addr.raw_value() as usize;
|
||||
self.as_volatile_slice()
|
||||
.unwrap()
|
||||
.read_from::<F>(maddr, src, count)
|
||||
.read_volatile_from::<F>(maddr, src, count)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn read_exact_from<F>(
|
||||
fn read_exact_volatile_from<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
src: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<()>
|
||||
where
|
||||
F: Read,
|
||||
F: ReadVolatile,
|
||||
{
|
||||
let maddr = addr.raw_value() as usize;
|
||||
self.as_volatile_slice()
|
||||
.unwrap()
|
||||
.read_exact_from::<F>(maddr, src, count)
|
||||
.read_exact_volatile_from::<F>(maddr, src, count)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn write_to<F>(
|
||||
fn write_volatile_to<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
dst: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<usize>
|
||||
where
|
||||
F: Write,
|
||||
F: WriteVolatile,
|
||||
{
|
||||
let maddr = addr.raw_value() as usize;
|
||||
self.as_volatile_slice()
|
||||
.unwrap()
|
||||
.write_to::<F>(maddr, dst, count)
|
||||
.write_volatile_to::<F>(maddr, dst, count)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn write_all_to<F>(
|
||||
fn write_all_volatile_to<F>(
|
||||
&self,
|
||||
addr: MemoryRegionAddress,
|
||||
dst: &mut F,
|
||||
count: usize,
|
||||
) -> guest_memory::Result<()>
|
||||
where
|
||||
F: Write,
|
||||
F: WriteVolatile,
|
||||
{
|
||||
let maddr = addr.raw_value() as usize;
|
||||
self.as_volatile_slice()
|
||||
.unwrap()
|
||||
.write_all_to::<F>(maddr, dst, count)
|
||||
.write_all_volatile_to::<F>(maddr, dst, count)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
@@ -170,8 +169,8 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionRaw<B> {
|
||||
self.guest_base
|
||||
}
|
||||
|
||||
fn bitmap(&self) -> &Self::B {
|
||||
&self.bitmap
|
||||
fn bitmap(&self) -> BS<'_, Self::B> {
|
||||
self.bitmap.slice_at(0)
|
||||
}
|
||||
|
||||
fn get_host_address(&self, addr: MemoryRegionAddress) -> guest_memory::Result<*mut u8> {
|
||||
@@ -186,18 +185,6 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionRaw<B> {
|
||||
None
|
||||
}
|
||||
|
||||
unsafe fn as_slice(&self) -> Option<&[u8]> {
|
||||
// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
// overflow. However, it is possible to alias.
|
||||
Some(std::slice::from_raw_parts(self.addr, self.size))
|
||||
}
|
||||
|
||||
unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> {
|
||||
// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
// overflow. However, it is possible to alias.
|
||||
Some(std::slice::from_raw_parts_mut(self.addr, self.size))
|
||||
}
|
||||
|
||||
fn get_slice(
|
||||
&self,
|
||||
offset: MemoryRegionAddress,
|
||||
@@ -216,6 +203,7 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionRaw<B> {
|
||||
(self.addr as usize + offset) as *mut _,
|
||||
count,
|
||||
self.bitmap.slice_at(offset),
|
||||
None,
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -226,6 +214,27 @@ impl<B: Bitmap> GuestMemoryRegion for GuestRegionRaw<B> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: Bitmap> GuestRegionRaw<B> {
|
||||
/// Returns a slice corresponding to the region.
|
||||
///
|
||||
/// # Safety
|
||||
/// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
/// overflow. However, it is possible to alias.
|
||||
pub unsafe fn as_slice(&self) -> Option<&[u8]> {
|
||||
Some(std::slice::from_raw_parts(self.addr, self.size))
|
||||
}
|
||||
|
||||
/// Returns a mutable slice corresponding to the region.
|
||||
///
|
||||
/// # Safety
|
||||
/// This is safe because we mapped the area at addr ourselves, so this slice will not
|
||||
/// overflow. However, it is possible to alias.
|
||||
#[allow(clippy::mut_from_ref)]
|
||||
pub unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> {
|
||||
Some(std::slice::from_raw_parts_mut(self.addr, self.size))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
extern crate vmm_sys_util;
|
||||
@@ -348,7 +357,7 @@ mod tests {
|
||||
unsafe { GuestRegionRaw::<()>::new(GuestAddress(0x10_0000), &mut buf as *mut _, 1024) };
|
||||
|
||||
let s = m.get_slice(MemoryRegionAddress(2), 3).unwrap();
|
||||
assert_eq!(s.as_ptr(), &mut buf[2] as *mut _);
|
||||
assert_eq!(s.ptr_guard().as_ptr(), &buf[2] as *const _);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -600,7 +609,7 @@ mod tests {
|
||||
File::open(Path::new("c:\\Windows\\system32\\ntoskrnl.exe")).unwrap()
|
||||
};
|
||||
gm.write_obj(!0u32, addr).unwrap();
|
||||
gm.read_exact_from(addr, &mut file, mem::size_of::<u32>())
|
||||
gm.read_exact_volatile_from(addr, &mut file, mem::size_of::<u32>())
|
||||
.unwrap();
|
||||
let value: u32 = gm.read_obj(addr).unwrap();
|
||||
if cfg!(unix) {
|
||||
@@ -610,7 +619,7 @@ mod tests {
|
||||
}
|
||||
|
||||
let mut sink = Vec::new();
|
||||
gm.write_all_to(addr, &mut sink, mem::size_of::<u32>())
|
||||
gm.write_all_volatile_to(addr, &mut sink, mem::size_of::<u32>())
|
||||
.unwrap();
|
||||
if cfg!(unix) {
|
||||
assert_eq!(sink, vec![0; mem::size_of::<u32>()]);
|
||||
|
||||
@@ -113,20 +113,23 @@ arm64_sys_reg!(MPIDR_EL1, 3, 0, 0, 0, 5);
|
||||
/// * `mem` - Reserved DRAM for current VM.
|
||||
pub fn setup_regs(vcpu: &VcpuFd, cpu_id: u8, boot_ip: u64, fdt_address: u64) -> Result<()> {
|
||||
// Get the register index of the PSTATE (Processor State) register.
|
||||
vcpu.set_one_reg(arm64_core_reg!(pstate), PSTATE_FAULT_BITS_64 as u128)
|
||||
.map_err(Error::SetCoreRegister)?;
|
||||
vcpu.set_one_reg(
|
||||
arm64_core_reg!(pstate),
|
||||
&(PSTATE_FAULT_BITS_64 as u128).to_le_bytes(),
|
||||
)
|
||||
.map_err(Error::SetCoreRegister)?;
|
||||
|
||||
// Other vCPUs are powered off initially awaiting PSCI wakeup.
|
||||
if cpu_id == 0 {
|
||||
// Setting the PC (Processor Counter) to the current program address (kernel address).
|
||||
vcpu.set_one_reg(arm64_core_reg!(pc), boot_ip as u128)
|
||||
vcpu.set_one_reg(arm64_core_reg!(pc), &(boot_ip as u128).to_le_bytes())
|
||||
.map_err(Error::SetCoreRegister)?;
|
||||
|
||||
// Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
|
||||
// "The device tree blob (dtb) must be placed on an 8-byte boundary and must
|
||||
// not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
|
||||
// We are choosing to place it the end of DRAM. See `get_fdt_addr`.
|
||||
vcpu.set_one_reg(arm64_core_reg!(regs), fdt_address as u128)
|
||||
vcpu.set_one_reg(arm64_core_reg!(regs), &(fdt_address as u128).to_le_bytes())
|
||||
.map_err(Error::SetCoreRegister)?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -157,9 +160,10 @@ pub fn is_system_register(regid: u64) -> bool {
|
||||
///
|
||||
/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd.
|
||||
pub fn read_mpidr(vcpu: &VcpuFd) -> Result<u64> {
|
||||
vcpu.get_one_reg(MPIDR_EL1)
|
||||
.map(|value| value as u64)
|
||||
.map_err(Error::GetSysRegister)
|
||||
let mut reg_data = 0u128.to_le_bytes();
|
||||
vcpu.get_one_reg(MPIDR_EL1, &mut reg_data)
|
||||
.map_err(Error::GetSysRegister)?;
|
||||
Ok(u128::from_le_bytes(reg_data) as u64)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
|
||||
use libc::c_char;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::mem;
|
||||
use std::result;
|
||||
use std::slice;
|
||||
@@ -205,7 +204,7 @@ pub fn setup_mptable<M: GuestMemory>(
|
||||
return Err(Error::AddressOverflow);
|
||||
}
|
||||
|
||||
mem.read_from(base_mp, &mut io::repeat(0), mp_size)
|
||||
mem.write_slice(&vec![0u8; mp_size], base_mp)
|
||||
.map_err(|_| Error::Clear)?;
|
||||
|
||||
{
|
||||
@@ -452,23 +451,11 @@ mod tests {
|
||||
let mpc_offset = GuestAddress(u64::from(mpf_intel.0.physptr));
|
||||
let mpc_table: MpcTableWrapper = mem.read_obj(mpc_offset).unwrap();
|
||||
|
||||
struct Sum(u8);
|
||||
impl io::Write for Sum {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
for v in buf.iter() {
|
||||
self.0 = self.0.wrapping_add(*v);
|
||||
}
|
||||
Ok(buf.len())
|
||||
}
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
let mut sum = Sum(0);
|
||||
mem.write_to(mpc_offset, &mut sum, mpc_table.0.length as usize)
|
||||
let mut buf = Vec::new();
|
||||
mem.write_volatile_to(mpc_offset, &mut buf, mpc_table.0.length as usize)
|
||||
.unwrap();
|
||||
assert_eq!(sum.0, 0);
|
||||
let sum: u8 = buf.iter().fold(0u8, |acc, &v| acc.wrapping_add(v));
|
||||
assert_eq!(sum, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -25,7 +25,7 @@ use std::collections::HashMap;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use kvm_bindings::{kvm_irq_routing, kvm_irq_routing_entry};
|
||||
use kvm_bindings::{kvm_irq_routing_entry, KvmIrqRouting as KvmIrqRoutingWrapper};
|
||||
use kvm_ioctls::VmFd;
|
||||
|
||||
use super::*;
|
||||
@@ -196,26 +196,18 @@ impl KvmIrqRouting {
|
||||
}
|
||||
|
||||
fn set_routing(&self, routes: &HashMap<u64, kvm_irq_routing_entry>) -> Result<()> {
|
||||
// Allocate enough buffer memory.
|
||||
let elem_sz = std::mem::size_of::<kvm_irq_routing>();
|
||||
let total_sz = std::mem::size_of::<kvm_irq_routing_entry>() * routes.len() + elem_sz;
|
||||
let elem_cnt = total_sz.div_ceil(elem_sz);
|
||||
let mut irq_routings = Vec::<kvm_irq_routing>::with_capacity(elem_cnt);
|
||||
irq_routings.resize_with(elem_cnt, Default::default);
|
||||
let mut irq_routing = KvmIrqRoutingWrapper::new(routes.len())
|
||||
.map_err(|_| Error::other("Failed to create KvmIrqRouting"))?;
|
||||
|
||||
// Prepare the irq_routing header.
|
||||
let irq_routing = &mut irq_routings[0];
|
||||
irq_routing.nr = routes.len() as u32;
|
||||
irq_routing.flags = 0;
|
||||
|
||||
// Safe because we have just allocated enough memory above.
|
||||
let irq_routing_entries = unsafe { irq_routing.entries.as_mut_slice(routes.len()) };
|
||||
for (idx, entry) in routes.values().enumerate() {
|
||||
irq_routing_entries[idx] = *entry;
|
||||
{
|
||||
let irq_routing_entries = irq_routing.as_mut_slice();
|
||||
for (idx, entry) in routes.values().enumerate() {
|
||||
irq_routing_entries[idx] = *entry;
|
||||
}
|
||||
}
|
||||
|
||||
self.vm_fd
|
||||
.set_gsi_routing(irq_routing)
|
||||
.set_gsi_routing(&irq_routing)
|
||||
.map_err(from_sys_util_errno)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -11,7 +11,7 @@ use kvm_bindings::{CpuId, __IncompleteArrayField, KVMIO};
|
||||
use thiserror::Error;
|
||||
use vmm_sys_util::fam::{FamStruct, FamStructWrapper};
|
||||
use vmm_sys_util::ioctl::ioctl_with_val;
|
||||
use vmm_sys_util::{generate_fam_struct_impl, ioctl_ioc_nr, ioctl_iowr_nr};
|
||||
use vmm_sys_util::{generate_fam_struct_impl, ioctl_iowr_nr};
|
||||
|
||||
/// Tdx capability list.
|
||||
pub type TdxCaps = FamStructWrapper<TdxCapabilities>;
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::os::raw::*;
|
||||
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
|
||||
use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val};
|
||||
use vmm_sys_util::{ioctl_ioc_nr, ioctl_iow_nr};
|
||||
use vmm_sys_util::ioctl_iow_nr;
|
||||
|
||||
use crate::net::net_gen;
|
||||
|
||||
|
||||
@@ -23,15 +23,15 @@ dbs-address-space = { workspace = true }
|
||||
dbs-boot = { workspace = true }
|
||||
epoll = ">=4.3.1, <4.3.2"
|
||||
io-uring = "0.5.2"
|
||||
fuse-backend-rs = { version = "0.10.5", optional = true }
|
||||
fuse-backend-rs = { version = "0.14.0", optional = true }
|
||||
kvm-bindings = { workspace = true }
|
||||
kvm-ioctls = { workspace = true }
|
||||
libc = "0.2.119"
|
||||
log = "0.4.14"
|
||||
nix = "0.24.3"
|
||||
nydus-api = "0.3.1"
|
||||
nydus-rafs = "0.3.2"
|
||||
nydus-storage = "0.6.4"
|
||||
nydus-api = "0.4.1"
|
||||
nydus-rafs = "0.4.1"
|
||||
nydus-storage = "0.7.2"
|
||||
rlimit = "0.7.0"
|
||||
serde = "1.0.27"
|
||||
serde_json = "1.0.9"
|
||||
@@ -42,8 +42,9 @@ virtio-queue = { workspace = true }
|
||||
vmm-sys-util = { workspace = true }
|
||||
vm-memory = { workspace = true, features = ["backend-mmap"] }
|
||||
sendfd = "0.4.3"
|
||||
vhost-rs = { version = "0.6.1", package = "vhost", optional = true }
|
||||
vhost-rs = { version = "0.15.0", package = "vhost", optional = true }
|
||||
timerfd = "1.0"
|
||||
kata-sys-util = { workspace = true}
|
||||
|
||||
[dev-dependencies]
|
||||
vm-memory = { workspace = true, features = ["backend-mmap", "backend-atomic"] }
|
||||
@@ -63,7 +64,7 @@ virtio-fs-pro = [
|
||||
]
|
||||
virtio-mem = ["virtio-mmio"]
|
||||
virtio-balloon = ["virtio-mmio"]
|
||||
vhost = ["virtio-mmio", "vhost-rs/vhost-user-master", "vhost-rs/vhost-kern"]
|
||||
vhost = ["virtio-mmio", "vhost-rs/vhost-user-frontend", "vhost-rs/vhost-kern"]
|
||||
vhost-net = ["vhost", "vhost-rs/vhost-net"]
|
||||
vhost-user = ["vhost"]
|
||||
vhost-user-fs = ["vhost-user"]
|
||||
|
||||
@@ -34,7 +34,7 @@ use dbs_utils::epoll_manager::{
|
||||
use dbs_utils::metric::{IncMetric, SharedIncMetric, SharedStoreMetric, StoreMetric};
|
||||
use log::{debug, error, info, trace};
|
||||
use serde::Serialize;
|
||||
use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1;
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_VERSION_1;
|
||||
use virtio_queue::{QueueOwnedT, QueueSync, QueueT};
|
||||
use vm_memory::{
|
||||
ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryRegion,
|
||||
|
||||
@@ -20,6 +20,7 @@ use dbs_utils::{
|
||||
};
|
||||
use log::{debug, error, info, warn};
|
||||
use virtio_bindings::bindings::virtio_blk::*;
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_VERSION_1;
|
||||
use virtio_queue::QueueT;
|
||||
use vm_memory::GuestMemoryRegion;
|
||||
use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK};
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::io::{self, Seek, SeekFrom, Write};
|
||||
use std::io::{self, Read, Seek, SeekFrom, Write};
|
||||
use std::ops::Deref;
|
||||
use std::result;
|
||||
|
||||
use log::error;
|
||||
use virtio_bindings::bindings::virtio_blk::*;
|
||||
use virtio_queue::{Descriptor, DescriptorChain};
|
||||
use virtio_queue::{desc::split::Descriptor, DescriptorChain};
|
||||
use vm_memory::{ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError};
|
||||
|
||||
use crate::{
|
||||
@@ -231,13 +231,19 @@ impl Request {
|
||||
for io in data_descs {
|
||||
match self.request_type {
|
||||
RequestType::In => {
|
||||
mem.read_from(GuestAddress(io.data_addr), disk, io.data_len)
|
||||
let mut buf = vec![0u8; io.data_len];
|
||||
disk.read_exact(&mut buf)
|
||||
.map_err(|e| ExecuteError::Read(GuestMemoryError::IOError(e)))?;
|
||||
mem.write_slice(&buf, GuestAddress(io.data_addr))
|
||||
.map_err(ExecuteError::Read)?;
|
||||
len += io.data_len;
|
||||
}
|
||||
RequestType::Out => {
|
||||
mem.write_to(GuestAddress(io.data_addr), disk, io.data_len)
|
||||
let mut buf = vec![0u8; io.data_len];
|
||||
mem.read_slice(&mut buf, GuestAddress(io.data_addr))
|
||||
.map_err(ExecuteError::Write)?;
|
||||
disk.write_all(&buf)
|
||||
.map_err(|e| ExecuteError::Write(GuestMemoryError::IOError(e)))?;
|
||||
}
|
||||
RequestType::Flush => match disk.flush() {
|
||||
Ok(_) => {}
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
|
||||
|
||||
use kata_sys_util::netns::NetnsGuard;
|
||||
use std::any::Any;
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::CString;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Read};
|
||||
use std::marker::PhantomData;
|
||||
@@ -30,7 +30,7 @@ use nydus_api::ConfigV2;
|
||||
use nydus_rafs::blobfs::{BlobFs, Config as BlobfsConfig};
|
||||
use nydus_rafs::{fs::Rafs, RafsIoRead};
|
||||
use rlimit::Resource;
|
||||
use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1;
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_VERSION_1;
|
||||
use virtio_queue::QueueT;
|
||||
use vm_memory::{
|
||||
FileOffset, GuestAddress, GuestAddressSpace, GuestRegionMmap, GuestUsize, MmapRegion,
|
||||
@@ -234,6 +234,7 @@ impl<AS: GuestAddressSpace> VirtioFs<AS> {
|
||||
CachePolicy::Always => Duration::from_secs(CACHE_ALWAYS_TIMEOUT),
|
||||
CachePolicy::Never => Duration::from_secs(CACHE_NONE_TIMEOUT),
|
||||
CachePolicy::Auto => Duration::from_secs(CACHE_AUTO_TIMEOUT),
|
||||
CachePolicy::Metadata => Duration::from_secs(CACHE_AUTO_TIMEOUT),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,16 +455,10 @@ impl<AS: GuestAddressSpace> VirtioFs<AS> {
|
||||
prefetch_list_path: Option<String>,
|
||||
) -> FsResult<()> {
|
||||
debug!("http_server rafs");
|
||||
let currentnetns = fs::read_link("/proc/self/ns/net").unwrap_or_default();
|
||||
info!("========fupan====1==netns={:?}", currentnetns);
|
||||
|
||||
let tid = unsafe { libc::syscall(libc::SYS_gettid) as i32 };
|
||||
|
||||
let netnspath = format!("/proc/{}/ns/net", tid);
|
||||
let netns = fs::read_link(netnspath.as_str()).unwrap_or_default();
|
||||
info!("========fupan====2==netns={:?}", netns);
|
||||
|
||||
info!("========fupan====3==config={:?}", config);
|
||||
// We need to make sure the nydus worker thread in the runD main process's network namespace
|
||||
// instead of the vmm thread's netns, which wouldn't access the host network.
|
||||
let _netns_guard =
|
||||
NetnsGuard::new("/proc/self/ns/net").map_err(|e| FsError::BackendFs(e.to_string()))?;
|
||||
|
||||
let file = Path::new(&source);
|
||||
let (mut rafs, rafs_cfg) = match config.as_ref() {
|
||||
@@ -553,7 +548,7 @@ impl<AS: GuestAddressSpace> VirtioFs<AS> {
|
||||
)));
|
||||
}
|
||||
};
|
||||
let any_fs = rootfs.deref().as_any();
|
||||
let any_fs = rootfs.0.deref().as_any();
|
||||
if let Some(fs_swap) = any_fs.downcast_ref::<Rafs>() {
|
||||
let mut file = <dyn RafsIoRead>::from_file(&source)
|
||||
.map_err(|e| FsError::BackendFs(format!("RafsIoRead failed: {e:?}")))?;
|
||||
@@ -623,8 +618,7 @@ impl<AS: GuestAddressSpace> VirtioFs<AS> {
|
||||
};
|
||||
|
||||
let region = Arc::new(
|
||||
GuestRegionMmap::new(mmap_region, GuestAddress(guest_addr))
|
||||
.map_err(Error::InsertMmap)?,
|
||||
GuestRegionMmap::new(mmap_region, GuestAddress(guest_addr)).ok_or(Error::InsertMmap)?,
|
||||
);
|
||||
self.handler.insert_region(region.clone())?;
|
||||
|
||||
|
||||
@@ -245,8 +245,8 @@ pub enum Error {
|
||||
#[error("set user memory region failed: {0}")]
|
||||
SetUserMemoryRegion(kvm_ioctls::Error),
|
||||
/// Inserting mmap region failed.
|
||||
#[error("inserting mmap region failed: {0}")]
|
||||
InsertMmap(vm_memory::mmap::Error),
|
||||
#[error("inserting mmap region failed")]
|
||||
InsertMmap,
|
||||
/// Failed to set madvise on guest memory region.
|
||||
#[error("failed to set madvice() on guest memory region")]
|
||||
Madvise(#[source] nix::Error),
|
||||
|
||||
@@ -30,7 +30,7 @@ use dbs_utils::epoll_manager::{
|
||||
};
|
||||
use kvm_ioctls::VmFd;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1;
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_VERSION_1;
|
||||
use virtio_queue::{DescriptorChain, QueueOwnedT, QueueSync, QueueT};
|
||||
use vm_memory::{
|
||||
ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryError,
|
||||
@@ -1389,7 +1389,7 @@ pub(crate) mod tests {
|
||||
.map_err(Error::NewMmapRegion)?;
|
||||
|
||||
let region =
|
||||
Arc::new(GuestRegionMmap::new(mmap_region, guest_addr).map_err(Error::InsertMmap)?);
|
||||
Arc::new(GuestRegionMmap::new(mmap_region, guest_addr).ok_or(Error::InsertMmap)?);
|
||||
|
||||
Ok(region)
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ use dbs_utils::net::{net_gen, MacAddr, Tap};
|
||||
use dbs_utils::rate_limiter::{BucketUpdate, RateLimiter, TokenType};
|
||||
use libc;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_VERSION_1;
|
||||
use virtio_bindings::bindings::virtio_net::*;
|
||||
use virtio_queue::{QueueOwnedT, QueueSync, QueueT};
|
||||
use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryRegion, GuestRegionMmap};
|
||||
|
||||
@@ -6,7 +6,7 @@ use log::{debug, error, warn};
|
||||
use virtio_bindings::bindings::virtio_net::{
|
||||
virtio_net_ctrl_hdr, virtio_net_ctrl_mq, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
|
||||
};
|
||||
use virtio_queue::{Descriptor, DescriptorChain};
|
||||
use virtio_queue::{desc::split::Descriptor, DescriptorChain};
|
||||
use vm_memory::{Bytes, GuestMemory};
|
||||
|
||||
use crate::{DbsGuestAddressSpace, Error as VirtioError, Result as VirtioResult};
|
||||
|
||||
@@ -26,6 +26,7 @@ use vhost_rs::vhost_user::message::VhostUserVringAddrFlags;
|
||||
#[cfg(not(test))]
|
||||
use vhost_rs::VhostBackend;
|
||||
use vhost_rs::{VhostUserMemoryRegionInfo, VringConfigData};
|
||||
use virtio_bindings::bindings::virtio_config::{VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_F_VERSION_1};
|
||||
use virtio_bindings::bindings::virtio_net::*;
|
||||
use virtio_bindings::bindings::virtio_ring::*;
|
||||
use virtio_queue::{DescriptorChain, QueueT};
|
||||
|
||||
@@ -25,7 +25,7 @@ use vhost_rs::vhost_user::message::{
|
||||
VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures,
|
||||
VHOST_USER_CONFIG_OFFSET,
|
||||
};
|
||||
use vhost_rs::vhost_user::{Master, VhostUserMaster};
|
||||
use vhost_rs::vhost_user::{Frontend, VhostUserFrontend};
|
||||
use vhost_rs::{Error as VhostError, VhostBackend};
|
||||
use virtio_bindings::bindings::virtio_blk::{VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_SEG_MAX};
|
||||
use virtio_queue::QueueT;
|
||||
@@ -231,7 +231,7 @@ impl VhostUserBlockDevice {
|
||||
|
||||
info!("vhost-user-blk: try to connect to {vhost_socket:?}");
|
||||
// Connect to the vhost-user socket.
|
||||
let mut master = Master::connect(&vhost_socket, 1).map_err(VirtIoError::VhostError)?;
|
||||
let mut master = Frontend::connect(&vhost_socket, 1).map_err(VirtIoError::VhostError)?;
|
||||
|
||||
info!("vhost-user-blk: get features");
|
||||
let avail_features = master.get_features().map_err(VirtIoError::VhostError)?;
|
||||
@@ -290,11 +290,11 @@ impl VhostUserBlockDevice {
|
||||
})
|
||||
}
|
||||
|
||||
fn reconnect_to_server(&mut self) -> VirtIoResult<Master> {
|
||||
fn reconnect_to_server(&mut self) -> VirtIoResult<Frontend> {
|
||||
if !Path::new(self.vhost_socket.as_str()).exists() {
|
||||
return Err(VirtIoError::InternalError);
|
||||
}
|
||||
let master = Master::connect(&self.vhost_socket, 1).map_err(VirtIoError::VhostError)?;
|
||||
let master = Frontend::connect(&self.vhost_socket, 1).map_err(VirtIoError::VhostError)?;
|
||||
|
||||
Ok(master)
|
||||
}
|
||||
@@ -360,7 +360,7 @@ impl VhostUserBlockDevice {
|
||||
if !Path::new(self.vhost_socket.as_str()).exists() {
|
||||
return Err(ActivateError::InternalError);
|
||||
}
|
||||
let master = Master::connect(String::from(self.vhost_socket.as_str()), 1)
|
||||
let master = Frontend::connect(String::from(self.vhost_socket.as_str()), 1)
|
||||
.map_err(VirtIoError::VhostError)?;
|
||||
|
||||
self.endpoint.set_master(master);
|
||||
@@ -388,7 +388,7 @@ impl VhostUserBlockDevice {
|
||||
R: GuestMemoryRegion + Send + Sync + 'static,
|
||||
>(
|
||||
&mut self,
|
||||
master: Master,
|
||||
master: Frontend,
|
||||
config: EndpointParam<AS, Q, R>,
|
||||
ops: &mut EventOps,
|
||||
) -> std::result::Result<(), VirtIoError> {
|
||||
|
||||
@@ -10,10 +10,10 @@ use dbs_utils::epoll_manager::{EventOps, EventSet, Events};
|
||||
use log::*;
|
||||
use vhost_rs::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVringAddrFlags};
|
||||
use vhost_rs::vhost_user::{
|
||||
Error as VhostUserError, Listener as VhostUserListener, Master, VhostUserMaster,
|
||||
Error as VhostUserError, Frontend, Listener as VhostUserListener, VhostUserFrontend,
|
||||
};
|
||||
use vhost_rs::{Error as VhostError, VhostBackend, VhostUserMemoryRegionInfo, VringConfigData};
|
||||
use virtio_bindings::bindings::virtio_net::VIRTIO_F_RING_PACKED;
|
||||
use virtio_bindings::bindings::virtio_config::VIRTIO_F_RING_PACKED;
|
||||
use virtio_queue::QueueT;
|
||||
use vm_memory::{
|
||||
Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryRegion, MemoryRegionAddress,
|
||||
@@ -50,7 +50,7 @@ impl Listener {
|
||||
}
|
||||
|
||||
// Wait for an incoming connection until success.
|
||||
pub fn accept(&self) -> VirtioResult<(Master, u64)> {
|
||||
pub fn accept(&self) -> VirtioResult<(Frontend, u64)> {
|
||||
loop {
|
||||
match self.try_accept() {
|
||||
Ok(Some((master, mut feature))) => {
|
||||
@@ -65,14 +65,14 @@ impl Listener {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_accept(&self) -> VirtioResult<Option<(Master, u64)>> {
|
||||
pub fn try_accept(&self) -> VirtioResult<Option<(Frontend, u64)>> {
|
||||
let sock = match self.listener.accept() {
|
||||
Ok(Some(conn)) => conn,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let mut master = Master::from_stream(sock, 1);
|
||||
let mut master = Frontend::from_stream(sock, 1);
|
||||
info!("{}: try to get virtio features from slave.", self.name);
|
||||
match Endpoint::initialize(&mut master) {
|
||||
Ok(Some(features)) => Ok(Some((master, features))),
|
||||
@@ -159,8 +159,8 @@ impl<AS: GuestAddressSpace, Q: QueueT, R: GuestMemoryRegion> EndpointParam<'_, A
|
||||
/// Caller needs to ensure mutual exclusive access to the object.
|
||||
pub(super) struct Endpoint {
|
||||
/// Underlying vhost-user communication endpoint.
|
||||
conn: Option<Master>,
|
||||
old: Option<Master>,
|
||||
conn: Option<Frontend>,
|
||||
old: Option<Frontend>,
|
||||
/// Token to register epoll event for the underlying socket.
|
||||
slot: u32,
|
||||
/// Identifier string for logs.
|
||||
@@ -168,7 +168,7 @@ pub(super) struct Endpoint {
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
pub fn new(master: Master, slot: u32, name: String) -> Self {
|
||||
pub fn new(master: Frontend, slot: u32, name: String) -> Self {
|
||||
Endpoint {
|
||||
conn: Some(master),
|
||||
old: None,
|
||||
@@ -186,7 +186,7 @@ impl Endpoint {
|
||||
/// * - Ok(Some(avial_features)): virtio features from the slave
|
||||
/// * - Ok(None): underlying communicaiton channel gets broken during negotiation
|
||||
/// * - Err(e): error conditions
|
||||
fn initialize(master: &mut Master) -> VirtioResult<Option<u64>> {
|
||||
fn initialize(master: &mut Frontend) -> VirtioResult<Option<u64>> {
|
||||
// 1. Seems that some vhost-user slaves depend on the get_features request to driver its
|
||||
// internal state machine.
|
||||
// N.B. it's really TDD, we just found it works in this way. Any spec about this?
|
||||
@@ -242,7 +242,7 @@ impl Endpoint {
|
||||
pub fn negotiate<AS: GuestAddressSpace, Q: QueueT, R: GuestMemoryRegion>(
|
||||
&mut self,
|
||||
config: &EndpointParam<AS, Q, R>,
|
||||
mut old: Option<&mut Master>,
|
||||
mut old: Option<&mut Frontend>,
|
||||
) -> VirtioResult<()> {
|
||||
let guard = config.virtio_config.lock_guest_memory();
|
||||
let mem = guard.deref();
|
||||
@@ -286,19 +286,19 @@ impl Endpoint {
|
||||
);
|
||||
|
||||
// Setup slave channel if SLAVE_REQ protocol feature is set
|
||||
if protocol_features.contains(VhostUserProtocolFeatures::SLAVE_REQ) {
|
||||
if protocol_features.contains(VhostUserProtocolFeatures::BACKEND_REQ) {
|
||||
match config.slave_req_fd {
|
||||
Some(fd) => master.set_slave_request_fd(&fd)?,
|
||||
Some(fd) => master.set_backend_request_fd(&fd)?,
|
||||
None => {
|
||||
error!(
|
||||
"{}: Protocol feature SLAVE_REQ is set but not slave channel fd",
|
||||
"{}: Protocol feature BACKEND_REQ is set but not slave channel fd",
|
||||
self.name
|
||||
);
|
||||
return Err(VhostError::VhostUserProtocol(VhostUserError::InvalidParam).into());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("{}: has no SLAVE_REQ protocol feature set", self.name);
|
||||
info!("{}: has no BACKEND_REQ protocol feature set", self.name);
|
||||
}
|
||||
|
||||
// 6. check number of queues supported
|
||||
@@ -454,7 +454,7 @@ impl Endpoint {
|
||||
/// Restore communication with the vhost-user slave on reconnect.
|
||||
pub fn reconnect<AS: GuestAddressSpace, Q: QueueT, R: GuestMemoryRegion>(
|
||||
&mut self,
|
||||
master: Master,
|
||||
master: Frontend,
|
||||
config: &EndpointParam<AS, Q, R>,
|
||||
ops: &mut EventOps,
|
||||
) -> VirtioResult<()> {
|
||||
@@ -515,7 +515,11 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
/// Deregister the underlying socket from the epoll controller.
|
||||
pub fn deregister_epoll_event(&self, master: &Master, ops: &mut EventOps) -> VirtioResult<()> {
|
||||
pub fn deregister_epoll_event(
|
||||
&self,
|
||||
master: &Frontend,
|
||||
ops: &mut EventOps,
|
||||
) -> VirtioResult<()> {
|
||||
info!(
|
||||
"{}: unregister epoll event for fd {}.",
|
||||
self.name,
|
||||
@@ -529,7 +533,7 @@ impl Endpoint {
|
||||
.map_err(VirtioError::EpollMgr)
|
||||
}
|
||||
|
||||
pub fn set_master(&mut self, master: Master) {
|
||||
pub fn set_master(&mut self, master: Frontend) {
|
||||
self.conn = Some(master);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,10 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use std::any::Any;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Deref;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use dbs_device::resources::{DeviceResources, ResourceConstraint};
|
||||
@@ -15,18 +12,15 @@ use dbs_utils::epoll_manager::{
|
||||
};
|
||||
use kvm_bindings::kvm_userspace_memory_region;
|
||||
use kvm_ioctls::VmFd;
|
||||
use libc::{c_void, off64_t, pread64, pwrite64};
|
||||
use log::*;
|
||||
use vhost_rs::vhost_user::message::{
|
||||
VhostUserFSSlaveMsg, VhostUserFSSlaveMsgFlags, VhostUserProtocolFeatures,
|
||||
VhostUserVirtioFeatures, VHOST_USER_FS_SLAVE_ENTRIES,
|
||||
use vhost_rs::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures};
|
||||
use vhost_rs::vhost_user::{
|
||||
Frontend, FrontendReqHandler, HandlerResult, VhostUserFrontendReqHandler,
|
||||
};
|
||||
use vhost_rs::vhost_user::{HandlerResult, Master, MasterReqHandler, VhostUserMasterReqHandler};
|
||||
use vhost_rs::VhostBackend;
|
||||
use virtio_queue::QueueT;
|
||||
use vm_memory::{
|
||||
GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryRegion, GuestRegionMmap, GuestUsize,
|
||||
MmapRegion,
|
||||
GuestAddress, GuestAddressSpace, GuestMemoryRegion, GuestRegionMmap, GuestUsize, MmapRegion,
|
||||
};
|
||||
|
||||
use crate::ConfigResult;
|
||||
@@ -50,6 +44,7 @@ const NUM_QUEUE_OFFSET: usize = 1;
|
||||
const MASTER_SLOT: u32 = 0;
|
||||
const SLAVE_REQ_SLOT: u32 = 1;
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct SlaveReqHandler<AS: GuestAddressSpace> {
|
||||
/// the address of memory region allocated for virtiofs
|
||||
cache_offset: u64,
|
||||
@@ -69,6 +64,7 @@ struct SlaveReqHandler<AS: GuestAddressSpace> {
|
||||
|
||||
impl<AS: GuestAddressSpace> SlaveReqHandler<AS> {
|
||||
// Make sure request is within cache range
|
||||
#[allow(dead_code)]
|
||||
fn is_req_valid(&self, offset: u64, len: u64) -> bool {
|
||||
// TODO: do we need to validate alignment here?
|
||||
match offset.checked_add(len) {
|
||||
@@ -78,274 +74,24 @@ impl<AS: GuestAddressSpace> SlaveReqHandler<AS> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<AS: GuestAddressSpace> VhostUserMasterReqHandler for SlaveReqHandler<AS> {
|
||||
impl<AS: GuestAddressSpace> VhostUserFrontendReqHandler for SlaveReqHandler<AS> {
|
||||
fn handle_config_change(&self) -> HandlerResult<u64> {
|
||||
trace!(target: "vhost-fs", "{}: SlaveReqHandler::handle_config_change()", self.id);
|
||||
debug!("{}: unhandle device_config_change event", self.id);
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn fs_slave_map(&self, fs: &VhostUserFSSlaveMsg, fd: &dyn AsRawFd) -> HandlerResult<u64> {
|
||||
trace!(target: "vhost-fs", "{}: SlaveReqHandler::fs_slave_map()", self.id);
|
||||
|
||||
for i in 0..VHOST_USER_FS_SLAVE_ENTRIES {
|
||||
let offset = fs.cache_offset[i];
|
||||
let len = fs.len[i];
|
||||
|
||||
// Ignore if the length is 0.
|
||||
if len == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"{}: fs_slave_map: offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
|
||||
if !self.is_req_valid(offset, len) {
|
||||
debug!(
|
||||
"{}: fs_slave_map: Wrong offset or length, offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
return Err(std::io::Error::from_raw_os_error(libc::EINVAL));
|
||||
}
|
||||
|
||||
let addr = self.mmap_cache_addr + offset;
|
||||
let flags = fs.flags[i];
|
||||
let ret = unsafe {
|
||||
libc::mmap(
|
||||
addr as *mut libc::c_void,
|
||||
len as usize,
|
||||
flags.bits() as i32,
|
||||
libc::MAP_SHARED | libc::MAP_FIXED,
|
||||
fd.as_raw_fd(),
|
||||
fs.fd_offset[i] as libc::off_t,
|
||||
)
|
||||
};
|
||||
if ret == libc::MAP_FAILED {
|
||||
let e = std::io::Error::last_os_error();
|
||||
error!("{}: fs_slave_map: mmap failed, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
let ret = unsafe { libc::close(fd.as_raw_fd()) };
|
||||
if ret == -1 {
|
||||
let e = std::io::Error::last_os_error();
|
||||
error!("{}: fs_slave_map: close failed, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn fs_slave_unmap(&self, fs: &VhostUserFSSlaveMsg) -> HandlerResult<u64> {
|
||||
trace!(target: "vhost-fs", "{}: SlaveReqHandler::fs_slave_map()", self.id);
|
||||
|
||||
for i in 0..VHOST_USER_FS_SLAVE_ENTRIES {
|
||||
let offset = fs.cache_offset[i];
|
||||
let mut len = fs.len[i];
|
||||
|
||||
// Ignore if the length is 0.
|
||||
if len == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"{}: fs_slave_unmap: offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
|
||||
// Need to handle a special case where the slave ask for the unmapping
|
||||
// of the entire mapping.
|
||||
if len == 0xffff_ffff_ffff_ffff {
|
||||
len = self.cache_size;
|
||||
}
|
||||
|
||||
if !self.is_req_valid(offset, len) {
|
||||
error!(
|
||||
"{}: fs_slave_map: Wrong offset or length, offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
return Err(std::io::Error::from_raw_os_error(libc::EINVAL));
|
||||
}
|
||||
|
||||
let addr = self.mmap_cache_addr + offset;
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
let ret = unsafe {
|
||||
libc::mmap(
|
||||
addr as *mut libc::c_void,
|
||||
len as usize,
|
||||
libc::PROT_NONE,
|
||||
libc::MAP_ANONYMOUS | libc::MAP_PRIVATE | libc::MAP_FIXED,
|
||||
-1,
|
||||
0 as libc::off_t,
|
||||
)
|
||||
};
|
||||
if ret == libc::MAP_FAILED {
|
||||
let e = std::io::Error::last_os_error();
|
||||
error!("{}: fs_slave_map: mmap failed, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn fs_slave_sync(&self, fs: &VhostUserFSSlaveMsg) -> HandlerResult<u64> {
|
||||
trace!(target: "vhost-fs", "{}: SlaveReqHandler::fs_slave_sync()", self.id);
|
||||
|
||||
for i in 0..VHOST_USER_FS_SLAVE_ENTRIES {
|
||||
let offset = fs.cache_offset[i];
|
||||
let len = fs.len[i];
|
||||
|
||||
// Ignore if the length is 0.
|
||||
if len == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!(
|
||||
"{}: fs_slave_sync: offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
|
||||
if !self.is_req_valid(offset, len) {
|
||||
error!(
|
||||
"{}: fs_slave_map: Wrong offset or length, offset={:x} len={:x} cache_size={:x}",
|
||||
self.id, offset, len, self.cache_size
|
||||
);
|
||||
return Err(std::io::Error::from_raw_os_error(libc::EINVAL));
|
||||
}
|
||||
|
||||
let addr = self.mmap_cache_addr + offset;
|
||||
let ret =
|
||||
unsafe { libc::msync(addr as *mut libc::c_void, len as usize, libc::MS_SYNC) };
|
||||
if ret == -1 {
|
||||
let e = std::io::Error::last_os_error();
|
||||
error!("{}: fs_slave_sync: msync failed, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn fs_slave_io(&self, fs: &VhostUserFSSlaveMsg, fd: &dyn AsRawFd) -> HandlerResult<u64> {
|
||||
trace!(target: "vhost-fs", "{}: SlaveReqHandler::fs_slave_io()", self.id);
|
||||
|
||||
let guard = self.mem.memory();
|
||||
let mem = guard.deref();
|
||||
let mut done: u64 = 0;
|
||||
for i in 0..VHOST_USER_FS_SLAVE_ENTRIES {
|
||||
// Ignore if the length is 0.
|
||||
if fs.len[i] == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut foffset = fs.fd_offset[i];
|
||||
let mut len = fs.len[i] as usize;
|
||||
let gpa = fs.cache_offset[i];
|
||||
let cache_end = self.cache_offset + self.cache_size;
|
||||
let efault = libc::EFAULT;
|
||||
|
||||
debug!(
|
||||
"{}: fs_slave_io: gpa={:x} len={:x} foffset={:x} cache_offset={:x} cache_size={:x}",
|
||||
self.id, gpa, len, foffset, self.cache_offset, self.cache_size
|
||||
);
|
||||
|
||||
let mut ptr = if gpa >= self.cache_offset && gpa < cache_end {
|
||||
let offset = gpa
|
||||
.checked_sub(self.cache_offset)
|
||||
.ok_or_else(|| io::Error::from_raw_os_error(efault))?;
|
||||
let end = gpa
|
||||
.checked_add(fs.len[i])
|
||||
.ok_or_else(|| io::Error::from_raw_os_error(efault))?;
|
||||
|
||||
if end >= cache_end {
|
||||
error!( "{}: fs_slave_io: Wrong gpa or len (gpa={:x} len={:x} cache_offset={:x}, cache_size={:x})", self.id, gpa, len, self.cache_offset, self.cache_size );
|
||||
return Err(io::Error::from_raw_os_error(efault));
|
||||
}
|
||||
self.mmap_cache_addr + offset
|
||||
} else {
|
||||
// gpa is a RAM addr.
|
||||
mem.get_host_address(GuestAddress(gpa))
|
||||
.map_err(|e| {
|
||||
error!(
|
||||
"{}: fs_slave_io: Failed to find RAM region associated with gpa 0x{:x}: {:?}",
|
||||
self.id, gpa, e
|
||||
);
|
||||
io::Error::from_raw_os_error(efault)
|
||||
})? as u64
|
||||
};
|
||||
|
||||
while len > 0 {
|
||||
let ret = if (fs.flags[i] & VhostUserFSSlaveMsgFlags::MAP_W)
|
||||
== VhostUserFSSlaveMsgFlags::MAP_W
|
||||
{
|
||||
debug!("{}: write: foffset={:x}, len={:x}", self.id, foffset, len);
|
||||
unsafe {
|
||||
pwrite64(
|
||||
fd.as_raw_fd(),
|
||||
ptr as *const c_void,
|
||||
len,
|
||||
foffset as off64_t,
|
||||
)
|
||||
}
|
||||
} else {
|
||||
debug!("{}: read: foffset={:x}, len={:x}", self.id, foffset, len);
|
||||
unsafe { pread64(fd.as_raw_fd(), ptr as *mut c_void, len, foffset as off64_t) }
|
||||
};
|
||||
|
||||
if ret < 0 {
|
||||
let e = std::io::Error::last_os_error();
|
||||
if (fs.flags[i] & VhostUserFSSlaveMsgFlags::MAP_W)
|
||||
== VhostUserFSSlaveMsgFlags::MAP_W
|
||||
{
|
||||
error!("{}: fs_slave_io: pwrite failed, {}", self.id, e);
|
||||
} else {
|
||||
error!("{}: fs_slave_io: pread failed, {}", self.id, e);
|
||||
}
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
if ret == 0 {
|
||||
// EOF
|
||||
let e = io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to access whole buffer",
|
||||
);
|
||||
error!("{}: fs_slave_io: IO error, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
len -= ret as usize;
|
||||
foffset += ret as u64;
|
||||
ptr += ret as u64;
|
||||
done += ret as u64;
|
||||
}
|
||||
|
||||
let ret = unsafe { libc::close(fd.as_raw_fd()) };
|
||||
if ret == -1 {
|
||||
let e = std::io::Error::last_os_error();
|
||||
error!("{}: fs_slave_io: close failed, {}", self.id, e);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(done)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct VhostUserFsHandler<
|
||||
AS: GuestAddressSpace,
|
||||
Q: QueueT,
|
||||
R: GuestMemoryRegion,
|
||||
S: VhostUserMasterReqHandler,
|
||||
S: VhostUserFrontendReqHandler,
|
||||
> {
|
||||
config: VirtioDeviceConfig<AS, Q, R>,
|
||||
device: Arc<Mutex<VhostUserFsDevice>>,
|
||||
slave_req_handler: Option<MasterReqHandler<S>>,
|
||||
slave_req_handler: Option<FrontendReqHandler<S>>,
|
||||
id: String,
|
||||
}
|
||||
|
||||
@@ -354,7 +100,7 @@ where
|
||||
AS: 'static + GuestAddressSpace + Send + Sync,
|
||||
Q: QueueT + Send + 'static,
|
||||
R: GuestMemoryRegion + Send + Sync + 'static,
|
||||
S: 'static + Send + VhostUserMasterReqHandler,
|
||||
S: 'static + Send + VhostUserFrontendReqHandler,
|
||||
{
|
||||
fn process(&mut self, events: Events, _ops: &mut EventOps) {
|
||||
trace!(target: "vhost-fs", "{}: VhostUserFsHandler::process({})", self.id, events.data());
|
||||
@@ -425,7 +171,7 @@ impl VhostUserFsDevice {
|
||||
// Connect to the vhost-user socket.
|
||||
info!("{VHOST_USER_FS_NAME}: try to connect to {path:?}");
|
||||
let num_queues = NUM_QUEUE_OFFSET + req_num_queues;
|
||||
let master = Master::connect(path, num_queues as u64).map_err(VirtioError::VhostError)?;
|
||||
let master = Frontend::connect(path, num_queues as u64).map_err(VirtioError::VhostError)?;
|
||||
|
||||
info!("{VHOST_USER_FS_NAME}: get features");
|
||||
let avail_features = master.get_features().map_err(VirtioError::VhostError)?;
|
||||
@@ -475,7 +221,7 @@ impl VhostUserFsDevice {
|
||||
let mut features = VhostUserProtocolFeatures::MQ | VhostUserProtocolFeatures::REPLY_ACK;
|
||||
if self.is_dax_on() {
|
||||
features |=
|
||||
VhostUserProtocolFeatures::SLAVE_REQ | VhostUserProtocolFeatures::SLAVE_SEND_FD;
|
||||
VhostUserProtocolFeatures::BACKEND_REQ | VhostUserProtocolFeatures::BACKEND_SEND_FD;
|
||||
}
|
||||
features
|
||||
}
|
||||
@@ -484,7 +230,7 @@ impl VhostUserFsDevice {
|
||||
AS: GuestAddressSpace,
|
||||
Q: QueueT,
|
||||
R: GuestMemoryRegion,
|
||||
S: VhostUserMasterReqHandler,
|
||||
S: VhostUserFrontendReqHandler,
|
||||
>(
|
||||
&mut self,
|
||||
handler: &VhostUserFsHandler<AS, Q, R, S>,
|
||||
@@ -621,7 +367,7 @@ where
|
||||
mem: config.vm_as.clone(),
|
||||
id: device.device_info.driver_name.clone(),
|
||||
});
|
||||
let req_handler = MasterReqHandler::new(vu_master_req_handler)
|
||||
let req_handler = FrontendReqHandler::new(vu_master_req_handler)
|
||||
.map_err(|e| ActivateError::VhostActivate(vhost_rs::Error::VhostUserProtocol(e)))?;
|
||||
|
||||
Some(req_handler)
|
||||
@@ -748,7 +494,7 @@ where
|
||||
|
||||
let guest_mmap_region = Arc::new(
|
||||
GuestRegionMmap::new(mmap_region, GuestAddress(guest_addr))
|
||||
.map_err(VirtioError::InsertMmap)?,
|
||||
.ok_or(VirtioError::InsertMmap)?,
|
||||
);
|
||||
|
||||
Ok(Some(VirtioSharedMemoryList {
|
||||
|
||||
@@ -12,7 +12,7 @@ use dbs_utils::epoll_manager::{EpollManager, EventOps, Events, MutEventSubscribe
|
||||
use dbs_utils::net::MacAddr;
|
||||
use log::{debug, error, info, trace, warn};
|
||||
use vhost_rs::vhost_user::{
|
||||
Error as VhostUserError, Master, VhostUserProtocolFeatures, VhostUserVirtioFeatures,
|
||||
Error as VhostUserError, Frontend, VhostUserProtocolFeatures, VhostUserVirtioFeatures,
|
||||
};
|
||||
use vhost_rs::Error as VhostError;
|
||||
use virtio_bindings::bindings::virtio_net::{
|
||||
@@ -59,7 +59,7 @@ struct VhostUserNetDevice {
|
||||
|
||||
impl VhostUserNetDevice {
|
||||
fn new(
|
||||
master: Master,
|
||||
master: Frontend,
|
||||
mut avail_features: u64,
|
||||
listener: Listener,
|
||||
guest_mac: Option<&MacAddr>,
|
||||
|
||||
@@ -14,13 +14,14 @@ use vhost_rs::vhost_user::message::{
|
||||
VhostUserVringAddr, VhostUserVringState, MAX_MSG_SIZE,
|
||||
};
|
||||
use vhost_rs::vhost_user::Error;
|
||||
use vm_memory::ByteValued;
|
||||
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
|
||||
use vmm_sys_util::tempfile::TempFile;
|
||||
|
||||
pub const MAX_ATTACHED_FD_ENTRIES: usize = 32;
|
||||
|
||||
pub(crate) trait Req:
|
||||
Clone + Copy + Debug + PartialEq + Eq + PartialOrd + Ord + Into<u32>
|
||||
Clone + Copy + Debug + PartialEq + Eq + PartialOrd + Ord + Into<u32> + Send + Sync
|
||||
{
|
||||
fn is_valid(&self) -> bool;
|
||||
}
|
||||
@@ -215,6 +216,10 @@ impl<R: Req> Default for VhostUserMsgHeader<R> {
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: VhostUserMsgHeader is a packed struct with only primitive (u32) fields and PhantomData.
|
||||
// All bit patterns are valid, and it has no padding bytes.
|
||||
unsafe impl<R: Req> ByteValued for VhostUserMsgHeader<R> {}
|
||||
|
||||
/// Unix domain socket endpoint for vhost-user connection.
|
||||
pub(crate) struct Endpoint<R: Req> {
|
||||
sock: UnixStream,
|
||||
|
||||
@@ -99,13 +99,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_bind() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9000");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9000");
|
||||
assert!(VsockTcpBackend::new(tcp_sock_addr).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_accept() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9001");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9001");
|
||||
|
||||
let mut vsock_backend = VsockTcpBackend::new(tcp_sock_addr.clone()).unwrap();
|
||||
let _stream = TcpStream::connect(&tcp_sock_addr).unwrap();
|
||||
@@ -115,7 +115,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_communication() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9002");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9002");
|
||||
let test_string = String::from("TEST");
|
||||
let mut buffer = [0; 10];
|
||||
|
||||
@@ -139,7 +139,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_connect() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9003");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9003");
|
||||
let vsock_backend = VsockTcpBackend::new(tcp_sock_addr).unwrap();
|
||||
// tcp backend don't support peer connection
|
||||
assert!(vsock_backend.connect(0).is_err());
|
||||
@@ -147,14 +147,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_type() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9004");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9004");
|
||||
let vsock_backend = VsockTcpBackend::new(tcp_sock_addr).unwrap();
|
||||
assert_eq!(vsock_backend.r#type(), VsockBackendType::Tcp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tcp_backend_vsock_stream() {
|
||||
let tcp_sock_addr = String::from("127.0.0.2:9005");
|
||||
let tcp_sock_addr = String::from("127.0.0.1:9005");
|
||||
let _vsock_backend = VsockTcpBackend::new(tcp_sock_addr.clone()).unwrap();
|
||||
let vsock_stream = TcpStream::connect(&tcp_sock_addr).unwrap();
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
/// backend.
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use virtio_queue::{Descriptor, DescriptorChain};
|
||||
use virtio_queue::{desc::split::Descriptor, DescriptorChain};
|
||||
use vm_memory::{Address, GuestMemory};
|
||||
|
||||
use super::defs;
|
||||
|
||||
@@ -118,11 +118,15 @@ pub enum AddressManagerError {
|
||||
|
||||
/// Failure in accessing the memory located at some address.
|
||||
#[error("address manager failed to access guest memory located at 0x{0:x}")]
|
||||
AccessGuestMemory(u64, #[source] vm_memory::mmap::Error),
|
||||
AccessGuestMemory(u64, #[source] vm_memory::GuestMemoryError),
|
||||
|
||||
/// Failed to create GuestMemory
|
||||
#[error("address manager failed to create guest memory object")]
|
||||
CreateGuestMemory(#[source] vm_memory::Error),
|
||||
CreateGuestMemory(#[source] vm_memory::GuestMemoryError),
|
||||
|
||||
/// Failed to insert/manage guest memory region collection
|
||||
#[error("address manager failed to manage guest memory region collection")]
|
||||
GuestRegionCollection(#[source] vm_memory::GuestRegionCollectionError),
|
||||
|
||||
/// Failure in initializing guest memory.
|
||||
#[error("address manager failed to initialize guest memory")]
|
||||
@@ -328,7 +332,7 @@ impl AddressSpaceMgr {
|
||||
|
||||
vm_memory = vm_memory
|
||||
.insert_region(mmap_reg.clone())
|
||||
.map_err(AddressManagerError::CreateGuestMemory)?;
|
||||
.map_err(AddressManagerError::GuestRegionCollection)?;
|
||||
self.map_to_kvm(res_mgr, ¶m, reg, mmap_reg)?;
|
||||
}
|
||||
|
||||
@@ -488,8 +492,11 @@ impl AddressSpaceMgr {
|
||||
self.configure_thp_and_prealloc(®ion, &mmap_reg)?;
|
||||
}
|
||||
|
||||
let reg = GuestRegionImpl::new(mmap_reg, region.start_addr())
|
||||
.map_err(AddressManagerError::CreateGuestMemory)?;
|
||||
let reg = GuestRegionImpl::new(mmap_reg, region.start_addr()).ok_or(
|
||||
AddressManagerError::GuestRegionCollection(
|
||||
vm_memory::GuestRegionCollectionError::NoMemoryRegion,
|
||||
),
|
||||
)?;
|
||||
Ok(Arc::new(reg))
|
||||
}
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ pub enum BalloonDeviceError {
|
||||
|
||||
/// guest memory error
|
||||
#[error("failed to access guest memory, {0}")]
|
||||
GuestMemoryError(#[source] vm_memory::mmap::Error),
|
||||
GuestMemoryError(#[source] vm_memory::GuestMemoryError),
|
||||
|
||||
/// create balloon device error
|
||||
#[error("failed to create virtio-balloon device, {0}")]
|
||||
|
||||
@@ -557,15 +557,14 @@ impl MemRegionFactory for MemoryRegionFactory {
|
||||
);
|
||||
|
||||
// All value should be valid.
|
||||
let memory_region = Arc::new(
|
||||
GuestRegionMmap::new(mmap_region, guest_addr).map_err(VirtioError::InsertMmap)?,
|
||||
);
|
||||
let memory_region =
|
||||
Arc::new(GuestRegionMmap::new(mmap_region, guest_addr).ok_or(VirtioError::InsertMmap)?);
|
||||
|
||||
let vm_as_new = self
|
||||
.vm_as
|
||||
.memory()
|
||||
.insert_region(memory_region.clone())
|
||||
.map_err(VirtioError::InsertMmap)?;
|
||||
.map_err(|_| VirtioError::InsertMmap)?;
|
||||
self.vm_as.lock().unwrap().replace(vm_as_new);
|
||||
self.address_space.insert_region(region).map_err(|e| {
|
||||
error!(self.logger, "failed to insert address space region: {}", e);
|
||||
|
||||
@@ -78,7 +78,7 @@ impl DeviceVirtioRegionHandler {
|
||||
) -> std::result::Result<(), VirtioError> {
|
||||
let vm_as_new = self.vm_as.memory().insert_region(region).map_err(|e| {
|
||||
error!("DeviceVirtioRegionHandler failed to insert guest memory region: {e:?}.");
|
||||
VirtioError::InsertMmap(e)
|
||||
VirtioError::InsertMmap
|
||||
})?;
|
||||
// Do not expect poisoned lock here, so safe to unwrap().
|
||||
self.vm_as.lock().unwrap().replace(vm_as_new);
|
||||
|
||||
@@ -13,6 +13,7 @@ use arc_swap::ArcSwap;
|
||||
use dbs_address_space::AddressSpace;
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use dbs_arch::{DeviceType, MMIODeviceInfo};
|
||||
#[cfg(feature = "host-device")]
|
||||
use dbs_boot::layout::MMIO_LOW_END;
|
||||
use dbs_device::device_manager::{Error as IoManagerError, IoManager, IoManagerContext};
|
||||
use dbs_device::resources::DeviceResources;
|
||||
@@ -24,7 +25,6 @@ use dbs_legacy_devices::ConsoleHandler;
|
||||
use dbs_pci::CAPABILITY_BAR_SIZE;
|
||||
use dbs_utils::epoll_manager::EpollManager;
|
||||
use kvm_ioctls::VmFd;
|
||||
use virtio_queue::QueueSync;
|
||||
|
||||
#[cfg(feature = "dbs-virtio-devices")]
|
||||
use dbs_device::resources::ResourceConstraint;
|
||||
@@ -41,6 +41,7 @@ use dbs_virtio_devices::{
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
use dbs_pci::VfioPciDevice;
|
||||
#[cfg(feature = "host-device")]
|
||||
use dbs_pci::VirtioPciDevice;
|
||||
#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
|
||||
use dbs_upcall::{
|
||||
@@ -59,6 +60,7 @@ use crate::resource_manager::ResourceManager;
|
||||
use crate::vm::{KernelConfigInfo, Vm, VmConfigInfo};
|
||||
use crate::IoManagerCached;
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
use vm_memory::GuestRegionMmap;
|
||||
|
||||
/// Virtual machine console device manager.
|
||||
@@ -187,18 +189,23 @@ pub enum DeviceMgrError {
|
||||
/// Error from Vfio Pci
|
||||
#[error("failed to do vfio pci operation: {0:?}")]
|
||||
VfioPci(#[source] dbs_pci::VfioPciError),
|
||||
#[cfg(feature = "host-device")]
|
||||
/// Error from Virtio Pci
|
||||
#[error("failed to do virtio pci operation")]
|
||||
VirtioPci,
|
||||
#[cfg(feature = "host-device")]
|
||||
/// PCI system manager error
|
||||
#[error("Pci system manager error")]
|
||||
PciSystemManager,
|
||||
#[cfg(feature = "host-device")]
|
||||
/// Dragonball pci system error
|
||||
#[error("pci error: {0:?}")]
|
||||
PciError(#[source] dbs_pci::Error),
|
||||
#[cfg(feature = "host-device")]
|
||||
/// Virtio Pci system error
|
||||
#[error("virtio pci error: {0:?}")]
|
||||
VirtioPciError(#[source] dbs_pci::VirtioPciDeviceError),
|
||||
#[cfg(feature = "host-device")]
|
||||
/// Unsupported pci device type
|
||||
#[error("unsupported pci device type")]
|
||||
InvalidPciDeviceType,
|
||||
@@ -315,6 +322,7 @@ pub struct DeviceOpContext {
|
||||
virtio_devices: Vec<Arc<dyn DeviceIo>>,
|
||||
#[cfg(feature = "host-device")]
|
||||
vfio_manager: Option<Arc<Mutex<VfioDeviceMgr>>>,
|
||||
#[cfg(feature = "host-device")]
|
||||
pci_system_manager: Arc<Mutex<PciSystemManager>>,
|
||||
vm_config: Option<VmConfigInfo>,
|
||||
shared_info: Arc<RwLock<InstanceInfo>>,
|
||||
@@ -366,6 +374,7 @@ impl DeviceOpContext {
|
||||
shared_info,
|
||||
#[cfg(feature = "host-device")]
|
||||
vfio_manager: None,
|
||||
#[cfg(feature = "host-device")]
|
||||
pci_system_manager: device_mgr.pci_system_manager.clone(),
|
||||
}
|
||||
}
|
||||
@@ -659,6 +668,7 @@ pub struct DeviceManager {
|
||||
vhost_user_net_manager: VhostUserNetDeviceMgr,
|
||||
#[cfg(feature = "host-device")]
|
||||
pub(crate) vfio_manager: Arc<Mutex<VfioDeviceMgr>>,
|
||||
#[cfg(feature = "host-device")]
|
||||
pub(crate) pci_system_manager: Arc<Mutex<PciSystemManager>>,
|
||||
}
|
||||
|
||||
@@ -674,15 +684,21 @@ impl DeviceManager {
|
||||
let irq_manager = Arc::new(KvmIrqManager::new(vm_fd.clone()));
|
||||
let io_manager = Arc::new(ArcSwap::new(Arc::new(IoManager::new())));
|
||||
let io_lock = Arc::new(Mutex::new(()));
|
||||
#[cfg(feature = "host-device")]
|
||||
let io_context = DeviceManagerContext::new(io_manager.clone(), io_lock.clone());
|
||||
#[cfg(feature = "host-device")]
|
||||
let mut mgr = PciSystemManager::new(irq_manager.clone(), io_context, res_manager.clone())?;
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
let requirements = mgr.resource_requirements();
|
||||
#[cfg(feature = "host-device")]
|
||||
let resources = res_manager
|
||||
.allocate_device_resources(&requirements, USE_SHARED_IRQ)
|
||||
.map_err(DeviceMgrError::ResourceError)?;
|
||||
#[cfg(feature = "host-device")]
|
||||
mgr.activate(resources)?;
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
let pci_system_manager = Arc::new(Mutex::new(mgr));
|
||||
|
||||
Ok(DeviceManager {
|
||||
@@ -720,6 +736,7 @@ impl DeviceManager {
|
||||
pci_system_manager.clone(),
|
||||
logger,
|
||||
))),
|
||||
#[cfg(feature = "host-device")]
|
||||
pci_system_manager,
|
||||
})
|
||||
}
|
||||
@@ -1251,6 +1268,7 @@ impl DeviceManager {
|
||||
}
|
||||
|
||||
/// Create an Virtio PCI transport layer device for the virtio backend device.
|
||||
#[cfg(feature = "host-device")]
|
||||
pub fn create_virtio_pci_device(
|
||||
mut device: DbsVirtioDevice,
|
||||
ctx: &mut DeviceOpContext,
|
||||
@@ -1366,6 +1384,7 @@ impl DeviceManager {
|
||||
}
|
||||
|
||||
/// Create an Virtio PCI transport layer device for the virtio backend device.
|
||||
#[cfg(feature = "host-device")]
|
||||
pub fn register_virtio_pci_device(
|
||||
device: Arc<dyn DeviceIo>,
|
||||
ctx: &DeviceOpContext,
|
||||
@@ -1385,6 +1404,7 @@ impl DeviceManager {
|
||||
}
|
||||
|
||||
/// Deregister Virtio device from IoManager
|
||||
#[cfg(feature = "host-device")]
|
||||
pub fn deregister_virtio_device(
|
||||
device: &Arc<dyn DeviceIo>,
|
||||
ctx: &mut DeviceOpContext,
|
||||
@@ -1405,11 +1425,15 @@ impl DeviceManager {
|
||||
}
|
||||
|
||||
/// Destroy/Deregister resources for a Virtio PCI
|
||||
#[cfg(feature = "host-device")]
|
||||
fn destroy_pci_device(
|
||||
device: Arc<dyn DeviceIo>,
|
||||
ctx: &mut DeviceOpContext,
|
||||
dev_id: u8,
|
||||
) -> std::result::Result<(), DeviceMgrError> {
|
||||
use virtio_queue::QueueSync;
|
||||
use vm_memory::GuestRegionMmap;
|
||||
|
||||
// unregister IoManager
|
||||
Self::deregister_virtio_device(&device, ctx)?;
|
||||
// unregister Resource manager
|
||||
@@ -1489,6 +1513,7 @@ impl DeviceManager {
|
||||
}
|
||||
|
||||
/// Teardown the Virtio PCI or MMIO transport layer device associated with the virtio backend device.
|
||||
#[cfg(feature = "dbs-virtio-devices")]
|
||||
pub fn destroy_virtio_device(
|
||||
device: Arc<dyn DeviceIo>,
|
||||
ctx: &mut DeviceOpContext,
|
||||
@@ -1496,12 +1521,18 @@ impl DeviceManager {
|
||||
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
|
||||
Self::destroy_mmio_device(device.clone(), ctx)?;
|
||||
mmio_dev.remove();
|
||||
} else if let Some(pci_dev) = device.as_any().downcast_ref::<VirtioPciDevice<
|
||||
GuestAddressSpaceImpl,
|
||||
QueueSync,
|
||||
GuestRegionMmap,
|
||||
>>() {
|
||||
Self::destroy_pci_device(device.clone(), ctx, pci_dev.device_id())?;
|
||||
}
|
||||
#[cfg(feature = "host-device")]
|
||||
{
|
||||
use virtio_queue::QueueSync;
|
||||
use vm_memory::GuestRegionMmap;
|
||||
if let Some(pci_dev) = device.as_any().downcast_ref::<VirtioPciDevice<
|
||||
GuestAddressSpaceImpl,
|
||||
QueueSync,
|
||||
GuestRegionMmap,
|
||||
>>() {
|
||||
Self::destroy_pci_device(device.clone(), ctx, pci_dev.device_id())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1572,18 +1603,25 @@ mod tests {
|
||||
let irq_manager = Arc::new(KvmIrqManager::new(vm_fd.clone()));
|
||||
let io_manager = Arc::new(ArcSwap::new(Arc::new(IoManager::new())));
|
||||
let io_lock = Arc::new(Mutex::new(()));
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
let io_context = DeviceManagerContext::new(io_manager.clone(), io_lock.clone());
|
||||
#[cfg(feature = "host-device")]
|
||||
let mut mgr =
|
||||
PciSystemManager::new(irq_manager.clone(), io_context, res_manager.clone())
|
||||
.unwrap();
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
let requirements = mgr.resource_requirements();
|
||||
#[cfg(feature = "host-device")]
|
||||
let resources = res_manager
|
||||
.allocate_device_resources(&requirements, USE_SHARED_IRQ)
|
||||
.map_err(DeviceMgrError::ResourceError)
|
||||
.unwrap();
|
||||
#[cfg(feature = "host-device")]
|
||||
mgr.activate(resources).unwrap();
|
||||
|
||||
#[cfg(feature = "host-device")]
|
||||
let pci_system_manager = Arc::new(Mutex::new(mgr));
|
||||
|
||||
DeviceManager {
|
||||
@@ -1619,6 +1657,7 @@ mod tests {
|
||||
pci_system_manager.clone(),
|
||||
&logger,
|
||||
))),
|
||||
#[cfg(feature = "host-device")]
|
||||
pci_system_manager,
|
||||
|
||||
logger,
|
||||
|
||||
@@ -406,9 +406,11 @@ impl VfioDeviceMgr {
|
||||
if let Some(vfio_container) = self.vfio_container.as_ref() {
|
||||
Ok(vfio_container.clone())
|
||||
} else {
|
||||
let kvm_dev_fd = Arc::new(self.get_kvm_dev_fd()?);
|
||||
let vfio_container =
|
||||
Arc::new(VfioContainer::new(kvm_dev_fd).map_err(VfioDeviceError::VfioIoctlError)?);
|
||||
let kvm_dev_fd = self.get_kvm_dev_fd()?;
|
||||
let vfio_dev_fd = Arc::new(vfio_ioctls::VfioDeviceFd::new_from_kvm(kvm_dev_fd));
|
||||
let vfio_container = Arc::new(
|
||||
VfioContainer::new(Some(vfio_dev_fd)).map_err(VfioDeviceError::VfioIoctlError)?,
|
||||
);
|
||||
self.vfio_container = Some(vfio_container.clone());
|
||||
|
||||
Ok(vfio_container)
|
||||
|
||||
@@ -43,7 +43,7 @@ impl Vcpu {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new_aarch64(
|
||||
id: u8,
|
||||
vcpu_fd: Arc<VcpuFd>,
|
||||
vcpu_fd: VcpuFd,
|
||||
io_mgr: IoManagerCached,
|
||||
exit_evt: EventFd,
|
||||
vcpu_state_event: EventFd,
|
||||
|
||||
@@ -274,7 +274,7 @@ enum VcpuEmulation {
|
||||
/// A wrapper around creating and using a kvm-based VCPU.
|
||||
pub struct Vcpu {
|
||||
// vCPU fd used by the vCPU
|
||||
fd: Arc<VcpuFd>,
|
||||
fd: VcpuFd,
|
||||
// vCPU id info
|
||||
id: u8,
|
||||
// Io manager Cached for facilitating IO operations
|
||||
@@ -317,7 +317,7 @@ pub struct Vcpu {
|
||||
}
|
||||
|
||||
// Using this for easier explicit type-casting to help IDEs interpret the code.
|
||||
type VcpuCell = Cell<Option<*const Vcpu>>;
|
||||
type VcpuCell = Cell<Option<*mut Vcpu>>;
|
||||
|
||||
impl Vcpu {
|
||||
thread_local!(static TLS_VCPU_PTR: VcpuCell = const { Cell::new(None) });
|
||||
@@ -332,7 +332,7 @@ impl Vcpu {
|
||||
if cell.get().is_some() {
|
||||
return Err(VcpuError::VcpuTlsInit);
|
||||
}
|
||||
cell.set(Some(self as *const Vcpu));
|
||||
cell.set(Some(self as *mut Vcpu));
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
@@ -369,13 +369,13 @@ impl Vcpu {
|
||||
/// dereferencing from pointer an already borrowed `Vcpu`.
|
||||
unsafe fn run_on_thread_local<F>(func: F) -> Result<()>
|
||||
where
|
||||
F: FnOnce(&Vcpu),
|
||||
F: FnOnce(&mut Vcpu),
|
||||
{
|
||||
Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| {
|
||||
if let Some(vcpu_ptr) = cell.get() {
|
||||
// Dereferencing here is safe since `TLS_VCPU_PTR` is populated/non-empty,
|
||||
// and it is being cleared on `Vcpu::drop` so there is no dangling pointer.
|
||||
let vcpu_ref: &Vcpu = &*vcpu_ptr;
|
||||
let vcpu_ref: &mut Vcpu = &mut *vcpu_ptr;
|
||||
func(vcpu_ref);
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -436,7 +436,7 @@ impl Vcpu {
|
||||
|
||||
/// Extract the vcpu running logic for test mocking.
|
||||
#[cfg(not(test))]
|
||||
pub fn emulate(fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||
pub fn emulate(fd: &mut VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||
fd.run()
|
||||
}
|
||||
|
||||
@@ -444,7 +444,7 @@ impl Vcpu {
|
||||
///
|
||||
/// Returns error or enum specifying whether emulation was handled or interrupted.
|
||||
fn run_emulation(&mut self) -> Result<VcpuEmulation> {
|
||||
match Vcpu::emulate(&self.fd) {
|
||||
match Vcpu::emulate(&mut self.fd) {
|
||||
Ok(run) => {
|
||||
match run {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
@@ -455,8 +455,9 @@ impl Vcpu {
|
||||
}
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
VcpuExit::IoOut(addr, data) => {
|
||||
if !self.check_io_port_info(addr, data)? {
|
||||
let _ = self.io_mgr.pio_write(addr, data);
|
||||
let data = data.to_vec();
|
||||
if !self.check_io_port_info(addr, &data)? {
|
||||
let _ = self.io_mgr.pio_write(addr, &data);
|
||||
}
|
||||
self.metrics.exit_io_out.inc();
|
||||
Ok(VcpuEmulation::Handled)
|
||||
@@ -493,14 +494,14 @@ impl Vcpu {
|
||||
VcpuExit::SystemEvent(event_type, event_flags) => match event_type {
|
||||
KVM_SYSTEM_EVENT_RESET | KVM_SYSTEM_EVENT_SHUTDOWN => {
|
||||
info!(
|
||||
"Received KVM_SYSTEM_EVENT: type: {event_type}, event: {event_flags}"
|
||||
"Received KVM_SYSTEM_EVENT: type: {event_type}, event: {event_flags:?}"
|
||||
);
|
||||
Ok(VcpuEmulation::Stopped)
|
||||
}
|
||||
_ => {
|
||||
self.metrics.failures.inc();
|
||||
error!(
|
||||
"Received KVM_SYSTEM_EVENT signal type: {event_type}, flag: {event_flags}"
|
||||
"Received KVM_SYSTEM_EVENT signal type: {event_type}, flag: {event_flags:?}"
|
||||
);
|
||||
Err(VcpuError::VcpuUnhandledKvmExit)
|
||||
}
|
||||
@@ -765,7 +766,7 @@ impl Vcpu {
|
||||
|
||||
/// Get vcpu file descriptor.
|
||||
pub fn vcpu_fd(&self) -> &VcpuFd {
|
||||
self.fd.as_ref()
|
||||
&self.fd
|
||||
}
|
||||
|
||||
pub fn metrics(&self) -> Arc<VcpuMetrics> {
|
||||
@@ -804,7 +805,7 @@ pub mod tests {
|
||||
FailEntry(u64, u32),
|
||||
InternalError,
|
||||
Unknown,
|
||||
SystemEvent(u32, u64),
|
||||
SystemEvent(u32, Vec<u64>),
|
||||
Error(i32),
|
||||
}
|
||||
|
||||
@@ -813,7 +814,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Vcpu {
|
||||
pub fn emulate(_fd: &VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||
pub fn emulate(_fd: &mut VcpuFd) -> std::result::Result<VcpuExit<'_>, kvm_ioctls::Error> {
|
||||
let res = &*EMULATE_RES.lock().unwrap();
|
||||
match res {
|
||||
EmulationCase::IoIn => Ok(VcpuExit::IoIn(0, &mut [])),
|
||||
@@ -828,7 +829,8 @@ pub mod tests {
|
||||
EmulationCase::InternalError => Ok(VcpuExit::InternalError),
|
||||
EmulationCase::Unknown => Ok(VcpuExit::Unknown),
|
||||
EmulationCase::SystemEvent(event_type, event_flags) => {
|
||||
Ok(VcpuExit::SystemEvent(*event_type, *event_flags))
|
||||
let flags = event_flags.clone().into_boxed_slice();
|
||||
Ok(VcpuExit::SystemEvent(*event_type, Box::leak(flags)))
|
||||
}
|
||||
EmulationCase::Error(e) => Err(kvm_ioctls::Error::new(*e)),
|
||||
}
|
||||
@@ -839,7 +841,7 @@ pub mod tests {
|
||||
fn create_vcpu() -> (Vcpu, Receiver<VcpuStateEvent>) {
|
||||
let kvm_context = KvmContext::new(None).unwrap();
|
||||
let vm = kvm_context.kvm().create_vm().unwrap();
|
||||
let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap());
|
||||
let vcpu_fd = vm.create_vcpu(0).unwrap();
|
||||
let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new()))));
|
||||
let supported_cpuid = kvm_context
|
||||
.supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
|
||||
@@ -875,7 +877,7 @@ pub mod tests {
|
||||
let kvm = Kvm::new().unwrap();
|
||||
let vm = Arc::new(kvm.create_vm().unwrap());
|
||||
let _kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap();
|
||||
let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap());
|
||||
let vcpu_fd = vm.create_vcpu(0).unwrap();
|
||||
let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new()))));
|
||||
let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap();
|
||||
let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap();
|
||||
@@ -947,17 +949,19 @@ pub mod tests {
|
||||
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||
|
||||
// KVM_SYSTEM_EVENT_RESET
|
||||
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, 0);
|
||||
*(EMULATE_RES.lock().unwrap()) =
|
||||
EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, vec![0]);
|
||||
let res = vcpu.run_emulation();
|
||||
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
|
||||
|
||||
// KVM_SYSTEM_EVENT_SHUTDOWN
|
||||
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, 0);
|
||||
*(EMULATE_RES.lock().unwrap()) =
|
||||
EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, vec![0]);
|
||||
let res = vcpu.run_emulation();
|
||||
assert!(matches!(res, Ok(VcpuEmulation::Stopped)));
|
||||
|
||||
// Other system event
|
||||
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, 0);
|
||||
*(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, vec![0]);
|
||||
let res = vcpu.run_emulation();
|
||||
assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit)));
|
||||
|
||||
|
||||
@@ -189,7 +189,7 @@ pub struct VcpuResizeInfo {
|
||||
#[derive(Default)]
|
||||
pub(crate) struct VcpuInfo {
|
||||
pub(crate) vcpu: Option<Vcpu>,
|
||||
vcpu_fd: Option<Arc<VcpuFd>>,
|
||||
vcpu_fd: Option<VcpuFd>,
|
||||
handle: Option<VcpuHandle>,
|
||||
tid: u32,
|
||||
}
|
||||
@@ -541,18 +541,13 @@ impl VcpuManager {
|
||||
}
|
||||
// We will reuse the kvm's vcpufd after first creation, for we can't
|
||||
// create vcpufd with same id in one kvm instance.
|
||||
let kvm_vcpu = match &self.vcpu_infos[cpu_index as usize].vcpu_fd {
|
||||
Some(vcpu_fd) => vcpu_fd.clone(),
|
||||
None => {
|
||||
let vcpu_fd = Arc::new(
|
||||
self.vm_fd
|
||||
.create_vcpu(cpu_index as u64)
|
||||
.map_err(VcpuError::VcpuFd)
|
||||
.map_err(VcpuManagerError::Vcpu)?,
|
||||
);
|
||||
self.vcpu_infos[cpu_index as usize].vcpu_fd = Some(vcpu_fd.clone());
|
||||
vcpu_fd
|
||||
}
|
||||
let kvm_vcpu = match self.vcpu_infos[cpu_index as usize].vcpu_fd.take() {
|
||||
Some(vcpu_fd) => vcpu_fd,
|
||||
None => self
|
||||
.vm_fd
|
||||
.create_vcpu(cpu_index as u64)
|
||||
.map_err(VcpuError::VcpuFd)
|
||||
.map_err(VcpuManagerError::Vcpu)?,
|
||||
};
|
||||
|
||||
let mut vcpu = self.create_vcpu_arch(cpu_index, kvm_vcpu, request_ts)?;
|
||||
@@ -777,7 +772,7 @@ impl VcpuManager {
|
||||
fn create_vcpu_arch(
|
||||
&self,
|
||||
cpu_index: u8,
|
||||
vcpu_fd: Arc<VcpuFd>,
|
||||
vcpu_fd: VcpuFd,
|
||||
request_ts: TimestampUs,
|
||||
) -> Result<Vcpu> {
|
||||
// It's safe to unwrap because guest_kernel always exist until vcpu manager done
|
||||
@@ -806,7 +801,7 @@ impl VcpuManager {
|
||||
fn create_vcpu_arch(
|
||||
&self,
|
||||
cpu_index: u8,
|
||||
vcpu_fd: Arc<VcpuFd>,
|
||||
vcpu_fd: VcpuFd,
|
||||
request_ts: TimestampUs,
|
||||
) -> Result<Vcpu> {
|
||||
Vcpu::new_aarch64(
|
||||
|
||||
@@ -45,7 +45,7 @@ impl Vcpu {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new_x86_64(
|
||||
id: u8,
|
||||
vcpu_fd: Arc<VcpuFd>,
|
||||
vcpu_fd: VcpuFd,
|
||||
io_mgr: IoManagerCached,
|
||||
cpuid: CpuId,
|
||||
exit_evt: EventFd,
|
||||
|
||||
@@ -642,7 +642,7 @@ impl Vm {
|
||||
image: &mut F,
|
||||
) -> std::result::Result<InitrdConfig, LoadInitrdError>
|
||||
where
|
||||
F: Read + Seek,
|
||||
F: Read + Seek + vm_memory::ReadVolatile,
|
||||
{
|
||||
use crate::error::LoadInitrdError::*;
|
||||
|
||||
@@ -666,7 +666,7 @@ impl Vm {
|
||||
|
||||
// Load the image into memory
|
||||
vm_memory
|
||||
.read_from(GuestAddress(address), image, size)
|
||||
.read_volatile_from(GuestAddress(address), image, size)
|
||||
.map_err(|_| LoadInitrd)?;
|
||||
|
||||
Ok(InitrdConfig {
|
||||
@@ -1132,7 +1132,7 @@ pub mod tests {
|
||||
let vm_memory = vm.address_space.vm_memory().unwrap();
|
||||
vm_memory.write_obj(code, load_addr).unwrap();
|
||||
|
||||
let vcpu_fd = vm.vm_fd().create_vcpu(0).unwrap();
|
||||
let mut vcpu_fd = vm.vm_fd().create_vcpu(0).unwrap();
|
||||
let mut vcpu_sregs = vcpu_fd.get_sregs().unwrap();
|
||||
assert_ne!(vcpu_sregs.cs.base, 0);
|
||||
assert_ne!(vcpu_sregs.cs.selector, 0);
|
||||
|
||||
@@ -63,8 +63,6 @@ else
|
||||
include $(ARCH_FILE)
|
||||
endif
|
||||
|
||||
|
||||
|
||||
ifeq ($(PREFIX),)
|
||||
PREFIX := /usr
|
||||
EXEC_PREFIX := $(PREFIX)/local
|
||||
|
||||
@@ -22,7 +22,7 @@ slog = { workspace = true }
|
||||
slog-scope = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tokio = { workspace = true, features = ["sync", "fs", "process", "io-util"] }
|
||||
vmm-sys-util = "0.11.0"
|
||||
vmm-sys-util = "0.15.0"
|
||||
rand = { workspace = true }
|
||||
path-clean = "1.0.1"
|
||||
lazy_static = { workspace = true }
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
|
||||
|
||||
use libc::ifreq;
|
||||
use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val};
|
||||
use vmm_sys_util::{ioctl_ioc_nr, ioctl_iow_nr};
|
||||
use vmm_sys_util::ioctl_iow_nr;
|
||||
// As defined in the Linux UAPI:
|
||||
// https://elixir.bootlin.com/linux/v4.17/source/include/uapi/linux/if.h#L33
|
||||
pub(crate) const IFACE_NAME_MAX_LEN: usize = 16;
|
||||
|
||||
16
src/tools/agent-ctl/Cargo.lock
generated
16
src/tools/agent-ctl/Cargo.lock
generated
@@ -1140,7 +1140,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror 1.0.40",
|
||||
"timerfd",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1469,9 +1469,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "event-manager"
|
||||
version = "0.2.1"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "377fa591135fbe23396a18e2655a6d5481bf7c5823cdfa3cc81b01a229cbe640"
|
||||
checksum = "13bdac971eb2efaceffca0976058ab80c715945cc565c8a4aa1ed3bb0dc8d0e4"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"vmm-sys-util 0.14.0",
|
||||
@@ -2119,7 +2119,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"ttrpc",
|
||||
"ttrpc-codegen",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5709,9 +5709,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "vmm-sys-util"
|
||||
version = "0.11.2"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48b7b084231214f7427041e4220d77dfe726897a6d41fddee450696e66ff2a29"
|
||||
checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"libc",
|
||||
@@ -5719,9 +5719,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "vmm-sys-util"
|
||||
version = "0.14.0"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789"
|
||||
checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"libc",
|
||||
|
||||
@@ -689,7 +689,11 @@ fn build_auth(reference: &Reference) -> RegistryAuth {
|
||||
Err(CredentialRetrievalError::ConfigReadError) => {
|
||||
debug!("build_auth: Cannot read docker credentials - using anonymous access.");
|
||||
}
|
||||
Err(CredentialRetrievalError::HelperFailure { stdout, stderr }) => {
|
||||
Err(CredentialRetrievalError::HelperFailure {
|
||||
helper: _,
|
||||
stdout,
|
||||
stderr,
|
||||
}) => {
|
||||
if stdout == "credentials not found in native keychain\n" {
|
||||
// On WSL, this error is generated when credentials are not
|
||||
// available in ~/.docker/config.json.
|
||||
|
||||
@@ -264,7 +264,11 @@ pub fn build_auth(reference: &Reference) -> Option<AuthConfig> {
|
||||
Err(CredentialRetrievalError::ConfigReadError) => {
|
||||
debug!("build_auth: Cannot read docker credentials - using anonymous access.");
|
||||
}
|
||||
Err(CredentialRetrievalError::HelperFailure { stdout, stderr }) => {
|
||||
Err(CredentialRetrievalError::HelperFailure {
|
||||
helper: _,
|
||||
stdout,
|
||||
stderr,
|
||||
}) => {
|
||||
if stdout == "credentials not found in native keychain\n" {
|
||||
// On WSL, this error is generated when credentials are not
|
||||
// available in ~/.docker/config.json.
|
||||
|
||||
22
src/tools/kata-ctl/Cargo.lock
generated
22
src/tools/kata-ctl/Cargo.lock
generated
@@ -905,7 +905,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror 1.0.50",
|
||||
"timerfd",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1126,12 +1126,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "event-manager"
|
||||
version = "0.2.1"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "377fa591135fbe23396a18e2655a6d5481bf7c5823cdfa3cc81b01a229cbe640"
|
||||
checksum = "13bdac971eb2efaceffca0976058ab80c715945cc565c8a4aa1ed3bb0dc8d0e4"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1624,7 +1624,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"ttrpc",
|
||||
"ttrpc-codegen",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1910,7 +1910,7 @@ dependencies = [
|
||||
"toml",
|
||||
"url",
|
||||
"virt_container",
|
||||
"vmm-sys-util 0.11.2",
|
||||
"vmm-sys-util 0.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4614,6 +4614,16 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vmm-sys-util"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "506c62fdf617a5176827c2f9afbcf1be155b03a9b4bf9617a60dbc07e3a1642f"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vsock"
|
||||
version = "0.3.0"
|
||||
|
||||
@@ -34,7 +34,7 @@ kata-sys-util = { path = "../../../src/libs/kata-sys-util/" }
|
||||
agent = { path = "../../runtime-rs/crates/agent" }
|
||||
virt_container = { path = "../../runtime-rs/crates/runtimes/virt_container" }
|
||||
serial_test = "0.10.0"
|
||||
vmm-sys-util = "0.11.0"
|
||||
vmm-sys-util = "0.15.0"
|
||||
epoll = "4.0.1"
|
||||
libc = "0.2.138"
|
||||
|
||||
|
||||
@@ -795,7 +795,7 @@ function helm_helper() {
|
||||
disable_snapshotter_setup=false
|
||||
for shim in ${HELM_SHIMS}; do
|
||||
case "${shim}" in
|
||||
qemu-tdx|qemu-snp|qemu-nvidia-gpu-tdx|qemu-nvidia-gpu-snp)
|
||||
qemu-tdx|qemu-snp)
|
||||
disable_snapshotter_setup=true
|
||||
break
|
||||
;;
|
||||
|
||||
@@ -174,6 +174,17 @@ function deploy_kata() {
|
||||
|
||||
set_default_cluster_namespace
|
||||
|
||||
# Workaround to avoid modifying the workflow yaml files
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-nvidia-gpu-*)
|
||||
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER=true
|
||||
SNAPSHOTTER="nydus"
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL=false
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
|
||||
ANNOTATIONS="default_vcpus"
|
||||
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
|
||||
ANNOTATIONS="image kernel default_vcpus cc_init_data"
|
||||
@@ -207,8 +218,11 @@ function deploy_kata() {
|
||||
# as they are running on a GitHub runner (and not on a BM machine),
|
||||
# and there the snapshotter is deployed on every run (rather than
|
||||
# deployed when the machine is configured, as on the BM machines).
|
||||
if [[ "${KATA_HYPERVISOR}" == qemu-coco-dev* ]] && [[ ${ARCH} == "x86_64" ]]; then
|
||||
EXPERIMENTAL_SETUP_SNAPSHOTTER="${SNAPSHOTTER}"
|
||||
if [[ ${ARCH} == "x86_64" ]]; then
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-coco-dev*|qemu-nvidia-gpu-*) EXPERIMENTAL_SETUP_SNAPSHOTTER="${SNAPSHOTTER}" ;;
|
||||
*) ;;
|
||||
esac
|
||||
fi
|
||||
;;
|
||||
*) ;;
|
||||
@@ -216,10 +230,6 @@ function deploy_kata() {
|
||||
fi
|
||||
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL="${EXPERIMENTAL_FORCE_GUEST_PULL:-}"
|
||||
if [[ "${KATA_HYPERVISOR}" == "qemu-nvidia-gpu-"* ]]; then
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL="${KATA_HYPERVISOR}"
|
||||
fi
|
||||
export EXPERIMENTAL_FORCE_GUEST_PULL
|
||||
|
||||
export HELM_K8S_DISTRIBUTION="${KUBERNETES}"
|
||||
export HELM_IMAGE_REFERENCE="${DOCKER_REGISTRY}/${DOCKER_REPO}"
|
||||
|
||||
@@ -29,8 +29,10 @@ setup() {
|
||||
kubectl expose deployment/${deployment}
|
||||
|
||||
busybox_pod="test-nginx"
|
||||
# We need to use `-O index.html` as the busybox' wget has a different behaviour
|
||||
# than GNU's wget, which would just append a .n to the file name instead of bailing.
|
||||
kubectl run $busybox_pod --restart=Never -it --image="$busybox_image" \
|
||||
-- sh -c 'i=1; while [ $i -le '"$wait_time"' ]; do wget --timeout=5 '"$deployment"' && break; sleep 1; i=$(expr $i + 1); done'
|
||||
-- sh -c 'i=1; while [ $i -le '"$wait_time"' ]; do wget -O index.html --timeout=5 '"$deployment"' && break; sleep 1; i=$(expr $i + 1); done'
|
||||
|
||||
# check pod's status, it should be Succeeded.
|
||||
# or {.status.containerStatuses[0].state.terminated.reason} = "Completed"
|
||||
|
||||
@@ -148,9 +148,9 @@ install_genpolicy_drop_ins() {
|
||||
# 20-* OCI version overlay
|
||||
if [[ "${KATA_HOST_OS:-}" == "cbl-mariner" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.2.0-drop-in.json" "${settings_d}/"
|
||||
elif is_k3s_or_rke2 || is_nvidia_gpu_platform; then
|
||||
elif is_k3s_or_rke2; then
|
||||
cp "${examples_dir}/20-oci-1.2.1-drop-in.json" "${settings_d}/"
|
||||
elif [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
elif is_nvidia_gpu_platform || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.3.0-drop-in.json" "${settings_d}/"
|
||||
fi
|
||||
|
||||
|
||||
@@ -203,8 +203,6 @@ function teardown() {
|
||||
echo "Running teardown"
|
||||
local rc=0
|
||||
|
||||
journalctl -x -t kata --since "10 minutes ago" || true
|
||||
|
||||
local pid
|
||||
for bin in containerd-nydus-grpc nydusd; do
|
||||
pid=$(pidof $bin)
|
||||
|
||||
@@ -42,10 +42,15 @@ Gramine
|
||||
Mellanox
|
||||
MOFED
|
||||
NCCL
|
||||
NRAS
|
||||
NVCR
|
||||
NVLINK
|
||||
nvswitch
|
||||
nvswitches
|
||||
NVRC
|
||||
nvlib
|
||||
pgpu
|
||||
PPCIE
|
||||
Quadro
|
||||
|
||||
# Storage & Filesystems
|
||||
@@ -82,3 +87,5 @@ StratoVirt
|
||||
TOCTOU
|
||||
unbootable
|
||||
userspace
|
||||
eBPF
|
||||
dwarves
|
||||
|
||||
@@ -1443,9 +1443,10 @@ main()
|
||||
test_path="${test_path:-"${repo}/tests"}"
|
||||
test_dir="${GOPATH}/src/${test_path}"
|
||||
|
||||
if [ -z "$repo_path" ]
|
||||
then
|
||||
if [ -z "$repo_path" ]; then
|
||||
repo_path=$GOPATH/src/$repo
|
||||
else
|
||||
test_dir=$repo_path/$test_path
|
||||
fi
|
||||
|
||||
announce
|
||||
|
||||
@@ -1,541 +1,3 @@
|
||||
# Kata Containers Deploy – Helm Chart
|
||||
|
||||
A Helm chart that installs the kata-deploy DaemonSet and its helper assets,
|
||||
enabling Kata Containers runtimes on your Kubernetes, K3s, RKE2, or K0s cluster.
|
||||
|
||||
## TL;DR
|
||||
|
||||
```sh
|
||||
# Install directly from the official ghcr.io OCI regitry
|
||||
# update the VERSION X.YY.Z to your needs or just use the latest
|
||||
|
||||
export VERSION=$(curl -sSL https://api.github.com/repos/kata-containers/kata-containers/releases/latest | jq .tag_name | tr -d '"')
|
||||
export CHART="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy"
|
||||
|
||||
$ helm install kata-deploy "${CHART}" --version "${VERSION}"
|
||||
|
||||
# See everything you can configure
|
||||
$ helm show values "${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **Kubernetes ≥ v1.22** – v1.22 is the first release where the CRI v1 API
|
||||
became the default and `RuntimeClass` left alpha. The chart depends on those
|
||||
stable interfaces; earlier clusters need `feature‑gates` or CRI shims that are
|
||||
out of scope.
|
||||
|
||||
- **Kata Release 3.12** - v3.12.0 introduced publishing the helm-chart on the
|
||||
release page for easier consumption, since v3.8.0 we shipped the helm-chart
|
||||
via source code in the kata-containers `Github` repository.
|
||||
|
||||
- CRI‑compatible runtime (containerd or CRI‑O). If one wants to use the
|
||||
`multiInstallSuffix` feature one needs at least **containerd-2.0** which
|
||||
supports drop-in config files
|
||||
|
||||
- Nodes must allow loading kernel modules and installing Kata artifacts (the
|
||||
chart runs privileged containers to do so)
|
||||
|
||||
## Installing Helm
|
||||
|
||||
If Helm is not yet on your workstation or CI runner, install Helm v3 (v3.9 or
|
||||
newer recommended):
|
||||
|
||||
```sh
|
||||
# Quick one‑liner (Linux/macOS)
|
||||
$ curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
|
||||
# Or via your package manager
|
||||
$ sudo apt-get install helm # Debian/Ubuntu
|
||||
$ brew install helm # Homebrew on macOS / Linuxbrew
|
||||
```
|
||||
|
||||
Verify the installation:
|
||||
|
||||
```sh
|
||||
$ helm version
|
||||
```
|
||||
|
||||
## Installing the Chart
|
||||
|
||||
Before attempting installing the chart one may first consult the table below
|
||||
[Configuration Reference](#configuration-reference) for all the default values.
|
||||
Some default values may not fit all use-cases so update as needed. A prime example
|
||||
may be the `k8sDistribution` which per default is set to `k8s`.
|
||||
|
||||
To see which chart versions are available either use the CLI
|
||||
|
||||
```sh
|
||||
$ helm show chart oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy
|
||||
```
|
||||
|
||||
or visit
|
||||
[kata-deploy-charts](https://github.com/orgs/kata-containers/packages/container/package/kata-deploy-charts%2Fkata-deploy)
|
||||
|
||||
If one wants to wait until the Helm chart has deployed every object in the chart
|
||||
one can use `--wait --timeout 10m --atomic`. If the timeout expires or anything
|
||||
fails, Helm rolls the release back to its previous state.
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy \ # release name
|
||||
--namespace kube-system \ # recommended namespace
|
||||
--wait --timeout 10m --atomic \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
If one does not want to wait for the object via Helm or one wants rather use
|
||||
`kubectl` use Helm like this:
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy \ # release name
|
||||
--namespace kube-system \ # recommended namespace
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
## Updating Settings
|
||||
|
||||
Forgot to enable an option? Re‑use the values already on the cluster and only
|
||||
mutate what you need:
|
||||
|
||||
```sh
|
||||
# List existing releases
|
||||
$ helm ls -A
|
||||
|
||||
# Upgrade in‑place, keeping everything else the same
|
||||
$ helm upgrade kata-deploy -n kube-system \
|
||||
--reuse-values \
|
||||
--set env.defaultShim=qemu-runtime-rs \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
## Uninstalling
|
||||
|
||||
```sh
|
||||
$ helm uninstall kata-deploy -n kube-system
|
||||
```
|
||||
|
||||
During uninstall, Helm will report that some resources were kept due to the
|
||||
resource policy (`ServiceAccount`, `ClusterRole`, `ClusterRoleBinding`). This
|
||||
is **normal**. A post-delete hook Job runs after uninstall and removes those
|
||||
resources so no cluster-wide `RBAC` is left behind.
|
||||
|
||||
## Configuration Reference
|
||||
|
||||
All values can be overridden with --set key=value or a custom `-f myvalues.yaml`.
|
||||
|
||||
| Key | Description | Default |
|
||||
|-----|-------------|---------|
|
||||
| `imagePullPolicy` | Set the DaemonSet pull policy | `Always` |
|
||||
| `imagePullSecrets` | Enable pulling from a private registry via pull secret | `""` |
|
||||
| `image.reference` | Fully qualified image reference (for digest pinning use the full image e.g. `…@sha256:...`; tag is ignored) | `quay.io/kata-containers/kata-deploy` |
|
||||
| `image.tag` | Tag of the image reference (defaults to chart `AppVersion` when empty) | `""` |
|
||||
| `kubectlImage.reference` | Fully qualified `kubectl` image reference (for digest pinning use the full image e.g. `…@sha256:...` and leave `kubectlImage.tag` empty) | `quay.io/kata-containers/kubectl` |
|
||||
| `kubectlImage.tag` | Tag of the `kubectl` image reference | `latest` |
|
||||
| `k8sDistribution` | Set the k8s distribution to use: `k8s`, `k0s`, `k3s`, `rke2`, `microk8s` | `k8s` |
|
||||
| `nodeSelector` | Node labels for pod assignment. Allows restricting deployment to specific nodes | `{}` |
|
||||
| `runtimeClasses.enabled` | Enable Helm-managed `runtimeClass` creation (recommended) | `true` |
|
||||
| `runtimeClasses.createDefault` | Create a default `runtimeClass` alias for the default shim | `false` |
|
||||
| `runtimeClasses.defaultName` | Name for the default `runtimeClass` | `kata` |
|
||||
| `env.debug` | Enable debugging in the `configuration.toml` | `false` |
|
||||
| `env.shims` | List of shims to deploy | `clh cloud-hypervisor dragonball fc qemu qemu-coco-dev qemu-coco-dev-runtime-rs qemu-runtime-rs qemu-se-runtime-rs qemu-snp qemu-tdx stratovirt qemu-nvidia-gpu qemu-nvidia-gpu-snp qemu-nvidia-gpu-tdx qemu-cca` |
|
||||
| `env.shims_x86_64` | List of shims to deploy for x86_64 (if set, overrides `shims`) | `""` |
|
||||
| `env.shims_aarch64` | List of shims to deploy for aarch64 (if set, overrides `shims`) | `""` |
|
||||
| `env.shims_s390x` | List of shims to deploy for s390x (if set, overrides `shims`) | `""` |
|
||||
| `env.shims_ppc64le` | List of shims to deploy for ppc64le (if set, overrides `shims`) | `""` |
|
||||
| `env.defaultShim` | The default shim to use if none specified | `qemu` |
|
||||
| `env.defaultShim_x86_64` | The default shim to use if none specified for x86_64 (if set, overrides `defaultShim`) | `""` |
|
||||
| `env.defaultShim_aarch64` | The default shim to use if none specified for aarch64 (if set, overrides `defaultShim`) | `""` |
|
||||
| `env.defaultShim_s390x` | The default shim to use if none specified for s390x (if set, overrides `defaultShim`) | `""` |
|
||||
| `env.defaultShim_ppc64le` | The default shim to use if none specified for ppc64le (if set, overrides `defaultShim`) | `""` |
|
||||
| `env.createRuntimeClasses` | **DEPRECATED** - Use `runtimeClasses.enabled` instead. Script-based `runtimeClass` creation | `false` |
|
||||
| `env.createDefaultRuntimeClass` | **DEPRECATED** - Use `runtimeClasses.createDefault` instead | `false` |
|
||||
| `env.allowedHypervisorAnnotations` | Enable the provided annotations to be enabled when launching a Container or Pod, per default the annotations are disabled | `""` |
|
||||
| `env.snapshotterHandlerMapping` | Provide the snapshotter handler for each shim | `""` |
|
||||
| `env.snapshotterHandlerMapping_x86_64` | Provide the snapshotter handler for each shim for x86_64 (if set, overrides `snapshotterHandlerMapping`) | `""` |
|
||||
| `env.snapshotterHandlerMapping_aarch64` | Provide the snapshotter handler for each shim for aarch64 (if set, overrides `snapshotterHandlerMapping`) | `""` |
|
||||
| `env.snapshotterHandlerMapping_s390x` | Provide the snapshotter handler for each shim for s390x (if set, overrides `snapshotterHandlerMapping`) | `""` |
|
||||
| `env.snapshotterHandlerMapping_ppc64le` | Provide the snapshotter handler for each shim for ppc64le (if set, overrides `snapshotterHandlerMapping`) | `""` |
|
||||
| `evn.agentHttpsProxy` | HTTPS_PROXY=... | `""` |
|
||||
| `env.agentHttpProxy` | specifies a list of addresses that should bypass a configured proxy server | `""` |
|
||||
| `env.pullTypeMapping` | Type of container image pulling, examples are guest-pull or default | `""` |
|
||||
| `env.pullTypeMapping_x86_64` | Type of container image pulling for x86_64 (if set, overrides `pullTypeMapping`) | `""` |
|
||||
| `env.pullTypeMapping_aarch64` | Type of container image pulling for aarch64 (if set, overrides `pullTypeMapping`) | `""` |
|
||||
| `env.pullTypeMapping_s390x` | Type of container image pulling for s390x (if set, overrides `pullTypeMapping`) | `""` |
|
||||
| `env.pullTypeMapping_ppc64le` | Type of container image pulling for ppc64le (if set, overrides `pullTypeMapping`) | `""` |
|
||||
| `env.installationPrefix` | Prefix where to install the Kata artifacts | `/opt/kata` |
|
||||
| `env.hostOS` | Provide host-OS setting, e.g. `cbl-mariner` to do additional configurations | `""` |
|
||||
| `env.multiInstallSuffix` | Enable multiple Kata installation on the same node with suffix e.g. `/opt/kata-PR12232` | `""` |
|
||||
| `env._experimentalSetupSnapshotter` | Deploys (`nydus`) and/or sets up (`erofs`, `nydus`) the snapshotter(s) specified as the value (supports multiple snapshotters, separated by commas; e.g., `nydus,erofs`) | `""` |
|
||||
| `env._experimentalForceGuestPull` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value (supports multiple shims, separated by commas; e.g., `qemu-tdx,qemu-snp`) | `""` |
|
||||
| `env._experimentalForceGuestPull_x86_64` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for x86_64 (if set, overrides `_experimentalForceGuestPull`) | `""` |
|
||||
| `env._experimentalForceGuestPull_aarch64` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for aarch64 (if set, overrides `_experimentalForceGuestPull`) | `""` |
|
||||
| `env._experimentalForceGuestPull_s390x` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for s390x (if set, overrides `_experimentalForceGuestPull`) | `""` |
|
||||
| `env._experimentalForceGuestPull_ppc64le` | Enables `experimental_force_guest_pull` for the shim(s) specified as the value for ppc64le (if set, overrides `_experimentalForceGuestPull`) | `""` |
|
||||
|
||||
## Structured Configuration
|
||||
|
||||
**NEW**: Starting with Kata Containers v3.23.0, a new structured configuration format is available for configuring shims. This provides better type safety, clearer organization, and per-shim configuration options.
|
||||
|
||||
### Migration from Legacy Format
|
||||
|
||||
The legacy `env.*` configuration format is **deprecated** and will be removed in 2 releases. Users are encouraged to migrate to the new structured format.
|
||||
|
||||
**Deprecated fields** (will be removed in 2 releases):
|
||||
- `env.shims`, `env.shims_x86_64`, `env.shims_aarch64`, `env.shims_s390x`, `env.shims_ppc64le`
|
||||
- `env.defaultShim`, `env.defaultShim_x86_64`, `env.defaultShim_aarch64`, `env.defaultShim_s390x`, `env.defaultShim_ppc64le`
|
||||
- `env.allowedHypervisorAnnotations`
|
||||
- `env.snapshotterHandlerMapping`, `env.snapshotterHandlerMapping_x86_64`, etc.
|
||||
- `env.pullTypeMapping`, `env.pullTypeMapping_x86_64`, etc.
|
||||
- `env.agentHttpsProxy`, `env.agentNoProxy`
|
||||
- `env._experimentalSetupSnapshotter`
|
||||
- `env._experimentalForceGuestPull`, `env._experimentalForceGuestPull_x86_64`, etc.
|
||||
- `env.debug`
|
||||
|
||||
### New Structured Format
|
||||
|
||||
The new format uses a `shims` section where each shim can be configured individually:
|
||||
|
||||
```yaml
|
||||
# Enable debug mode globally
|
||||
debug: false
|
||||
|
||||
# Configure snapshotter setup
|
||||
snapshotter:
|
||||
setup: [] # ["nydus", "erofs"] or []
|
||||
|
||||
# Configure shims
|
||||
shims:
|
||||
# Disable all shims at once (useful when enabling only specific shims or custom runtimes)
|
||||
disableAll: false
|
||||
|
||||
qemu:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
- arm64
|
||||
- s390x
|
||||
- ppc64le
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
qemu-snp:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: nydus
|
||||
forceGuestPull: false
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
httpsProxy: ""
|
||||
noProxy: ""
|
||||
# Optional: set runtimeClass.nodeSelector to pin TEE to specific nodes (always applied). If unset, NFD TEE labels are auto-injected when NFD is detected.
|
||||
|
||||
# Default shim per architecture
|
||||
defaultShim:
|
||||
amd64: qemu
|
||||
arm64: qemu
|
||||
s390x: qemu
|
||||
ppc64le: qemu
|
||||
```
|
||||
|
||||
### Key Benefits
|
||||
|
||||
1. **Per-shim configuration**: Each shim can have its own settings for snapshotter, guest pull, agent proxy, etc.
|
||||
2. **Architecture-aware**: Shims declare which architectures they support
|
||||
3. **Type safety**: Structured format reduces configuration errors
|
||||
4. **Easy to use**: All shims are enabled by default in `values.yaml`, so you can use the chart directly without modification
|
||||
5. **Disable all at once**: Use `shims.disableAll: true` to disable all standard shims, useful when enabling only specific shims or using custom runtimes only
|
||||
|
||||
### Example: Enable `qemu` shim with new format
|
||||
|
||||
```yaml
|
||||
shims:
|
||||
qemu:
|
||||
enabled: true
|
||||
supportedArches:
|
||||
- amd64
|
||||
- arm64
|
||||
|
||||
defaultShim:
|
||||
amd64: qemu
|
||||
arm64: qemu
|
||||
```
|
||||
|
||||
### Backward Compatibility
|
||||
|
||||
The chart maintains full backward compatibility with the legacy `env.*` format. If legacy values are set, they take precedence over the new structured format. This allows for gradual migration.
|
||||
|
||||
### Default Configuration
|
||||
|
||||
The default `values.yaml` file has **all shims enabled by default**, making it easy to use the chart directly without modification:
|
||||
|
||||
```sh
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--version VERSION
|
||||
```
|
||||
|
||||
This includes all available Kata Containers shims:
|
||||
- Standard shims: `qemu`, `qemu-runtime-rs`, `clh`, `cloud-hypervisor`, `dragonball`, `fc`
|
||||
- TEE shims: `qemu-snp`, `qemu-snp-runtime-rs`, `qemu-tdx`, `qemu-tdx-runtime-rs`, `qemu-se`, `qemu-se-runtime-rs`, `qemu-cca`, `qemu-coco-dev`, `qemu-coco-dev-runtime-rs`
|
||||
- NVIDIA GPU shims: `qemu-nvidia-gpu`, `qemu-nvidia-gpu-snp`, `qemu-nvidia-gpu-tdx`
|
||||
- Remote shims: `remote` (for `peer-pods`/`cloud-api-adaptor`, disabled by default)
|
||||
|
||||
To enable only specific shims, you can override the configuration:
|
||||
|
||||
```yaml
|
||||
# Custom values file - enable only qemu shim
|
||||
shims:
|
||||
qemu:
|
||||
enabled: true
|
||||
clh:
|
||||
enabled: false
|
||||
cloud-hypervisor:
|
||||
enabled: false
|
||||
# ... disable other shims as needed
|
||||
```
|
||||
|
||||
### Example Values Files
|
||||
|
||||
For convenience, we also provide example values files that demonstrate specific use cases:
|
||||
|
||||
#### `try-kata-tee.values.yaml` - Trusted Execution Environment Shims
|
||||
|
||||
This file enables only the TEE (Trusted Execution Environment) shims for confidential computing:
|
||||
|
||||
```sh
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--version VERSION \
|
||||
-f try-kata-tee.values.yaml
|
||||
```
|
||||
|
||||
Includes:
|
||||
- `qemu-snp` - AMD SEV-SNP (amd64)
|
||||
- `qemu-tdx` - Intel TDX (amd64)
|
||||
- `qemu-se` - IBM Secure Execution for Linux (SEL) (s390x)
|
||||
- `qemu-se-runtime-rs` - IBM Secure Execution for Linux (SEL) Rust runtime (s390x)
|
||||
- `qemu-cca` - Arm Confidential Compute Architecture (arm64)
|
||||
- `qemu-coco-dev` - Confidential Containers development (amd64, s390x)
|
||||
- `qemu-coco-dev-runtime-rs` - Confidential Containers development Rust runtime (amd64, s390x)
|
||||
|
||||
#### `try-kata-nvidia-gpu.values.yaml` - NVIDIA GPU Shims
|
||||
|
||||
This file enables only the NVIDIA GPU-enabled shims:
|
||||
|
||||
```sh
|
||||
helm install kata-deploy oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy \
|
||||
--version VERSION \
|
||||
-f try-kata-nvidia-gpu.values.yaml
|
||||
```
|
||||
|
||||
Includes:
|
||||
- `qemu-nvidia-gpu` - Standard NVIDIA GPU support (amd64, arm64)
|
||||
- `qemu-nvidia-gpu-snp` - NVIDIA GPU with AMD SEV-SNP (amd64)
|
||||
- `qemu-nvidia-gpu-tdx` - NVIDIA GPU with Intel TDX (amd64)
|
||||
|
||||
**Note**: These example files are located in the chart directory. When installing from the OCI registry, you'll need to download them separately or clone the repository to access them.
|
||||
|
||||
### RuntimeClass Node Selectors for TEE Shims
|
||||
|
||||
**Manual configuration:** Any `nodeSelector` you set under `shims.<shim>.runtimeClass.nodeSelector`
|
||||
is **always applied** to that shim's RuntimeClass, whether or not NFD is present. Use this when
|
||||
you want to pin TEE workloads to specific nodes (e.g. without NFD, or with custom labels).
|
||||
|
||||
**Auto-inject when NFD is present:** If you do *not* set a `runtimeClass.nodeSelector` for a
|
||||
TEE shim, the chart can **automatically inject** NFD-based labels when NFD is detected in the
|
||||
cluster (deployed by this chart with `node-feature-discovery.enabled=true` or found externally):
|
||||
- AMD SEV-SNP shims: `amd.feature.node.kubernetes.io/snp: "true"`
|
||||
- Intel TDX shims: `intel.feature.node.kubernetes.io/tdx: "true"`
|
||||
- IBM Secure Execution for Linux (SEL) shims (s390x): `feature.node.kubernetes.io/cpu-security.se.enabled: "true"`
|
||||
|
||||
The chart uses Helm's `lookup` function to detect NFD (by looking for the
|
||||
`node-feature-discovery-worker` DaemonSet). Auto-inject only runs when NFD is detected and
|
||||
no manual `runtimeClass.nodeSelector` is set for that shim.
|
||||
|
||||
**Note**: NFD detection requires cluster access. During `helm template` (dry-run without a
|
||||
cluster), external NFD is not seen, so auto-injected labels are not added. Manual
|
||||
`runtimeClass.nodeSelector` values are still applied in all cases.
|
||||
|
||||
## `RuntimeClass` Management
|
||||
|
||||
**NEW**: Starting with Kata Containers v3.23.0, `runtimeClasses` are managed by
|
||||
Helm by default, providing better lifecycle management and integration.
|
||||
|
||||
### Features:
|
||||
- **Automatic Creation**: `runtimeClasses` are automatically created for all configured shims
|
||||
- **Lifecycle Management**: Helm manages creation, updates, and deletion of `runtimeClasses`
|
||||
|
||||
### Configuration:
|
||||
```yaml
|
||||
runtimeClasses:
|
||||
enabled: true # Enable Helm-managed `runtimeClasses` (default)
|
||||
createDefault: false # Create a default "kata" `runtimeClass`
|
||||
defaultName: "kata" # Name for the default `runtimeClass`
|
||||
```
|
||||
|
||||
When `runtimeClasses.enabled: true` (default), the Helm chart creates
|
||||
`runtimeClass` resources for all enabled shims (either from the new structured
|
||||
`shims` configuration or from the legacy `env.shims` format).
|
||||
|
||||
The kata-deploy script will no longer create `runtimeClasses`
|
||||
(`env.createRuntimeClasses` defaults to `"false"`).
|
||||
|
||||
## Example: only `qemu` shim and debug enabled
|
||||
|
||||
Use `shims.disableAll=true` to disable all shims at once, then enable only the ones you need:
|
||||
|
||||
```sh
|
||||
# Using --set flags (disable all, then enable qemu)
|
||||
$ helm install kata-deploy \
|
||||
--set shims.disableAll=true \
|
||||
--set shims.qemu.enabled=true \
|
||||
--set debug=true \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
Or use a custom values file:
|
||||
|
||||
```yaml
|
||||
# custom-values.yaml
|
||||
debug: true
|
||||
shims:
|
||||
disableAll: true
|
||||
qemu:
|
||||
enabled: true
|
||||
```
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy \
|
||||
-f custom-values.yaml \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
## Example: Deploy only to specific nodes using `nodeSelector`
|
||||
|
||||
```sh
|
||||
# First, label the nodes where you want kata-containers to be installed
|
||||
$ kubectl label nodes worker-node-1 kata-containers=enabled
|
||||
$ kubectl label nodes worker-node-2 kata-containers=enabled
|
||||
|
||||
# Then install the chart with `nodeSelector`
|
||||
$ helm install kata-deploy \
|
||||
--set nodeSelector.kata-containers="enabled" \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
You can also use a values file:
|
||||
|
||||
```yaml
|
||||
# values.yaml
|
||||
nodeSelector:
|
||||
kata-containers: "enabled"
|
||||
node-type: "worker"
|
||||
```
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy -f values.yaml "${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
## Example: Multiple Kata installations on the same node
|
||||
|
||||
For debugging, testing and other use-case it is possible to deploy multiple
|
||||
versions of Kata on the very same node. All the needed artifacts are getting the
|
||||
`mulitInstallSuffix` appended to distinguish each installation. **BEWARE** that one
|
||||
needs at least **containerd-2.0** since this version has drop-in conf support
|
||||
which is a prerequisite for the `mulitInstallSuffix` to work properly.
|
||||
|
||||
```sh
|
||||
$ helm install kata-deploy-cicd \
|
||||
-n kata-deploy-cicd \
|
||||
--set env.multiInstallSuffix=cicd \
|
||||
--set env.debug=true \
|
||||
"${CHART}" --version "${VERSION}"
|
||||
```
|
||||
|
||||
Note: `runtimeClasses` are automatically created by Helm (via
|
||||
`runtimeClasses.enabled=true`, which is the default).
|
||||
|
||||
Now verify the installation by examining the `runtimeClasses`:
|
||||
|
||||
```sh
|
||||
$ kubectl get runtimeClasses
|
||||
NAME HANDLER AGE
|
||||
kata-clh-cicd kata-clh-cicd 77s
|
||||
kata-cloud-hypervisor-cicd kata-cloud-hypervisor-cicd 77s
|
||||
kata-dragonball-cicd kata-dragonball-cicd 77s
|
||||
kata-fc-cicd kata-fc-cicd 77s
|
||||
kata-qemu-cicd kata-qemu-cicd 77s
|
||||
kata-qemu-coco-dev-cicd kata-qemu-coco-dev-cicd 77s
|
||||
kata-qemu-nvidia-gpu-cicd kata-qemu-nvidia-gpu-cicd 77s
|
||||
kata-qemu-nvidia-gpu-snp-cicd kata-qemu-nvidia-gpu-snp-cicd 77s
|
||||
kata-qemu-nvidia-gpu-tdx-cicd kata-qemu-nvidia-gpu-tdx-cicd 76s
|
||||
kata-qemu-runtime-rs-cicd kata-qemu-runtime-rs-cicd 77s
|
||||
kata-qemu-se-runtime-rs-cicd kata-qemu-se-runtime-rs-cicd 77s
|
||||
kata-qemu-snp-cicd kata-qemu-snp-cicd 77s
|
||||
kata-qemu-tdx-cicd kata-qemu-tdx-cicd 77s
|
||||
kata-stratovirt-cicd kata-stratovirt-cicd 77s
|
||||
```
|
||||
|
||||
## Customizing Configuration with Drop-in Files
|
||||
|
||||
When kata-deploy installs Kata Containers, the base configuration files should not
|
||||
be modified directly. Instead, use drop-in configuration files to customize
|
||||
settings. This approach ensures your customizations survive kata-deploy upgrades.
|
||||
|
||||
### How Drop-in Files Work
|
||||
|
||||
The Kata runtime reads the base configuration file and then applies any `.toml`
|
||||
files found in the `config.d/` directory alongside it. Files are processed in
|
||||
alphabetical order, with later files overriding earlier settings.
|
||||
|
||||
### Creating Custom Drop-in Files
|
||||
|
||||
To add custom settings, create a `.toml` file in the appropriate `config.d/`
|
||||
directory. Use a numeric prefix to control the order of application.
|
||||
|
||||
**Reserved prefixes** (used by kata-deploy):
|
||||
- `10-*`: Core kata-deploy settings
|
||||
- `20-*`: Debug settings
|
||||
- `30-*`: Kernel parameters
|
||||
|
||||
**Recommended prefixes for custom settings**: `50-89`
|
||||
|
||||
### Example: Adding Custom Kernel Parameters
|
||||
|
||||
```bash
|
||||
# SSH into the node or use kubectl exec
|
||||
sudo mkdir -p /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/
|
||||
sudo cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-custom.toml << 'EOF'
|
||||
[hypervisor.qemu]
|
||||
kernel_params = "my_param=value"
|
||||
EOF
|
||||
```
|
||||
|
||||
### Example: Changing Default Memory Size
|
||||
|
||||
```bash
|
||||
sudo cat > /opt/kata/share/defaults/kata-containers/runtimes/qemu/config.d/50-memory.toml << 'EOF'
|
||||
[hypervisor.qemu]
|
||||
default_memory = 4096
|
||||
EOF
|
||||
```
|
||||
|
||||
### Custom Runtimes
|
||||
|
||||
For more complex customizations, you can define custom runtimes in your Helm
|
||||
values. Custom runtimes create isolated configuration directories with their
|
||||
own drop-in files:
|
||||
|
||||
```yaml
|
||||
customRuntimes:
|
||||
enabled: true
|
||||
runtimes:
|
||||
- handler: kata-custom
|
||||
baseConfig: qemu
|
||||
dropInFile: /path/to/your/config.toml
|
||||
```
|
||||
|
||||
This creates a new Runtime Class `kata-custom` that extends the `qemu`
|
||||
configuration with your custom settings.
|
||||
The documentation for the helm chart can be found at https://kata-containers.github.io/kata-containers/installation/#helm-chart.
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
debug: false
|
||||
|
||||
snapshotter:
|
||||
setup: []
|
||||
setup: ["nydus"]
|
||||
|
||||
# Enable NVIDIA GPU shims
|
||||
# Disable all shims at once, then enable only the ones we need
|
||||
@@ -32,8 +32,7 @@ shims:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
forceGuestPull: true
|
||||
snapshotter: "nydus"
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
@@ -46,8 +45,7 @@ shims:
|
||||
- amd64
|
||||
allowedHypervisorAnnotations: []
|
||||
containerd:
|
||||
snapshotter: ""
|
||||
forceGuestPull: true
|
||||
snapshotter: "nydus"
|
||||
crio:
|
||||
guestPull: true
|
||||
agent:
|
||||
|
||||
@@ -20,6 +20,7 @@ endif
|
||||
ifeq ($(ARCH), x86_64)
|
||||
BASE_TARBALLS = serial-targets \
|
||||
firecracker-tarball \
|
||||
kernel-debug-tarball \
|
||||
kernel-dragonball-experimental-tarball \
|
||||
kernel-nvidia-gpu-tarball \
|
||||
kernel-tarball \
|
||||
@@ -42,6 +43,7 @@ BASE_SERIAL_TARBALLS = rootfs-image-tarball \
|
||||
cloud-hypervisor-glibc-tarball
|
||||
else ifeq ($(ARCH), s390x)
|
||||
BASE_TARBALLS = serial-targets \
|
||||
kernel-debug-tarball \
|
||||
kernel-tarball \
|
||||
qemu-tarball \
|
||||
shim-v2-tarball \
|
||||
@@ -51,6 +53,7 @@ BASE_SERIAL_TARBALLS = rootfs-image-tarball \
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
BASE_TARBALLS = serial-targets \
|
||||
kernel-cca-confidential-tarball \
|
||||
kernel-debug-tarball \
|
||||
kernel-tarball \
|
||||
qemu-tarball \
|
||||
qemu-cca-experimental-tarball \
|
||||
@@ -137,6 +140,9 @@ kernel-nvidia-gpu-tarball:
|
||||
kernel-tarball:
|
||||
${MAKE} $@-build
|
||||
|
||||
kernel-debug-tarball:
|
||||
${MAKE} $@-build
|
||||
|
||||
kernel-cca-confidential-tarball:
|
||||
${MAKE} $@-build
|
||||
|
||||
|
||||
@@ -114,6 +114,7 @@ options:
|
||||
kata-manager
|
||||
kernel
|
||||
kernel-cca-confidential
|
||||
kernel-debug
|
||||
kernel-dragonball-experimental
|
||||
kernel-experimental
|
||||
kernel-nvidia-gpu
|
||||
@@ -716,6 +717,15 @@ install_kernel() {
|
||||
"${extra_cmd}"
|
||||
}
|
||||
|
||||
install_kernel_debug() {
|
||||
export KERNEL_DEBUG_ENABLED="yes"
|
||||
|
||||
install_kernel_helper \
|
||||
"assets.kernel" \
|
||||
"kernel-debug" \
|
||||
""
|
||||
}
|
||||
|
||||
install_kernel_cca_confidential() {
|
||||
export CONFIDENTIAL_GUEST="yes"
|
||||
export MEASURED_ROOTFS="yes"
|
||||
@@ -1319,6 +1329,8 @@ handle_build() {
|
||||
|
||||
kernel) install_kernel ;;
|
||||
|
||||
kernel-debug) install_kernel_debug ;;
|
||||
|
||||
kernel-cca-confidential) install_kernel_cca_confidential ;;
|
||||
|
||||
kernel-dragonball-experimental) install_kernel_dragonball_experimental ;;
|
||||
|
||||
@@ -12,7 +12,8 @@ It also requires [yq](https://github.com/mikefarah/yq) version v4.40.7.
|
||||
> **Hint**: `go install github.com/mikefarah/yq/v4@latest`
|
||||
|
||||
|
||||
The Linux kernel scripts further require a few packages (flex, bison, and libelf-dev)
|
||||
The Linux kernel scripts further require a few packages (flex, bison, libelf-dev, and
|
||||
dwarves for BTF generation in debug kernels).
|
||||
|
||||
|
||||
## Usage
|
||||
@@ -73,6 +74,18 @@ containers path (`/usr/share/kata-containers/`).
|
||||
$ sudo ./build-kernel.sh install
|
||||
```
|
||||
|
||||
## Debug Kernel
|
||||
|
||||
Kata Containers provides a kernel with debug and eBPF configs enabled.
|
||||
|
||||
To build this debug kernel, set `KERNEL_DEBUG_ENABLED=yes` for **all phases**:
|
||||
|
||||
```bash
|
||||
$ KERNEL_DEBUG_ENABLED=yes ./build-kernel.sh setup
|
||||
$ KERNEL_DEBUG_ENABLED=yes ./build-kernel.sh build
|
||||
$ sudo KERNEL_DEBUG_ENABLED=yes ./build-kernel.sh install
|
||||
```
|
||||
|
||||
## Submit Kernel Changes
|
||||
|
||||
Kata Containers packaging repository holds the kernel configs and patches. The
|
||||
|
||||
@@ -322,7 +322,8 @@ get_kernel_frag_path() {
|
||||
|
||||
if [[ ${KERNEL_DEBUG_ENABLED} == "yes" ]]; then
|
||||
info "Enable kernel debug"
|
||||
local debug_configs="$(ls ${common_path}/common/debug.conf)"
|
||||
local debug_path="${arch_path}/../debug"
|
||||
local debug_configs="$(ls ${debug_path}/*.conf)"
|
||||
all_configs="${all_configs} ${debug_configs}"
|
||||
fi
|
||||
|
||||
@@ -583,6 +584,10 @@ install_kata() {
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ${KERNEL_DEBUG_ENABLED} == "yes" ]]; then
|
||||
suffix="-debug${suffix}"
|
||||
fi
|
||||
|
||||
vmlinuz="vmlinuz-${kernel_version}-${config_version}${suffix}"
|
||||
vmlinux="vmlinux-${kernel_version}-${config_version}${suffix}"
|
||||
|
||||
|
||||
@@ -9,4 +9,4 @@ CONFIG_IKCONFIG_PROC=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_KERNEL=y
|
||||
CONFIG_DEBUG_FS=y
|
||||
CONFIG_DEBUG_INFO_DWARF4=y
|
||||
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
|
||||
22
tools/packaging/kernel/configs/fragments/debug/ebpf.conf
Normal file
22
tools/packaging/kernel/configs/fragments/debug/ebpf.conf
Normal file
@@ -0,0 +1,22 @@
|
||||
# Generate BTF type info
|
||||
CONFIG_DEBUG_INFO_BTF=y
|
||||
|
||||
# JIT compile the program
|
||||
CONFIG_BPF_JIT=y
|
||||
|
||||
# Allow the user to attach BPF programs to kprobe, uprobe, and tracepoint events
|
||||
CONFIG_BPF_EVENTS=y
|
||||
|
||||
# Kernel probes
|
||||
CONFIG_KPROBES=y
|
||||
CONFIG_KPROBE_EVENTS=y
|
||||
|
||||
# Userspace probes
|
||||
CONFIG_UPROBES=y
|
||||
CONFIG_UPROBE_EVENTS=y
|
||||
|
||||
# ftrace infrastructure
|
||||
CONFIG_DYNAMIC_FTRACE=y
|
||||
CONFIG_FTRACE=y
|
||||
CONFIG_FTRACE_SYSCALLS=y
|
||||
CONFIG_FUNCTION_TRACER=y
|
||||
@@ -1 +1 @@
|
||||
186
|
||||
187
|
||||
|
||||
@@ -19,6 +19,7 @@ RUN apt-get update && \
|
||||
ca-certificates \
|
||||
curl \
|
||||
debhelper \
|
||||
dwarves \
|
||||
flex \
|
||||
git \
|
||||
iptables \
|
||||
|
||||
@@ -77,6 +77,7 @@ container_build+=" --build-arg ARCH=${ARCH:-}"
|
||||
|
||||
"${container_engine}" run --rm -i -v "${repo_root_dir}:${repo_root_dir}" \
|
||||
-w "${PWD}" \
|
||||
--env KERNEL_DEBUG_ENABLED="${KERNEL_DEBUG_ENABLED}" \
|
||||
--user "$(id -u)":"$(id -g)" \
|
||||
"${container_image}" \
|
||||
bash -c "${kernel_builder} ${kernel_builder_args} build"
|
||||
@@ -84,6 +85,7 @@ container_build+=" --build-arg ARCH=${ARCH:-}"
|
||||
"${container_engine}" run --rm -i -v "${repo_root_dir}:${repo_root_dir}" \
|
||||
-w "${PWD}" \
|
||||
--env DESTDIR="${DESTDIR}" --env PREFIX="${PREFIX}" \
|
||||
--env KERNEL_DEBUG_ENABLED="${KERNEL_DEBUG_ENABLED}" \
|
||||
--user "$(id -u)":"$(id -g)" \
|
||||
"${container_image}" \
|
||||
bash -c "${kernel_builder} ${kernel_builder_args} install"
|
||||
|
||||
318
zensical.toml
318
zensical.toml
@@ -1,318 +0,0 @@
|
||||
[project]
|
||||
|
||||
# The site_name is shown in the page header and the browser window title
|
||||
#
|
||||
# Read more: https://zensical.org/docs/setup/basics/#site_name
|
||||
site_name = "Kata Containers Docs"
|
||||
|
||||
# The site_description is included in the HTML head and should contain a
|
||||
# meaningful description of the site content for use by search engines.
|
||||
#
|
||||
# Read more: https://zensical.org/docs/setup/basics/#site_description
|
||||
site_description = "Developer and user documentation for the Kata Containers project."
|
||||
|
||||
# The site_author attribute. This is used in the HTML head element.
|
||||
#
|
||||
# Read more: https://zensical.org/docs/setup/basics/#site_author
|
||||
site_author = "Kata Containers Community"
|
||||
|
||||
repo_url = "https://github.com/kata-containers/kata-containers"
|
||||
|
||||
# The site_url is the canonical URL for your site. When building online
|
||||
# documentation you should set this.
|
||||
# Read more: https://zensical.org/docs/setup/basics/#site_url
|
||||
site_url = "https://kata-containers.github.io/kata-containers"
|
||||
|
||||
edit_uri = "edit/main/docs/"
|
||||
|
||||
# The copyright notice appears in the page footer and can contain an HTML
|
||||
# fragment.
|
||||
#
|
||||
# Read more: https://zensical.org/docs/setup/basics/#copyright
|
||||
copyright = """
|
||||
Copyright © 2026 Kata Containers
|
||||
"""
|
||||
|
||||
# Zensical supports both implicit navigation and explicitly defined navigation.
|
||||
# If you decide not to define a navigation here then Zensical will simply
|
||||
# derive the navigation structure from the directory structure of your
|
||||
# "docs_dir". The definition below demonstrates how a navigation structure
|
||||
# can be defined using TOML syntax.
|
||||
#
|
||||
# Read more: https://zensical.org/docs/setup/navigation/
|
||||
# nav = [
|
||||
# { "Get started" = "index.md" },
|
||||
# { "Markdown in 5min" = "markdown.md" },
|
||||
# ]
|
||||
|
||||
# With the "extra_css" option you can add your own CSS styling to customize
|
||||
# your Zensical project according to your needs. You can add any number of
|
||||
# CSS files.
|
||||
#
|
||||
# The path provided should be relative to the "docs_dir".
|
||||
#
|
||||
# Read more: https://zensical.org/docs/customization/#additional-css
|
||||
#
|
||||
#extra_css = ["stylesheets/extra.css"]
|
||||
|
||||
# With the `extra_javascript` option you can add your own JavaScript to your
|
||||
# project to customize the behavior according to your needs.
|
||||
#
|
||||
# The path provided should be relative to the "docs_dir".
|
||||
#
|
||||
# Read more: https://zensical.org/docs/customization/#additional-javascript
|
||||
#extra_javascript = ["javascripts/extra.js"]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Section for configuring theme options
|
||||
# ----------------------------------------------------------------------------
|
||||
[project.theme]
|
||||
|
||||
# change this to "classic" to use the traditional Material for MkDocs look.
|
||||
#variant = "classic"
|
||||
|
||||
# Zensical allows you to override specific blocks, partials, or whole
|
||||
# templates as well as to define your own templates. To do this, uncomment
|
||||
# the custom_dir setting below and set it to a directory in which you
|
||||
# keep your template overrides.
|
||||
#
|
||||
# Read more:
|
||||
# - https://zensical.org/docs/customization/#extending-the-theme
|
||||
#
|
||||
#custom_dir = "overrides"
|
||||
|
||||
# With the "favicon" option you can set your own image to use as the icon
|
||||
# browsers will use in the browser title bar or tab bar. The path provided
|
||||
# must be relative to the "docs_dir".
|
||||
#
|
||||
# Read more:
|
||||
# - https://zensical.org/docs/setup/logo-and-icons/#favicon
|
||||
# - https://developer.mozilla.org/en-US/docs/Glossary/Favicon
|
||||
#
|
||||
favicon = "assets/favicon.svg"
|
||||
logo = "assets/favicon.svg"
|
||||
|
||||
# Zensical supports more than 60 different languages. This means that the
|
||||
# labels and tooltips that Zensical's templates produce are translated.
|
||||
# The "language" option allows you to set the language used. This language
|
||||
# is also indicated in the HTML head element to help with accessibility
|
||||
# and guide search engines and translation tools.
|
||||
#
|
||||
# The default language is "en" (English). It is possible to create
|
||||
# sites with multiple languages and configure a language selector. See
|
||||
# the documentation for details.
|
||||
#
|
||||
# Read more:
|
||||
# - https://zensical.org/docs/setup/language/
|
||||
#
|
||||
language = "en"
|
||||
|
||||
# Zensical provides a number of feature toggles that change the behavior
|
||||
# of the documentation site.
|
||||
features = [
|
||||
# Zensical includes an announcement bar. This feature allows users to
|
||||
# dismiss it then they have read the announcement.
|
||||
# https://zensical.org/docs/setup/header/#announcement-bar
|
||||
"announce.dismiss",
|
||||
|
||||
# If you have a repository configured and turn feature this on, Zensical
|
||||
# will generate an edit button for the page. This works for common
|
||||
# repository hosting services.
|
||||
# https://zensical.org/docs/setup/repository/#code-actions
|
||||
"content.action.edit",
|
||||
|
||||
# If you have a repository configured and turn feature this on, Zensical
|
||||
# will generate a button that allows the user to view the Markdown
|
||||
# code for the current page.
|
||||
# https://zensical.org/docs/setup/repository/#code-actions
|
||||
"content.action.view",
|
||||
|
||||
# Code annotations allow you to add an icon with a tooltip to your
|
||||
# code blocks to provide explanations at crucial points.
|
||||
# https://zensical.org/docs/authoring/code-blocks/#code-annotations
|
||||
"content.code.annotate",
|
||||
|
||||
# This feature turns on a button in code blocks that allow users to
|
||||
# copy the content to their clipboard without first selecting it.
|
||||
# https://zensical.org/docs/authoring/code-blocks/#code-copy-button
|
||||
"content.code.copy",
|
||||
|
||||
# Code blocks can include a button to allow for the selection of line
|
||||
# ranges by the user.
|
||||
# https://zensical.org/docs/authoring/code-blocks/#code-selection-button
|
||||
"content.code.select",
|
||||
|
||||
# Zensical can render footnotes as inline tooltips, so the user can read
|
||||
# the footnote without leaving the context of the document.
|
||||
# https://zensical.org/docs/authoring/footnotes/#footnote-tooltips
|
||||
"content.footnote.tooltips",
|
||||
|
||||
# If you have many content tabs that have the same titles (e.g., "Python",
|
||||
# "JavaScript", "Cobol"), this feature causes all of them to switch to
|
||||
# at the same time when the user chooses their language in one.
|
||||
# https://zensical.org/docs/authoring/content-tabs/#linked-content-tabs
|
||||
"content.tabs.link",
|
||||
|
||||
# TODO: not sure I understand this one? Is there a demo of this in the docs?
|
||||
# https://zensical.org/docs/authoring/tooltips/#improved-tooltips
|
||||
"content.tooltips",
|
||||
|
||||
# With this feature enabled, Zensical will automatically hide parts
|
||||
# of the header when the user scrolls past a certain point.
|
||||
# https://zensical.org/docs/setup/header/#automatic-hiding
|
||||
# "header.autohide",
|
||||
|
||||
# Turn on this feature to expand all collapsible sections in the
|
||||
# navigation sidebar by default.
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-expansion
|
||||
# "navigation.expand",
|
||||
|
||||
# This feature turns on navigation elements in the footer that allow the
|
||||
# user to navigate to a next or previous page.
|
||||
# https://zensical.org/docs/setup/footer/#navigation
|
||||
"navigation.footer",
|
||||
|
||||
# When section index pages are enabled, documents can be directly attached
|
||||
# to sections, which is particularly useful for providing overview pages.
|
||||
# https://zensical.org/docs/setup/navigation/#section-index-pages
|
||||
"navigation.indexes",
|
||||
|
||||
# When instant navigation is enabled, clicks on all internal links will be
|
||||
# intercepted and dispatched via XHR without fully reloading the page.
|
||||
# https://zensical.org/docs/setup/navigation/#instant-navigation
|
||||
"navigation.instant",
|
||||
|
||||
# With instant prefetching, your site will start to fetch a page once the
|
||||
# user hovers over a link. This will reduce the perceived loading time
|
||||
# for the user.
|
||||
# https://zensical.org/docs/setup/navigation/#instant-prefetching
|
||||
"navigation.instant.prefetch",
|
||||
|
||||
# In order to provide a better user experience on slow connections when
|
||||
# using instant navigation, a progress indicator can be enabled.
|
||||
# https://zensical.org/docs/setup/navigation/#progress-indicator
|
||||
#"navigation.instant.progress",
|
||||
|
||||
# When navigation paths are activated, a breadcrumb navigation is rendered
|
||||
# above the title of each page
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-path
|
||||
"navigation.path",
|
||||
|
||||
# When pruning is enabled, only the visible navigation items are included
|
||||
# in the rendered HTML, reducing the size of the built site by 33% or more.
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-pruning
|
||||
#"navigation.prune",
|
||||
|
||||
# When sections are enabled, top-level sections are rendered as groups in
|
||||
# the sidebar for viewports above 1220px, but remain as-is on mobile.
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-sections
|
||||
"navigation.sections",
|
||||
|
||||
# When tabs are enabled, top-level sections are rendered in a menu layer
|
||||
# below the header for viewports above 1220px, but remain as-is on mobile.
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-tabs
|
||||
#"navigation.tabs",
|
||||
|
||||
# When sticky tabs are enabled, navigation tabs will lock below the header
|
||||
# and always remain visible when scrolling down.
|
||||
# https://zensical.org/docs/setup/navigation/#sticky-navigation-tabs
|
||||
#"navigation.tabs.sticky",
|
||||
|
||||
# A back-to-top button can be shown when the user, after scrolling down,
|
||||
# starts to scroll up again.
|
||||
# https://zensical.org/docs/setup/navigation/#back-to-top-button
|
||||
"navigation.top",
|
||||
|
||||
# When anchor tracking is enabled, the URL in the address bar is
|
||||
# automatically updated with the active anchor as highlighted in the table
|
||||
# of contents.
|
||||
# https://zensical.org/docs/setup/navigation/#anchor-tracking
|
||||
"navigation.tracking",
|
||||
|
||||
# When search highlighting is enabled and a user clicks on a search result,
|
||||
# Zensical will highlight all occurrences after following the link.
|
||||
# https://zensical.org/docs/setup/search/#search-highlighting
|
||||
"search.highlight",
|
||||
|
||||
# When anchor following for the table of contents is enabled, the sidebar
|
||||
# is automatically scrolled so that the active anchor is always visible.
|
||||
# https://zensical.org/docs/setup/navigation/#anchor-following
|
||||
"toc.follow",
|
||||
|
||||
# When navigation integration for the table of contents is enabled, it is
|
||||
# always rendered as part of the navigation sidebar on the left.
|
||||
# https://zensical.org/docs/setup/navigation/#navigation-integration
|
||||
#"toc.integrate",
|
||||
]
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# In the "palette" subsection you can configure options for the color scheme.
|
||||
# You can configure different color # schemes, e.g., to turn on dark mode,
|
||||
# that the user can switch between. Each color scheme can be further
|
||||
# customized.
|
||||
#
|
||||
# Read more:
|
||||
# - https://zensical.org/docs/setup/colors/
|
||||
# ----------------------------------------------------------------------------
|
||||
[[project.theme.palette]]
|
||||
scheme = "slate"
|
||||
toggle.icon = "lucide/moon"
|
||||
toggle.name = "Switch to light mode"
|
||||
primary = "indigo"
|
||||
accent = "orange"
|
||||
|
||||
[[project.theme.palette]]
|
||||
scheme = "default"
|
||||
toggle.icon = "lucide/sun"
|
||||
toggle.name = "Switch to dark mode"
|
||||
primary = "indigo"
|
||||
accent = "orange"
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# In the "font" subsection you can configure the fonts used. By default, fonts
|
||||
# are loaded from Google Fonts, giving you a wide range of choices from a set
|
||||
# of suitably licensed fonts. There are options for a normal text font and for
|
||||
# a monospaced font used in code blocks.
|
||||
# ----------------------------------------------------------------------------
|
||||
#[project.theme.font]
|
||||
#text = "Inter"
|
||||
#code = "Jetbrains Mono"
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# You can configure your own logo to be shown in the header using the "logo"
|
||||
# option in the "icons" subsection. The logo can be a path to a file in your
|
||||
# "docs_dir" or it can be a path to an icon.
|
||||
#
|
||||
# Likewise, you can customize the logo used for the repository section of the
|
||||
# header. Zensical derives the default logo for this from the repository URL.
|
||||
# See below...
|
||||
#
|
||||
# There are other icons you can customize. See the documentation for details.
|
||||
#
|
||||
# Read more:
|
||||
# - https://zensical.org/docs/setup/logo-and-icons
|
||||
# - https://zensical.org/docs/authoring/icons-emojis/#search
|
||||
# ----------------------------------------------------------------------------
|
||||
[project.theme.icon]
|
||||
#logo = "./images/logo.png"
|
||||
#repo = "lucide/smile"
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# The "extra" section contains miscellaneous settings.
|
||||
# ----------------------------------------------------------------------------
|
||||
#[[project.extra.social]]
|
||||
#icon = "fontawesome/brands/github"
|
||||
#link = "https://github.com/user/repo"
|
||||
|
||||
|
||||
[project.markdown_extensions.toc]
|
||||
permalink = true
|
||||
|
||||
[project.markdown_extensions.admonition]
|
||||
[project.markdown_extensions.pymdownx.highlight]
|
||||
anchor_linenums = true
|
||||
line_spans = "__span"
|
||||
pygments_lang_class = true
|
||||
[project.markdown_extensions.pymdownx.inlinehilite]
|
||||
[project.markdown_extensions.pymdownx.snippets]
|
||||
[project.markdown_extensions.pymdownx.superfences]
|
||||
Reference in New Issue
Block a user