mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-30 16:42:48 +00:00
Compare commits
55 Commits
dependabot
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a0e99a86cf | ||
|
|
012bf4b333 | ||
|
|
7dce05b5fc | ||
|
|
3c584a474f | ||
|
|
48ef2220e8 | ||
|
|
c96b2034dc | ||
|
|
b8576ef476 | ||
|
|
a747b9f774 | ||
|
|
302b2c8d75 | ||
|
|
7fa68ffd52 | ||
|
|
119a145923 | ||
|
|
9f6bce9517 | ||
|
|
b04260f926 | ||
|
|
26d41b8f6e | ||
|
|
004333ed71 | ||
|
|
8dae67794a | ||
|
|
65b2a75aca | ||
|
|
75ecfe3fe2 | ||
|
|
a923bb2917 | ||
|
|
1163b6581f | ||
|
|
29e5d5d951 | ||
|
|
0cf3243801 | ||
|
|
64735222c6 | ||
|
|
30e030e18e | ||
|
|
8cebcf0113 | ||
|
|
237729d728 | ||
|
|
f0ad9f1709 | ||
|
|
1b8189731a | ||
|
|
4fad88499c | ||
|
|
fb77c357f4 | ||
|
|
de3afd3076 | ||
|
|
cd931d4905 | ||
|
|
911aee5ad7 | ||
|
|
858620d2e7 | ||
|
|
8c2b7ed619 | ||
|
|
af7fdd5cd1 | ||
|
|
0d8186ae16 | ||
|
|
7e0f5e533a | ||
|
|
bcfb2354e0 | ||
|
|
caf6b244e6 | ||
|
|
fb5482f647 | ||
|
|
46aa318b74 | ||
|
|
ec9c57c595 | ||
|
|
8950f1caeb | ||
|
|
814ae53d77 | ||
|
|
27dfb0d06f | ||
|
|
7ae2282a99 | ||
|
|
fd583d833b | ||
|
|
eb4ce0e98b | ||
|
|
6a832dd1f3 | ||
|
|
79efe3e041 | ||
|
|
2728b493d5 | ||
|
|
5765bc97b4 | ||
|
|
ce65d17276 | ||
|
|
27bebfb438 |
1438
Cargo.lock
generated
1438
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
42
Cargo.toml
42
Cargo.toml
@@ -6,6 +6,12 @@ rust-version = "1.88"
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
# kata-agent
|
||||
"src/agent",
|
||||
"src/agent/rustjail",
|
||||
"src/agent/policy",
|
||||
"src/agent/vsock-exporter",
|
||||
|
||||
# Dragonball
|
||||
"src/dragonball",
|
||||
"src/dragonball/dbs_acpi",
|
||||
@@ -41,7 +47,6 @@ resolver = "2"
|
||||
|
||||
# TODO: Add all excluded crates to root workspace
|
||||
exclude = [
|
||||
"src/agent",
|
||||
"src/tools",
|
||||
"src/libs",
|
||||
|
||||
@@ -104,6 +109,7 @@ wasm_container = { path = "src/runtime-rs/crates/runtimes/wasm_container" }
|
||||
kata-sys-util = { path = "src/libs/kata-sys-util" }
|
||||
kata-types = { path = "src/libs/kata-types", features = ["safe-path"] }
|
||||
logging = { path = "src/libs/logging" }
|
||||
mem-agent = { path = "src/libs/mem-agent" }
|
||||
protocols = { path = "src/libs/protocols", features = ["async"] }
|
||||
runtime-spec = { path = "src/libs/runtime-spec" }
|
||||
safe-path = { path = "src/libs/safe-path" }
|
||||
@@ -112,35 +118,65 @@ test-utils = { path = "src/libs/test-utils" }
|
||||
|
||||
# Local dependencies from `src/agent`
|
||||
kata-agent-policy = { path = "src/agent/policy" }
|
||||
rustjail = { path = "src/agent/rustjail" }
|
||||
vsock-exporter = { path = "src/agent/vsock-exporter" }
|
||||
|
||||
# Outside dependencies
|
||||
actix-rt = "2.7.0"
|
||||
anyhow = "1.0"
|
||||
async-recursion = "0.3.2"
|
||||
async-trait = "0.1.48"
|
||||
capctl = "0.2.0"
|
||||
cfg-if = "1.0.0"
|
||||
cgroups = { package = "cgroups-rs", git = "https://github.com/kata-containers/cgroups-rs", rev = "v0.3.5" }
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
const_format = "0.2.30"
|
||||
containerd-shim = { version = "0.10.0", features = ["async"] }
|
||||
containerd-shim-protos = { version = "0.10.0", features = ["async"] }
|
||||
derivative = "2.2.0"
|
||||
futures = "0.3.30"
|
||||
go-flag = "0.1.0"
|
||||
hyper = "0.14.20"
|
||||
hyperlocal = "0.8.0"
|
||||
ipnetwork = "0.17.0"
|
||||
lazy_static = "1.4"
|
||||
libc = "0.2"
|
||||
libc = "0.2.94"
|
||||
log = "0.4.14"
|
||||
netlink-packet-core = "0.7.0"
|
||||
netlink-packet-route = "0.19.0"
|
||||
netlink-sys = { version = "0.7.0", features = ["tokio_socket"] }
|
||||
netns-rs = "0.1.0"
|
||||
# Note: nix needs to stay sync'd with libs versions
|
||||
nix = "0.26.4"
|
||||
oci-spec = { version = "0.8.1", features = ["runtime"] }
|
||||
opentelemetry = { version = "0.17.0", features = ["rt-tokio"] }
|
||||
procfs = "0.12.0"
|
||||
prometheus = { version = "0.14.0", features = ["process"] }
|
||||
protobuf = "3.7.2"
|
||||
rand = "0.8.4"
|
||||
regex = "1.10.5"
|
||||
rstest = "0.18.0"
|
||||
rtnetlink = "0.14.0"
|
||||
scan_fmt = "0.2.6"
|
||||
scopeguard = "1.0.0"
|
||||
serde = { version = "1.0.145", features = ["derive"] }
|
||||
serde_json = "1.0.91"
|
||||
serial_test = "0.10.0"
|
||||
sha2 = "0.10.9"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.4.0"
|
||||
slog-stdlog = "4.0.0"
|
||||
slog-term = "2.9.0"
|
||||
strum = { version = "0.24.0", features = ["derive"] }
|
||||
strum_macros = "0.26.2"
|
||||
tempfile = "3.19.1"
|
||||
thiserror = "1.0"
|
||||
thiserror = "1.0.26"
|
||||
tokio = "1.46.1"
|
||||
tokio-vsock = "0.3.4"
|
||||
toml = "0.5.8"
|
||||
tracing = "0.1.41"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = "0.3.20"
|
||||
ttrpc = "0.8.4"
|
||||
url = "2.5.4"
|
||||
which = "4.3.0"
|
||||
|
||||
@@ -74,7 +74,7 @@ See the [official documentation](docs) including:
|
||||
- [Developer guide](docs/Developer-Guide.md)
|
||||
- [Design documents](docs/design)
|
||||
- [Architecture overview](docs/design/architecture)
|
||||
- [Architecture 3.0 overview](docs/design/architecture_3.0/)
|
||||
- [Architecture 4.0 overview](docs/design/architecture_4.0/)
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
@@ -522,10 +522,18 @@ $ sudo kata-runtime check
|
||||
If your system is *not* able to run Kata Containers, the previous command will error out and explain why.
|
||||
|
||||
# Run Kata Containers with Containerd
|
||||
|
||||
Refer to the [How to use Kata Containers and Containerd](how-to/containerd-kata.md) how-to guide.
|
||||
|
||||
# Run Kata Containers with Kubernetes
|
||||
Refer to the [Run Kata Containers with Kubernetes](how-to/run-kata-with-k8s.md) how-to guide.
|
||||
|
||||
- Containerd
|
||||
|
||||
Refer to the [How to use Kata Containers and Containerd with Kubernetes](how-to/how-to-use-k8s-with-containerd-and-kata.md) how-to guide.
|
||||
|
||||
- CRI-O
|
||||
|
||||
Refer to the [How to use Kata Containers and CRI-O with Kubernetes](how-to/how-to-use-k8s-with-crio-and-kata.md) how-to guide.
|
||||
|
||||
# Troubleshoot Kata Containers
|
||||
|
||||
|
||||
@@ -32,4 +32,4 @@ runtime. Refer to the following guides on how to set up Kata
|
||||
Containers with Kubernetes:
|
||||
|
||||
- [How to use Kata Containers and containerd](../../how-to/containerd-kata.md)
|
||||
- [Run Kata Containers with Kubernetes](../../how-to/run-kata-with-k8s.md)
|
||||
- [Run Kata Containers with Kubernetes](../../how-to/how-to-use-k8s-with-crio-and-kata.md)
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
# Kata 3.0 Architecture
|
||||
## Overview
|
||||
In cloud-native scenarios, there is an increased demand for container startup speed, resource consumption, stability, and security, areas where the present Kata Containers runtime is challenged relative to other runtimes. To achieve this, we propose a solid, field-tested and secure Rust version of the kata-runtime.
|
||||
|
||||
Also, we provide the following designs:
|
||||
|
||||
- Turn key solution with builtin `Dragonball` Sandbox
|
||||
- Async I/O to reduce resource consumption
|
||||
- Extensible framework for multiple services, runtimes and hypervisors
|
||||
- Lifecycle management for sandbox and container associated resources
|
||||
|
||||
### Rationale for choosing Rust
|
||||
|
||||
We chose Rust because it is designed as a system language with a focus on efficiency.
|
||||
In contrast to Go, Rust makes a variety of design trade-offs in order to obtain
|
||||
good execution performance, with innovative techniques that, in contrast to C or
|
||||
C++, provide reasonable protection against common memory errors (buffer
|
||||
overflow, invalid pointers, range errors), error checking (ensuring errors are
|
||||
dealt with), thread safety, ownership of resources, and more.
|
||||
|
||||
These benefits were verified in our project when the Kata Containers guest agent
|
||||
was rewritten in Rust. We notably saw a significant reduction in memory usage
|
||||
with the Rust-based implementation.
|
||||
|
||||
|
||||
## Design
|
||||
### Architecture
|
||||

|
||||
### Built-in VMM
|
||||
#### Current Kata 2.x architecture
|
||||

|
||||
As shown in the figure, runtime and VMM are separate processes. The runtime process forks the VMM process and interacts through the inter-process RPC. Typically, process interaction consumes more resources than peers within the process, and it will result in relatively low efficiency. At the same time, the cost of resource operation and maintenance should be considered. For example, when performing resource recovery under abnormal conditions, the exception of any process must be detected by others and activate the appropriate resource recovery process. If there are additional processes, the recovery becomes even more difficult.
|
||||
#### How To Support Built-in VMM
|
||||
We provide `Dragonball` Sandbox to enable built-in VMM by integrating VMM's function into the Rust library. We could perform VMM-related functionalities by using the library. Because runtime and VMM are in the same process, there is a benefit in terms of message processing speed and API synchronization. It can also guarantee the consistency of the runtime and the VMM life cycle, reducing resource recovery and exception handling maintenance, as shown in the figure:
|
||||

|
||||
### Async Support
|
||||
#### Why Need Async
|
||||
**Async is already in stable Rust and allows us to write async code**
|
||||
|
||||
- Async provides significantly reduced CPU and memory overhead, especially for workloads with a large amount of IO-bound tasks
|
||||
- Async is zero-cost in Rust, which means that you only pay for what you use. Specifically, you can use async without heap allocations and dynamic dispatch, which greatly improves efficiency
|
||||
- For more (see [Why Async?](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) and [The State of Asynchronous Rust](https://rust-lang.github.io/async-book/01_getting_started/03_state_of_async_rust.html)).
|
||||
|
||||
**There may be several problems if implementing kata-runtime with Sync Rust**
|
||||
|
||||
- Too many threads with a new TTRPC connection
|
||||
- TTRPC threads: reaper thread(1) + listener thread(1) + client handler(2)
|
||||
- Add 3 I/O threads with a new container
|
||||
- In Sync mode, implementing a timeout mechanism is challenging. For example, in TTRPC API interaction, the timeout mechanism is difficult to align with Golang
|
||||
#### How To Support Async
|
||||
The kata-runtime is controlled by TOKIO_RUNTIME_WORKER_THREADS to run the OS thread, which is 2 threads by default. For TTRPC and container-related threads run in the `tokio` thread in a unified manner, and related dependencies need to be switched to Async, such as Timer, File, Netlink, etc. With the help of Async, we can easily support no-block I/O and timer. Currently, we only utilize Async for kata-runtime. The built-in VMM keeps the OS thread because it can ensure that the threads are controllable.
|
||||
|
||||
**For N `tokio` worker threads and M containers**
|
||||
|
||||
- Sync runtime(both OS thread and `tokio` task are OS thread but without `tokio` worker thread) OS thread number: 4 + 12*M
|
||||
- Async runtime(only OS thread is OS thread) OS thread number: 2 + N
|
||||
```shell
|
||||
├─ main(OS thread)
|
||||
├─ async-logger(OS thread)
|
||||
└─ tokio worker(N * OS thread)
|
||||
├─ agent log forwarder(1 * tokio task)
|
||||
├─ health check thread(1 * tokio task)
|
||||
├─ TTRPC reaper thread(M * tokio task)
|
||||
├─ TTRPC listener thread(M * tokio task)
|
||||
├─ TTRPC client handler thread(7 * M * tokio task)
|
||||
├─ container stdin io thread(M * tokio task)
|
||||
├─ container stdout io thread(M * tokio task)
|
||||
└─ container stderr io thread(M * tokio task)
|
||||
```
|
||||
### Extensible Framework
|
||||
The Kata 3.x runtime is designed with the extension of service, runtime, and hypervisor, combined with configuration to meet the needs of different scenarios. At present, the service provides a register mechanism to support multiple services. Services could interact with runtime through messages. In addition, the runtime handler handles messages from services. To meet the needs of a binary that supports multiple runtimes and hypervisors, the startup must obtain the runtime handler type and hypervisor type through configuration.
|
||||
|
||||

|
||||
### Resource Manager
|
||||
In our case, there will be a variety of resources, and every resource has several subtypes. Especially for `Virt-Container`, every subtype of resource has different operations. And there may be dependencies, such as the share-fs rootfs and the share-fs volume will use share-fs resources to share files to the VM. Currently, network and share-fs are regarded as sandbox resources, while rootfs, volume, and cgroup are regarded as container resources. Also, we abstract a common interface for each resource and use subclass operations to evaluate the differences between different subtypes.
|
||||

|
||||
|
||||
## Roadmap
|
||||
|
||||
- Stage 1 (June): provide basic features (current delivered)
|
||||
- Stage 2 (September): support common features
|
||||
- Stage 3: support full features
|
||||
|
||||
| **Class** | **Sub-Class** | **Development Stage** | **Status** |
|
||||
| -------------------------- | ------------------- | --------------------- |------------|
|
||||
| Service | task service | Stage 1 | ✅ |
|
||||
| | extend service | Stage 3 | 🚫 |
|
||||
| | image service | Stage 3 | 🚫 |
|
||||
| Runtime handler | `Virt-Container` | Stage 1 | ✅ |
|
||||
| Endpoint | VETH Endpoint | Stage 1 | ✅ |
|
||||
| | Physical Endpoint | Stage 2 | ✅ |
|
||||
| | Tap Endpoint | Stage 2 | ✅ |
|
||||
| | `Tuntap` Endpoint | Stage 2 | ✅ |
|
||||
| | `IPVlan` Endpoint | Stage 2 | ✅ |
|
||||
| | `MacVlan` Endpoint | Stage 2 | ✅ |
|
||||
| | MACVTAP Endpoint | Stage 3 | 🚫 |
|
||||
| | `VhostUserEndpoint` | Stage 3 | 🚫 |
|
||||
| Network Interworking Model | Tc filter | Stage 1 | ✅ |
|
||||
| | `MacVtap` | Stage 3 | 🚧 |
|
||||
| Storage | Virtio-fs | Stage 1 | ✅ |
|
||||
| | `nydus` | Stage 2 | 🚧 |
|
||||
| | `device mapper` | Stage 2 | 🚫 |
|
||||
| `Cgroup V2` | | Stage 2 | 🚧 |
|
||||
| Hypervisor | `Dragonball` | Stage 1 | 🚧 |
|
||||
| | QEMU | Stage 2 | 🚫 |
|
||||
| | Cloud Hypervisor | Stage 3 | 🚫 |
|
||||
| | Firecracker | Stage 3 | 🚫 |
|
||||
|
||||
## FAQ
|
||||
|
||||
- Are the "service", "message dispatcher" and "runtime handler" all part of the single Kata 3.x runtime binary?
|
||||
|
||||
Yes. They are components in Kata 3.x runtime. And they will be packed into one binary.
|
||||
1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc.
|
||||
2. Message dispatcher, it is used to match multiple requests from the service module.
|
||||
3. Runtime handler is used to deal with the operation for sandbox and container.
|
||||
- What is the name of the Kata 3.x runtime binary?
|
||||
|
||||
Apparently we can't use `containerd-shim-v2-kata` because it's already used. We are facing the hardest issue of "naming" again. Any suggestions are welcomed.
|
||||
Internally we use `containerd-shim-v2-rund`.
|
||||
|
||||
- Is the Kata 3.x design compatible with the containerd shimv2 architecture?
|
||||
|
||||
Yes. It is designed to follow the functionality of go version kata. And it implements the `containerd shim v2` interface/protocol.
|
||||
|
||||
- How will users migrate to the Kata 3.x architecture?
|
||||
|
||||
The migration plan will be provided before the Kata 3.x is merging into the main branch.
|
||||
|
||||
- Is `Dragonball` limited to its own built-in VMM? Can the `Dragonball` system be configured to work using an external `Dragonball` VMM/hypervisor?
|
||||
|
||||
The `Dragonball` could work as an external hypervisor. However, stability and performance is challenging in this case. Built in VMM could optimise the container overhead, and it's easy to maintain stability.
|
||||
|
||||
`runD` is the `containerd-shim-v2` counterpart of `runC` and can run a pod/containers. `Dragonball` is a `microvm`/VMM that is designed to run container workloads. Instead of `microvm`/VMM, we sometimes refer to it as secure sandbox.
|
||||
|
||||
- QEMU, Cloud Hypervisor and Firecracker support are planned, but how that would work. Are they working in separate process?
|
||||
|
||||
Yes. They are unable to work as built in VMM.
|
||||
|
||||
- What is `upcall`?
|
||||
|
||||
The `upcall` is used to hotplug CPU/memory/MMIO devices, and it solves two issues.
|
||||
1. avoid dependency on PCI/ACPI
|
||||
2. avoid dependency on `udevd` within guest and get deterministic results for hotplug operations. So `upcall` is an alternative to ACPI based CPU/memory/device hotplug. And we may cooperate with the community to add support for ACPI based CPU/memory/device hotplug if needed.
|
||||
|
||||
`Dbs-upcall` is a `vsock-based` direct communication tool between VMM and guests. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests once the kernel has started. And the client side is in VMM , it'll be a thread that communicates with VSOCK through `uds`. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid virtualization of ACPI to minimize virtual machine's overhead. And there could be many other usage through this direct communication channel. It's already open source.
|
||||
https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall
|
||||
|
||||
- The URL below says the kernel patches work with 4.19, but do they also work with 5.15+ ?
|
||||
|
||||
Forward compatibility should be achievable, we have ported it to 5.10 based kernel.
|
||||
|
||||
- Are these patches platform-specific or would they work for any architecture that supports VSOCK?
|
||||
|
||||
It's almost platform independent, but some message related to CPU hotplug are platform dependent.
|
||||
|
||||
- Could the kernel driver be replaced with a userland daemon in the guest using loopback VSOCK?
|
||||
|
||||
We need to create device nodes for hot-added CPU/memory/devices, so it's not easy for userspace daemon to do these tasks.
|
||||
|
||||
- The fact that `upcall` allows communication between the VMM and the guest suggests that this architecture might be incompatible with https://github.com/confidential-containers where the VMM should have no knowledge of what happens inside the VM.
|
||||
|
||||
1. `TDX` doesn't support CPU/memory hotplug yet.
|
||||
2. For ACPI based device hotplug, it depends on ACPI `DSDT` table, and the guest kernel will execute `ASL` code to handle during handling those hotplug event. And it should be easier to audit VSOCK based communication than ACPI `ASL` methods.
|
||||
|
||||
- What is the security boundary for the monolithic / "Built-in VMM" case?
|
||||
|
||||
It has the security boundary of virtualization. More details will be provided in next stage.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 95 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 66 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 136 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 72 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 139 KiB |
File diff suppressed because one or more lines are too long
433
docs/design/architecture_4.0/README.md
Normal file
433
docs/design/architecture_4.0/README.md
Normal file
@@ -0,0 +1,433 @@
|
||||
# Kata Containers 4.0 Architecture (Rust Runtime)
|
||||
|
||||
## Overview
|
||||
|
||||
Kata Containers 4.0 represents a significant architectural evolution, moving beyond the limitations of legacy multi-process container runtimes. Driven by a modern Rust-based stack, this release transitions to an asynchronous, unified architecture that drastically reduces resource consumption and latency.
|
||||
|
||||
By consolidating the entire runtime into a single, high-performance binary, Kata 4.0 eliminates the overhead of cross-process communication and streamlines the container lifecycle. The result is a secure, production-tested runtime capable of handling high-density workloads with efficiency. With built-in support for diverse container abstractions and optimized hypervisor integration, Kata 4.0 delivers the agility and robustness required by modern, cloud-native infrastructure.
|
||||
|
||||
---
|
||||
|
||||
## 1. Architecture Overview
|
||||
|
||||
The Kata Containers Rust Runtime is designed to minimize resource overhead and startup latency. It achieves this by shifting from traditional process-based management to a more integrated, Rust-native control flow.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
containerd["containerd"] --> shimv2["containerd-shim-kata-v2 (shimv2)"]
|
||||
|
||||
subgraph BuiltIn["Built-in VMM (Integrated Mode)"]
|
||||
direction TD
|
||||
subgraph shimv2_bi["shimv2 process (Single Process)"]
|
||||
runtime_bi["shimv2 runtime"]
|
||||
subgraph dragonball["Dragonball VMM (library)"]
|
||||
helpers_bi["virtiofs / nydus\n(BuiltIn)"]
|
||||
end
|
||||
runtime_bi -->|"direct function calls"| dragonball
|
||||
end
|
||||
subgraph guestvm_bi["Guest VM"]
|
||||
agent_bi["kata-agent"]
|
||||
end
|
||||
shimv2_bi -->|"hybrid-vsock"| guestvm_bi
|
||||
end
|
||||
|
||||
subgraph OptionalVMM["Optional VMM (External Mode)"]
|
||||
direction TD
|
||||
shimv2_ext["shimv2 process"]
|
||||
imagesrvd_ext["virtiofsd / nydusd\n(Independent Process)"]
|
||||
ext_vmm["External VMM process\n(QEMU / Cloud-Hypervisor / Firecracker)"]
|
||||
subgraph guestvm_ext["Guest VM"]
|
||||
agent_ext["kata-agent"]
|
||||
end
|
||||
shimv2_ext -->|"fork + IPC/RPC"| ext_vmm
|
||||
shimv2_ext -->|"manages"| imagesrvd_ext
|
||||
ext_vmm -->|"vsock / hybrid-vsock"| guestvm_ext
|
||||
end
|
||||
|
||||
shimv2 --> BuiltIn
|
||||
shimv2 --> OptionalVMM
|
||||
|
||||
classDef process fill:#d0e8ff,stroke:#336,stroke-width:1px
|
||||
classDef vm fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef agent fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
class shimv2,runtime_bi,shimv2_ext,helpers_bi,imagesrvd_ext,ext_vmm process
|
||||
class guestvm_bi,guestvm_ext vm
|
||||
class agent_bi,agent_ext agent
|
||||
```
|
||||
|
||||
The runtime employs a **flexible VMM strategy**, supporting both `built-in` and `optional` VMMs. This allows users to choose between a tightly integrated VMM (e.g., Dragonball) for peak performance, or external options (e.g., QEMU, Cloud-Hypervisor, Firecracker) for enhanced compatibility and modularity.
|
||||
|
||||
### A. Built-in VMM (Integrated Mode)
|
||||
|
||||
The built-in VMM mode is the default and recommended configuration for users, as it offers superior performance and resource efficiency.
|
||||
|
||||
In this mode, the VMM (`Dragonball`) is **deeply integrated** into the `shimv2`'s lifecycle. This eliminates the overhead of IPC, enabling lower-latency message processing and tight API synchronization. Moreover, it ensures the runtime and VMM share a unified lifecycle, simplifying exception handling and resource cleanup.
|
||||
|
||||
* **Integrated Management**: The `shimv2` directly controls the VMM and its critical helper services (`virtiofsd` or `nydusd`).
|
||||
* **Performance**: By eliminating external process overhead and complex inter-process communication (IPC), this mode achieves faster container startup and higher resource density.
|
||||
* **Core Technology**: Primarily utilizes **Dragonball**, the native Rust-based VMM optimized and dedicated for cloud-native scenarios.
|
||||
|
||||
> **Note**: The built-in VMM mode is the default and recommended configuration for users, as it offers superior performance and resource efficiency.
|
||||
|
||||
### B. Optional VMM (External Mode)
|
||||
|
||||
The optional VMM mode is available for users with specific requirements that necessitate external hypervisor support.
|
||||
|
||||
In this mode, the runtime and the VMM operate as separate, decoupled processes. The runtime forks the VMM process and interacts with it via inter-process RPC. And the `containerd-shim-kata-v2`(short of `shimv2`) manages the VMM as an **external process**.
|
||||
|
||||
* **Decoupled Lifecycle**: The `shimv2` communicates with the VMM (e.g., QEMU, Cloud-Hypervisor, or Firecracker) via vsock/hybrid vsock.
|
||||
* **Flexibility**: Ideal for environments that require specific hypervisor hardware emulation or legacy compatibility.
|
||||
|
||||
> **Note**: This approach (Optional VMM) introduces overhead due to context switching and cross-process communication. Furthermore, managing resources across process boundaries—especially during abnormal conditions—introduces significant complexity in error detection and recovery.
|
||||
|
||||
---
|
||||
|
||||
## Core Architectural Principles
|
||||
|
||||
* **Safety via Rust**: Leveraging Rust's ownership and type systems to eliminate memory-related vulnerabilities (buffer overflows, dangling pointers) by design.
|
||||
* **Performance via Async**: Utilizing Tokio to handle high-concurrency I/O, reducing the OS thread footprint by an order of magnitude.
|
||||
* **Built-in VMM**: A modular, library-based approach to virtualization, enabling tighter integration with the runtime.
|
||||
* **Pluggable Framework**: A clean abstraction layer allowing seamless swapping of hypervisors, network interfaces, and storage backends.
|
||||
|
||||
---
|
||||
|
||||
## Design Deep Dive
|
||||
|
||||
### Built-in VMM Integration (Dragonball)
|
||||
|
||||
The legacy Kata 2.x architecture relied on inter-process communication (IPC) between the runtime and the VMM. This introduced context-switching latency and complex error-recovery requirements across process boundaries. In contrast, the built-in VMM approach embeds the VMM directly within the runtime's process space. This eliminates IPC overhead, allowing for direct function calls and shared memory access, resulting in significantly reduced startup times and improved performance.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph HostProcess["Host Process:containerd-shim-kata-v2 (shimv2)"]
|
||||
shimv2["shimv2 runtime"]
|
||||
end
|
||||
|
||||
imagesrvd["virtiofsd / nydusd\n(Independent Process)"]
|
||||
|
||||
subgraph ExtVMMProc["External VMM Process (e.g., QEMU)"]
|
||||
vmm["VMM\n(QEMU / Cloud-Hypervisor\n/ Firecracker)"]
|
||||
end
|
||||
|
||||
subgraph GuestVM["Guest VM"]
|
||||
agent["kata-agent"]
|
||||
end
|
||||
|
||||
shimv2 -->|"fork + IPC / RPC"| vmm
|
||||
shimv2 -->|"manages"| imagesrvd
|
||||
vmm -->|"vsock / hybrid-vsock"| GuestVM
|
||||
|
||||
classDef proc fill:#d0e8ff,stroke:#336,stroke-width:1px
|
||||
classDef vm fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef ag fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
class shimv2,imagesrvd,vmm proc
|
||||
class agent ag
|
||||
```
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph SingleProcess["Single Process: containerd-shim-kata-v2 (shimv2)"]
|
||||
shimv2["shimv2 runtime"]
|
||||
subgraph dragonball["Dragonball VMM (library)"]
|
||||
helpers["virtiofs / nydus\n(BuiltIn)"]
|
||||
end
|
||||
shimv2 -->|"direct function calls"| dragonball
|
||||
end
|
||||
|
||||
subgraph GuestVM["Guest VM"]
|
||||
agent["kata-agent"]
|
||||
end
|
||||
|
||||
dragonball -->|"hybrid-vsock"| GuestVM
|
||||
|
||||
classDef proc fill:#d0e8ff,stroke:#336,stroke-width:1px
|
||||
classDef vm fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef ag fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
class shimv2,helpers proc
|
||||
class agent ag
|
||||
```
|
||||
|
||||
By integrating Dragonball directly as a library, we eliminate the need for heavy IPC.
|
||||
|
||||
* **API Synchronization**: Direct function calls replace RPCs, reducing latency.
|
||||
* **Unified Lifecycle**: The runtime and VMM share a single process lifecycle, significantly simplifying resource cleanup and fault isolation.
|
||||
|
||||
### Layered Architecture
|
||||
|
||||
The Kata 4.0 runtime utilizes a highly modular, layered architecture designed to decouple high-level service requests from low-level infrastructure execution. This design facilitates extensibility, allowing the system to support diverse container types and dragonball within a single, unified Rust binary and also support other hypervisors as optional VMMs.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph L1["Layer 1 — Service & Orchestration Layer"]
|
||||
TaskSvc["Task Service"]
|
||||
ImageSvc["Image Service"]
|
||||
OtherSvc["Other Services"]
|
||||
Dispatcher["Message Dispatcher"]
|
||||
TaskSvc --> Dispatcher
|
||||
ImageSvc --> Dispatcher
|
||||
OtherSvc --> Dispatcher
|
||||
end
|
||||
|
||||
subgraph L2["Layer 2 — Management & Handler Layer"]
|
||||
subgraph RuntimeHandler["Runtime Handler"]
|
||||
SandboxMgr["Sandbox Manager"]
|
||||
ContainerMgr["Container Manager"]
|
||||
end
|
||||
subgraph ContainerAbstractions["Container Abstractions"]
|
||||
LinuxContainer["LinuxContainer"]
|
||||
VirtContainer["VirtContainer"]
|
||||
WasmContainer["WasmContainer"]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph L3["Layer 3 — Infrastructure Abstraction Layer"]
|
||||
subgraph HypervisorIface["Hypervisor Interface"]
|
||||
Qemu["Qemu"]
|
||||
CloudHV["Cloud Hypervisor"]
|
||||
Firecracker["Firecracker"]
|
||||
Dragonball["Dragonball"]
|
||||
end
|
||||
subgraph ResourceMgr["Resource Manager"]
|
||||
Sharedfs["Sharedfs"]
|
||||
Network["Network"]
|
||||
Rootfs["Rootfs"]
|
||||
Volume["Volume"]
|
||||
Cgroup["Cgroup"]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph L4["Layer 4 — Built-in Dragonball VMM Layer"]
|
||||
BuiltinDB["Builtin Dragonball"]
|
||||
end
|
||||
|
||||
Dispatcher --> RuntimeHandler
|
||||
RuntimeHandler --> ContainerAbstractions
|
||||
ContainerAbstractions --> HypervisorIface
|
||||
ContainerAbstractions --> ResourceMgr
|
||||
Dragonball --> BuiltinDB
|
||||
|
||||
classDef svc fill:#cce5ff,stroke:#004085,stroke-width:1px
|
||||
classDef handler fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef infra fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
classDef builtin fill:#f8d7da,stroke:#721c24,stroke-width:1px
|
||||
class TaskSvc,ImageSvc,OtherSvc,Dispatcher svc
|
||||
class SandboxMgr,ContainerMgr,LinuxContainer,VirtContainer,WasmContainer handler
|
||||
class Qemu,CloudHV,Firecracker,Dragonball,Sharedfs,Network,Rootfs,Volume,Cgroup infra
|
||||
class BuiltinDB builtin
|
||||
```
|
||||
|
||||
#### Service & Orchestration Layer
|
||||
|
||||
* **Service Layer**: The entry point for the runtime, providing specialized interfaces for external callers (e.g., `containerd`). It includes:
|
||||
* **Task Service**: Manages the lifecycle of containerized processes.
|
||||
* **Image Service**: Handles container image operations.
|
||||
* **Other Services**: An extensible framework allowing for custom modules.
|
||||
|
||||
* **Message Dispatcher**: Acts as a centralized traffic controller. It parses requests from the Service layer and routes them to the appropriate **Runtime Handler**, ensuring efficient message multiplexing.
|
||||
|
||||
#### Management & Handler Layer
|
||||
|
||||
* **Runtime Handler**: The core processing engine. It abstracts the underlying workload, enabling the runtime to handle various container types through:
|
||||
* **Sandbox Manager**: Orchestrates the lifecycle of the entire Pod (Sandbox).
|
||||
* **Container Manager**: Manages individual containers within a Sandbox.
|
||||
|
||||
* **Container Abstractions**: The framework is agnostic to the container implementation, with explicit support paths for:
|
||||
* **LinuxContainer** (Standard/OCI)
|
||||
* **VirtContainer** (Virtualization-based)
|
||||
* **WasmContainer** (WebAssembly-based)
|
||||
|
||||
#### Infrastructure Abstraction Layer
|
||||
|
||||
This layer provides standardized interfaces for hardware and resource management, regardless of the underlying backend.
|
||||
|
||||
* **Hypervisor Interface**: A pluggable architecture supporting multiple virtualization backends, including **Qemu**, **Cloud Hypervisor**, **Firecracker**, and **Dragonball**.
|
||||
|
||||
* **Resource Manager**: A unified interface for managing critical infrastructure components:
|
||||
* **Sharedfs, Network, Rootfs, Volume, and cgroup management**.
|
||||
|
||||
#### Built-in Dragonball VMM Layer
|
||||
|
||||
Representing the core of the high-performance runtime, the `Builtin Dragonball` block demonstrates deep integration between the runtime and the hypervisor.
|
||||
|
||||
#### Key Architectural Advantages
|
||||
|
||||
* **Uniformity**: By consolidating these layers into a single binary, the runtime ensures a consistent state across all sub-modules, preventing the "split-brain" scenarios common in multi-process runtimes.
|
||||
* **Modularity**: The clear separation between the **Message Dispatcher** and the **Runtime Handler** allows developers to introduce new container types (e.g., WASM) or hypervisors without modifying existing core logic.
|
||||
* **Efficiency**: The direct integration of `Dragonball` as a library allows for "Zero-Copy" resource management and direct API access, which drastically improves performance compared to traditional RPC-based hypervisor interaction.
|
||||
|
||||
### Extensible Framework
|
||||
|
||||
The Kata Rust runtime features a modular design that supports diverse services, runtimes, and hypervisors. We utilize a registration mechanism to decouple service logic from the core runtime. At startup, the runtime resolves the required runtime handler and hypervisor types based on configuration.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
API["API"]
|
||||
|
||||
subgraph Services["Configurable Services"]
|
||||
TaskSvc["Task Service"]
|
||||
ImageSvc["Image Service"]
|
||||
OtherSvc["Other Service"]
|
||||
end
|
||||
|
||||
Msg(["Message Dispatcher"])
|
||||
|
||||
subgraph Handlers["Configurable Runtime Handlers"]
|
||||
WasmC["WasmContainer"]
|
||||
VirtC["VirtContainer"]
|
||||
LinuxC["LinuxContainer"]
|
||||
end
|
||||
|
||||
subgraph HVs["Configurable Hypervisors"]
|
||||
DB["Dragonball"]
|
||||
QEMU["QEMU"]
|
||||
CH["Cloud Hypervisor"]
|
||||
FC["Firecracker"]
|
||||
end
|
||||
|
||||
API --> Services
|
||||
Services --> Msg
|
||||
Msg --> Handlers
|
||||
Handlers --> HVs
|
||||
|
||||
classDef api fill:#d0e8ff,stroke:#336,stroke-width:1px
|
||||
classDef svc fill:#e2d9f3,stroke:#6610f2,stroke-width:1px
|
||||
classDef msg fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
classDef handler fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef hv fill:#f8d7da,stroke:#721c24,stroke-width:1px
|
||||
class API api
|
||||
class TaskSvc,ImageSvc,OtherSvc svc
|
||||
class Msg msg
|
||||
class WasmC,VirtC,LinuxC handler
|
||||
class DB,QEMU,CH,FC hv
|
||||
```
|
||||
|
||||
### Modular Resource Manager
|
||||
|
||||
Managing diverse resources—from Virtio-fs volumes to Cgroup V2—is handled by an abstracted resource manager. Each resource type implements a common trait, enabling uniform lifecycle hooks and deterministic dependency resolution.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
RM["Resource Manager"]
|
||||
|
||||
subgraph SandboxRes["Sandbox Resources"]
|
||||
Network["Network Entity"]
|
||||
SharedFs["Shared FS"]
|
||||
end
|
||||
|
||||
subgraph ContainerRes["Container Resources"]
|
||||
Rootfs["Rootfs"]
|
||||
Cgroup["Cgroup"]
|
||||
Volume["Volume"]
|
||||
end
|
||||
|
||||
RM --> Network
|
||||
RM --> SharedFs
|
||||
RM --> Rootfs
|
||||
RM --> Cgroup
|
||||
RM --> Volume
|
||||
|
||||
Network --> Endpoint["endpoint\n(veth / physical)"]
|
||||
Network --> NetModel["model\n(tcfilter / route)"]
|
||||
SharedFs --> InlineVirtioFs["inline virtiofs"]
|
||||
SharedFs --> StandaloneVirtioFs["standalone virtiofs"]
|
||||
|
||||
Rootfs --> RootfsTypes["block / virtiofs / nydus"]
|
||||
Cgroup --> CgroupVers["v1 / v2"]
|
||||
Volume --> VolumeTypes["sharefs / shm / local\nephemeral / direct / block"]
|
||||
|
||||
classDef rm fill:#e2d9f3,stroke:#6610f2,stroke-width:2px
|
||||
classDef sandbox fill:#d0e8ff,stroke:#336,stroke-width:1px
|
||||
classDef container fill:#d4edda,stroke:#155724,stroke-width:1px
|
||||
classDef impl fill:#fff3cd,stroke:#856404,stroke-width:1px
|
||||
class RM rm
|
||||
class Network,SharedFs sandbox
|
||||
class Rootfs,Cgroup,Volume container
|
||||
class Endpoint,NetModel,InlineVirtioFs,StandaloneVirtioFs,RootfsTypes,CgroupVers,VolumeTypes impl
|
||||
```
|
||||
|
||||
### Asynchronous I/O Model
|
||||
|
||||
Synchronous runtimes are often limited by "thread bloat," where each container or connection spawns multiple OS threads.
|
||||
|
||||
#### Why Async Rust?
|
||||
|
||||
**The Rust async ecosystem is stable and highly efficient, providing several key benefits:**
|
||||
|
||||
- Reduced Overhead: Significantly lower CPU and memory consumption, particularly for I/O-bound workloads.
|
||||
- Zero-Cost Abstractions: Rust's async model allows developers to "pay only for what they use," avoiding heap allocations and dynamic dispatch where possible.
|
||||
- For further reading, see [Why Async?](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) and [The State of Asynchronous Rust](https://rust-lang.github.io/async-book/01_getting_started/03_state_of_async_rust.html).
|
||||
|
||||
**Limitations of Synchronous Rust in kata-runtime:**
|
||||
|
||||
- Thread Proliferation: Every TTRPC connection creates multiple threads (Reaper, Listener, Handler), and each container adds 3 additional I/O threads, leading to high thread count and memory pressure.
|
||||
- Timeout Complexity: Implementing reliable, cross-platform timeout mechanisms in synchronous code is difficult, especially when aligning with Golang-based components.
|
||||
|
||||
#### Implementation
|
||||
|
||||
The kata-runtime utilizes Tokio to manage asynchronous tasks. By offloading TTRPC and container-related I/O to a unified Tokio executor and switching dependencies (Timer, File, Netlink) to their asynchronous counterparts, we achieve non-blocking I/O. The built-in VMM remains on a dedicated OS thread to ensure control and real-time performance.
|
||||
|
||||
**Comparison of OS Thread usage (for N tokio worker threads and M containers)**
|
||||
|
||||
- Sync Runtime: OS thread count scales as 4 + 12*M.
|
||||
- Async Runtime: OS thread count scales as 2 + N.
|
||||
|
||||
```shell
|
||||
├─ main(OS thread)
|
||||
├─ async-logger(OS thread)
|
||||
└─ tokio worker(N * OS thread)
|
||||
├─ agent log forwarder(1 * tokio task)
|
||||
├─ health check thread(1 * tokio task)
|
||||
├─ TTRPC reaper thread(M * tokio task)
|
||||
├─ TTRPC listener thread(M * tokio task)
|
||||
├─ TTRPC client handler thread(7 * M * tokio task)
|
||||
├─ container stdin io thread(M * tokio task)
|
||||
├─ container stdout io thread(M * tokio task)
|
||||
└─ container stderr io thread(M * tokio task)
|
||||
```
|
||||
|
||||
The Async Advantage:
|
||||
We move away from thread-per-task to a Tokio-driven task model.
|
||||
|
||||
* **Scalability**: The OS thread count is reduced from 4 + 12*M (Sync) to 2 + N (Async), where N is the worker thread count.
|
||||
* **Efficiency**: Non-blocking I/O allows a single thread to handle multiplexed container operations, significantly lowering memory consumption for high-density pod deployments.
|
||||
|
||||
---
|
||||
|
||||
## 2. Getting Started
|
||||
To configure your preferred VMM strategy, locate the `[hypervisor]` block in your runtime configuration file:
|
||||
|
||||
- Install Kata Containers with the Rust Runtime and Dragonball as the built-in VMM by following the [containerd-kata](../../how-to/containerd-kata.md).
|
||||
- Run a kata with builtin VMM Dragonball
|
||||
|
||||
```shell
|
||||
$ sudo ctr run --runtime io.containerd.kata.v2 -d docker.io/library/ubuntu:latest hello
|
||||
```
|
||||
|
||||
As the VMM and its image service have been builtin, you should only see a single containerd-shim-kata-v2 process.
|
||||
|
||||
---
|
||||
|
||||
## FAQ
|
||||
|
||||
* **Q1**: Is the architecture compatible with containerd?
|
||||
|
||||
Yes. It implements the containerd-shim-v2 interface, ensuring drop-in compatibility with standard cloud-native tooling.
|
||||
|
||||
* **Q2**: What is the security boundary for the "Built-in VMM" model?
|
||||
|
||||
The security boundary remains established by the hypervisor (hardware virtualization). The shift to a monolithic process model does not compromise isolation; rather, it improves the integrity of the control plane by reducing the attack surface typically associated with complex IPC mechanisms.
|
||||
|
||||
* **Q3**: What is the migration path?
|
||||
|
||||
Migration is managed via configuration policies. The containerd shim configuration will allow users to toggle between the legacy runtime and the runtime-rs (internally `RunD`) binary, facilitating canary deployments and gradual migration.
|
||||
|
||||
* **Q4**: Why upcall instead of ACPI?
|
||||
|
||||
Standard ACPI-based hotplugging requires heavy guest-side kernel emulation and udevd interaction. Dbs-upcall utilizes a vsock-based direct channel to trigger hotplug events, providing:
|
||||
|
||||
Deterministic execution: Bypassing complex guest-side ACPI state machines.
|
||||
Lower overhead: Minimizing guest kernel footprint.
|
||||
|
||||
* **Q5**: How upcall works?
|
||||
|
||||
The `Dbs-upcall` architecture consists of a server-side driver in the guest kernel and a client-side thread within the VMM. Once the guest kernel initializes, it establishes a communication channel via vsock (using uds). This allows the VMM to directly request device hot-add/hot-remove operations. We have already open-sourced this implementation: [dbs-upcall](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall).
|
||||
@@ -1,137 +1,324 @@
|
||||
# Virtualization in Kata Containers
|
||||
|
||||
Kata Containers, a second layer of isolation is created on top of those provided by traditional namespace-containers. The
|
||||
hardware virtualization interface is the basis of this additional layer. Kata will launch a lightweight virtual machine,
|
||||
and use the guest’s Linux kernel to create a container workload, or workloads in the case of multi-container pods. In Kubernetes
|
||||
and in the Kata implementation, the sandbox is carried out at the pod level. In Kata, this sandbox is created using a virtual machine.
|
||||
## Overview
|
||||
|
||||
This document describes how Kata Containers maps container technologies to virtual machines technologies, and how this is realized in
|
||||
the multiple hypervisors and virtual machine monitors that Kata supports.
|
||||
Kata Containers creates a second layer of isolation on top of traditional namespace-based containers using hardware virtualization. Kata launches a lightweight virtual machine (VM) and uses the guest Linux kernel to create container workloads. In Kubernetes, the sandbox is implemented at the pod level using VMs.
|
||||
|
||||
## Mapping container concepts to virtual machine technologies
|
||||
This document describes:
|
||||
|
||||
A typical deployment of Kata Containers will be in Kubernetes by way of a Container Runtime Interface (CRI) implementation. On every node,
|
||||
Kubelet will interact with a CRI implementer (such as containerd or CRI-O), which will in turn interface with Kata Containers (an OCI based runtime).
|
||||
- How Kata Containers maps container technologies to virtualization technologies
|
||||
- The multiple hypervisors and Virtual Machine Monitors (VMMs) supported by Kata
|
||||
- Guidance for selecting the appropriate hypervisor for your use case
|
||||
|
||||
The CRI API, as defined at the [Kubernetes CRI-API repo](https://github.com/kubernetes/cri-api/), implies a few constructs being supported by the
|
||||
CRI implementation, and ultimately in Kata Containers. In order to support the full [API](https://github.com/kubernetes/cri-api/blob/a6f63f369f6d50e9d0886f2eda63d585fbd1ab6a/pkg/apis/runtime/v1alpha2/api.proto#L34-L110) with the CRI-implementer, Kata must provide the following constructs:
|
||||
### Architecture
|
||||
|
||||

|
||||
A typical Kata Containers deployment integrates with Kubernetes through a Container Runtime Interface (CRI) implementation:
|
||||
|
||||
These constructs can then be further mapped to what devices are necessary for interfacing with the virtual machine:
|
||||
```
|
||||
Kubelet → CRI (containerd/CRI-O) → Kata Containers (OCI runtime) → VM → Containers
|
||||
```
|
||||
|
||||

|
||||
The CRI API requires Kata to support the following constructs:
|
||||
|
||||
Ultimately, these concepts map to specific para-virtualized devices or virtualization technologies.
|
||||
| CRI Construct | VM Equivalent | Virtualization Technology |
|
||||
|---------------|---------------|---------------------------|
|
||||
| Pod Sandbox | VM | Hypervisor/VMM |
|
||||
| Container | Process in VM | Namespace/Cgroup in guest |
|
||||
| Network | Network Interface | virtio-net, vhost-net, physical, etc. |
|
||||
| Storage | Block/File Device | virtio-block, virtio-scsi, virtio-fs |
|
||||
| Compute | vCPU/Memory | KVM, ACPI hotplug |
|
||||
|
||||

|
||||
### Mapping Container Concepts to Virtualization Technologies
|
||||
|
||||
Each hypervisor or VMM varies on how or if it handles each of these.
|
||||
Kata Containers implements the Kubernetes Container Runtime Interface (CRI) to provide pod and container lifecycle management. The CRI API defines abstractions that Kata must translate into virtualization primitives.
|
||||
|
||||
## Kata Containers Hypervisor and VMM support
|
||||
The mapping from CRI constructs to virtualization technologies follows a three-layer model:
|
||||
|
||||
Kata Containers [supports multiple hypervisors](../hypervisors.md).
|
||||
```
|
||||
CRI API Constructs → VM Abstractions → Para-virtualized Devices
|
||||
```
|
||||
|
||||
Details of each solution and a summary are provided below.
|
||||
**Layer 1: CRI API Constructs**
|
||||
|
||||
The CRI API ([kubernetes/cri-api](https://github.com/kubernetes/cri-api)) defines the following abstractions that Kata must implement:
|
||||
|
||||
| Construct | Description |
|
||||
|-----------|-------------|
|
||||
| Pod Sandbox | Isolated execution environment for containers |
|
||||
| Container | Process workload within a sandbox |
|
||||
| Network | Pod and container networking interfaces |
|
||||
| Storage | Volume mounts and image storage |
|
||||
| RuntimeConfig | Resource constraints (CPU, memory, cgroups) |
|
||||
|
||||

|
||||
|
||||
**Layer 2: VM Abstractions**
|
||||
|
||||
Kata translates CRI constructs into VM-level concepts:
|
||||
|
||||
| CRI Construct | VM Equivalent |
|
||||
|---------------|---------------|
|
||||
| Pod Sandbox | Virtual Machine |
|
||||
| Container | Process/namespace in guest OS |
|
||||
| Network | Virtual NIC (vNIC) |
|
||||
| Storage | Virtual block device or filesystem |
|
||||
| RuntimeConfig | VM resources (vCPU, memory) |
|
||||
|
||||

|
||||
|
||||
**Layer 3: Para-virtualized Devices**
|
||||
|
||||
VM abstractions are realized through para-virtualized drivers for optimal performance:
|
||||
|
||||
| VM Concept | Device Technology |
|
||||
|------------|-------------------|
|
||||
| vNIC | virtio-net, vhost-net, macvtap |
|
||||
| Block Storage | virtio-block, virtio-scsi |
|
||||
| Shared Filesystem | virtio-fs |
|
||||
| Agent Communication | virtio-vsock |
|
||||
| Device Passthrough | VFIO with IOMMU |
|
||||
|
||||

|
||||
|
||||
> **Note:** Each hypervisor implements these mappings differently based on its device model and feature set. See the [Hypervisor Details](#hypervisor-details) section for specific implementations.
|
||||
|
||||
### Device Mapping
|
||||
|
||||
Container constructs map to para-virtualized devices:
|
||||
|
||||
| Construct | Device Type | Technology |
|
||||
|-----------|-------------|------------|
|
||||
| Network | Network Interface | virtio-net, vhost-net |
|
||||
| Storage (ephemeral) | Block Device | virtio-block, virtio-scsi |
|
||||
| Storage (shared) | Filesystem | virtio-fs |
|
||||
| Communication | Socket | virtio-vsock |
|
||||
| GPU/Passthrough | PCI Device | VFIO, IOMMU |
|
||||
|
||||
## Supported Hypervisors and VMMs
|
||||
|
||||
Kata Containers supports multiple hypervisors, each with different characteristics:
|
||||
|
||||
| Hypervisor | Language | Architectures | Type |
|
||||
|------------|----------|---------------|------|
|
||||
| [QEMU] | C | x86_64, aarch64, ppc64le, s390x, risc-v | Type 2 (KVM) |
|
||||
| [Cloud Hypervisor] | Rust | x86_64, aarch64 | Type 2 (KVM) |
|
||||
| [Firecracker] | Rust | x86_64, aarch64 | Type 2 (KVM) |
|
||||
| `Dragonball` | Rust | x86_64, aarch64 | Type 2 (KVM) Built-in |
|
||||
|
||||
> **Note:** All supported hypervisors use KVM (Kernel-based Virtual Machine) as the underlying hardware virtualization interface on Linux.
|
||||
|
||||
## Hypervisor Details
|
||||
|
||||
### QEMU/KVM
|
||||
|
||||
Kata Containers with QEMU has complete compatibility with Kubernetes.
|
||||
QEMU is the most mature and feature-complete hypervisor option for Kata Containers.
|
||||
|
||||
Depending on the host architecture, Kata Containers supports various machine types,
|
||||
for example `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers
|
||||
machine type is `q35`. The machine type and its [`Machine accelerators`](#machine-accelerators) can
|
||||
be changed by editing the runtime [`configuration`](architecture/README.md#configuration) file.
|
||||
**Machine Types:**
|
||||
|
||||
Devices and features used:
|
||||
- virtio VSOCK or virtio serial
|
||||
- virtio block or virtio SCSI
|
||||
- [virtio net](https://www.redhat.com/en/virtio-networking-series)
|
||||
- virtio fs or virtio 9p (recommend: virtio fs)
|
||||
- VFIO
|
||||
- hotplug
|
||||
- machine accelerators
|
||||
- `q35` (x86_64, default)
|
||||
- `s390x` (s390x)
|
||||
- `virt` (aarch64)
|
||||
- `pseries` (ppc64le)
|
||||
- `risc-v` (riscv64, experimental)
|
||||
|
||||
Machine accelerators and hotplug are used in Kata Containers to manage resource constraints, improve boot time and reduce memory footprint. These are documented below.
|
||||
**Devices and Features:**
|
||||
|
||||
#### Machine accelerators
|
||||
- virtio-vsock (agent communication)
|
||||
- virtio-block or virtio-scsi (storage)
|
||||
- virtio-net/vhost-net/vhost-user-net (networking)
|
||||
- virtio-fs (shared filesystem, virtio-fs recommended)
|
||||
- VFIO (device passthrough)
|
||||
- CPU and memory hotplug
|
||||
- NVDIMM (x86_64, for rootfs as persistent memory)
|
||||
|
||||
Machine accelerators are architecture specific and can be used to improve the performance
|
||||
and enable specific features of the machine types. The following machine accelerators
|
||||
are used in Kata Containers:
|
||||
**Use Cases:**
|
||||
|
||||
- NVDIMM: This machine accelerator is x86 specific and only supported by `q35` machine types.
|
||||
`nvdimm` is used to provide the root filesystem as a persistent memory device to the Virtual Machine.
|
||||
- Production workloads requiring full CRI API compatibility
|
||||
- Scenarios requiring device passthrough (VFIO)
|
||||
- Multi-architecture deployments
|
||||
|
||||
#### Hotplug devices
|
||||
**Configuration:** See [`configuration-qemu.toml`](../../src/runtime/config/configuration-qemu.toml.in)
|
||||
|
||||
The Kata Containers VM starts with a minimum amount of resources, allowing for faster boot time and a reduction in memory footprint. As the container launch progresses,
|
||||
devices are hotplugged to the VM. For example, when a CPU constraint is specified which includes additional CPUs, they can be hot added. Kata Containers has support
|
||||
for hot-adding the following devices:
|
||||
- Virtio block
|
||||
- Virtio SCSI
|
||||
- VFIO
|
||||
- CPU
|
||||
### Dragonball (Built-in VMM)
|
||||
|
||||
### Firecracker/KVM
|
||||
Dragonball is a Rust-based VMM integrated directly into the Kata Containers Rust runtime as a library.
|
||||
|
||||
Firecracker, built on many rust crates that are within [rust-VMM](https://github.com/rust-vmm), has a very limited device model, providing a lighter
|
||||
footprint and attack surface, focusing on function-as-a-service like use cases. As a result, Kata Containers with Firecracker VMM supports a subset of the CRI API.
|
||||
Firecracker does not support file-system sharing, and as a result only block-based storage drivers are supported. Firecracker does not support device
|
||||
hotplug nor does it support VFIO. As a result, Kata Containers with Firecracker VMM does not support updating container resources after boot, nor
|
||||
does it support device passthrough.
|
||||
**Advantages:**
|
||||
|
||||
Devices used:
|
||||
- virtio VSOCK
|
||||
- virtio block
|
||||
- virtio net
|
||||
- **Zero IPC overhead**: VMM runs in the same process as the runtime
|
||||
- **Unified lifecycle**: Simplified resource management and error handling
|
||||
- **Optimized for containers**: Purpose-built for container workloads
|
||||
- **Upcall support**: Direct VMM-to-Guest communication for efficient hotplug operations
|
||||
- **Low resource overhead**: Minimal CPU and memory footprint
|
||||
|
||||
**Architecture:**
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ Kata Containers Runtime (Rust) │
|
||||
│ ┌─────────────────────────────────┐ │
|
||||
│ │ Dragonball VMM Library │ │
|
||||
│ └─────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Features:**
|
||||
|
||||
- Built-in virtio-fs/nydus support
|
||||
- Async I/O via Tokio
|
||||
- Single binary deployment
|
||||
- Optimized startup latency
|
||||
|
||||
**Use Cases:**
|
||||
|
||||
- Default choice for most container workloads
|
||||
- High-density container deployments and low resource overhead scenarios
|
||||
- Scenarios requiring optimal startup performance
|
||||
|
||||
**Configuration:** See [`configuration-dragonball.toml`](../../src/runtime-rs/config/configuration-dragonball.toml.in)
|
||||
|
||||
### Cloud Hypervisor/KVM
|
||||
|
||||
[Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor), based
|
||||
on [rust-vmm](https://github.com/rust-vmm), is designed to have a
|
||||
lighter footprint and smaller attack surface for running modern cloud
|
||||
workloads. Kata Containers with Cloud
|
||||
Hypervisor provides mostly complete compatibility with Kubernetes
|
||||
comparable to the QEMU configuration. As of the 1.12 and 2.0.0 release
|
||||
of Kata Containers, the Cloud Hypervisor configuration supports both CPU
|
||||
and memory resize, device hotplug (disk and VFIO), file-system sharing through virtio-fs,
|
||||
block-based volumes, booting from VM images backed by pmem device, and
|
||||
fine-grained seccomp filters for each VMM threads (e.g. all virtio
|
||||
device worker threads).
|
||||
Cloud Hypervisor is a Rust-based VMM designed for modern cloud workloads with a focus on performance and security.
|
||||
|
||||
Devices and features used:
|
||||
- virtio VSOCK or virtio serial
|
||||
- virtio block
|
||||
- virtio net
|
||||
- virtio fs
|
||||
- virtio pmem
|
||||
- VFIO
|
||||
- hotplug
|
||||
- seccomp filters
|
||||
- [HTTP OpenAPI](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/vmm/src/api/openapi/cloud-hypervisor.yaml)
|
||||
**Features:**
|
||||
|
||||
### StratoVirt/KVM
|
||||
- CPU and memory resize
|
||||
- Device hotplug (disk, VFIO)
|
||||
- virtio-fs (shared filesystem)
|
||||
- virtio-pmem (persistent memory)
|
||||
- virtio-block (block storage)
|
||||
- virtio-vsock (agent communication)
|
||||
- Fine-grained seccomp filters per VMM thread
|
||||
- HTTP OpenAPI for management
|
||||
|
||||
[StratoVirt](https://gitee.com/openeuler/stratovirt) is an enterprise-level open source VMM oriented to cloud data centers, implements a unified architecture to support Standard-VMs, containers and serverless (Micro-VM). StratoVirt has some competitive advantages, such as lightweight and low resource overhead, fast boot, hardware acceleration, and language-level security with Rust.
|
||||
**Use Cases:**
|
||||
|
||||
Currently, StratoVirt in Kata supports Micro-VM machine type, mainly focus on FaaS cases, supporting device hotplug (virtio block), file-system sharing through virtio fs and so on. Kata Containers with StratoVirt now use virtio-mmio bus as driver, and doesn't support CPU/memory resize nor VFIO, thus doesn't support updating container resources after booted.
|
||||
- High-performance cloud-native workloads
|
||||
- Applications requiring memory/CPU resizing
|
||||
- Security-sensitive deployments (seccomp isolation)
|
||||
|
||||
Devices and features used currently:
|
||||
- Micro-VM machine type for FaaS(mmio, no ACPI)
|
||||
- Virtual Socket(vhost VSOCK、virtio console)
|
||||
- Virtual Storage(virtio block, mmio)
|
||||
- Virtual Networking(virtio net, mmio)
|
||||
- Shared Filesystem(virtio fs)
|
||||
- Device Hotplugging(virtio block hotplug)
|
||||
- Entropy Source(virtio RNG)
|
||||
- QMP API
|
||||
**Configuration:** See [`configuration-cloud-hypervisor.toml`](../../src/runtime-rs/config/configuration-cloud-hypervisor.toml.in)
|
||||
|
||||
### Summary
|
||||
### Firecracker/KVM
|
||||
|
||||
| Solution | release introduced | brief summary |
|
||||
|-|-|-|
|
||||
| Cloud Hypervisor | 1.10 | upstream Cloud Hypervisor with rich feature support, e.g. hotplug, VFIO and FS sharing|
|
||||
| Firecracker | 1.5 | upstream Firecracker, rust-VMM based, no VFIO, no FS sharing, no memory/CPU hotplug |
|
||||
| QEMU | 1.0 | upstream QEMU, with support for hotplug and filesystem sharing |
|
||||
| StratoVirt | 3.3 | upstream StratoVirt with FS sharing and virtio block hotplug, no VFIO, no CPU/memory resize |
|
||||
Firecracker is a minimalist VMM built on rust-vmm crates, optimized for serverless and FaaS workloads.
|
||||
|
||||
**Devices:**
|
||||
|
||||
- virtio-vsock (agent communication)
|
||||
- virtio-block (block storage)
|
||||
- virtio-net (networking)
|
||||
|
||||
**Limitations:**
|
||||
|
||||
- No filesystem sharing (virtio-fs not supported)
|
||||
- No device hotplug
|
||||
- No VFIO/passthrough support
|
||||
- No CPU/memory hotplug
|
||||
- Limited CRI API support
|
||||
|
||||
**Use Cases:**
|
||||
|
||||
- Serverless/FaaS workloads
|
||||
- Single-tenant microVMs
|
||||
- Scenarios prioritizing minimal attack surface
|
||||
|
||||
**Configuration:** See [`configuration-fc.toml`](../../src/runtime/config/configuration-fc.toml.in)
|
||||
|
||||
## Hypervisor Comparison Summary
|
||||
|
||||
| Feature | QEMU | Cloud Hypervisor | Firecracker | Dragonball |
|
||||
|---------|------|------------------|-------------|------------|
|
||||
| Maturity | Excellent | Good | Good | Good |
|
||||
| CRI Compatibility | Full | Full | Partial | Full |
|
||||
| Filesystem Sharing | ✓ | ✓ | ✗ | ✓ |
|
||||
| Device Hotplug | ✓ | ✓ | ✗ | ✓ |
|
||||
| VFIO/Passthrough | ✓ | ✓ | ✗ | ✓ |
|
||||
| CPU/Memory Hotplug | ✓ | ✓ | ✗ | ✓ |
|
||||
| Security Isolation | Good | Excellent (seccomp) | Excellent | Excellent |
|
||||
| Startup Latency | Good | Excellent | Excellent | Best |
|
||||
| Resource Overhead | Medium | Low | Lowest | Lowest |
|
||||
|
||||
## Choosing a Hypervisor
|
||||
|
||||
### Decision Matrix
|
||||
|
||||
| Requirement | Recommended Hypervisor |
|
||||
|-------------|------------------------|
|
||||
| Full CRI API compatibility | QEMU, Cloud Hypervisor, Dragonball |
|
||||
| Device passthrough (VFIO) | QEMU, Cloud Hypervisor, Dragonball |
|
||||
| Minimal resource overhead | Dragonball, Firecracker |
|
||||
| Fastest startup time | Dragonball, Firecracker |
|
||||
| Serverless/FaaS | Dragonball, Firecracker |
|
||||
| Production workloads | Dragonball, QEMU |
|
||||
| Memory/CPU resizing | Dragonball, Cloud Hypervisor, QEMU |
|
||||
| Maximum security isolation | Cloud Hypervisor (seccomp), Firecracker, Dragonball |
|
||||
| Multi-architecture | QEMU |
|
||||
|
||||
### Recommendations
|
||||
|
||||
**For Most Users:** Use the default Dragonball VMM with the Kata Containers Rust runtime. It provides the best balance of performance, security, and container density.
|
||||
|
||||
**For Device Passthrough:** Use QEMU, Cloud Hypervisor, or Dragonball if you require VFIO device assignment.
|
||||
|
||||
**For Serverless:** Use Dragonball or Firecracker for ultra-lightweight, single-tenant microVMs.
|
||||
|
||||
**For Legacy/Ecosystem Compatibility:** Use QEMU for its extensive hardware emulation and multi-architecture support.
|
||||
|
||||
## Hypervisor Configuration
|
||||
|
||||
### Configuration Files
|
||||
|
||||
Each hypervisor has a dedicated configuration file:
|
||||
|
||||
| Hypervisor | Rust Runtime Configuration | Go Runtime Configuration |
|
||||
|------------|----------------|-----------------|
|
||||
| QEMU |`configuration-qemu-runtime-rs.toml` |`configuration-qemu.toml` |
|
||||
| Cloud Hypervisor | `configuration-cloud-hypervisor.toml` | `configuration-clh.toml` |
|
||||
| Firecracker | `configuration-rs-fc.toml` | `configuration-fc.toml` |
|
||||
| Dragonball | `configuration-dragonball.toml` (default) | `No` |
|
||||
|
||||
> **Note:** Configuration files are typically installed in `/opt/kata/share/defaults/kata-containers/` or `/opt/kata/share/defaults/kata-containers/runtime-rs/` or `/usr/share/defaults/kata-containers/`.
|
||||
|
||||
### Switching Hypervisors
|
||||
|
||||
Use the `kata-manager` tool to switch the configured hypervisor:
|
||||
|
||||
```bash
|
||||
# List available hypervisors
|
||||
$ kata-manager -L
|
||||
|
||||
# Switch to a different hypervisor
|
||||
$ sudo kata-manager -S <hypervisor-name>
|
||||
```
|
||||
|
||||
For detailed instructions, see the [`kata-manager` documentation](../../utils/README.md).
|
||||
|
||||
## Hypervisor Versions
|
||||
|
||||
The following versions are used in this release (from [versions.yaml](../../versions.yaml)):
|
||||
|
||||
| Hypervisor | Version | Repository |
|
||||
|------------|---------|------------|
|
||||
| Cloud Hypervisor | v51.1 | https://github.com/cloud-hypervisor/cloud-hypervisor |
|
||||
| Firecracker | v1.12.1 | https://github.com/firecracker-microvm/firecracker |
|
||||
| QEMU | v10.2.1 | https://github.com/qemu/qemu |
|
||||
| Dragonball | builtin | https://github.com/kata-containers/kata-containers/tree/main/src/dragonball |
|
||||
|
||||
> **Note:** Dragonball is integrated into the Kata Containers Rust runtime and does not have a separate version number.
|
||||
> For the latest hypervisor versions, see the [versions.yaml](../../versions.yaml) file in the Kata Containers repository.
|
||||
|
||||
## References
|
||||
|
||||
- [Kata Containers Architecture](./architecture/README.md)
|
||||
- [Configuration Guide](../../src/runtime/README.md#configuration)
|
||||
- [QEMU Documentation](https://www.qemu.org/documentation/)
|
||||
- [Cloud Hypervisor Documentation](https://github.com/cloud-hypervisor/cloud-hypervisor/blob/main/docs/api.md)
|
||||
- [Firecracker Documentation](https://github.com/firecracker-microvm/firecracker/tree/main/docs)
|
||||
- [Dragonball Source](https://github.com/kata-containers/kata-containers/tree/main/src/dragonball)
|
||||
|
||||
[KVM]: https://en.wikipedia.org/wiki/Kernel-based_Virtual_Machine
|
||||
[QEMU]: https://www.qemu.org
|
||||
[Cloud Hypervisor]: https://github.com/cloud-hypervisor/cloud-hypervisor
|
||||
[Firecracker]: https://github.com/firecracker-microvm/firecracker
|
||||
[`Dragonball`]: https://github.com/kata-containers/kata-containers/tree/main/src/dragonball
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
## Kubernetes Integration
|
||||
|
||||
- [Run Kata containers with `crictl`](run-kata-with-crictl.md)
|
||||
- [Run Kata Containers with Kubernetes](run-kata-with-k8s.md)
|
||||
- [How to use Kata Containers and Containerd](containerd-kata.md)
|
||||
- [How to use Kata Containers and containerd with Kubernetes](how-to-use-k8s-with-containerd-and-kata.md)
|
||||
- [How to use Kata Containers and CRI-O with Kubernetes](how-to-use-k8s-with-crio-and-kata.md)
|
||||
- [Kata Containers and service mesh for Kubernetes](service-mesh.md)
|
||||
- [How to import Kata Containers logs into Fluentd](how-to-import-kata-logs-with-fluentd.md)
|
||||
|
||||
@@ -50,3 +50,4 @@
|
||||
- [How to pull images in the guest](how-to-pull-images-in-guest-with-kata.md)
|
||||
- [How to use mem-agent to decrease the memory usage of Kata container](how-to-use-memory-agent.md)
|
||||
- [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md)
|
||||
- [How to use passthroughfd-IO with runtime-rs and Dragonball](how-to-use-passthroughfd-io-within-runtime-rs.md)
|
||||
|
||||
@@ -5,7 +5,7 @@ and [Kata Containers](https://katacontainers.io). The containerd provides not on
|
||||
command line tool, but also the [CRI](https://kubernetes.io/blog/2016/12/container-runtime-interface-cri-in-kubernetes/)
|
||||
interface for [Kubernetes](https://kubernetes.io) and other CRI clients.
|
||||
|
||||
This document is primarily written for Kata Containers v1.5.0-rc2 or above, and containerd v1.2.0 or above.
|
||||
This document is primarily written for Kata Containers v3.28 or above, and containerd v1.7.0 or above.
|
||||
Previous versions are addressed here, but we suggest users upgrade to the newer versions for better support.
|
||||
|
||||
## Concepts
|
||||
@@ -14,7 +14,7 @@ Previous versions are addressed here, but we suggest users upgrade to the newer
|
||||
|
||||
[`RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/) is a Kubernetes feature first
|
||||
introduced in Kubernetes 1.12 as alpha. It is the feature for selecting the container runtime configuration to
|
||||
use to run a pod’s containers. This feature is supported in `containerd` since [v1.2.0](https://github.com/containerd/containerd/releases/tag/v1.2.0).
|
||||
use to run a pod's containers. This feature is supported in `containerd` since [v1.2.0](https://github.com/containerd/containerd/releases/tag/v1.2.0).
|
||||
|
||||
Before the `RuntimeClass` was introduced, Kubernetes was not aware of the difference of runtimes on the node. `kubelet`
|
||||
creates Pod sandboxes and containers through CRI implementations, and treats all the Pods equally. However, there
|
||||
@@ -123,18 +123,56 @@ The following sections outline how to add Kata Containers to the configurations.
|
||||
|
||||
#### Kata Containers as a `RuntimeClass`
|
||||
|
||||
For
|
||||
- Kata Containers v1.5.0 or above (including `1.5.0-rc`)
|
||||
- Containerd v1.2.0 or above
|
||||
- Kubernetes v1.12.0 or above
|
||||
For Kubernetes users, we suggest using `RuntimeClass` to select Kata Containers as the runtime for untrusted workloads. The configuration is as follows:
|
||||
|
||||
- Kata Containers v3.28.0 or above
|
||||
- Containerd v1.7.0 or above
|
||||
- Kubernetes v1.33 or above
|
||||
|
||||
The `RuntimeClass` is suggested.
|
||||
|
||||
The following example registers custom runtimes into containerd:
|
||||
|
||||
You can check the detailed information about the configuration of containerd in the [Containerd config documentation](https://github.com/containerd/containerd/blob/main/docs/cri/config.md).
|
||||
|
||||
+ In containerd 2.x
|
||||
|
||||
```toml
|
||||
version = 3
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd]
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes]
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc]
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
+ In containerd 1.7.x
|
||||
|
||||
```toml
|
||||
version = 2
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
The following configuration includes two runtime classes:
|
||||
- `plugins.cri.containerd.runtimes.runc`: the runc, and it is the default runtime.
|
||||
- `plugins.cri.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/main/core/runtime/v2))
|
||||
|
||||
- `plugins.<X>.containerd.runtimes.runc`: the runc, and it is the default runtime.
|
||||
- `plugins.<X>.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/main/core/runtime/v2))
|
||||
where the dot-connected string `io.containerd.kata.v2` is translated to `containerd-shim-kata-v2` (i.e. the
|
||||
binary name of the Kata implementation of [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/core/runtime/v2)).
|
||||
binary name of the Kata implementation of [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/core/runtime/v2)). By default, the `containerd-shim-kata-v2` (short of `shimv2`) binary will be installed under the path of `/usr/local/bin/`.
|
||||
|
||||
And `<X>` is `io.containerd.cri.v1.runtime` for containerd v2.x and `io.containerd.grpc.v1.cri` for containerd v1.7.x.
|
||||
|
||||
+ In containerd 1.7.x
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
@@ -149,7 +187,7 @@ The following configuration includes two runtime classes:
|
||||
CriuPath = ""
|
||||
CriuWorkPath = ""
|
||||
IoGid = 0
|
||||
[plugins.cri.containerd.runtimes.kata]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
privileged_without_host_devices = true
|
||||
pod_annotations = ["io.katacontainers.*"]
|
||||
@@ -158,13 +196,71 @@ The following configuration includes two runtime classes:
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
+ In containerd 2.x
|
||||
|
||||
```toml
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd]
|
||||
no_pivot = false
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes]
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc]
|
||||
privileged_without_host_devices = false
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options]
|
||||
BinaryName = ""
|
||||
CriuImagePath = ""
|
||||
CriuPath = ""
|
||||
CriuWorkPath = ""
|
||||
IoGid = 0
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
privileged_without_host_devices = true
|
||||
pod_annotations = ["io.katacontainers.*"]
|
||||
container_annotations = ["io.katacontainers.*"]
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
```
|
||||
|
||||
`privileged_without_host_devices` tells containerd that a privileged Kata container should not have direct access to all host devices. If unset, containerd will pass all host devices to Kata container, which may cause security issues.
|
||||
|
||||
`pod_annotations` is the list of pod annotations passed to both the pod sandbox as well as container through the OCI config.
|
||||
|
||||
`container_annotations` is the list of container annotations passed through to the OCI config of the containers.
|
||||
|
||||
This `ConfigPath` option is optional. If you do not specify it, shimv2 first tries to get the configuration file from the environment variable `KATA_CONF_FILE`. If neither are set, shimv2 will use the default Kata configuration file paths (`/etc/kata-containers/configuration.toml` and `/usr/share/defaults/kata-containers/configuration.toml`).
|
||||
This `ConfigPath` option is optional. If you want to use a different configuration file, you can specify the path of the configuration file with `ConfigPath` in the containerd configuration file. We use containerd 2.x configuration as an example here, and the configuration for containerd 1.7.x is similar, just replace `io.containerd.cri.v1.runtime` with `io.containerd.grpc.v1.cri`.
|
||||
|
||||
```toml
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata.options]
|
||||
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
|
||||
```
|
||||
|
||||
> **Note:** In this example, the specified `ConfigPath` is valid in Kubernetes/Containerd workflow with containerd v1.7+ but doesn't work with ctr and nerdctl.
|
||||
|
||||
If you do not specify it, `shimv2` first tries to get the configuration file from the environment variable `KATA_CONF_FILE`. If you want to adopt this way, you should first create a shell script as `containerd-shim-kata-v2` which is placed under the path of `/usr/local/bin/`. The following is an example of the shell script `containerd-shim-kata-qemu-v2` which specifies the configuration file with `KATA_CONF_FILE`
|
||||
|
||||
> **Note:** Just use containerd 2.x configuration as an example, the configuration for containerd 1.7.x is similar, just replace `io.containerd.cri.v1.runtime` with `io.containerd.grpc.v1.cri`
|
||||
|
||||
```shell
|
||||
~$ cat /usr/local/bin/containerd-shim-kata-qemu-v2
|
||||
#!/bin/bash
|
||||
KATA_CONF_FILE=/opt/kata/share/defaults/kata-containers/configuration-qemu.toml /opt/kata/bin/containerd-shim-kata-v2 "$@"
|
||||
```
|
||||
|
||||
And then just reference it in the configuration of containerd:
|
||||
|
||||
```toml
|
||||
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.kata-qemu]
|
||||
runtime_type = "io.containerd.kata-qemu.v2"
|
||||
```
|
||||
|
||||
Finally you can run a Kata container with the runtime `io.containerd.kata-qemu.v2`:
|
||||
|
||||
```shell
|
||||
$ sudo ctr run --cni --runtime io.containerd.kata-qemu.v2 -t --rm docker.io/library/busybox:latest hello sh
|
||||
```
|
||||
|
||||
> **Note:** The `KATA_CONF_FILE` environment variable is valid in both Kubernetes/Containerd workflow with containerd and containerd tools(ctr, nerdctl, etc.) scenarios.
|
||||
|
||||
If neither are set, shimv2 will use the default Kata configuration file paths (`/etc/kata-containers/configuration.toml` and `/usr/share/defaults/kata-containers/configuration.toml` and `/opt/kata/share/defaults/kata-containers/configuration.toml`).
|
||||
|
||||
#### Kata Containers as the runtime for untrusted workload
|
||||
|
||||
@@ -173,18 +269,20 @@ for an untrusted workload. With the following configuration, you can run trusted
|
||||
and then, run an untrusted workload with Kata Containers:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
# "plugins.cri.containerd.default_runtime" is the runtime to use in containerd.
|
||||
[plugins.cri.containerd.default_runtime]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
# "plugins."io.containerd.grpc.v1.cri".containerd.default_runtime" is the runtime to use in containerd.
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
|
||||
# runtime_type is the runtime type to use in containerd e.g. io.containerd.runtime.v1.linux
|
||||
runtime_type = "io.containerd.runtime.v1.linux"
|
||||
|
||||
# "plugins.cri.containerd.untrusted_workload_runtime" is a runtime to run untrusted workloads on it.
|
||||
[plugins.cri.containerd.untrusted_workload_runtime]
|
||||
# "plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime" is a runtime to run untrusted workloads on it.
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
|
||||
# runtime_type is the runtime type to use in containerd e.g. io.containerd.runtime.v1.linux
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
```
|
||||
|
||||
> **Note:** The `untrusted_workload_runtime` is deprecated since containerd v1.7.0, and it is recommended to use `RuntimeClass` instead.
|
||||
|
||||
You can find more information on the [Containerd config documentation](https://github.com/containerd/containerd/blob/main/docs/cri/config.md)
|
||||
|
||||
#### Kata Containers as the default runtime
|
||||
@@ -192,8 +290,8 @@ You can find more information on the [Containerd config documentation](https://g
|
||||
If you want to set Kata Containers as the only runtime in the deployment, you can simply configure as follows:
|
||||
|
||||
```toml
|
||||
[plugins.cri.containerd]
|
||||
[plugins.cri.containerd.default_runtime]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
```
|
||||
|
||||
@@ -246,11 +344,14 @@ debug: true
|
||||
|
||||
### Launch containers with `ctr` command line
|
||||
|
||||
> **Note:** With containerd command tool `ctr`, the `ConfigPath` is not supported, and the configuration file should be explicitly specified with the option `--runtime-config-path`, otherwise, it'll use the default configurations.
|
||||
|
||||
To run a container with Kata Containers through the containerd command line, you can run the following:
|
||||
|
||||
```bash
|
||||
$ sudo ctr image pull docker.io/library/busybox:latest
|
||||
$ sudo ctr run --cni --runtime io.containerd.run.kata.v2 -t --rm docker.io/library/busybox:latest hello sh
|
||||
$ CONFIG_PATH="/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
|
||||
$ sudo ctr run --cni --runtime io.containerd.kata.v2 --runtime-config-path $CONFIG_PATH -t --rm docker.io/library/busybox:latest hello sh
|
||||
```
|
||||
|
||||
This launches a BusyBox container named `hello`, and it will be removed by `--rm` after it quits.
|
||||
@@ -260,7 +361,9 @@ loopback interface is created.
|
||||
### Launch containers using `ctr` command line with rootfs bundle
|
||||
|
||||
#### Get rootfs
|
||||
|
||||
Use the script to create rootfs
|
||||
|
||||
```bash
|
||||
ctr i pull quay.io/prometheus/busybox:latest
|
||||
ctr i export rootfs.tar quay.io/prometheus/busybox:latest
|
||||
@@ -278,7 +381,9 @@ for ((i=0;i<$(cat ${layers_dir}/manifest.json | jq -r ".[].Layers | length");i++
|
||||
tar -C ${rootfs_dir} -xf ${layers_dir}/$(cat ${layers_dir}/manifest.json | jq -r ".[].Layers[${i}]")
|
||||
done
|
||||
```
|
||||
|
||||
#### Get `config.json`
|
||||
|
||||
Use runc spec to generate `config.json`
|
||||
```bash
|
||||
cd ./bundle/rootfs
|
||||
@@ -295,10 +400,13 @@ Change the root `path` in `config.json` to the absolute path of rootfs
|
||||
```
|
||||
|
||||
#### Run container
|
||||
|
||||
```bash
|
||||
sudo ctr run -d --runtime io.containerd.run.kata.v2 --config bundle/config.json hello
|
||||
CONFIG_PATH="/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
|
||||
sudo ctr run -d --runtime io.containerd.kata.v2 --runtime-config-path $CONFIG_PATH --config bundle/config.json hello
|
||||
sudo ctr t exec --exec-id ${ID} -t hello sh
|
||||
```
|
||||
|
||||
### Launch Pods with `crictl` command line
|
||||
|
||||
With the `crictl` command line of `cri-tools`, you can specify runtime class with `-r` or `--runtime` flag.
|
||||
|
||||
@@ -96,6 +96,10 @@ path = "/path/to/qemu/build/qemu-system-x86_64"
|
||||
```toml
|
||||
shared_fs = "virtio-9p"
|
||||
```
|
||||
- Use `blockfile` snapshotter: Since virtio-fs remains unsupported due to bugs in QEMU snp-v3, and virtio-9p is no longer supported in runtime-rs, it is recommended to use the blockfile snapshotter. This allows container images to be managed via block devices without relying on a shared file system. To enable this, set the `snapshotter` to `blockfile` in the containerd config file, please refer to [blockfile guide](https://github.com/containerd/containerd/blob/main/docs/snapshotters/blockfile.md) for more information. Additionally, shared_fs should be set to "none" since no shared file system is used.
|
||||
```toml
|
||||
shared_fs = "none"
|
||||
```
|
||||
- Disable `virtiofsd` since it is no longer required (comment out)
|
||||
```toml
|
||||
# virtio_fs_daemon = "/usr/libexec/virtiofsd"
|
||||
|
||||
@@ -12,11 +12,11 @@ Currently, there is no widely applicable and convenient method available for use
|
||||
|
||||
According to the proposal, it requires to use the `kata-ctl direct-volume` command to add a direct assigned block volume device to the Kata Containers runtime.
|
||||
|
||||
And then with the help of method [get_volume_mount_info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L95), get information from JSON file: `(mountinfo.json)` and parse them into structure [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70) which is used to save device-related information.
|
||||
And then with the help of method [get_volume_mount_info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L95), get information from JSON file: `(mountInfo.json)` and parse them into structure [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70) which is used to save device-related information.
|
||||
|
||||
We only fill the `mountinfo.json`, such as `device` ,`volume_type`, `fs_type`, `metadata` and `options`, which correspond to the fields in [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70), to describe a device.
|
||||
We only fill the `mountInfo.json`, such as `device` ,`volume-type`, `fstype`, `metadata` and `options`, which correspond to the fields in [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70), to describe a device.
|
||||
|
||||
The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/volume001` which under fixed path `/run/kata-containers/shared/direct-volumes/`.
|
||||
The JSON file `mountInfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/volume001` which under fixed path `/run/kata-containers/shared/direct-volumes/`.
|
||||
And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is
|
||||
encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`.
|
||||
|
||||
@@ -47,18 +47,18 @@ $ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g
|
||||
```json
|
||||
{
|
||||
"device": "/tmp/stor/rawdisk01.20g",
|
||||
"volume_type": "directvol",
|
||||
"fs_type": "ext4",
|
||||
"volume-type": "directvol",
|
||||
"fstype": "ext4",
|
||||
"metadata":"{}",
|
||||
"options": []
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume-type\": \"directvol\", \"fstype\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx
|
||||
$ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
|
||||
{"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]}
|
||||
{"volume-type":"directvol","device":"/tmp/stor/rawdisk01.20g","fstype":"ext4","metadata":{},"options":[]}
|
||||
```
|
||||
|
||||
#### Run a Kata container with direct block device volume
|
||||
@@ -76,7 +76,7 @@ $ sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=directvol,sr
|
||||
> **Tip:** It only supports `vfio-pci` based PCI device passthrough mode.
|
||||
|
||||
In this scenario, the device's host kernel driver will be replaced by `vfio-pci`, and IOMMU group ID generated.
|
||||
And either device's BDF or its VFIO IOMMU group ID in `/dev/vfio/` is fine for "device" in `mountinfo.json`.
|
||||
And either device's BDF or its VFIO IOMMU group ID in `/dev/vfio/` is fine for "device" in `mountInfo.json`.
|
||||
|
||||
```bash
|
||||
$ lspci -nn -k -s 45:00.1
|
||||
@@ -92,15 +92,15 @@ $ ls /sys/kernel/iommu_groups/110/devices/
|
||||
|
||||
#### setup VFIO device for kata-containers
|
||||
|
||||
First, configure the `mountinfo.json`, as below:
|
||||
First, configure the `mountInfo.json`, as below:
|
||||
|
||||
- (1) device with `BB:DD:F`
|
||||
|
||||
```json
|
||||
{
|
||||
"device": "45:00.1",
|
||||
"volume_type": "vfiovol",
|
||||
"fs_type": "ext4",
|
||||
"volume-type": "vfiovol",
|
||||
"fstype": "ext4",
|
||||
"metadata":"{}",
|
||||
"options": []
|
||||
}
|
||||
@@ -111,8 +111,8 @@ First, configure the `mountinfo.json`, as below:
|
||||
```json
|
||||
{
|
||||
"device": "0000:45:00.1",
|
||||
"volume_type": "vfiovol",
|
||||
"fs_type": "ext4",
|
||||
"volume-type": "vfiovol",
|
||||
"fstype": "ext4",
|
||||
"metadata":"{}",
|
||||
"options": []
|
||||
}
|
||||
@@ -123,8 +123,8 @@ First, configure the `mountinfo.json`, as below:
|
||||
```json
|
||||
{
|
||||
"device": "/dev/vfio/110",
|
||||
"volume_type": "vfiovol",
|
||||
"fs_type": "ext4",
|
||||
"volume-type": "vfiovol",
|
||||
"fstype": "ext4",
|
||||
"metadata":"{}",
|
||||
"options": []
|
||||
}
|
||||
@@ -133,10 +133,10 @@ First, configure the `mountinfo.json`, as below:
|
||||
Second, run kata-containers with device(`/dev/vfio/110`) as an example:
|
||||
|
||||
```bash
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-vfio-vol-003/vfiovol003 "{\"device\": \"/dev/vfio/110\", \"volume_type\": \"vfiovol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-vfio-vol-003/vfiovol003 "{\"device\": \"/dev/vfio/110\", \"volume-type\": \"vfiovol\", \"fstype\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$ # /kubelet/kata-vfio-vol-003/directvol003 <==> /run/kata-containers/shared/direct-volumes/F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx
|
||||
$ cat F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx/mountInfo.json
|
||||
{"volume_type":"vfiovol","device":"/dev/vfio/110","fs_type":"ext4","metadata":{},"options":[]}
|
||||
{"volume-type":"vfiovol","device":"/dev/vfio/110","fstype":"ext4","metadata":{},"options":[]}
|
||||
```
|
||||
|
||||
#### Run a Kata container with VFIO block device based volume
|
||||
@@ -190,25 +190,25 @@ be passed to Hypervisor, such as Dragonball, Cloud-Hypervisor, Firecracker or QE
|
||||
|
||||
First, `mkdir` a sub-path `kubelet/kata-test-vol-001/` under `/run/kata-containers/shared/direct-volumes/`.
|
||||
|
||||
Second, fill fields in `mountinfo.json`, it looks like as below:
|
||||
Second, fill fields in `mountInfo.json`, it looks like as below:
|
||||
```json
|
||||
{
|
||||
"device": "/tmp/vhu-targets/vhost-blk-rawdisk01.sock",
|
||||
"volume_type": "spdkvol",
|
||||
"fs_type": "ext4",
|
||||
"volume-type": "spdkvol",
|
||||
"fstype": "ext4",
|
||||
"metadata":"{}",
|
||||
"options": []
|
||||
}
|
||||
```
|
||||
|
||||
Third, with the help of `kata-ctl direct-volume` to add block device to generate `mountinfo.json`, and run a kata container with `--mount`.
|
||||
Third, with the help of `kata-ctl direct-volume` to add block device to generate `mountInfo.json`, and run a kata container with `--mount`.
|
||||
|
||||
```bash
|
||||
$ # kata-ctl direct-volume add
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-test-vol-001/volume001 "{\"device\": \"/tmp/vhu-targets/vhost-blk-rawdisk01.sock\", \"volume_type\":\"spdkvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$ sudo kata-ctl direct-volume add /kubelet/kata-test-vol-001/volume001 "{\"device\": \"/tmp/vhu-targets/vhost-blk-rawdisk01.sock\", \"volume-type\":\"spdkvol\", \"fstype\": \"ext4\", \"metadata\":"{}", \"options\": []}"
|
||||
$ # /kubelet/kata-test-vol-001/volume001 <==> /run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx
|
||||
$ cat L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
|
||||
$ {"volume_type":"spdkvol","device":"/tmp/vhu-targets/vhost-blk-rawdisk01.sock","fs_type":"ext4","metadata":{},"options":[]}
|
||||
$ {"volume-type":"spdkvol","device":"/tmp/vhu-targets/vhost-blk-rawdisk01.sock","fstype":"ext4","metadata":{},"options":[]}
|
||||
```
|
||||
|
||||
As `/run/kata-containers/shared/direct-volumes/` is a fixed path , we will be able to run a kata pod with `--mount` and set
|
||||
|
||||
@@ -17,7 +17,7 @@ You must have a running Kubernetes cluster first. If not, [install a Kubernetes
|
||||
Also you should ensure that `kubectl` working correctly.
|
||||
|
||||
> **Note**: More information about Kubernetes integrations:
|
||||
> - [Run Kata Containers with Kubernetes](run-kata-with-k8s.md)
|
||||
> - [Run Kata Containers with Kubernetes](how-to-use-k8s-with-crio-and-kata.md)
|
||||
> - [How to use Kata Containers and Containerd](containerd-kata.md)
|
||||
> - [How to use Kata Containers and containerd with Kubernetes](how-to-use-k8s-with-containerd-and-kata.md)
|
||||
|
||||
|
||||
@@ -46,6 +46,8 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.hypervisor.block_device_cache_noflush` | `boolean` | Denotes whether flush requests for the device are ignored |
|
||||
| `io.katacontainers.config.hypervisor.block_device_cache_set` | `boolean` | cache-related options will be set to block devices or not |
|
||||
| `io.katacontainers.config.hypervisor.block_device_driver` | string | the driver to be used for block device, valid values are `virtio-blk`, `virtio-scsi`, `nvdimm`|
|
||||
| `io.katacontainers.config.hypervisor.blk_logical_sector_size` | uint32 | logical sector size in bytes reported by block devices to the guest (0 = hypervisor default, must be a power of 2 between 512 and 65536) |
|
||||
| `io.katacontainers.config.hypervisor.blk_physical_sector_size` | uint32 | physical sector size in bytes reported by block devices to the guest (0 = hypervisor default, must be a power of 2 between 512 and 65536) |
|
||||
| `io.katacontainers.config.hypervisor.cpu_features` | `string` | Comma-separated list of CPU features to pass to the CPU (QEMU) |
|
||||
| `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor |
|
||||
| `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` |
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Run Kata Containers with Kubernetes
|
||||
# How to use Kata Containers and CRI-O with Kubernetes
|
||||
|
||||
## Prerequisites
|
||||
|
||||
This guide requires Kata Containers available on your system, install-able by following [this guide](../install/README.md).
|
||||
|
||||
## Install a CRI implementation
|
||||
@@ -9,22 +10,16 @@ Kubernetes CRI (Container Runtime Interface) implementations allow using any
|
||||
OCI-compatible runtime with Kubernetes, such as the Kata Containers runtime.
|
||||
|
||||
Kata Containers support both the [CRI-O](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[containerd](https://github.com/containerd/containerd) CRI implementations.
|
||||
|
||||
After choosing one CRI implementation, you must make the appropriate configuration
|
||||
to ensure it integrates with Kata Containers.
|
||||
|
||||
Kata Containers 1.5 introduced the `shimv2` for containerd 1.2.0, reducing the components
|
||||
required to spawn pods and containers, and this is the preferred way to run Kata Containers with Kubernetes ([as documented here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-containerd-to-use-kata-containers)).
|
||||
|
||||
An equivalent shim implementation for CRI-O is planned.
|
||||
[containerd](https://github.com/containerd/containerd) CRI implementations. We choose `CRI-O` for our examples in this guide.
|
||||
|
||||
### CRI-O
|
||||
|
||||
For CRI-O installation instructions, refer to the [CRI-O Tutorial](https://github.com/cri-o/cri-o/blob/main/tutorial.md) page.
|
||||
|
||||
The following sections show how to set up the CRI-O snippet configuration file (default path: `/etc/crio/crio.conf`) for Kata.
|
||||
|
||||
Unless otherwise stated, all the following settings are specific to the `crio.runtime` table:
|
||||
|
||||
```toml
|
||||
# The "crio.runtime" table contains settings pertaining to the OCI
|
||||
# runtime used and options for how to set up and manage the OCI runtime.
|
||||
@@ -33,16 +28,17 @@ Unless otherwise stated, all the following settings are specific to the `crio.ru
|
||||
A comprehensive documentation of the configuration file can be found [here](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md).
|
||||
|
||||
> **Note**: After any change to this file, the CRI-O daemon have to be restarted with:
|
||||
|
||||
>````
|
||||
>$ sudo systemctl restart crio
|
||||
>````
|
||||
|
||||
#### Kubernetes Runtime Class (CRI-O v1.12+)
|
||||
|
||||
The [Kubernetes Runtime Class](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
is the preferred way of specifying the container runtime configuration to run a Pod's containers.
|
||||
To use this feature, Kata must added as a runtime handler. This can be done by
|
||||
dropping a `50-kata` snippet file into `/etc/crio/crio.conf.d`, with the
|
||||
content shown below:
|
||||
To use this feature, Kata must added as a runtime handler. This can be done by dropping a `50-kata`
|
||||
snippet file into `/etc/crio/crio.conf.d`, with the content shown below:
|
||||
|
||||
```toml
|
||||
[crio.runtime.runtimes.kata]
|
||||
@@ -52,13 +48,6 @@ content shown below:
|
||||
privileged_without_host_devices = true
|
||||
```
|
||||
|
||||
|
||||
### containerd
|
||||
|
||||
To customize containerd to select Kata Containers runtime, follow our
|
||||
"Configure containerd to use Kata Containers" internal documentation
|
||||
[here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-containerd-to-use-kata-containers).
|
||||
|
||||
## Install Kubernetes
|
||||
|
||||
Depending on what your needs are and what you expect to do with Kubernetes,
|
||||
@@ -72,25 +61,16 @@ implementation you chose, and the Kubelet service has to be updated accordingly.
|
||||
### Configure for CRI-O
|
||||
|
||||
`/etc/systemd/system/kubelet.service.d/0-crio.conf`
|
||||
|
||||
```
|
||||
[Service]
|
||||
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///var/run/crio/crio.sock"
|
||||
```
|
||||
|
||||
### Configure for containerd
|
||||
|
||||
`/etc/systemd/system/kubelet.service.d/0-cri-containerd.conf`
|
||||
```
|
||||
[Service]
|
||||
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||||
```
|
||||
For more information about containerd see the "Configure Kubelet to use containerd"
|
||||
documentation [here](../how-to/how-to-use-k8s-with-containerd-and-kata.md#configure-kubelet-to-use-containerd).
|
||||
|
||||
## Run a Kubernetes pod with Kata Containers
|
||||
|
||||
After you update your Kubelet service based on the CRI implementation you
|
||||
are using, reload and restart Kubelet. Then, start your cluster:
|
||||
After you update your Kubelet service based on the CRI implementation you are using, reload and restart Kubelet. Then, start your cluster:
|
||||
|
||||
```bash
|
||||
$ sudo systemctl daemon-reload
|
||||
$ sudo systemctl restart kubelet
|
||||
@@ -98,12 +78,6 @@ $ sudo systemctl restart kubelet
|
||||
# If using CRI-O
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --cri-socket /var/run/crio/crio.sock --pod-network-cidr=10.244.0.0/16
|
||||
|
||||
# If using containerd
|
||||
$ cat <<EOF | tee kubeadm-config.yaml
|
||||
apiVersion: kubeadm.k8s.io/v1beta3
|
||||
kind: InitConfiguration
|
||||
nodeRegistration:
|
||||
criSocket: "/run/containerd/containerd.sock"
|
||||
---
|
||||
kind: KubeletConfiguration
|
||||
apiVersion: kubelet.config.k8s.io/v1beta1
|
||||
@@ -118,6 +92,7 @@ $ export KUBECONFIG=/etc/kubernetes/admin.conf
|
||||
### Allow pods to run in the control-plane node
|
||||
|
||||
By default, the cluster will not schedule pods in the control-plane node. To enable control-plane node scheduling:
|
||||
|
||||
```bash
|
||||
$ sudo -E kubectl taint nodes --all node-role.kubernetes.io/control-plane-
|
||||
```
|
||||
@@ -161,6 +136,7 @@ If a pod has the `runtimeClassName` set to `kata`, the CRI plugin runs the pod w
|
||||
```
|
||||
|
||||
- Create the pod
|
||||
|
||||
```bash
|
||||
$ sudo -E kubectl apply -f nginx-kata.yaml
|
||||
```
|
||||
@@ -172,6 +148,7 @@ If a pod has the `runtimeClassName` set to `kata`, the CRI plugin runs the pod w
|
||||
```
|
||||
|
||||
- Check hypervisor is running
|
||||
|
||||
```bash
|
||||
$ ps aux | grep qemu
|
||||
```
|
||||
159
docs/how-to/how-to-use-passthroughfd-io-within-runtime-rs.md
Normal file
159
docs/how-to/how-to-use-passthroughfd-io-within-runtime-rs.md
Normal file
@@ -0,0 +1,159 @@
|
||||
# How to Use Passthrough-FD IO within Runtime-rs and Dragonball
|
||||
|
||||
This document describes the Passthrough-FD (pass-fd) technology implemented in Kata Containers to optimize IO performance. By bypassing the intermediate proxy layers, this technology significantly reduces latency and CPU overhead for container IO streams.
|
||||
|
||||
## Important Limitation
|
||||
|
||||
Before diving into the technical details, please note the following restriction:
|
||||
|
||||
- Exclusive Support for Dragonball VMM: This feature is currently implemented only for Kata Containers' built-in VMM, Dragonball.
|
||||
- Unsupported VMMs: Other VMMs such as QEMU, Cloud Hypervisor, and Firecracker do not support this feature at this time.
|
||||
|
||||
## Overview
|
||||
|
||||
The original IO implementation in Kata Containers suffered from an excessively long data path, leading to poor efficiency. For instance, copying a 10GB file could take as long as 10 minutes.
|
||||
|
||||
To address this, Kata AC member @lifupan and @frezcirno introduced a series of optimizations using passthrough-fd technology. This approach allows the VMM to directly handle file descriptors (FDs), dramatically improving IO throughput.
|
||||
|
||||
## Traditional IO Path
|
||||
|
||||
Before the introduction of Passthrough-FD, Kata's IO streams were implemented using `ttrpc + virtio-vsock`.
|
||||
|
||||
The data flow was as follows:
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph Host ["Host"]
|
||||
direction LR
|
||||
Containerd["Containerd"]
|
||||
|
||||
subgraph KS ["kata-shim"]
|
||||
buffer(("buffer"))
|
||||
end
|
||||
|
||||
Vsock["vsock"]
|
||||
|
||||
subgraph VM ["vm"]
|
||||
Agent["kata-agent"]
|
||||
Container["container"]
|
||||
end
|
||||
end
|
||||
|
||||
Containerd -->|stdin| buffer
|
||||
buffer --> Vsock
|
||||
Vsock --> Agent
|
||||
Agent -.-> Container
|
||||
|
||||
%% Style Rendering
|
||||
style Host fill:#f0f8ff,stroke:#333,stroke-dasharray: 5 5
|
||||
style VM fill:#fff9c4,stroke:#e0e0e0
|
||||
style buffer fill:#c8e6c9,stroke:#ff9800,stroke-dasharray: 5 5
|
||||
style Vsock fill:#bbdefb,stroke:#2196f3
|
||||
style Containerd fill:#f5f5f5,stroke:#333
|
||||
style Agent fill:#fff,stroke:#333
|
||||
style Container fill:#fff,stroke:#333
|
||||
|
||||
```
|
||||
|
||||
The kata-shim (containerd-shim-kata-v2) on the Host opens the FIFO pipes provided by containerd via the shimv2 interface.
|
||||
This results in three FDs (stdin, stdout, and stderr).
|
||||
The kata-shim manages three separate threads to handle these streams.
|
||||
The Bottleneck: kata-shim acts as a "middleman," maintaining three internal buffers. It must read data from the FDs into its own buffers before forwarding them via ttrpc over vsock to the destination.
|
||||
This multi-threaded proxying and buffering in the shim layer introduced significant overhead.
|
||||
|
||||
|
||||
## What is Passthrough-FD?
|
||||
|
||||
Passthrough-FD technology enhances the Dragonball VMM's hybrid-vsock implementation with support for recv-fd.
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph Host ["Host"]
|
||||
direction LR
|
||||
Containerd["Containerd"]
|
||||
|
||||
Vsock["vsock"]
|
||||
|
||||
subgraph VM ["vm"]
|
||||
Agent["kata-agent"]
|
||||
Container["container"]
|
||||
end
|
||||
end
|
||||
|
||||
Containerd -->|stdin| Vsock
|
||||
Vsock --> Agent
|
||||
Agent -.-> Container
|
||||
|
||||
%% Style Rendering
|
||||
style Host fill:#f0f8ff,stroke:#333,stroke-dasharray: 5 5
|
||||
style VM fill:#fff9c4,stroke:#e0e0e0
|
||||
style Vsock fill:#bbdefb,stroke:#2196f3
|
||||
style Containerd fill:#f5f5f5,stroke:#333
|
||||
style Agent fill:#fff,stroke:#333
|
||||
style Container fill:#fff,stroke:#333
|
||||
```
|
||||
|
||||
Instead of requiring an intermediate layer to read and forward data, the hybrid-vsock module can now directly receive file descriptors from the Host. This allows the system to "pass through" the host's FDs directly to the kata-agent. By eliminating the proxying logic in kata-shim, the IO stream is effectively connected directly to the guest environment.
|
||||
|
||||
## Technical Details
|
||||
|
||||
The end-to-end process follows these steps:
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
|
||||
box rgb(220,235,255) Guest (VM)
|
||||
participant Agent as kata-agent<br/>(Server)
|
||||
participant VSOCK as AF_VSOCK socket<br/>(Hybrid Vsock)
|
||||
end
|
||||
|
||||
box rgb(255,240,220) Host
|
||||
participant Shim as kata-shim<br/>(Client)
|
||||
participant FIFO as File or FIFO<br/>(stdin/stdout/stderr)
|
||||
end
|
||||
|
||||
Note over Agent: Agent Initialization:<br/>listen() on passfd_listener_port
|
||||
|
||||
Shim->>FIFO: open() to acquire Fd<br/>(for stdin / stdout / stderr)
|
||||
|
||||
Shim->>VSOCK: connect() + send("passfd\n")<br/>+ send_with_fd(Fd, PortA)
|
||||
|
||||
Note over VSOCK,Agent: FD Transfer via Hybrid Vsock<br/>(repeat for stdin-port, stdout-port, stderr-port)
|
||||
|
||||
VSOCK->>Agent: forward connection + Fd + PortA
|
||||
|
||||
Agent->>Agent: accept() → get conn_fd + host-port<br/>save: map[host-port] = conn_fd<br/>(3 entries: stdin-port, stdout-port, stderr-port)
|
||||
|
||||
Shim->>Agent: create_container RPC<br/>(includes stdin-port, stdout-port, stderr-port)
|
||||
|
||||
Agent->>Agent: lookup map[stdin-port] → bind to container stdin<br/>lookup map[stdout-port] → bind to container stdout<br/>lookup map[stderr-port] → bind to container stderr
|
||||
|
||||
Agent-->>Shim: create_container RPC response (OK)
|
||||
```
|
||||
|
||||
1. Agent Initialization: The kata-agent starts a server listening on the port specified by passfd_listener_port.
|
||||
2. FD Transfer: During the container creation phase, the kata-shim sends the FDs for stdin, stdout, and stderr to the Dragonball hybrid-vsock module using the sendfd mechanism.
|
||||
3. Connection Establishment: Through hybrid-vsock, these FDs connect to the server started by the agent in Step 1.
|
||||
4. Identification: The agent's server calls accept() to obtain the connection FD and a corresponding host-port. It saves the connection using the host-port as a unique identifier. At this stage, the agent has three established connections (identified by stdin-port, stdout-port, and stderr-port).
|
||||
5. RPC Mapping: When kata-shim invokes the create_container RPC, it includes these three port identifiers in the request.
|
||||
6. Final Binding: Upon receiving the RPC, the agent retrieves the saved connections using the provided ports and binds them directly to the container's standard IO streams.
|
||||
|
||||
|
||||
## How to enable PassthroughFD IO within Configuration?
|
||||
|
||||
The Passthrough-FD feature is controlled by two main parameters in the Kata configuration file:
|
||||
|
||||
- use_passfd_io: A boolean flag to enable or disable the Passthrough-FD IO feature.
|
||||
- passfd_listener_port: Specifies the port on which the kata-agent listens for FD connections. The default value is 1027.
|
||||
To enable Passthrough-FD IO, set use_passfd_io to true in the configuration file:
|
||||
|
||||
```toml
|
||||
...
|
||||
# If enabled, the runtime will attempt to use fd passthrough feature for process io.
|
||||
# Note: this feature is only supported by the Dragonball hypervisor.
|
||||
use_passfd_io = true
|
||||
|
||||
# If fd passthrough io is enabled, the runtime will attempt to use the specified port instead of the default port.
|
||||
passfd_listener_port = 1027
|
||||
```
|
||||
@@ -73,5 +73,5 @@ See below example config:
|
||||
privileged_without_host_devices = true
|
||||
```
|
||||
|
||||
- [Kata Containers with CRI-O](../how-to/run-kata-with-k8s.md#cri-o)
|
||||
- [Kata Containers with CRI-O](../how-to/how-to-use-k8s-with-crio-and-kata.md#cri-o)
|
||||
|
||||
|
||||
@@ -18,6 +18,3 @@ artifacts required to run Kata Containers on Kubernetes.
|
||||
* [upgrading document](../Upgrading.md)
|
||||
* [developer guide](../Developer-Guide.md)
|
||||
* [runtime documentation](../../src/runtime/README.md)
|
||||
|
||||
## Kata Containers 3.0 rust runtime installation
|
||||
* [installation guide](../install/kata-containers-3.0-rust-runtime-installation-guide.md)
|
||||
|
||||
@@ -1,116 +0,0 @@
|
||||
# Kata Containers 3.0 rust runtime installation
|
||||
The following is an overview of the different installation methods available.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check
|
||||
[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata
|
||||
Containers.
|
||||
|
||||
### Platform support
|
||||
|
||||
Kata Containers 3.0 rust runtime currently runs on 64-bit systems supporting the following
|
||||
architectures:
|
||||
|
||||
> **Notes:**
|
||||
> For other architectures, see https://github.com/kata-containers/kata-containers/issues/4320
|
||||
|
||||
| Architecture | Virtualization technology |
|
||||
|-|-|
|
||||
| `x86_64`| [Intel](https://www.intel.com) VT-x |
|
||||
| `aarch64` ("`arm64`")| [ARM](https://www.arm.com) Hyp |
|
||||
|
||||
## Packaged installation methods
|
||||
|
||||
| Installation method | Description | Automatic updates | Use case | Availability
|
||||
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- |
|
||||
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | Yes |
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
|
||||
|
||||
### Kata Deploy Installation
|
||||
|
||||
Follow the [`kata-deploy`](../../tools/packaging/kata-deploy/helm-chart/README.md).
|
||||
### Official packages
|
||||
`ToDo`
|
||||
### Automatic Installation
|
||||
`ToDo`
|
||||
### Manual Installation
|
||||
`ToDo`
|
||||
|
||||
## Build from source installation
|
||||
|
||||
### Rust Environment Set Up
|
||||
|
||||
* Download `Rustup` and install `Rust`
|
||||
> **Notes:**
|
||||
> For Rust version, please set `RUST_VERSION` to the value of `languages.rust.meta.newest-version key` in [`versions.yaml`](../../versions.yaml) or, if `yq` is available on your system, run `export RUST_VERSION=$(yq read versions.yaml languages.rust.meta.newest-version)`.
|
||||
|
||||
Example for `x86_64`
|
||||
```
|
||||
$ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
$ source $HOME/.cargo/env
|
||||
$ rustup install ${RUST_VERSION}
|
||||
$ rustup default ${RUST_VERSION}-x86_64-unknown-linux-gnu
|
||||
```
|
||||
|
||||
* Musl support for fully static binary
|
||||
|
||||
Example for `x86_64`
|
||||
```
|
||||
$ rustup target add x86_64-unknown-linux-musl
|
||||
```
|
||||
* [Musl `libc`](http://musl.libc.org/) install
|
||||
|
||||
Example for musl 1.2.3
|
||||
```
|
||||
$ curl -O https://git.musl-libc.org/cgit/musl/snapshot/musl-1.2.3.tar.gz
|
||||
$ tar vxf musl-1.2.3.tar.gz
|
||||
$ cd musl-1.2.3/
|
||||
$ ./configure --prefix=/usr/local/
|
||||
$ make && sudo make install
|
||||
```
|
||||
|
||||
|
||||
### Install Kata 3.0 Rust Runtime Shim
|
||||
|
||||
```
|
||||
$ git clone https://github.com/kata-containers/kata-containers.git
|
||||
$ cd kata-containers/src/runtime-rs
|
||||
$ make && sudo make install
|
||||
```
|
||||
After running the command above, the default config file `configuration.toml` will be installed under `/usr/share/defaults/kata-containers/`, the binary file `containerd-shim-kata-v2` will be installed under `/usr/local/bin/` .
|
||||
|
||||
### Install Shim Without Builtin Dragonball VMM
|
||||
|
||||
By default, runtime-rs includes the `Dragonball` VMM. To build without the built-in `Dragonball` hypervisor, use `make USE_BUILDIN_DB=false`:
|
||||
```bash
|
||||
$ cd kata-containers/src/runtime-rs
|
||||
$ make USE_BUILDIN_DB=false
|
||||
```
|
||||
After building, specify the desired hypervisor during installation using `HYPERVISOR`. For example, to use `qemu` or `cloud-hypervisor`:
|
||||
|
||||
```
|
||||
sudo make install HYPERVISOR=qemu
|
||||
```
|
||||
or
|
||||
```
|
||||
sudo make install HYPERVISOR=cloud-hypervisor
|
||||
```
|
||||
|
||||
### Build Kata Containers Kernel
|
||||
Follow the [Kernel installation guide](/tools/packaging/kernel/README.md).
|
||||
|
||||
### Build Kata Rootfs
|
||||
Follow the [Rootfs installation guide](../../tools/osbuilder/rootfs-builder/README.md).
|
||||
|
||||
### Build Kata Image
|
||||
Follow the [Image installation guide](../../tools/osbuilder/image-builder/README.md).
|
||||
|
||||
### Install Containerd
|
||||
|
||||
Follow the [Containerd installation guide](container-manager/containerd/containerd-install.md).
|
||||
|
||||
|
||||
@@ -1,102 +1,3 @@
|
||||
[workspace]
|
||||
members = ["rustjail", "policy", "vsock-exporter"]
|
||||
|
||||
[workspace.package]
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
license = "Apache-2.0"
|
||||
rust-version = "1.88.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
oci-spec = { version = "0.8.1", features = ["runtime"] }
|
||||
lazy_static = "1.3.0"
|
||||
ttrpc = { version = "0.8.4", features = ["async"], default-features = false }
|
||||
protobuf = "3.7.2"
|
||||
libc = "0.2.94"
|
||||
|
||||
# Notes:
|
||||
# - Needs to stay in sync with libs
|
||||
# - Upgrading to 0.27+ will require code changes (see #11842)
|
||||
nix = "0.26.4"
|
||||
|
||||
capctl = "0.2.0"
|
||||
scan_fmt = "0.2.6"
|
||||
scopeguard = "1.0.0"
|
||||
thiserror = "1.0.26"
|
||||
regex = "1.10.5"
|
||||
serial_test = "0.10.0"
|
||||
url = "2.5.0"
|
||||
derivative = "2.2.0"
|
||||
const_format = "0.2.30"
|
||||
|
||||
# Async helpers
|
||||
async-trait = "0.1.50"
|
||||
async-recursion = "0.3.2"
|
||||
futures = "0.3.30"
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1.46.1", features = ["full"] }
|
||||
tokio-vsock = "0.3.4"
|
||||
|
||||
netlink-sys = { version = "0.7.0", features = ["tokio_socket"] }
|
||||
rtnetlink = "0.14.0"
|
||||
netlink-packet-route = "0.19.0"
|
||||
netlink-packet-core = "0.7.0"
|
||||
ipnetwork = "0.17.0"
|
||||
|
||||
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.1.2"
|
||||
slog-term = "2.9.0"
|
||||
|
||||
# Redirect ttrpc log calls
|
||||
slog-stdlog = "4.0.0"
|
||||
log = "0.4.11"
|
||||
|
||||
cfg-if = "1.0.0"
|
||||
prometheus = { version = "0.14.0", features = ["process"] }
|
||||
procfs = "0.12.0"
|
||||
|
||||
anyhow = "1"
|
||||
|
||||
cgroups = { package = "cgroups-rs", git = "https://github.com/kata-containers/cgroups-rs", rev = "v0.3.5" }
|
||||
|
||||
# Tracing
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = "0.3.20"
|
||||
tracing-opentelemetry = "0.17.0"
|
||||
opentelemetry = { version = "0.17.0", features = ["rt-tokio"] }
|
||||
|
||||
# Configuration
|
||||
serde = { version = "1.0.129", features = ["derive"] }
|
||||
serde_json = "1.0.39"
|
||||
toml = "0.5.8"
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
strum = "0.26.2"
|
||||
strum_macros = "0.26.2"
|
||||
|
||||
tempfile = "3.19.1"
|
||||
which = "4.3.0"
|
||||
rstest = "0.18.0"
|
||||
|
||||
# Local dependencies
|
||||
kata-agent-policy = { path = "policy" }
|
||||
rustjail = { path = "rustjail" }
|
||||
vsock-exporter = { path = "vsock-exporter" }
|
||||
|
||||
mem-agent = { path = "../libs/mem-agent" }
|
||||
|
||||
kata-sys-util = { path = "../libs/kata-sys-util" }
|
||||
kata-types = { path = "../libs/kata-types", features = ["safe-path"] }
|
||||
# Note: this crate sets the slog 'max_*' features which allows the log level
|
||||
# to be modified at runtime.
|
||||
logging = { path = "../libs/logging" }
|
||||
protocols = { path = "../libs/protocols" }
|
||||
runtime-spec = { path = "../libs/runtime-spec" }
|
||||
safe-path = { path = "../libs/safe-path" }
|
||||
test-utils = { path = "../libs/test-utils" }
|
||||
|
||||
|
||||
[package]
|
||||
name = "kata-agent"
|
||||
version = "0.1.0"
|
||||
@@ -156,7 +57,8 @@ cgroups.workspace = true
|
||||
# Tracing
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
# TODO: bump tracing-opentelemetry to sync with version in workspace
|
||||
tracing-opentelemetry = "0.17.0"
|
||||
opentelemetry.workspace = true
|
||||
|
||||
# Configuration
|
||||
@@ -205,7 +107,3 @@ seccomp = ["rustjail/seccomp"]
|
||||
standard-oci-runtime = ["rustjail/standard-oci-runtime"]
|
||||
agent-policy = ["kata-agent-policy"]
|
||||
init-data = []
|
||||
|
||||
[[bin]]
|
||||
name = "kata-agent"
|
||||
path = "src/main.rs"
|
||||
|
||||
@@ -63,7 +63,7 @@ ifneq ($(EXTRA_RUSTFEATURES),)
|
||||
override EXTRA_RUSTFEATURES := --features "$(EXTRA_RUSTFEATURES)"
|
||||
endif
|
||||
|
||||
TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET)
|
||||
TARGET_PATH = ../../target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET)
|
||||
|
||||
##VAR DESTDIR=<path> is a directory prepended to each installed target file
|
||||
DESTDIR ?=
|
||||
@@ -153,7 +153,7 @@ vendor:
|
||||
|
||||
#TARGET test: run cargo tests
|
||||
test: $(GENERATED_FILES)
|
||||
@RUST_LIB_BACKTRACE=0 RUST_BACKTRACE=1 cargo test --all --target $(TRIPLE) $(EXTRA_RUSTFEATURES) -- --nocapture
|
||||
@RUST_LIB_BACKTRACE=0 RUST_BACKTRACE=1 cargo test -p kata-agent --target $(TRIPLE) $(EXTRA_RUSTFEATURES) -- --nocapture
|
||||
|
||||
##TARGET check: run test
|
||||
check: $(GENERATED_FILES) standard_rust_check
|
||||
|
||||
@@ -1,13 +1,114 @@
|
||||
The `src/libs` directory hosts library crates which may be shared by multiple Kata Containers components
|
||||
or published to [`crates.io`](https://crates.io/index.html).
|
||||
# Kata Containers Library Crates
|
||||
|
||||
### Library Crates
|
||||
Currently it provides following library crates:
|
||||
The `src/libs` directory hosts library crates shared by multiple Kata Containers components. These libraries provide common utilities, data types, and protocol definitions to facilitate development and maintain consistency across the project.
|
||||
|
||||
## Library Crates
|
||||
|
||||
| Library | Description |
|
||||
|-|-|
|
||||
| [logging](logging/) | Facilities to setup logging subsystem based on slog. |
|
||||
| [system utilities](kata-sys-util/) | Collection of facilities and helpers to access system services. |
|
||||
| [types](kata-types/) | Collection of constants and data types shared by multiple Kata Containers components. |
|
||||
| [safe-path](safe-path/) | Utilities to safely resolve filesystem paths. |
|
||||
| [test utilities](test-utils/) | Utilities to share test code. |
|
||||
|---------|-------------|
|
||||
| [kata-types](kata-types/) | Constants, data types, and configuration structures shared by Kata Containers components |
|
||||
| [kata-sys-util](kata-sys-util/) | System utilities: CPU, device, filesystem, hooks, K8s, mount, netns, NUMA, PCI, protection, spec validation |
|
||||
| [protocols](protocols/) | ttrpc protocol definitions for agent, health, remote, CSI, OCI, confidential data hub |
|
||||
| [runtime-spec](runtime-spec/) | OCI runtime spec data structures and constants |
|
||||
| [shim-interface](shim-interface/) | Shim management interface with RESTful API over Unix domain socket |
|
||||
| [logging](logging/) | Slog-based logging with JSON output and systemd journal support |
|
||||
| [safe-path](safe-path/) | Safe path resolution to prevent symlink and TOCTOU attacks |
|
||||
| [mem-agent](mem-agent/) | Memory management agent: memcg, compact, PSI monitoring |
|
||||
| [test-utils](test-utils/) | Test macros for root/non-root privileges and KVM accessibility |
|
||||
|
||||
## Details
|
||||
|
||||
### kata-types
|
||||
|
||||
Core types and configurations including:
|
||||
|
||||
- Annotations for CRI-containerd, CRI-O, dockershim
|
||||
- Hypervisor configurations (QEMU, Cloud Hypervisor, Firecracker, Dragonball)
|
||||
- Agent and runtime configurations
|
||||
- Kubernetes-specific utilities
|
||||
|
||||
### kata-sys-util
|
||||
|
||||
System-level utilities:
|
||||
|
||||
- `cpu`: CPU information and affinity
|
||||
- `device`: Device management
|
||||
- `fs`: Filesystem operations
|
||||
- `hooks`: Hook execution
|
||||
- `k8s`: Kubernetes utilities
|
||||
- `mount`: Mount operations
|
||||
- `netns`: Network namespace handling
|
||||
- `numa`: NUMA topology
|
||||
- `pcilibs`: PCI device access
|
||||
- `protection`: Hardware protection features
|
||||
- `spec`: OCI spec loading
|
||||
- `validate`: Input validation
|
||||
|
||||
### protocols
|
||||
|
||||
Generated ttrpc protocol bindings:
|
||||
|
||||
- `agent`: Kata agent API
|
||||
- `health`: Health check service
|
||||
- `remote`: Remote hypervisor API
|
||||
- `csi`: Container storage interface
|
||||
- `oci`: OCI specifications
|
||||
- `confidential_data_hub`: Confidential computing support
|
||||
|
||||
Features: `async` for async ttrpc, `with-serde` for serde support.
|
||||
|
||||
### runtime-spec
|
||||
|
||||
OCI runtime specification types:
|
||||
|
||||
- `ContainerState`: Creating, Created, Running, Stopped, Paused
|
||||
- `State`: Container state with version, id, status, pid, bundle, annotations
|
||||
- Namespace constants: pid, network, mount, ipc, user, uts, cgroup
|
||||
|
||||
### shim-interface
|
||||
|
||||
Shim management service interface:
|
||||
|
||||
- RESTful API over Unix domain socket (`/run/kata/<sid>/shim-monitor.sock`)
|
||||
- `MgmtClient` for HTTP requests to shim management server
|
||||
- Sandbox ID resolution with prefix matching
|
||||
|
||||
### logging
|
||||
|
||||
Slog-based logging framework:
|
||||
|
||||
- JSON output to file or stdout
|
||||
- systemd journal support
|
||||
- Runtime log level filtering per component/subsystem
|
||||
- Async drain for thread safety
|
||||
|
||||
### safe-path
|
||||
|
||||
Secure filesystem path handling:
|
||||
|
||||
- `scoped_join()`: Safely join paths under a root directory
|
||||
- `scoped_resolve()`: Resolve paths constrained by root
|
||||
- `PinnedPathBuf`: TOCTOU-safe path reference
|
||||
- `ScopedDirBuilder`: Safe directory creation
|
||||
|
||||
### mem-agent
|
||||
|
||||
Memory management for containers:
|
||||
|
||||
- `memcg`: Memory cgroup configuration and monitoring
|
||||
- `compact`: Memory compaction control
|
||||
- `psi`: Pressure stall information monitoring
|
||||
- Async runtime with configurable policies
|
||||
|
||||
### test-utils
|
||||
|
||||
Testing utilities:
|
||||
|
||||
- `skip_if_root!`: Skip test if running as root
|
||||
- `skip_if_not_root!`: Skip test if not running as root
|
||||
- `skip_if_kvm_unaccessable!`: Skip test if KVM is unavailable
|
||||
- `assert_result!`: Assert expected vs actual results
|
||||
|
||||
## License
|
||||
|
||||
All crates are licensed under Apache-2.0.
|
||||
|
||||
@@ -1,16 +1,100 @@
|
||||
# `kata-sys-util`
|
||||
|
||||
This crate is a collection of utilities and helpers for
|
||||
[Kata Containers](https://github.com/kata-containers/kata-containers/) components to access system services.
|
||||
System utilities and helpers for [Kata Containers](https://github.com/kata-containers/kata-containers/) components to access Linux system services.
|
||||
|
||||
It provides safe wrappers over system services, such as:
|
||||
- file systems
|
||||
- mount
|
||||
- NUMA
|
||||
## Overview
|
||||
|
||||
## Support
|
||||
This crate provides safe wrappers and utility functions for interacting with various Linux system services and kernel interfaces. It is designed specifically for the Kata Containers ecosystem.
|
||||
|
||||
## Features
|
||||
|
||||
### File System Operations (`fs`)
|
||||
|
||||
- Path canonicalization and basename extraction
|
||||
- Filesystem type detection (FUSE, OverlayFS)
|
||||
- Symlink detection
|
||||
- Reflink copy with fallback to regular copy
|
||||
|
||||
### Mount Operations (`mount`)
|
||||
|
||||
- Bind mount and remount operations
|
||||
- Mount propagation type management (SHARED, PRIVATE, SLAVE, UNBINDABLE)
|
||||
- Overlay filesystem mount option compression
|
||||
- Safe mount destination creation
|
||||
- Umount with timeout support
|
||||
- `/proc/mounts` parsing utilities
|
||||
|
||||
### CPU Utilities (`cpu`)
|
||||
|
||||
- CPU information parsing from `/proc/cpuinfo`
|
||||
- CPU flags detection and validation
|
||||
- Architecture-specific support (x86_64, s390x)
|
||||
|
||||
### NUMA Support (`numa`)
|
||||
|
||||
- CPU to NUMA node mapping
|
||||
- NUMA node information retrieval from sysfs
|
||||
- NUMA CPU validation
|
||||
|
||||
### Device Management (`device`)
|
||||
|
||||
- Block device major/minor number detection
|
||||
- Device ID resolution for cgroup operations
|
||||
|
||||
### Kubernetes Support (`k8s`)
|
||||
|
||||
- Ephemeral volume detection
|
||||
- EmptyDir volume handling
|
||||
- Kubernetes-specific mount type identification
|
||||
|
||||
### Network Namespace (`netns`)
|
||||
|
||||
- Network namespace switching with RAII guard pattern
|
||||
- Network namespace name generation
|
||||
|
||||
### OCI Specification Utilities (`spec`)
|
||||
|
||||
- Container type detection (PodSandbox, PodContainer)
|
||||
- Sandbox ID extraction from OCI annotations
|
||||
- OCI spec loading utilities
|
||||
|
||||
### Validation (`validate`)
|
||||
|
||||
- Container/exec ID validation
|
||||
- Environment variable validation
|
||||
|
||||
### Hooks (`hooks`)
|
||||
|
||||
- OCI hook execution and management
|
||||
- Hook state tracking
|
||||
- Timeout handling for hook execution
|
||||
|
||||
### Guest Protection (`protection`)
|
||||
|
||||
- Confidential computing detection (TDX, SEV, SNP, PEF, SE, ARM CCA , etc.)
|
||||
- Architecture-specific protection checking (x86_64, s390x, aarch64, powerpc64)
|
||||
|
||||
### Random Generation (`rand`)
|
||||
|
||||
- Secure random byte generation
|
||||
- UUID generation
|
||||
|
||||
### PCI Device Management (`pcilibs`)
|
||||
|
||||
- PCI device enumeration and management
|
||||
- PCI configuration space access
|
||||
- Memory resource allocation for PCI devices
|
||||
|
||||
## Supported Architectures
|
||||
|
||||
- x86_64
|
||||
- aarch64
|
||||
- s390x
|
||||
- powerpc64 (little-endian)
|
||||
- riscv64
|
||||
|
||||
## Supported Operating Systems
|
||||
|
||||
**Operating Systems**:
|
||||
- Linux
|
||||
|
||||
## License
|
||||
|
||||
@@ -177,7 +177,7 @@ pub fn get_linux_mount_info(mount_point: &str) -> Result<LinuxMountInfo> {
|
||||
///
|
||||
/// To ensure security, the `create_mount_destination()` function takes an extra parameter `root`,
|
||||
/// which is used to ensure that `dst` is within the specified directory. And a safe version of
|
||||
/// `PathBuf` is returned to avoid TOCTTOU type of flaws.
|
||||
/// `PathBuf` is returned to avoid TOCTOU type of flaws.
|
||||
pub fn create_mount_destination<S: AsRef<Path>, D: AsRef<Path>, R: AsRef<Path>>(
|
||||
src: S,
|
||||
dst: D,
|
||||
|
||||
@@ -1,18 +1,53 @@
|
||||
# kata-types
|
||||
|
||||
This crate is a collection of constants and data types shared by multiple
|
||||
[Kata Containers](https://github.com/kata-containers/kata-containers/) components.
|
||||
Constants and data types shared by Kata Containers components.
|
||||
|
||||
It defines constants and data types used by multiple Kata Containers components. Those constants
|
||||
and data types may be defined by Kata Containers or by other projects/specifications, such as:
|
||||
## Overview
|
||||
|
||||
This crate provides common constants, data types, and configuration structures used across multiple [Kata Containers](https://github.com/kata-containers/kata-containers/) components. It includes definitions from:
|
||||
|
||||
- Kata Containers project
|
||||
- [Containerd](https://github.com/containerd/containerd)
|
||||
- [Kubelet](https://github.com/kubernetes/kubelet)
|
||||
- [Kubelet](https://github.com/kubernetes/kubernetes)
|
||||
|
||||
## Support
|
||||
## Modules
|
||||
|
||||
**Operating Systems**:
|
||||
- Linux
|
||||
| Module | Description |
|
||||
|--------|-------------|
|
||||
| `annotations` | Annotation keys for CRI-containerd, CRI-O, dockershim, and third-party integrations |
|
||||
| `capabilities` | Hypervisor capability flags (block device, multi-queue, filesystem sharing, etc.) |
|
||||
| `config` | Configuration structures for agent, hypervisor (QEMU, Cloud Hypervisor, Firecracker, Dragonball), and runtime |
|
||||
| `container` | Container-related constants and types |
|
||||
| `cpu` | CPU resource management types |
|
||||
| `device` | Device-related definitions |
|
||||
| `fs` | Filesystem constants |
|
||||
| `handler` | Handler-related types |
|
||||
| `initdata` | Initdata specification for TEE data injection |
|
||||
| `k8s` | Kubernetes-specific paths and utilities (empty-dir, configmap, secret, projected volumes) |
|
||||
| `machine_type` | Machine type definitions |
|
||||
| `mount` | Mount point structures and validation |
|
||||
| `rootless` | Rootless VMM support utilities |
|
||||
|
||||
## Configuration
|
||||
|
||||
The `config` module supports:
|
||||
|
||||
- TOML-based configuration loading
|
||||
- Drop-in configuration files
|
||||
- Hypervisor-specific configurations (QEMU, Cloud Hypervisor, Firecracker, Dragonball, Remote)
|
||||
- Agent configuration
|
||||
- Runtime configuration
|
||||
- Shared mount definitions
|
||||
|
||||
## Features
|
||||
|
||||
- `enable-vendor`: Enable vendor-specific extensions
|
||||
- `safe-path`: Enable safe path resolution (platform-specific)
|
||||
|
||||
## Platform Support
|
||||
|
||||
- **Linux**: Fully supported
|
||||
|
||||
## License
|
||||
|
||||
This code is licensed under [Apache-2.0](../../../LICENSE).
|
||||
Apache-2.0 - See [LICENSE](../../../LICENSE)
|
||||
|
||||
@@ -26,7 +26,7 @@
|
||||
//! information and ensure data out of the container rootfs directory won't be affected
|
||||
//! by the container. There are several types of attacks related to container mount namespace:
|
||||
//! - symlink based attack
|
||||
//! - Time of check to time of use (TOCTTOU)
|
||||
//! - Time of check to time of use (TOCTOU)
|
||||
//!
|
||||
//! This crate provides several mechanisms for container runtimes to safely handle filesystem paths
|
||||
//! when preparing mount namespace for containers.
|
||||
@@ -35,13 +35,13 @@
|
||||
//! - [scoped_resolve()](crate::scoped_resolve()): resolve `unsafe_path` to a relative path,
|
||||
//! rooted at and constrained by `root`.
|
||||
//! - [struct PinnedPathBuf](crate::PinnedPathBuf): safe version of `PathBuf` to protect from
|
||||
//! TOCTTOU style of attacks, which ensures:
|
||||
//! TOCTOU style of attacks, which ensures:
|
||||
//! - the value of [`PinnedPathBuf::as_path()`] never changes.
|
||||
//! - the path returned by [`PinnedPathBuf::as_path()`] is always a symlink.
|
||||
//! - the filesystem object referenced by the symlink [`PinnedPathBuf::as_path()`] never changes.
|
||||
//! - the value of [`PinnedPathBuf::target()`] never changes.
|
||||
//! - [struct ScopedDirBuilder](crate::ScopedDirBuilder): safe version of `DirBuilder` to protect
|
||||
//! from symlink race and TOCTTOU style of attacks, which enhances security by:
|
||||
//! from symlink race and TOCTOU style of attacks, which enhances security by:
|
||||
//! - ensuring the new directories are created under a specified `root` directory.
|
||||
//! - avoiding symlink race attacks during making directories.
|
||||
//! - returning a [PinnedPathBuf] for the last level of directory, so it could be used for other
|
||||
|
||||
@@ -15,7 +15,7 @@ use std::path::{Component, Path, PathBuf};
|
||||
use crate::scoped_join;
|
||||
|
||||
/// A safe version of [`PathBuf`] pinned to an underlying filesystem object to protect from
|
||||
/// `TOCTTOU` style of attacks.
|
||||
/// `TOCTOU` style of attacks.
|
||||
///
|
||||
/// A [`PinnedPathBuf`] is a resolved path buffer pinned to an underlying filesystem object, which
|
||||
/// guarantees:
|
||||
|
||||
@@ -117,7 +117,7 @@ pub fn scoped_resolve<R: AsRef<Path>, U: AsRef<Path>>(root: R, unsafe_path: U) -
|
||||
/// Note that the guarantees provided by this function only apply if the path components in the
|
||||
/// returned string are not modified (in other words are not replaced with symlinks on the
|
||||
/// filesystem) after this function has returned. You may use [crate::PinnedPathBuf] to protect
|
||||
/// from such TOCTTOU attacks.
|
||||
/// from such TOCTOU attacks.
|
||||
pub fn scoped_join<R: AsRef<Path>, U: AsRef<Path>>(root: R, unsafe_path: U) -> Result<PathBuf> {
|
||||
do_scoped_resolve(root, unsafe_path).map(|(root, path)| root.join(path))
|
||||
}
|
||||
|
||||
@@ -99,11 +99,11 @@ HYPERVISOR_REMOTE = remote
|
||||
ARCH_SUPPORT_DB := x86_64 aarch64
|
||||
ifneq ($(filter $(ARCH),$(ARCH_SUPPORT_DB)),)
|
||||
# When set to true, builds the built-in Dragonball hypervisor
|
||||
USE_BUILDIN_DB := true
|
||||
USE_BUILTIN_DB := true
|
||||
else
|
||||
USE_BUILDIN_DB := false
|
||||
USE_BUILTIN_DB := false
|
||||
$(info Dragonball does not support ARCH $(ARCH), disabled. \
|
||||
Specify "USE_BUILDIN_DB=true" to force enable.)
|
||||
Specify "USE_BUILTIN_DB=true" to force enable.)
|
||||
endif
|
||||
|
||||
HYPERVISOR ?= $(HYPERVISOR_DB)
|
||||
@@ -483,7 +483,7 @@ USER_VARS += CONFIG_REMOTE_IN
|
||||
USER_VARS += CONFIG_QEMU_COCO_DEV_IN
|
||||
USER_VARS += DESTDIR
|
||||
USER_VARS += HYPERVISOR
|
||||
USER_VARS += USE_BUILDIN_DB
|
||||
USER_VARS += USE_BUILTIN_DB
|
||||
USER_VARS += DBCMD
|
||||
USER_VARS += DBCTLCMD
|
||||
USER_VARS += FCCTLCMD
|
||||
@@ -651,7 +651,7 @@ COMMIT_MSG = $(if $(COMMIT),$(COMMIT),unknown)
|
||||
EXTRA_RUSTFEATURES :=
|
||||
|
||||
# if use dragonball hypervisor, add the feature to build dragonball in runtime
|
||||
ifeq ($(USE_BUILDIN_DB),true)
|
||||
ifeq ($(USE_BUILTIN_DB),true)
|
||||
EXTRA_RUSTFEATURES += dragonball
|
||||
endif
|
||||
|
||||
|
||||
@@ -1,131 +1,179 @@
|
||||
# runtime-rs
|
||||
|
||||
## Wath's runtime-rs
|
||||
## What is runtime-rs
|
||||
|
||||
`runtime-rs` is a new component introduced in Kata Containers 3.0, it is a Rust version of runtime(shim). It like [runtime](../runtime), but they have many difference:
|
||||
`runtime-rs` is a core component of Kata Containers 4.0. It is a high-performance, Rust-based implementation of the containerd shim v2 runtime.
|
||||
|
||||
- `runtime-rs` is written in Rust, and `runtime` is written in Go.
|
||||
- `runtime` is the default shim in Kata Containers 3.0, `runtime-rs` is still under heavy development.
|
||||
- `runtime-rs` has a completed different architecture than `runtime`, you can check at the [architecture overview](../../docs/design/architecture_3.0).
|
||||
Key characteristics:
|
||||
|
||||
**Note**:
|
||||
- **Implementation Language**: Rust, leveraging memory safety and zero-cost abstractions
|
||||
- **Project Maturity**: Production-ready component of Kata Containers 4.0
|
||||
- **Architectural Design**: Modular framework optimized for Kata Containers 4.0
|
||||
|
||||
`runtime-rs` is still under heavy development, you should avoid using it in critical system.
|
||||
For architecture details, see [Architecture Overview](../../docs/design/architecture_4.0).
|
||||
|
||||
## Architecture overview
|
||||
## Architecture Overview
|
||||
|
||||
Also, `runtime-rs` provides the following features:
|
||||
Key features:
|
||||
|
||||
- Turn key solution with builtin `Dragonball` Sandbox, all components in one process
|
||||
- Async I/O to reduce resource consumption
|
||||
- Extensible framework for multiple services, runtimes and hypervisors
|
||||
- Lifecycle management for sandbox and container associated resources
|
||||
|
||||
See the [architecture overview](../../docs/design/architecture_3.0)
|
||||
for details on the `runtime-rs` design.
|
||||
|
||||
`runtime-rs` is a runtime written in Rust, it is composed of several crates.
|
||||
|
||||
This picture shows the overview about the crates under this directory and the relation between crates.
|
||||
- **Built-in VMM (Dragonball)**: Deeply integrated into shim lifecycle, eliminating IPC overhead for peak performance
|
||||
- **Asynchronous I/O**: Tokio-based async runtime for high-concurrency with reduced thread footprint
|
||||
- **Extensible Framework**: Pluggable hypervisors, network interfaces, and storage backends
|
||||
- **Resource Lifecycle Management**: Comprehensive sandbox and container resource management
|
||||
|
||||

|
||||
|
||||
Not all the features have been implemented yet, for details please check the [roadmap](../../docs/design/architecture_3.0/README.md#roadmap).
|
||||
|
||||
## Crates
|
||||
|
||||
The `runtime-rs` directory contains some crates in the crates directory that compose the `containerd-shim-kata-v2`.
|
||||
|
||||
| Crate | Description |
|
||||
|-|-|
|
||||
| [`shim`](crates/shim)| containerd shimv2 implementation |
|
||||
| [`service`](crates/service)| services for containers, includes task service |
|
||||
| [`runtimes`](crates/runtimes)| container runtimes |
|
||||
| [`resource`](crates/resource)| sandbox and container resources |
|
||||
| [`hypervisor`](crates/hypervisor)| hypervisor that act as a sandbox |
|
||||
| [`agent`](crates/agent)| library used to communicate with agent in the guest OS |
|
||||
| [`persist`](crates/persist)| persist container state to disk |
|
||||
|-------|-------------|
|
||||
| [`shim`](crates/shim) | Containerd shim v2 entry point (start, delete, run commands) |
|
||||
| [`service`](crates/service) | Services including TaskService for containerd shim protocol |
|
||||
| [`runtimes`](crates/runtimes) | Runtime handlers: VirtContainer (default), LinuxContainer(experimental), WasmContainer(experimental) |
|
||||
| [`resource`](crates/resource) | Resource management: network, share_fs, rootfs, volume, cgroups, cpu_mem |
|
||||
| [`hypervisor`](crates/hypervisor) | Hypervisor implementations |
|
||||
| [`agent`](crates/agent) | Guest agent communication (KataAgent) |
|
||||
| [`persist`](crates/persist) | State persistence to disk (JSON format) |
|
||||
| [`shim-ctl`](crates/shim-ctl) | Development tool for testing shim without containerd |
|
||||
|
||||
### shim
|
||||
|
||||
`shim` is the entry point of the containerd shim process, it implements containerd shim's [binary protocol](https://github.com/containerd/containerd/tree/v1.6.8/runtime/v2#commands):
|
||||
Entry point implementing [containerd shim v2 binary protocol](https://github.com/containerd/containerd/tree/main/runtime/v2#commands):
|
||||
|
||||
- start: start a new shim process
|
||||
- delete: delete exist a shim process
|
||||
- run: run ttRPC service in shim
|
||||
|
||||
containerd will launch a shim process and the shim process will serve as a ttRPC server to provide shim service through `TaskService` from `service` crate.
|
||||
- `start`: Start new shim process
|
||||
- `delete`: Delete existing shim process
|
||||
- `run`: Run ttRPC service
|
||||
|
||||
### service
|
||||
|
||||
The `runtime-rs` has an extensible framework, includes extension of services, runtimes, and hypervisors.
|
||||
|
||||
Currently, only containerd compatible `TaskService` is implemented.
|
||||
|
||||
`TaskService` has implemented the [containerd shim protocol](https://docs.rs/containerd-shim-protos/0.2.0/containerd_shim_protos/),
|
||||
and interacts with runtimes through messages.
|
||||
Extensible service framework. Currently implements `TaskService` conforming to [containerd shim protocol](https://docs.rs/containerd-shim-protos/).
|
||||
|
||||
### runtimes
|
||||
|
||||
Runtime is a container runtime, the runtime handler handles messages from task services to manage containers.
|
||||
Runtime handler and Runtime instance is used to deal with the operation for sandbox and container.
|
||||
Runtime handlers manage sandbox and container operations:
|
||||
|
||||
Currently, only `VirtContainer` has been implemented.
|
||||
| Handler | Feature Flag | Description |
|
||||
|---------|--------------|-------------|
|
||||
| `VirtContainer` | `virt` (default) | Virtual machine-based containers |
|
||||
| `LinuxContainer` | `linux` | Linux container runtime (experimental) |
|
||||
| `WasmContainer` | `wasm` | WebAssembly runtime (experimental) |
|
||||
|
||||
### resource
|
||||
|
||||
In `runtime-rs`, all networks/volumes/rootfs are abstracted as resources.
|
||||
All resources abstracted uniformly:
|
||||
|
||||
Resources are classified into two types:
|
||||
- **Sandbox resources**: network, share-fs
|
||||
- **Container resources**: rootfs, volume, cgroup
|
||||
|
||||
- sandbox resources: network, share-fs
|
||||
- container resources: rootfs, volume, cgroup
|
||||
|
||||
[Here](../../docs/design/architecture_3.0/README.md#resource-manager) is a detailed description of the resources.
|
||||
Sub-modules: `cpu_mem`, `cdi_devices`, `coco_data`, `network`, `share_fs`, `rootfs`, `volume`
|
||||
|
||||
### hypervisor
|
||||
|
||||
For `VirtContainer`, there will be more hypervisors to choose.
|
||||
Supported hypervisors:
|
||||
|
||||
Currently, built-in `Dragonball` has been implemented. We have also added initial support for `cloud-hypervisor` with CI being added next.
|
||||
| Hypervisor | Mode | Description |
|
||||
|------------|------|-------------|
|
||||
| Dragonball | Built-in | Integrated VMM for peak performance (default) |
|
||||
| QEMU | External | Full-featured emulator |
|
||||
| Cloud Hypervisor | External | Modern VMM (x86_64, aarch64) |
|
||||
| Firecracker | External | Lightweight microVM |
|
||||
| Remote | External | Remote hypervisor |
|
||||
|
||||
The built-in VMM mode (Dragonball) is recommended for production, offering superior performance by eliminating IPC overhead.
|
||||
|
||||
### agent
|
||||
|
||||
`agent` is used to communicate with agent in the guest OS from the shim side. The only supported agent is `KataAgent`.
|
||||
Communication with guest OS agent via ttRPC. Supports `KataAgent` for full container lifecycle management.
|
||||
|
||||
### persist
|
||||
|
||||
Persist defines traits and functions to help different components save state to disk and load state from disk.
|
||||
State serialization to disk for sandbox recovery after restart. Stores `state.json` under `/run/kata/<sandbox-id>/`.
|
||||
|
||||
### helper libraries
|
||||
## Build from Source and Install
|
||||
|
||||
Some helper libraries are maintained in [the library directory](../libs) so that they can be shared with other rust components.
|
||||
### Prerequisites
|
||||
|
||||
## Build and install
|
||||
Download `Rustup` and install Rust. For Rust version, see `languages.rust.meta.newest-version` in [`versions.yaml`](../../versions.yaml).
|
||||
|
||||
See the
|
||||
[build from the source section of the rust runtime installation guide](../../docs/install/kata-containers-3.0-rust-runtime-installation-guide.md#build-from-source-installation).
|
||||
Example for `x86_64`:
|
||||
|
||||
```bash
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
source $HOME/.cargo/env
|
||||
rustup install ${RUST_VERSION}
|
||||
rustup default ${RUST_VERSION}-x86_64-unknown-linux-gnu
|
||||
```
|
||||
|
||||
### Musl Support (Optional)
|
||||
|
||||
For fully static binary:
|
||||
|
||||
```bash
|
||||
# Add musl target
|
||||
rustup target add x86_64-unknown-linux-musl
|
||||
|
||||
# Install musl libc (example: musl 1.2.3)
|
||||
curl -O https://git.musl-libc.org/cgit/musl/snapshot/musl-1.2.3.tar.gz
|
||||
tar vxf musl-1.2.3.tar.gz
|
||||
cd musl-1.2.3/
|
||||
./configure --prefix=/usr/local/
|
||||
make && sudo make install
|
||||
```
|
||||
|
||||
### Install Kata 4.0 Rust Runtime Shim
|
||||
|
||||
```bash
|
||||
git clone https://github.com/kata-containers/kata-containers.git
|
||||
cd kata-containers/src/runtime-rs
|
||||
make && sudo make install
|
||||
```
|
||||
|
||||
After installation:
|
||||
- Config file: `/usr/share/defaults/kata-containers/configuration.toml`
|
||||
- Binary: `/usr/local/bin/containerd-shim-kata-v2`
|
||||
|
||||
### Install Without Built-in Dragonball VMM
|
||||
|
||||
To build without the built-in Dragonball hypervisor:
|
||||
|
||||
```bash
|
||||
make USE_BUILTIN_DB=false
|
||||
```
|
||||
|
||||
Specify hypervisor during installation:
|
||||
|
||||
```bash
|
||||
sudo make install HYPERVISOR=qemu
|
||||
# or
|
||||
sudo make install HYPERVISOR=cloud-hypervisor
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
`runtime-rs` has the same [configuration as `runtime`](../runtime/README.md#configuration) with some [limitations](#limitations).
|
||||
Configuration files in `config/`:
|
||||
|
||||
| Config File | Hypervisor | Notes |
|
||||
|-------------|------------|-------|
|
||||
| `configuration-dragonball.toml.in` | Dragonball | Built-in VMM |
|
||||
| `configuration-qemu-runtime-rs.toml.in` | QEMU | Default external |
|
||||
| `configuration-cloud-hypervisor.toml.in` | Cloud Hypervisor | Modern VMM |
|
||||
| `configuration-rs-fc.toml.in` | Firecracker | Lightweight microVM |
|
||||
| `configuration-remote.toml.in` | Remote | Remote hypervisor |
|
||||
| `configuration-qemu-tdx-runtime-rs.toml.in` | QEMU + TDX | Intel TDX confidential computing |
|
||||
| `configuration-qemu-snp-runtime-rs.toml.in` | QEMU + SEV-SNP | AMD SEV-SNP confidential computing |
|
||||
| `configuration-qemu-se-runtime-rs.toml.in` | QEMU + SEV | AMD SEV confidential computing |
|
||||
| `configuration-qemu-coco-dev-runtime-rs.toml.in` | QEMU + CoCo | CoCo development |
|
||||
|
||||
See [runtime configuration](../runtime/README.md#configuration) for configuration options.
|
||||
|
||||
## Logging
|
||||
|
||||
See the
|
||||
[debugging section of the developer guide](../../docs/Developer-Guide.md#troubleshoot-kata-containers).
|
||||
See [Developer Guide - Troubleshooting](../../docs/Developer-Guide.md#troubleshoot-kata-containers).
|
||||
|
||||
## Debugging
|
||||
|
||||
See the
|
||||
[debugging section of the developer guide](../../docs/Developer-Guide.md#troubleshoot-kata-containers).
|
||||
|
||||
An [experimental alternative binary](crates/shim-ctl/README.md) is available that removes containerd dependencies and makes it easier to run the shim proper outside of the runtime's usual deployment environment (i.e. on a developer machine).
|
||||
For development, use [`shim-ctl`](crates/shim-ctl/README.md) to test shim without containerd dependencies.
|
||||
|
||||
## Limitations
|
||||
|
||||
For Kata Containers limitations, see the
|
||||
[limitations file](../../docs/Limitations.md)
|
||||
for further details.
|
||||
|
||||
`runtime-rs` is under heavy developments, and doesn't support all features as the Golang version [`runtime`](../runtime), check the [roadmap](../../docs/design/architecture_3.0/README.md#roadmap) for details.
|
||||
See [Limitations](../../docs/Limitations.md) for details.
|
||||
|
||||
@@ -1,23 +1,21 @@
|
||||
# Multi-vmm support for runtime-rs
|
||||
# Multi-VMM Support for runtime-rs
|
||||
|
||||
## 0. Status
|
||||
|
||||
External hypervisor support is currently being developed.
|
||||
Multiple external hypervisors are supported in the Rust runtime, including QEMU, Firecracker, and Cloud Hypervisor. This document outlines the key implementation details for multi-VMM support in the Rust runtime.
|
||||
|
||||
See [the main tracking issue](https://github.com/kata-containers/kata-containers/issues/4634)
|
||||
for further details.
|
||||
## 1. Hypervisor Configuration
|
||||
|
||||
Some key points for supporting multi-vmm in rust runtime.
|
||||
## 1. Hypervisor Config
|
||||
|
||||
The diagram below gives an overview for the hypervisor config
|
||||
The diagram below provides an overview of the hypervisor configuration:
|
||||
|
||||

|
||||
|
||||
VMM's config info will be loaded when initialize the runtime instance, there are some important functions need to be focused on.
|
||||
VMM configuration information is loaded during runtime instance initialization. The following key functions are critical to this process:
|
||||
|
||||
### `VirtContainer::init()`
|
||||
|
||||
This function initialize the runtime handler. It will register the plugins into the HYPERVISOR_PLUGINS. Different plugins are needed for different hypervisors.
|
||||
This function initializes the runtime handler and registers plugins into the `HYPERVISOR_PLUGINS` registry. Different hypervisors require different plugins:
|
||||
|
||||
```rust
|
||||
#[async_trait]
|
||||
impl RuntimeHandler for VirtContainer {
|
||||
@@ -30,21 +28,24 @@ impl RuntimeHandler for VirtContainer {
|
||||
}
|
||||
```
|
||||
|
||||
[This is the plugin method for QEMU. Other VMM plugin methods haven't support currently.](../../../libs/kata-types/src/config/hypervisor/qemu.rs)
|
||||
QEMU plugin defines the methods to adjust and validate the hypervisor config file, those methods could be modified if it is needed.
|
||||
Currently, the QEMU plugin is fully implemented, we can take it as an example. The QEMU plugin defines methods to adjust and validate the hypervisor configuration file. These methods can be customized as needed.
|
||||
|
||||
Details of the QEMU plugin implementation can be found in [QEMU Plugin Implementation](../../../libs/kata-types/src/config/hypervisor/qemu.rs)
|
||||
|
||||
When loading the TOML configuration, the registered plugins are invoked to adjust and validate the configuration file:
|
||||
|
||||
After that, when loading the TOML config, the plugins will be called to adjust and validate the config file.
|
||||
```rust
|
||||
async fn try_init(&mut self, spec: &oci::Spec) -> Result<()> {、
|
||||
async fn try_init(&mut self, spec: &oci::Spec) -> Result<()> {
|
||||
...
|
||||
let config = load_config(spec).context("load config")?;
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### new_instance
|
||||
### `new_instance`
|
||||
|
||||
This function creates a runtime instance that manages container and sandbox operations. During this process, a hypervisor instance is created. For QEMU, the hypervisor instance is instantiated and configured with the appropriate configuration file:
|
||||
|
||||
This function will create a runtime_instance which include the operations for container and sandbox. At the same time, a hypervisor instance will be created. QEMU instance will be created here as well, and set the hypervisor config file
|
||||
```rust
|
||||
async fn new_hypervisor(toml_config: &TomlConfig) -> Result<Arc<dyn Hypervisor>> {
|
||||
let hypervisor_name = &toml_config.runtime.hypervisor_name;
|
||||
@@ -70,7 +71,8 @@ async fn new_hypervisor(toml_config: &TomlConfig) -> Result<Arc<dyn Hypervisor>>
|
||||
|
||||
## 2. Hypervisor Trait
|
||||
|
||||
[To support multi-vmm, the hypervisor trait need to be implemented.](./src/lib.rs)
|
||||
[The hypervisor trait must be implemented to support multi-VMM architectures.](./src/lib.rs)
|
||||
|
||||
```rust
|
||||
pub trait Hypervisor: Send + Sync {
|
||||
// vm manager
|
||||
@@ -97,6 +99,7 @@ pub trait Hypervisor: Send + Sync {
|
||||
async fn save_state(&self) -> Result<HypervisorState>;
|
||||
}
|
||||
```
|
||||
In current design, VM will be started in the following steps.
|
||||
|
||||
In the current design, the VM startup process follows these steps:
|
||||
|
||||

|
||||
|
||||
@@ -229,6 +229,12 @@ disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM_CLH@
|
||||
# The default setting is "no-port"
|
||||
hot_plug_vfio = "no-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a root-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
cold_plug_vfio = "no-port"
|
||||
|
||||
# Path to OCI hook binaries in the *guest rootfs*.
|
||||
# This does not affect host-side hooks which must instead be added to
|
||||
# the OCI spec passed to the runtime.
|
||||
|
||||
@@ -235,6 +235,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently only implemented
|
||||
# for SCSI.
|
||||
|
||||
@@ -247,6 +247,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
|
||||
@@ -287,6 +287,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
@@ -727,7 +737,7 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
# - block-encrypted
|
||||
# Plugs a block device to be encrypted in the guest.
|
||||
#
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE@"
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
|
||||
@@ -264,6 +264,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
@@ -704,7 +714,7 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
# - block-encrypted
|
||||
# Plugs a block device to be encrypted in the guest.
|
||||
#
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE@"
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
|
||||
@@ -246,6 +246,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
|
||||
@@ -249,6 +249,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
@@ -689,7 +699,7 @@ disable_guest_empty_dir = @DEFDISABLEGUESTEMPTYDIR@
|
||||
# - block-encrypted
|
||||
# Plugs a block device to be encrypted in the guest.
|
||||
#
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE@"
|
||||
emptydir_mode = "@DEFEMPTYDIRMODE_COCO@"
|
||||
|
||||
# Enabled experimental feature list, format: ["a", "b"].
|
||||
# Experimental features are features not stable enough for production,
|
||||
|
||||
@@ -281,6 +281,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
|
||||
@@ -263,6 +263,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
|
||||
@@ -241,6 +241,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable iothreads (data-plane) to be used. This causes IO to be
|
||||
# handled in a separate IO thread. This is currently implemented
|
||||
# for virtio-scsi and virtio-blk.
|
||||
|
||||
@@ -181,6 +181,16 @@ block_device_cache_direct = false
|
||||
# Default false
|
||||
block_device_cache_noflush = false
|
||||
|
||||
# Specifies the logical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_logical_sector_size = 0
|
||||
|
||||
# Specifies the physical sector size, in bytes, reported by block devices to the guest.
|
||||
# Common values are 512 and 4096. Set to 0 to use the QEMU/hypervisor default.
|
||||
# Default 0
|
||||
block_device_physical_sector_size = 0
|
||||
|
||||
# Enable huge pages for VM RAM, default false
|
||||
# Enabling this will result in the VM memory
|
||||
# being allocated using huge pages.
|
||||
|
||||
@@ -859,8 +859,10 @@ func (q *QMP) ExecuteBlockdevAddWithDriverCache(ctx context.Context, driver stri
|
||||
// shared denotes if the drive can be shared allowing it to be passed more than once.
|
||||
// disableModern indicates if virtio version 1.0 should be replaced by the
|
||||
// former version 0.9, as there is a KVM bug that occurs when using virtio
|
||||
// 1.0 in nested environments.
|
||||
func (q *QMP) ExecuteDeviceAdd(ctx context.Context, blockdevID, devID, driver, bus, romfile string, shared, disableModern bool) error {
|
||||
// 1.0 in nested environments. logicalBlockSize and physicalBlockSize specify
|
||||
// the logical and physical block sizes for the device; if either is 0, the
|
||||
// hypervisor default is used for that size.
|
||||
func (q *QMP) ExecuteDeviceAdd(ctx context.Context, blockdevID, devID, driver, bus, romfile string, shared, disableModern bool, logicalBlockSize, physicalBlockSize uint32) error {
|
||||
args := map[string]interface{}{
|
||||
"id": devID,
|
||||
"driver": driver,
|
||||
@@ -886,6 +888,14 @@ func (q *QMP) ExecuteDeviceAdd(ctx context.Context, blockdevID, devID, driver, b
|
||||
}
|
||||
}
|
||||
|
||||
if logicalBlockSize > 0 {
|
||||
args["logical_block_size"] = logicalBlockSize
|
||||
}
|
||||
|
||||
if physicalBlockSize > 0 {
|
||||
args["physical_block_size"] = physicalBlockSize
|
||||
}
|
||||
|
||||
return q.executeCommand(ctx, "device_add", args, nil)
|
||||
}
|
||||
|
||||
@@ -1108,8 +1118,9 @@ func (q *QMP) ExecuteDeviceDel(ctx context.Context, devID string) error {
|
||||
// a block device. shared denotes if the drive can be shared allowing it to be passed more than once.
|
||||
// disableModern indicates if virtio version 1.0 should be replaced by the
|
||||
// former version 0.9, as there is a KVM bug that occurs when using virtio
|
||||
// 1.0 in nested environments.
|
||||
func (q *QMP) ExecutePCIDeviceAdd(ctx context.Context, blockdevID, devID, driver, addr, bus, romfile string, queues int, shared, disableModern bool, iothreadID string) error {
|
||||
// 1.0 in nested environments. logicalBlockSize and physicalBlockSize specify the logical and
|
||||
// physical sector sizes reported to the guest; set to 0 to use the hypervisor default.
|
||||
func (q *QMP) ExecutePCIDeviceAdd(ctx context.Context, blockdevID, devID, driver, addr, bus, romfile string, queues int, shared, disableModern bool, iothreadID string, logicalBlockSize, physicalBlockSize uint32) error {
|
||||
args := map[string]interface{}{
|
||||
"id": devID,
|
||||
"driver": driver,
|
||||
@@ -1140,6 +1151,14 @@ func (q *QMP) ExecutePCIDeviceAdd(ctx context.Context, blockdevID, devID, driver
|
||||
args["iothread"] = iothreadID
|
||||
}
|
||||
|
||||
if logicalBlockSize > 0 {
|
||||
args["logical_block_size"] = logicalBlockSize
|
||||
}
|
||||
|
||||
if physicalBlockSize > 0 {
|
||||
args["physical_block_size"] = physicalBlockSize
|
||||
}
|
||||
|
||||
return q.executeCommand(ctx, "device_add", args, nil)
|
||||
}
|
||||
|
||||
|
||||
@@ -208,6 +208,31 @@ func (b *qmpTestCommandBuffer) Write(p []byte) (int, error) {
|
||||
b.cmds[currentCmd].name, gotCmdName)
|
||||
result = "error"
|
||||
}
|
||||
|
||||
// When expected args are provided, verify that each expected key/value
|
||||
// is present in the actual QMP arguments. Existing tests pass nil args
|
||||
// and are unaffected by this check.
|
||||
if expectedArgs := b.cmds[currentCmd].args; expectedArgs != nil {
|
||||
gotArgs, _ := cmdJSON["arguments"].(map[string]interface{})
|
||||
for k, v := range expectedArgs {
|
||||
got, ok := gotArgs[k]
|
||||
if !ok {
|
||||
b.t.Errorf("Command %s: missing expected argument %q", gotCmdName, k)
|
||||
continue
|
||||
}
|
||||
// JSON numbers decode as float64
|
||||
expectedFloat, expectedIsFloat := toFloat64(v)
|
||||
gotFloat, gotIsFloat := toFloat64(got)
|
||||
if expectedIsFloat && gotIsFloat {
|
||||
if expectedFloat != gotFloat {
|
||||
b.t.Errorf("Command %s: argument %q = %v, want %v", gotCmdName, k, got, v)
|
||||
}
|
||||
} else if fmt.Sprintf("%v", got) != fmt.Sprintf("%v", v) {
|
||||
b.t.Errorf("Command %s: argument %q = %v, want %v", gotCmdName, k, got, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resultMap := make(map[string]interface{})
|
||||
resultMap[result] = b.results[currentCmd].data
|
||||
encodedRes, err := json.Marshal(&resultMap)
|
||||
@@ -219,6 +244,26 @@ func (b *qmpTestCommandBuffer) Write(p []byte) (int, error) {
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// toFloat64 attempts to convert a numeric value to float64 for comparison.
|
||||
// JSON unmarshalling decodes all numbers as float64, while Go code may pass
|
||||
// int, uint32, etc. This helper normalises both sides for comparison.
|
||||
func toFloat64(v interface{}) (float64, bool) {
|
||||
switch n := v.(type) {
|
||||
case float64:
|
||||
return n, true
|
||||
case int:
|
||||
return float64(n), true
|
||||
case int64:
|
||||
return float64(n), true
|
||||
case uint32:
|
||||
return float64(n), true
|
||||
case uint64:
|
||||
return float64(n), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func checkVersion(t *testing.T, connectedCh <-chan *QMPVersion) *QMPVersion {
|
||||
var version *QMPVersion
|
||||
select {
|
||||
@@ -605,7 +650,7 @@ func TestQMPDeviceAdd(t *testing.T) {
|
||||
blockdevID := fmt.Sprintf("drive_%s", volumeUUID)
|
||||
devID := fmt.Sprintf("device_%s", volumeUUID)
|
||||
err := q.ExecuteDeviceAdd(context.Background(), blockdevID, devID,
|
||||
"virtio-blk-pci", "", "", true, false)
|
||||
"virtio-blk-pci", "", "", true, false, 0, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error %v", err)
|
||||
}
|
||||
@@ -1070,7 +1115,31 @@ func TestQMPPCIDeviceAdd(t *testing.T) {
|
||||
blockdevID := fmt.Sprintf("drive_%s", volumeUUID)
|
||||
devID := fmt.Sprintf("device_%s", volumeUUID)
|
||||
err := q.ExecutePCIDeviceAdd(context.Background(), blockdevID, devID,
|
||||
"virtio-blk-pci", "0x1", "", "", 1, true, false, "")
|
||||
"virtio-blk-pci", "0x1", "", "", 1, true, false, "", 0, 0)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error %v", err)
|
||||
}
|
||||
q.Shutdown()
|
||||
<-disconnectedCh
|
||||
}
|
||||
|
||||
// Checks that PCI block devices with explicit logical and physical block sizes are
|
||||
// correctly added using device_add, and that the sizes appear in the QMP arguments.
|
||||
func TestQMPPCIDeviceAddWithBlockSize(t *testing.T) {
|
||||
connectedCh := make(chan *QMPVersion)
|
||||
disconnectedCh := make(chan struct{})
|
||||
buf := newQMPTestCommandBuffer(t)
|
||||
buf.AddCommand("device_add", map[string]interface{}{
|
||||
"logical_block_size": uint32(512),
|
||||
"physical_block_size": uint32(4096),
|
||||
}, "return", nil)
|
||||
cfg := QMPConfig{Logger: qmpTestLogger{}}
|
||||
q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh)
|
||||
q.version = checkVersion(t, connectedCh)
|
||||
blockdevID := fmt.Sprintf("drive_%s", volumeUUID)
|
||||
devID := fmt.Sprintf("device_%s", volumeUUID)
|
||||
err := q.ExecutePCIDeviceAdd(context.Background(), blockdevID, devID,
|
||||
"virtio-blk-pci", "0x1", "", "", 1, true, false, "", 512, 4096)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error %v", err)
|
||||
}
|
||||
|
||||
@@ -146,6 +146,8 @@ type hypervisor struct {
|
||||
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
|
||||
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
|
||||
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
|
||||
BlockDeviceLogicalSectorSize uint32 `toml:"block_device_logical_sector_size"`
|
||||
BlockDevicePhysicalSectorSize uint32 `toml:"block_device_physical_sector_size"`
|
||||
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
|
||||
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
|
||||
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
|
||||
@@ -593,6 +595,20 @@ func (h hypervisor) blockDeviceDriver() (string, error) {
|
||||
return "", fmt.Errorf("Invalid hypervisor block storage driver %v specified (supported drivers: %v)", h.BlockDeviceDriver, supportedBlockDrivers)
|
||||
}
|
||||
|
||||
func (h hypervisor) blockDeviceLogicalSectorSize() (uint32, error) {
|
||||
if err := validateBlockDeviceSectorSize(cfgBlockDeviceLogicalSectorSize, h.BlockDeviceLogicalSectorSize); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return h.BlockDeviceLogicalSectorSize, nil
|
||||
}
|
||||
|
||||
func (h hypervisor) blockDevicePhysicalSectorSize() (uint32, error) {
|
||||
if err := validateBlockDeviceSectorSize(cfgBlockDevicePhysicalSectorSize, h.BlockDevicePhysicalSectorSize); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return h.BlockDevicePhysicalSectorSize, nil
|
||||
}
|
||||
|
||||
func (h hypervisor) blockDeviceAIO() (string, error) {
|
||||
supportedBlockAIO := []string{config.AIOIOUring, config.AIONative, config.AIOThreads}
|
||||
|
||||
@@ -877,6 +893,28 @@ func newFirecrackerHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
const (
|
||||
cfgBlockDeviceLogicalSectorSize = "block_device_logical_sector_size"
|
||||
cfgBlockDevicePhysicalSectorSize = "block_device_physical_sector_size"
|
||||
)
|
||||
|
||||
func validateBlockDeviceSectorSize(name string, size uint32) error {
|
||||
if size == 0 {
|
||||
return nil
|
||||
}
|
||||
if size < 512 || size > 65536 || (size&(size-1)) != 0 {
|
||||
return fmt.Errorf("invalid %s %d: must be 0 or a power of 2 between 512 and 65536", name, size)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateBlockDeviceSectorSizes(logical, physical uint32) error {
|
||||
if logical != 0 && physical != 0 && logical > physical {
|
||||
return fmt.Errorf("invalid sector sizes: logical (%d) must not be larger than physical (%d)", logical, physical)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
hypervisor, err := h.path()
|
||||
if err != nil {
|
||||
@@ -973,88 +1011,104 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
blockLogicalSectorSize, err := h.blockDeviceLogicalSectorSize()
|
||||
if err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
blockPhysicalSectorSize, err := h.blockDevicePhysicalSectorSize()
|
||||
if err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
if err := validateBlockDeviceSectorSizes(blockLogicalSectorSize, blockPhysicalSectorSize); err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
return vc.HypervisorConfig{
|
||||
HypervisorPath: hypervisor,
|
||||
HypervisorPathList: h.HypervisorPathList,
|
||||
KernelPath: kernel,
|
||||
InitrdPath: initrd,
|
||||
ImagePath: image,
|
||||
RootfsType: rootfsType,
|
||||
FirmwarePath: firmware,
|
||||
FirmwareVolumePath: firmwareVolume,
|
||||
PFlash: pflashes,
|
||||
MachineAccelerators: machineAccelerators,
|
||||
CPUFeatures: cpuFeatures,
|
||||
KernelParams: vc.DeserializeParams(vc.KernelParamFields(kernelParams)),
|
||||
KernelVerityParams: h.kernelVerityParams(),
|
||||
HypervisorMachineType: machineType,
|
||||
QgsPort: h.qgsPort(),
|
||||
NumVCPUsF: h.defaultVCPUs(),
|
||||
DefaultMaxVCPUs: h.defaultMaxVCPUs(),
|
||||
MemorySize: h.defaultMemSz(),
|
||||
MemSlots: h.defaultMemSlots(),
|
||||
MemOffset: h.defaultMemOffset(),
|
||||
DefaultMaxMemorySize: h.defaultMaxMemSz(),
|
||||
VirtioMem: h.VirtioMem,
|
||||
EntropySource: h.GetEntropySource(),
|
||||
EntropySourceList: h.EntropySourceList,
|
||||
DefaultBridges: h.defaultBridges(),
|
||||
DisableBlockDeviceUse: h.DisableBlockDeviceUse,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: h.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: h.VirtioFSDaemonList,
|
||||
HypervisorLoglevel: h.defaultHypervisorLoglevel(),
|
||||
VirtioFSCacheSize: h.VirtioFSCacheSize,
|
||||
VirtioFSCache: h.defaultVirtioFSCache(),
|
||||
VirtioFSQueueSize: h.VirtioFSQueueSize,
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
MemPrealloc: h.MemPrealloc,
|
||||
ReclaimGuestFreedMemory: h.ReclaimGuestFreedMemory,
|
||||
HugePages: h.HugePages,
|
||||
IOMMU: h.IOMMU,
|
||||
IOMMUPlatform: h.getIOMMUPlatform(),
|
||||
GuestNUMANodes: h.defaultGuestNUMANodes(),
|
||||
FileBackedMemRootDir: h.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: h.FileBackedMemRootList,
|
||||
Debug: h.Debug,
|
||||
DisableNestingChecks: h.DisableNestingChecks,
|
||||
BlockDeviceDriver: blockDriver,
|
||||
BlockDeviceAIO: blockAIO,
|
||||
BlockDeviceCacheSet: h.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: h.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush,
|
||||
EnableIOThreads: h.EnableIOThreads,
|
||||
IndepIOThreads: h.indepiothreads(),
|
||||
Msize9p: h.msize9p(),
|
||||
DisableImageNvdimm: h.DisableImageNvdimm,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.pcieRootPort(),
|
||||
PCIeSwitchPort: h.pcieSwitchPort(),
|
||||
DisableVhostNet: h.DisableVhostNet,
|
||||
EnableVhostUserStore: h.EnableVhostUserStore,
|
||||
VhostUserStorePath: h.vhostUserStorePath(),
|
||||
VhostUserStorePathList: h.VhostUserStorePathList,
|
||||
VhostUserDeviceReconnect: h.VhostUserDeviceReconnect,
|
||||
SeccompSandbox: h.SeccompSandbox,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: txRateLimiterMaxRate,
|
||||
EnableAnnotations: h.EnableAnnotations,
|
||||
GuestMemoryDumpPath: h.GuestMemoryDumpPath,
|
||||
GuestMemoryDumpPaging: h.GuestMemoryDumpPaging,
|
||||
ConfidentialGuest: h.ConfidentialGuest,
|
||||
SevSnpGuest: h.SevSnpGuest,
|
||||
GuestSwap: h.GuestSwap,
|
||||
Rootless: h.Rootless,
|
||||
LegacySerial: h.LegacySerial,
|
||||
DisableSeLinux: h.DisableSeLinux,
|
||||
DisableGuestSeLinux: h.DisableGuestSeLinux,
|
||||
ExtraMonitorSocket: extraMonitorSocket,
|
||||
SnpIdBlock: h.SnpIdBlock,
|
||||
SnpIdAuth: h.SnpIdAuth,
|
||||
SnpGuestPolicy: h.SnpGuestPolicy,
|
||||
MeasurementAlgo: h.GetMeasurementAlgo(),
|
||||
HypervisorPath: hypervisor,
|
||||
HypervisorPathList: h.HypervisorPathList,
|
||||
KernelPath: kernel,
|
||||
InitrdPath: initrd,
|
||||
ImagePath: image,
|
||||
RootfsType: rootfsType,
|
||||
FirmwarePath: firmware,
|
||||
FirmwareVolumePath: firmwareVolume,
|
||||
PFlash: pflashes,
|
||||
MachineAccelerators: machineAccelerators,
|
||||
CPUFeatures: cpuFeatures,
|
||||
KernelParams: vc.DeserializeParams(vc.KernelParamFields(kernelParams)),
|
||||
KernelVerityParams: h.kernelVerityParams(),
|
||||
HypervisorMachineType: machineType,
|
||||
QgsPort: h.qgsPort(),
|
||||
NumVCPUsF: h.defaultVCPUs(),
|
||||
DefaultMaxVCPUs: h.defaultMaxVCPUs(),
|
||||
MemorySize: h.defaultMemSz(),
|
||||
MemSlots: h.defaultMemSlots(),
|
||||
MemOffset: h.defaultMemOffset(),
|
||||
DefaultMaxMemorySize: h.defaultMaxMemSz(),
|
||||
VirtioMem: h.VirtioMem,
|
||||
EntropySource: h.GetEntropySource(),
|
||||
EntropySourceList: h.EntropySourceList,
|
||||
DefaultBridges: h.defaultBridges(),
|
||||
DisableBlockDeviceUse: h.DisableBlockDeviceUse,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: h.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: h.VirtioFSDaemonList,
|
||||
HypervisorLoglevel: h.defaultHypervisorLoglevel(),
|
||||
VirtioFSCacheSize: h.VirtioFSCacheSize,
|
||||
VirtioFSCache: h.defaultVirtioFSCache(),
|
||||
VirtioFSQueueSize: h.VirtioFSQueueSize,
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
MemPrealloc: h.MemPrealloc,
|
||||
ReclaimGuestFreedMemory: h.ReclaimGuestFreedMemory,
|
||||
HugePages: h.HugePages,
|
||||
IOMMU: h.IOMMU,
|
||||
IOMMUPlatform: h.getIOMMUPlatform(),
|
||||
GuestNUMANodes: h.defaultGuestNUMANodes(),
|
||||
FileBackedMemRootDir: h.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: h.FileBackedMemRootList,
|
||||
Debug: h.Debug,
|
||||
DisableNestingChecks: h.DisableNestingChecks,
|
||||
BlockDeviceDriver: blockDriver,
|
||||
BlockDeviceAIO: blockAIO,
|
||||
BlockDeviceCacheSet: h.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: h.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush,
|
||||
BlockDeviceLogicalSectorSize: blockLogicalSectorSize,
|
||||
BlockDevicePhysicalSectorSize: blockPhysicalSectorSize,
|
||||
EnableIOThreads: h.EnableIOThreads,
|
||||
IndepIOThreads: h.indepiothreads(),
|
||||
Msize9p: h.msize9p(),
|
||||
DisableImageNvdimm: h.DisableImageNvdimm,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.pcieRootPort(),
|
||||
PCIeSwitchPort: h.pcieSwitchPort(),
|
||||
DisableVhostNet: h.DisableVhostNet,
|
||||
EnableVhostUserStore: h.EnableVhostUserStore,
|
||||
VhostUserStorePath: h.vhostUserStorePath(),
|
||||
VhostUserStorePathList: h.VhostUserStorePathList,
|
||||
VhostUserDeviceReconnect: h.VhostUserDeviceReconnect,
|
||||
SeccompSandbox: h.SeccompSandbox,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: txRateLimiterMaxRate,
|
||||
EnableAnnotations: h.EnableAnnotations,
|
||||
GuestMemoryDumpPath: h.GuestMemoryDumpPath,
|
||||
GuestMemoryDumpPaging: h.GuestMemoryDumpPaging,
|
||||
ConfidentialGuest: h.ConfidentialGuest,
|
||||
SevSnpGuest: h.SevSnpGuest,
|
||||
GuestSwap: h.GuestSwap,
|
||||
Rootless: h.Rootless,
|
||||
LegacySerial: h.LegacySerial,
|
||||
DisableSeLinux: h.DisableSeLinux,
|
||||
DisableGuestSeLinux: h.DisableGuestSeLinux,
|
||||
ExtraMonitorSocket: extraMonitorSocket,
|
||||
SnpIdBlock: h.SnpIdBlock,
|
||||
SnpIdAuth: h.SnpIdAuth,
|
||||
SnpGuestPolicy: h.SnpGuestPolicy,
|
||||
MeasurementAlgo: h.GetMeasurementAlgo(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -1283,42 +1337,58 @@ func newStratovirtHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
|
||||
}
|
||||
|
||||
blockLogicalSectorSize, err := h.blockDeviceLogicalSectorSize()
|
||||
if err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
blockPhysicalSectorSize, err := h.blockDevicePhysicalSectorSize()
|
||||
if err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
if err := validateBlockDeviceSectorSizes(blockLogicalSectorSize, blockPhysicalSectorSize); err != nil {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
return vc.HypervisorConfig{
|
||||
HypervisorPath: hypervisor,
|
||||
HypervisorPathList: h.HypervisorPathList,
|
||||
KernelPath: kernel,
|
||||
InitrdPath: initrd,
|
||||
ImagePath: image,
|
||||
RootfsType: rootfsType,
|
||||
KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)),
|
||||
KernelVerityParams: h.kernelVerityParams(),
|
||||
HypervisorMachineType: machineType,
|
||||
NumVCPUsF: h.defaultVCPUs(),
|
||||
DefaultMaxVCPUs: h.defaultMaxVCPUs(),
|
||||
MemorySize: h.defaultMemSz(),
|
||||
MemSlots: h.defaultMemSlots(),
|
||||
MemOffset: h.defaultMemOffset(),
|
||||
DefaultMaxMemorySize: h.defaultMaxMemSz(),
|
||||
EntropySource: h.GetEntropySource(),
|
||||
DefaultBridges: h.defaultBridges(),
|
||||
DisableBlockDeviceUse: h.DisableBlockDeviceUse,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: h.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: h.VirtioFSDaemonList,
|
||||
HypervisorLoglevel: h.defaultHypervisorLoglevel(),
|
||||
VirtioFSCacheSize: h.VirtioFSCacheSize,
|
||||
VirtioFSCache: h.defaultVirtioFSCache(),
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
HugePages: h.HugePages,
|
||||
Debug: h.Debug,
|
||||
DisableNestingChecks: h.DisableNestingChecks,
|
||||
BlockDeviceDriver: blockDriver,
|
||||
DisableVhostNet: true,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
EnableAnnotations: h.EnableAnnotations,
|
||||
DisableSeccomp: h.DisableSeccomp,
|
||||
DisableSeLinux: h.DisableSeLinux,
|
||||
DisableGuestSeLinux: h.DisableGuestSeLinux,
|
||||
HypervisorPath: hypervisor,
|
||||
HypervisorPathList: h.HypervisorPathList,
|
||||
KernelPath: kernel,
|
||||
InitrdPath: initrd,
|
||||
ImagePath: image,
|
||||
RootfsType: rootfsType,
|
||||
KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)),
|
||||
KernelVerityParams: h.kernelVerityParams(),
|
||||
HypervisorMachineType: machineType,
|
||||
NumVCPUsF: h.defaultVCPUs(),
|
||||
DefaultMaxVCPUs: h.defaultMaxVCPUs(),
|
||||
MemorySize: h.defaultMemSz(),
|
||||
MemSlots: h.defaultMemSlots(),
|
||||
MemOffset: h.defaultMemOffset(),
|
||||
DefaultMaxMemorySize: h.defaultMaxMemSz(),
|
||||
EntropySource: h.GetEntropySource(),
|
||||
DefaultBridges: h.defaultBridges(),
|
||||
DisableBlockDeviceUse: h.DisableBlockDeviceUse,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: h.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: h.VirtioFSDaemonList,
|
||||
HypervisorLoglevel: h.defaultHypervisorLoglevel(),
|
||||
VirtioFSCacheSize: h.VirtioFSCacheSize,
|
||||
VirtioFSCache: h.defaultVirtioFSCache(),
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
HugePages: h.HugePages,
|
||||
Debug: h.Debug,
|
||||
DisableNestingChecks: h.DisableNestingChecks,
|
||||
BlockDeviceDriver: blockDriver,
|
||||
BlockDeviceLogicalSectorSize: blockLogicalSectorSize,
|
||||
BlockDevicePhysicalSectorSize: blockPhysicalSectorSize,
|
||||
DisableVhostNet: true,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
EnableAnnotations: h.EnableAnnotations,
|
||||
DisableSeccomp: h.DisableSeccomp,
|
||||
DisableSeLinux: h.DisableSeLinux,
|
||||
DisableGuestSeLinux: h.DisableGuestSeLinux,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -1950,11 +2020,11 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT
|
||||
return nil
|
||||
}
|
||||
if hypervisorType == vc.ClhHypervisor {
|
||||
if coldPlug != config.NoPort {
|
||||
return fmt.Errorf("cold-plug not supported on CLH")
|
||||
if coldPlug != config.NoPort && coldPlug != config.RootPort {
|
||||
return fmt.Errorf("only cold-plug=%s or %s supported on CLH", config.NoPort, config.RootPort)
|
||||
}
|
||||
if hotPlug != config.RootPort {
|
||||
return fmt.Errorf("only hot-plug=%s supported on CLH", config.RootPort)
|
||||
if hotPlug != config.NoPort && hotPlug != config.RootPort {
|
||||
return fmt.Errorf("only hot-plug=%s or %s supported on CLH", config.NoPort, config.RootPort)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -708,6 +708,41 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
func TestValidateBlockDeviceSectorSize(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
for _, size := range []uint32{0, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536} {
|
||||
assert.NoError(validateBlockDeviceSectorSize("test_field", size), "expected size %d to be accepted", size)
|
||||
}
|
||||
|
||||
for _, size := range []uint32{3, 100, 1000, 3000, 5000} {
|
||||
assert.Error(validateBlockDeviceSectorSize("test_field", size), "expected non-power-of-2 size %d to be rejected", size)
|
||||
}
|
||||
|
||||
for _, size := range []uint32{1, 256} {
|
||||
assert.Error(validateBlockDeviceSectorSize("test_field", size), "expected below-minimum size %d to be rejected", size)
|
||||
}
|
||||
|
||||
for _, size := range []uint32{131072, 1048576} {
|
||||
assert.Error(validateBlockDeviceSectorSize("test_field", size), "expected above-maximum size %d to be rejected", size)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateBlockDeviceSectorSizes(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
assert.NoError(validateBlockDeviceSectorSizes(0, 0))
|
||||
assert.NoError(validateBlockDeviceSectorSizes(512, 0))
|
||||
assert.NoError(validateBlockDeviceSectorSizes(0, 4096))
|
||||
assert.NoError(validateBlockDeviceSectorSizes(512, 4096))
|
||||
assert.NoError(validateBlockDeviceSectorSizes(4096, 4096))
|
||||
assert.NoError(validateBlockDeviceSectorSizes(512, 512))
|
||||
|
||||
assert.Error(validateBlockDeviceSectorSizes(4096, 512), "logical > physical should be rejected")
|
||||
assert.Error(validateBlockDeviceSectorSizes(4096, 1024), "logical > physical should be rejected")
|
||||
assert.Error(validateBlockDeviceSectorSizes(65536, 512), "logical > physical should be rejected")
|
||||
}
|
||||
|
||||
func TestNewFirecrackerHypervisorConfig(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
|
||||
@@ -430,9 +430,11 @@ func TestVfioChecksClh(t *testing.T) {
|
||||
}
|
||||
assert.NoError(f(config.NoPort, config.NoPort))
|
||||
assert.NoError(f(config.NoPort, config.RootPort))
|
||||
assert.NoError(f(config.RootPort, config.NoPort))
|
||||
assert.Error(f(config.RootPort, config.RootPort))
|
||||
assert.Error(f(config.RootPort, config.NoPort))
|
||||
assert.Error(f(config.NoPort, config.SwitchPort))
|
||||
assert.Error(f(config.SwitchPort, config.NoPort))
|
||||
assert.Error(f(config.BridgePort, config.NoPort))
|
||||
}
|
||||
|
||||
func TestVfioCheckQemu(t *testing.T) {
|
||||
|
||||
@@ -920,9 +920,39 @@ func addHypervisorBlockOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig)
|
||||
return err
|
||||
}
|
||||
|
||||
return newAnnotationConfiguration(ocispec, vcAnnotations.BlockDeviceCacheNoflush).setBool(func(blockDeviceCacheNoflush bool) {
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.BlockDeviceCacheNoflush).setBool(func(blockDeviceCacheNoflush bool) {
|
||||
sbConfig.HypervisorConfig.BlockDeviceCacheNoflush = blockDeviceCacheNoflush
|
||||
})
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.BlockDeviceLogicalSectorSize).setUintWithCheck(func(size uint64) error {
|
||||
if size != 0 && (size < 512 || size > 65536 || (size&(size-1)) != 0) {
|
||||
return fmt.Errorf("invalid %s %d: must be 0 or a power of 2 between 512 and 65536", vcAnnotations.BlockDeviceLogicalSectorSize, size)
|
||||
}
|
||||
sbConfig.HypervisorConfig.BlockDeviceLogicalSectorSize = uint32(size)
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.BlockDevicePhysicalSectorSize).setUintWithCheck(func(size uint64) error {
|
||||
if size != 0 && (size < 512 || size > 65536 || (size&(size-1)) != 0) {
|
||||
return fmt.Errorf("invalid %s %d: must be 0 or a power of 2 between 512 and 65536", vcAnnotations.BlockDevicePhysicalSectorSize, size)
|
||||
}
|
||||
sbConfig.HypervisorConfig.BlockDevicePhysicalSectorSize = uint32(size)
|
||||
return nil
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
logical := sbConfig.HypervisorConfig.BlockDeviceLogicalSectorSize
|
||||
physical := sbConfig.HypervisorConfig.BlockDevicePhysicalSectorSize
|
||||
if logical != 0 && physical != 0 && logical > physical {
|
||||
return fmt.Errorf("invalid sector sizes: logical (%d) must not be larger than physical (%d)", logical, physical)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func addHypervisorVirtioFsOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
|
||||
|
||||
@@ -665,6 +665,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
// 10Mbit
|
||||
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
|
||||
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
|
||||
ocispec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = "512"
|
||||
ocispec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = "4096"
|
||||
|
||||
err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.NoError(err)
|
||||
@@ -706,6 +708,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceLogicalSectorSize, uint32(512))
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDevicePhysicalSectorSize, uint32(4096))
|
||||
|
||||
// In case an absurd large value is provided, the config value if not over-ridden
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
|
||||
@@ -726,6 +730,80 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
assert.Error(err)
|
||||
}
|
||||
|
||||
func TestBlockDeviceSectorSizeAnnotations(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
runtimeConfig := RuntimeConfig{
|
||||
HypervisorType: vc.QemuHypervisor,
|
||||
}
|
||||
runtimeConfig.HypervisorConfig.EnableAnnotations = []string{".*"}
|
||||
|
||||
newSpec := func() specs.Spec {
|
||||
return specs.Spec{Annotations: make(map[string]string)}
|
||||
}
|
||||
newConfig := func() vc.SandboxConfig {
|
||||
return vc.SandboxConfig{Annotations: make(map[string]string)}
|
||||
}
|
||||
|
||||
// Valid: 0 means "use hypervisor default", no override applied
|
||||
for _, v := range []string{"0", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"} {
|
||||
spec := newSpec()
|
||||
cfg := newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = v
|
||||
spec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = v
|
||||
assert.NoError(addAnnotations(spec, &cfg, runtimeConfig), "expected valid size %s to be accepted", v)
|
||||
}
|
||||
|
||||
// Invalid: not a power of 2
|
||||
for _, v := range []string{"3", "100", "1000", "3000", "5000"} {
|
||||
spec := newSpec()
|
||||
cfg := newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = v
|
||||
assert.Error(addAnnotations(spec, &cfg, runtimeConfig), "expected non-power-of-2 size %s to be rejected", v)
|
||||
}
|
||||
|
||||
// Invalid: below minimum (512)
|
||||
for _, v := range []string{"1", "256"} {
|
||||
spec := newSpec()
|
||||
cfg := newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = v
|
||||
assert.Error(addAnnotations(spec, &cfg, runtimeConfig), "expected below-minimum size %s to be rejected", v)
|
||||
}
|
||||
|
||||
// Invalid: above maximum (65536)
|
||||
for _, v := range []string{"131072", "1048576"} {
|
||||
spec := newSpec()
|
||||
cfg := newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = v
|
||||
assert.Error(addAnnotations(spec, &cfg, runtimeConfig), "expected above-maximum size %s to be rejected", v)
|
||||
}
|
||||
|
||||
// Logical 4096 with physical 4096 — both valid
|
||||
spec := newSpec()
|
||||
cfg := newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = "4096"
|
||||
spec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = "4096"
|
||||
assert.NoError(addAnnotations(spec, &cfg, runtimeConfig))
|
||||
assert.Equal(cfg.HypervisorConfig.BlockDeviceLogicalSectorSize, uint32(4096))
|
||||
assert.Equal(cfg.HypervisorConfig.BlockDevicePhysicalSectorSize, uint32(4096))
|
||||
|
||||
// Logical 512 with physical 4096 — both valid
|
||||
spec = newSpec()
|
||||
cfg = newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = "512"
|
||||
spec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = "4096"
|
||||
assert.NoError(addAnnotations(spec, &cfg, runtimeConfig))
|
||||
assert.Equal(cfg.HypervisorConfig.BlockDeviceLogicalSectorSize, uint32(512))
|
||||
assert.Equal(cfg.HypervisorConfig.BlockDevicePhysicalSectorSize, uint32(4096))
|
||||
|
||||
// Invalid: logical > physical
|
||||
spec = newSpec()
|
||||
cfg = newConfig()
|
||||
spec.Annotations[vcAnnotations.BlockDeviceLogicalSectorSize] = "4096"
|
||||
spec.Annotations[vcAnnotations.BlockDevicePhysicalSectorSize] = "512"
|
||||
assert.Error(addAnnotations(spec, &cfg, runtimeConfig), "logical > physical should be rejected")
|
||||
}
|
||||
|
||||
func TestAddRemoteHypervisorAnnotations(t *testing.T) {
|
||||
// Remote hypervisor uses DefaultVCPUs, DefaultMemory etc as annotations to pick the size of the separate VM to create,
|
||||
// so doesn't need to be bound by the host's capacity limits.
|
||||
|
||||
@@ -976,6 +976,44 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
|
||||
return err
|
||||
}
|
||||
|
||||
// coldPlugVFIODevice appends a VFIO device to the VM configuration so that it
|
||||
// is present when the VM is created (before boot). Cloud Hypervisor's CreateVM
|
||||
// API accepts a list of devices that are attached at VM creation time, which
|
||||
// effectively provides cold-plug semantics — the guest sees the device on its
|
||||
// PCI bus from the very first enumeration.
|
||||
func (clh *cloudHypervisor) coldPlugVFIODevice(device *config.VFIODev) error {
|
||||
switch device.Type {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
// Supported PCI VFIO device types for Cloud Hypervisor.
|
||||
default:
|
||||
return fmt.Errorf("VFIO device %+v has unsupported type %v; only PCI VFIO devices are supported in Cloud Hypervisor", device, device.Type)
|
||||
}
|
||||
if strings.TrimSpace(device.SysfsDev) == "" {
|
||||
return fmt.Errorf("VFIO device %q has empty or invalid SysfsDev path", device.ID)
|
||||
}
|
||||
|
||||
clh.Logger().WithFields(log.Fields{
|
||||
"device": device.ID,
|
||||
"sysfs": device.SysfsDev,
|
||||
"bdf": device.BDF,
|
||||
}).Info("Cold-plugging VFIO device into VM config")
|
||||
|
||||
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
|
||||
clhDevice.SetIommu(clh.config.IOMMU)
|
||||
clhDevice.SetId(device.ID)
|
||||
|
||||
if clh.vmconfig.Devices != nil {
|
||||
*clh.vmconfig.Devices = append(*clh.vmconfig.Devices, clhDevice)
|
||||
} else {
|
||||
clh.vmconfig.Devices = &[]chclient.DeviceConfig{clhDevice}
|
||||
}
|
||||
|
||||
// Track the device ID so that it can be referenced later (e.g. for removal).
|
||||
clh.devicesIds[device.ID] = device.ID
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
||||
cl := clh.client()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second)
|
||||
@@ -1342,6 +1380,8 @@ func (clh *cloudHypervisor) AddDevice(ctx context.Context, devInfo interface{},
|
||||
clh.addVSock(defaultGuestVSockCID, v.UdsPath)
|
||||
case types.Volume:
|
||||
err = clh.addVolume(v)
|
||||
case config.VFIODev:
|
||||
err = clh.coldPlugVFIODevice(&v)
|
||||
default:
|
||||
clh.Logger().WithField("function", "AddDevice").Warnf("Add device of type %v is not supported.", v)
|
||||
return fmt.Errorf("Not implemented support for %s", v)
|
||||
|
||||
@@ -682,6 +682,94 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
|
||||
assert.Error(err, "Hotplug remove pmem block device expected error")
|
||||
}
|
||||
|
||||
func TestCloudHypervisorColdPlugVFIODevice(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
clhConfig, err := newClhConfig()
|
||||
assert.NoError(err)
|
||||
|
||||
clh := &cloudHypervisor{}
|
||||
clh.config = clhConfig
|
||||
clh.devicesIds = make(map[string]string)
|
||||
clh.vmconfig = *chclient.NewVmConfig(*chclient.NewPayloadConfig())
|
||||
|
||||
// Cold-plug a PCI VFIO device
|
||||
dev := &config.VFIODev{
|
||||
ID: "gpu0",
|
||||
SysfsDev: "/sys/bus/pci/devices/0000:41:00.0",
|
||||
BDF: "0000:41:00.0",
|
||||
Type: config.VFIOPCIDeviceNormalType,
|
||||
}
|
||||
err = clh.coldPlugVFIODevice(dev)
|
||||
assert.NoError(err, "Cold-plug PCI VFIO device expected no error")
|
||||
|
||||
// Verify the device was added to vmconfig.Devices
|
||||
assert.NotNil(clh.vmconfig.Devices)
|
||||
assert.Len(*clh.vmconfig.Devices, 1)
|
||||
assert.Equal("/sys/bus/pci/devices/0000:41:00.0", (*clh.vmconfig.Devices)[0].Path)
|
||||
assert.Equal("gpu0", clh.devicesIds["gpu0"])
|
||||
|
||||
// Cold-plug a second device
|
||||
dev2 := &config.VFIODev{
|
||||
ID: "gpu1",
|
||||
SysfsDev: "/sys/bus/pci/devices/0000:42:00.0",
|
||||
BDF: "0000:42:00.0",
|
||||
Type: config.VFIOPCIDeviceNormalType,
|
||||
}
|
||||
err = clh.coldPlugVFIODevice(dev2)
|
||||
assert.NoError(err, "Cold-plug second VFIO device expected no error")
|
||||
assert.Len(*clh.vmconfig.Devices, 2)
|
||||
|
||||
// AP mediated device should fail
|
||||
apDev := &config.VFIODev{
|
||||
ID: "ap0",
|
||||
Type: config.VFIOAPDeviceMediatedType,
|
||||
}
|
||||
err = clh.coldPlugVFIODevice(apDev)
|
||||
assert.Error(err, "Cold-plug AP mediated device expected error")
|
||||
|
||||
// Error type (0) should fail
|
||||
errDev := &config.VFIODev{
|
||||
ID: "bad0",
|
||||
SysfsDev: "/sys/bus/pci/devices/0000:43:00.0",
|
||||
Type: config.VFIODeviceErrorType,
|
||||
}
|
||||
err = clh.coldPlugVFIODevice(errDev)
|
||||
assert.Error(err, "Cold-plug error-type device expected error")
|
||||
|
||||
// Empty SysfsDev should fail
|
||||
emptySysfsDev := &config.VFIODev{
|
||||
ID: "bad1",
|
||||
Type: config.VFIOPCIDeviceNormalType,
|
||||
}
|
||||
err = clh.coldPlugVFIODevice(emptySysfsDev)
|
||||
assert.Error(err, "Cold-plug with empty SysfsDev expected error")
|
||||
}
|
||||
|
||||
func TestCloudHypervisorAddDeviceVFIO(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
clhConfig, err := newClhConfig()
|
||||
assert.NoError(err)
|
||||
|
||||
clh := &cloudHypervisor{}
|
||||
clh.config = clhConfig
|
||||
clh.devicesIds = make(map[string]string)
|
||||
clh.vmconfig = *chclient.NewVmConfig(*chclient.NewPayloadConfig())
|
||||
|
||||
// AddDevice with VFIODev type should cold-plug
|
||||
dev := config.VFIODev{
|
||||
ID: "nic0",
|
||||
SysfsDev: "/sys/bus/pci/devices/0000:05:00.0",
|
||||
BDF: "0000:05:00.0",
|
||||
Type: config.VFIOPCIDeviceNormalType,
|
||||
}
|
||||
err = clh.AddDevice(context.Background(), dev, VfioDev)
|
||||
assert.NoError(err, "AddDevice VFIO expected no error")
|
||||
assert.NotNil(clh.vmconfig.Devices)
|
||||
assert.Len(*clh.vmconfig.Devices, 1)
|
||||
}
|
||||
|
||||
func TestClhGenerateSocket(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
|
||||
@@ -754,6 +754,16 @@ type HypervisorConfig struct {
|
||||
// Denotes whether flush requests for the device are ignored.
|
||||
BlockDeviceCacheNoflush bool
|
||||
|
||||
// BlockDeviceLogicalSectorSize specifies the logical sector size reported
|
||||
// by block devices to the guest, in bytes. Common values are 512 and 4096.
|
||||
// Set to 0 to use the hypervisor default.
|
||||
BlockDeviceLogicalSectorSize uint32
|
||||
|
||||
// BlockDevicePhysicalSectorSize specifies the physical sector size reported
|
||||
// by block devices to the guest, in bytes. Common values are 512 and 4096.
|
||||
// Set to 0 to use the hypervisor default.
|
||||
BlockDevicePhysicalSectorSize uint32
|
||||
|
||||
// DisableBlockDeviceUse disallows a block device from being used.
|
||||
DisableBlockDeviceUse bool
|
||||
|
||||
|
||||
@@ -201,62 +201,64 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
|
||||
}
|
||||
|
||||
ss.Config.HypervisorConfig = persistapi.HypervisorConfig{
|
||||
NumVCPUsF: sconfig.HypervisorConfig.NumVCPUsF,
|
||||
DefaultMaxVCPUs: sconfig.HypervisorConfig.DefaultMaxVCPUs,
|
||||
MemorySize: sconfig.HypervisorConfig.MemorySize,
|
||||
DefaultBridges: sconfig.HypervisorConfig.DefaultBridges,
|
||||
Msize9p: sconfig.HypervisorConfig.Msize9p,
|
||||
MemSlots: sconfig.HypervisorConfig.MemSlots,
|
||||
MemOffset: sconfig.HypervisorConfig.MemOffset,
|
||||
VirtioMem: sconfig.HypervisorConfig.VirtioMem,
|
||||
VirtioFSCacheSize: sconfig.HypervisorConfig.VirtioFSCacheSize,
|
||||
KernelPath: sconfig.HypervisorConfig.KernelPath,
|
||||
ImagePath: sconfig.HypervisorConfig.ImagePath,
|
||||
InitrdPath: sconfig.HypervisorConfig.InitrdPath,
|
||||
FirmwarePath: sconfig.HypervisorConfig.FirmwarePath,
|
||||
MachineAccelerators: sconfig.HypervisorConfig.MachineAccelerators,
|
||||
CPUFeatures: sconfig.HypervisorConfig.CPUFeatures,
|
||||
HypervisorPath: sconfig.HypervisorConfig.HypervisorPath,
|
||||
HypervisorPathList: sconfig.HypervisorConfig.HypervisorPathList,
|
||||
JailerPath: sconfig.HypervisorConfig.JailerPath,
|
||||
JailerPathList: sconfig.HypervisorConfig.JailerPathList,
|
||||
BlockDeviceDriver: sconfig.HypervisorConfig.BlockDeviceDriver,
|
||||
HypervisorMachineType: sconfig.HypervisorConfig.HypervisorMachineType,
|
||||
MemoryPath: sconfig.HypervisorConfig.MemoryPath,
|
||||
DevicesStatePath: sconfig.HypervisorConfig.DevicesStatePath,
|
||||
EntropySource: sconfig.HypervisorConfig.EntropySource,
|
||||
EntropySourceList: sconfig.HypervisorConfig.EntropySourceList,
|
||||
SharedFS: sconfig.HypervisorConfig.SharedFS,
|
||||
VirtioFSDaemon: sconfig.HypervisorConfig.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: sconfig.HypervisorConfig.VirtioFSDaemonList,
|
||||
VirtioFSCache: sconfig.HypervisorConfig.VirtioFSCache,
|
||||
VirtioFSExtraArgs: sconfig.HypervisorConfig.VirtioFSExtraArgs[:],
|
||||
BlockDeviceCacheSet: sconfig.HypervisorConfig.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: sconfig.HypervisorConfig.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: sconfig.HypervisorConfig.BlockDeviceCacheNoflush,
|
||||
DisableBlockDeviceUse: sconfig.HypervisorConfig.DisableBlockDeviceUse,
|
||||
EnableIOThreads: sconfig.HypervisorConfig.EnableIOThreads,
|
||||
IndepIOThreads: sconfig.HypervisorConfig.IndepIOThreads,
|
||||
Debug: sconfig.HypervisorConfig.Debug,
|
||||
MemPrealloc: sconfig.HypervisorConfig.MemPrealloc,
|
||||
HugePages: sconfig.HypervisorConfig.HugePages,
|
||||
FileBackedMemRootDir: sconfig.HypervisorConfig.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList,
|
||||
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
|
||||
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
|
||||
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
|
||||
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
|
||||
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
|
||||
EnableVhostUserStore: sconfig.HypervisorConfig.EnableVhostUserStore,
|
||||
SeccompSandbox: sconfig.HypervisorConfig.SeccompSandbox,
|
||||
VhostUserStorePath: sconfig.HypervisorConfig.VhostUserStorePath,
|
||||
VhostUserStorePathList: sconfig.HypervisorConfig.VhostUserStorePathList,
|
||||
GuestHookPath: sconfig.HypervisorConfig.GuestHookPath,
|
||||
VMid: sconfig.HypervisorConfig.VMid,
|
||||
RxRateLimiterMaxRate: sconfig.HypervisorConfig.RxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: sconfig.HypervisorConfig.TxRateLimiterMaxRate,
|
||||
SGXEPCSize: sconfig.HypervisorConfig.SGXEPCSize,
|
||||
EnableAnnotations: sconfig.HypervisorConfig.EnableAnnotations,
|
||||
NumVCPUsF: sconfig.HypervisorConfig.NumVCPUsF,
|
||||
DefaultMaxVCPUs: sconfig.HypervisorConfig.DefaultMaxVCPUs,
|
||||
MemorySize: sconfig.HypervisorConfig.MemorySize,
|
||||
DefaultBridges: sconfig.HypervisorConfig.DefaultBridges,
|
||||
Msize9p: sconfig.HypervisorConfig.Msize9p,
|
||||
MemSlots: sconfig.HypervisorConfig.MemSlots,
|
||||
MemOffset: sconfig.HypervisorConfig.MemOffset,
|
||||
VirtioMem: sconfig.HypervisorConfig.VirtioMem,
|
||||
VirtioFSCacheSize: sconfig.HypervisorConfig.VirtioFSCacheSize,
|
||||
KernelPath: sconfig.HypervisorConfig.KernelPath,
|
||||
ImagePath: sconfig.HypervisorConfig.ImagePath,
|
||||
InitrdPath: sconfig.HypervisorConfig.InitrdPath,
|
||||
FirmwarePath: sconfig.HypervisorConfig.FirmwarePath,
|
||||
MachineAccelerators: sconfig.HypervisorConfig.MachineAccelerators,
|
||||
CPUFeatures: sconfig.HypervisorConfig.CPUFeatures,
|
||||
HypervisorPath: sconfig.HypervisorConfig.HypervisorPath,
|
||||
HypervisorPathList: sconfig.HypervisorConfig.HypervisorPathList,
|
||||
JailerPath: sconfig.HypervisorConfig.JailerPath,
|
||||
JailerPathList: sconfig.HypervisorConfig.JailerPathList,
|
||||
BlockDeviceDriver: sconfig.HypervisorConfig.BlockDeviceDriver,
|
||||
HypervisorMachineType: sconfig.HypervisorConfig.HypervisorMachineType,
|
||||
MemoryPath: sconfig.HypervisorConfig.MemoryPath,
|
||||
DevicesStatePath: sconfig.HypervisorConfig.DevicesStatePath,
|
||||
EntropySource: sconfig.HypervisorConfig.EntropySource,
|
||||
EntropySourceList: sconfig.HypervisorConfig.EntropySourceList,
|
||||
SharedFS: sconfig.HypervisorConfig.SharedFS,
|
||||
VirtioFSDaemon: sconfig.HypervisorConfig.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: sconfig.HypervisorConfig.VirtioFSDaemonList,
|
||||
VirtioFSCache: sconfig.HypervisorConfig.VirtioFSCache,
|
||||
VirtioFSExtraArgs: sconfig.HypervisorConfig.VirtioFSExtraArgs[:],
|
||||
BlockDeviceCacheSet: sconfig.HypervisorConfig.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: sconfig.HypervisorConfig.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: sconfig.HypervisorConfig.BlockDeviceCacheNoflush,
|
||||
BlockDeviceLogicalSectorSize: sconfig.HypervisorConfig.BlockDeviceLogicalSectorSize,
|
||||
BlockDevicePhysicalSectorSize: sconfig.HypervisorConfig.BlockDevicePhysicalSectorSize,
|
||||
DisableBlockDeviceUse: sconfig.HypervisorConfig.DisableBlockDeviceUse,
|
||||
EnableIOThreads: sconfig.HypervisorConfig.EnableIOThreads,
|
||||
IndepIOThreads: sconfig.HypervisorConfig.IndepIOThreads,
|
||||
Debug: sconfig.HypervisorConfig.Debug,
|
||||
MemPrealloc: sconfig.HypervisorConfig.MemPrealloc,
|
||||
HugePages: sconfig.HypervisorConfig.HugePages,
|
||||
FileBackedMemRootDir: sconfig.HypervisorConfig.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList,
|
||||
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
|
||||
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
|
||||
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
|
||||
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
|
||||
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
|
||||
EnableVhostUserStore: sconfig.HypervisorConfig.EnableVhostUserStore,
|
||||
SeccompSandbox: sconfig.HypervisorConfig.SeccompSandbox,
|
||||
VhostUserStorePath: sconfig.HypervisorConfig.VhostUserStorePath,
|
||||
VhostUserStorePathList: sconfig.HypervisorConfig.VhostUserStorePathList,
|
||||
GuestHookPath: sconfig.HypervisorConfig.GuestHookPath,
|
||||
VMid: sconfig.HypervisorConfig.VMid,
|
||||
RxRateLimiterMaxRate: sconfig.HypervisorConfig.RxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: sconfig.HypervisorConfig.TxRateLimiterMaxRate,
|
||||
SGXEPCSize: sconfig.HypervisorConfig.SGXEPCSize,
|
||||
EnableAnnotations: sconfig.HypervisorConfig.EnableAnnotations,
|
||||
}
|
||||
|
||||
ss.Config.KataAgentConfig = &persistapi.KataAgentConfig{
|
||||
@@ -441,65 +443,67 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
|
||||
|
||||
hconf := savedConf.HypervisorConfig
|
||||
sconfig.HypervisorConfig = HypervisorConfig{
|
||||
NumVCPUsF: hconf.NumVCPUsF,
|
||||
DefaultMaxVCPUs: hconf.DefaultMaxVCPUs,
|
||||
MemorySize: hconf.MemorySize,
|
||||
DefaultBridges: hconf.DefaultBridges,
|
||||
Msize9p: hconf.Msize9p,
|
||||
MemSlots: hconf.MemSlots,
|
||||
MemOffset: hconf.MemOffset,
|
||||
VirtioMem: hconf.VirtioMem,
|
||||
VirtioFSCacheSize: hconf.VirtioFSCacheSize,
|
||||
KernelPath: hconf.KernelPath,
|
||||
ImagePath: hconf.ImagePath,
|
||||
InitrdPath: hconf.InitrdPath,
|
||||
FirmwarePath: hconf.FirmwarePath,
|
||||
MachineAccelerators: hconf.MachineAccelerators,
|
||||
CPUFeatures: hconf.CPUFeatures,
|
||||
HypervisorPath: hconf.HypervisorPath,
|
||||
HypervisorPathList: hconf.HypervisorPathList,
|
||||
JailerPath: hconf.JailerPath,
|
||||
JailerPathList: hconf.JailerPathList,
|
||||
BlockDeviceDriver: hconf.BlockDeviceDriver,
|
||||
HypervisorMachineType: hconf.HypervisorMachineType,
|
||||
MemoryPath: hconf.MemoryPath,
|
||||
DevicesStatePath: hconf.DevicesStatePath,
|
||||
EntropySource: hconf.EntropySource,
|
||||
EntropySourceList: hconf.EntropySourceList,
|
||||
SharedFS: hconf.SharedFS,
|
||||
VirtioFSDaemon: hconf.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: hconf.VirtioFSDaemonList,
|
||||
VirtioFSCache: hconf.VirtioFSCache,
|
||||
VirtioFSExtraArgs: hconf.VirtioFSExtraArgs[:],
|
||||
BlockDeviceCacheSet: hconf.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: hconf.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: hconf.BlockDeviceCacheNoflush,
|
||||
DisableBlockDeviceUse: hconf.DisableBlockDeviceUse,
|
||||
EnableIOThreads: hconf.EnableIOThreads,
|
||||
IndepIOThreads: hconf.IndepIOThreads,
|
||||
Debug: hconf.Debug,
|
||||
MemPrealloc: hconf.MemPrealloc,
|
||||
HugePages: hconf.HugePages,
|
||||
FileBackedMemRootDir: hconf.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: hconf.FileBackedMemRootList,
|
||||
DisableNestingChecks: hconf.DisableNestingChecks,
|
||||
DisableImageNvdimm: hconf.DisableImageNvdimm,
|
||||
HotPlugVFIO: hconf.HotPlugVFIO,
|
||||
ColdPlugVFIO: hconf.ColdPlugVFIO,
|
||||
PCIeRootPort: hconf.PCIeRootPort,
|
||||
PCIeSwitchPort: hconf.PCIeSwitchPort,
|
||||
BootToBeTemplate: hconf.BootToBeTemplate,
|
||||
BootFromTemplate: hconf.BootFromTemplate,
|
||||
DisableVhostNet: hconf.DisableVhostNet,
|
||||
EnableVhostUserStore: hconf.EnableVhostUserStore,
|
||||
VhostUserStorePath: hconf.VhostUserStorePath,
|
||||
VhostUserStorePathList: hconf.VhostUserStorePathList,
|
||||
GuestHookPath: hconf.GuestHookPath,
|
||||
VMid: hconf.VMid,
|
||||
RxRateLimiterMaxRate: hconf.RxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: hconf.TxRateLimiterMaxRate,
|
||||
SGXEPCSize: hconf.SGXEPCSize,
|
||||
EnableAnnotations: hconf.EnableAnnotations,
|
||||
NumVCPUsF: hconf.NumVCPUsF,
|
||||
DefaultMaxVCPUs: hconf.DefaultMaxVCPUs,
|
||||
MemorySize: hconf.MemorySize,
|
||||
DefaultBridges: hconf.DefaultBridges,
|
||||
Msize9p: hconf.Msize9p,
|
||||
MemSlots: hconf.MemSlots,
|
||||
MemOffset: hconf.MemOffset,
|
||||
VirtioMem: hconf.VirtioMem,
|
||||
VirtioFSCacheSize: hconf.VirtioFSCacheSize,
|
||||
KernelPath: hconf.KernelPath,
|
||||
ImagePath: hconf.ImagePath,
|
||||
InitrdPath: hconf.InitrdPath,
|
||||
FirmwarePath: hconf.FirmwarePath,
|
||||
MachineAccelerators: hconf.MachineAccelerators,
|
||||
CPUFeatures: hconf.CPUFeatures,
|
||||
HypervisorPath: hconf.HypervisorPath,
|
||||
HypervisorPathList: hconf.HypervisorPathList,
|
||||
JailerPath: hconf.JailerPath,
|
||||
JailerPathList: hconf.JailerPathList,
|
||||
BlockDeviceDriver: hconf.BlockDeviceDriver,
|
||||
HypervisorMachineType: hconf.HypervisorMachineType,
|
||||
MemoryPath: hconf.MemoryPath,
|
||||
DevicesStatePath: hconf.DevicesStatePath,
|
||||
EntropySource: hconf.EntropySource,
|
||||
EntropySourceList: hconf.EntropySourceList,
|
||||
SharedFS: hconf.SharedFS,
|
||||
VirtioFSDaemon: hconf.VirtioFSDaemon,
|
||||
VirtioFSDaemonList: hconf.VirtioFSDaemonList,
|
||||
VirtioFSCache: hconf.VirtioFSCache,
|
||||
VirtioFSExtraArgs: hconf.VirtioFSExtraArgs[:],
|
||||
BlockDeviceCacheSet: hconf.BlockDeviceCacheSet,
|
||||
BlockDeviceCacheDirect: hconf.BlockDeviceCacheDirect,
|
||||
BlockDeviceCacheNoflush: hconf.BlockDeviceCacheNoflush,
|
||||
BlockDeviceLogicalSectorSize: hconf.BlockDeviceLogicalSectorSize,
|
||||
BlockDevicePhysicalSectorSize: hconf.BlockDevicePhysicalSectorSize,
|
||||
DisableBlockDeviceUse: hconf.DisableBlockDeviceUse,
|
||||
EnableIOThreads: hconf.EnableIOThreads,
|
||||
IndepIOThreads: hconf.IndepIOThreads,
|
||||
Debug: hconf.Debug,
|
||||
MemPrealloc: hconf.MemPrealloc,
|
||||
HugePages: hconf.HugePages,
|
||||
FileBackedMemRootDir: hconf.FileBackedMemRootDir,
|
||||
FileBackedMemRootList: hconf.FileBackedMemRootList,
|
||||
DisableNestingChecks: hconf.DisableNestingChecks,
|
||||
DisableImageNvdimm: hconf.DisableImageNvdimm,
|
||||
HotPlugVFIO: hconf.HotPlugVFIO,
|
||||
ColdPlugVFIO: hconf.ColdPlugVFIO,
|
||||
PCIeRootPort: hconf.PCIeRootPort,
|
||||
PCIeSwitchPort: hconf.PCIeSwitchPort,
|
||||
BootToBeTemplate: hconf.BootToBeTemplate,
|
||||
BootFromTemplate: hconf.BootFromTemplate,
|
||||
DisableVhostNet: hconf.DisableVhostNet,
|
||||
EnableVhostUserStore: hconf.EnableVhostUserStore,
|
||||
VhostUserStorePath: hconf.VhostUserStorePath,
|
||||
VhostUserStorePathList: hconf.VhostUserStorePathList,
|
||||
GuestHookPath: hconf.GuestHookPath,
|
||||
VMid: hconf.VMid,
|
||||
RxRateLimiterMaxRate: hconf.RxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: hconf.TxRateLimiterMaxRate,
|
||||
SGXEPCSize: hconf.SGXEPCSize,
|
||||
EnableAnnotations: hconf.EnableAnnotations,
|
||||
}
|
||||
|
||||
sconfig.AgentConfig = KataAgentConfig{
|
||||
|
||||
@@ -157,6 +157,14 @@ type HypervisorConfig struct {
|
||||
// Denotes whether flush requests for the device are ignored.
|
||||
BlockDeviceCacheNoflush bool
|
||||
|
||||
// BlockDeviceLogicalSectorSize specifies the logical sector size reported
|
||||
// by block devices to the guest, in bytes.
|
||||
BlockDeviceLogicalSectorSize uint32
|
||||
|
||||
// BlockDevicePhysicalSectorSize specifies the physical sector size reported
|
||||
// by block devices to the guest, in bytes.
|
||||
BlockDevicePhysicalSectorSize uint32
|
||||
|
||||
// DisableBlockDeviceUse disallows a block device from being used.
|
||||
DisableBlockDeviceUse bool
|
||||
|
||||
|
||||
@@ -242,6 +242,21 @@ const (
|
||||
// Denotes whether flush requests for the device are ignored.
|
||||
BlockDeviceCacheNoflush = kataAnnotHypervisorPrefix + "block_device_cache_noflush"
|
||||
|
||||
// BlockDeviceLogicalSectorSize is a sandbox annotation that specifies the logical sector size
|
||||
// reported by block devices to the guest, in bytes. Common values are 512 and 4096.
|
||||
// Set to 0 to use the hypervisor default.
|
||||
// NOTE: the annotation key uses the abbreviated "blk_logical_sector_size" rather than
|
||||
// "block_device_logical_sector_size" (as used in the config file) because Kubernetes
|
||||
// enforces a 63-character limit on annotation name segments, and the full name with the
|
||||
// "io.katacontainers.config.hypervisor." prefix would exceed that limit.
|
||||
BlockDeviceLogicalSectorSize = kataAnnotHypervisorPrefix + "blk_logical_sector_size"
|
||||
|
||||
// BlockDevicePhysicalSectorSize is a sandbox annotation that specifies the physical sector size
|
||||
// reported by block devices to the guest, in bytes. Common values are 512 and 4096.
|
||||
// Set to 0 to use the hypervisor default.
|
||||
// NOTE: see BlockDeviceLogicalSectorSize for the reason the annotation key is abbreviated.
|
||||
BlockDevicePhysicalSectorSize = kataAnnotHypervisorPrefix + "blk_physical_sector_size"
|
||||
|
||||
// RxRateLimiterMaxRate is a sandbox annotation that specifies max rate on network I/O inbound bandwidth.
|
||||
RxRateLimiterMaxRate = kataAnnotHypervisorPrefix + "rx_rate_limiter_max_rate"
|
||||
|
||||
|
||||
@@ -1686,7 +1686,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
|
||||
iothreadID = fmt.Sprintf("%s_%d", indepIOThreadsPrefix, 0)
|
||||
}
|
||||
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, queues, true, defaultDisableModern, iothreadID); err != nil {
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, queues, true, defaultDisableModern, iothreadID, q.config.BlockDeviceLogicalSectorSize, q.config.BlockDevicePhysicalSectorSize); err != nil {
|
||||
return err
|
||||
}
|
||||
case q.config.BlockDeviceDriver == config.VirtioBlockCCW:
|
||||
@@ -1705,7 +1705,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil {
|
||||
if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false, q.config.BlockDeviceLogicalSectorSize, q.config.BlockDevicePhysicalSectorSize); err != nil {
|
||||
return err
|
||||
}
|
||||
case q.config.BlockDeviceDriver == config.VirtioSCSI:
|
||||
|
||||
@@ -905,7 +905,7 @@ func (s *stratovirt) hotplugBlk(ctx context.Context, drive *config.BlockDrive, o
|
||||
}
|
||||
|
||||
devAddr := fmt.Sprintf("%d", slot)
|
||||
if err := s.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(s.qmpMonitorCh.ctx, drive.ID, drive.ID, driver, devAddr, "", "", 0, false, false, ""); err != nil {
|
||||
if err := s.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(s.qmpMonitorCh.ctx, drive.ID, drive.ID, driver, devAddr, "", "", 0, false, false, "", s.config.BlockDeviceLogicalSectorSize, s.config.BlockDevicePhysicalSectorSize); err != nil {
|
||||
return err
|
||||
}
|
||||
case RemoveDevice:
|
||||
|
||||
@@ -790,23 +790,6 @@ function helm_helper() {
|
||||
# Always unset first to clear any defaults from base file
|
||||
yq -i ".snapshotter.setup = []" "${values_yaml}"
|
||||
|
||||
# For TDX and SNP shims, snapshotter.setup must ALWAYS be disabled in CI
|
||||
# Check if any TDX/SNP shims are enabled
|
||||
disable_snapshotter_setup=false
|
||||
for shim in ${HELM_SHIMS}; do
|
||||
case "${shim}" in
|
||||
qemu-tdx|qemu-snp)
|
||||
disable_snapshotter_setup=true
|
||||
break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Safety check: Fail if EXPERIMENTAL_SETUP_SNAPSHOTTER is set when using SNP/TDX shims
|
||||
if [[ "${disable_snapshotter_setup}" == "true" ]] && [[ -n "${HELM_EXPERIMENTAL_SETUP_SNAPSHOTTER}" ]]; then
|
||||
die "ERROR: HELM_EXPERIMENTAL_SETUP_SNAPSHOTTER cannot be set when using SNP/TDX shims (qemu-snp, qemu-tdx, qemu-nvidia-gpu-snp, qemu-nvidia-gpu-tdx). snapshotter.setup must always be disabled for these shims."
|
||||
fi
|
||||
|
||||
if [[ -n "${HELM_EXPERIMENTAL_SETUP_SNAPSHOTTER}" ]]; then
|
||||
# Convert space-separated or comma-separated list to YAML array
|
||||
IFS=', ' read -ra snapshotter_list <<< "${HELM_EXPERIMENTAL_SETUP_SNAPSHOTTER}"
|
||||
|
||||
@@ -272,12 +272,29 @@ kbs_uninstall_cli() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Ensure ~/.cicd/venv exists and activate it in the current shell.
|
||||
ensure_cicd_python_venv() {
|
||||
local venv_path="${HOME}/.cicd/venv"
|
||||
if [[ ! -f "${venv_path}/bin/activate" ]]; then
|
||||
# NIM tests need Python 3.10 via pyenv; attestation uses system python3. Both are fine.
|
||||
if command -v pyenv &>/dev/null; then
|
||||
export PYENV_ROOT="${HOME}/.pyenv"
|
||||
[[ -d "${PYENV_ROOT}/bin" ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
|
||||
eval "$(pyenv init - bash)"
|
||||
fi
|
||||
mkdir -p "${HOME}/.cicd"
|
||||
python3 -m venv "${venv_path}"
|
||||
fi
|
||||
# shellcheck disable=SC1091
|
||||
source "${venv_path}/bin/activate"
|
||||
}
|
||||
|
||||
# Ensure the sev-snp-measure utility is installed.
|
||||
#
|
||||
ensure_sev_snp_measure() {
|
||||
command -v sev-snp-measure >/dev/null && return
|
||||
|
||||
source "${HOME}"/.cicd/venv/bin/activate
|
||||
ensure_cicd_python_venv
|
||||
pip install sev-snp-measure
|
||||
}
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ function deploy_kata() {
|
||||
|
||||
# Workaround to avoid modifying the workflow yaml files
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-nvidia-gpu-*)
|
||||
qemu-tdx|qemu-snp|qemu-nvidia-gpu-*)
|
||||
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER=true
|
||||
SNAPSHOTTER="nydus"
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL=false
|
||||
@@ -208,26 +208,14 @@ function deploy_kata() {
|
||||
HOST_OS="${KATA_HOST_OS}"
|
||||
fi
|
||||
|
||||
# nydus and erofs are always deployed by kata-deploy; set this unconditionally
|
||||
# based on the snapshotter so that all architectures and hypervisors work
|
||||
# without needing per-workflow USE_EXPERIMENTAL_SETUP_SNAPSHOTTER overrides.
|
||||
EXPERIMENTAL_SETUP_SNAPSHOTTER=""
|
||||
if [[ "${USE_EXPERIMENTAL_SETUP_SNAPSHOTTER:-false}" == "true" ]]; then
|
||||
case "${SNAPSHOTTER}" in
|
||||
nydus|erofs)
|
||||
ARCH="$(uname -m)"
|
||||
# We only want to tests this for the qemu-coco-dev and
|
||||
# qemu-coco-dev-runtime-rs runtime classes
|
||||
# as they are running on a GitHub runner (and not on a BM machine),
|
||||
# and there the snapshotter is deployed on every run (rather than
|
||||
# deployed when the machine is configured, as on the BM machines).
|
||||
if [[ ${ARCH} == "x86_64" ]]; then
|
||||
case "${KATA_HYPERVISOR}" in
|
||||
qemu-coco-dev*|qemu-nvidia-gpu-*) EXPERIMENTAL_SETUP_SNAPSHOTTER="${SNAPSHOTTER}" ;;
|
||||
*) ;;
|
||||
esac
|
||||
fi
|
||||
;;
|
||||
*) ;;
|
||||
esac
|
||||
fi
|
||||
case "${SNAPSHOTTER}" in
|
||||
nydus|erofs) EXPERIMENTAL_SETUP_SNAPSHOTTER="${SNAPSHOTTER}" ;;
|
||||
*) ;;
|
||||
esac
|
||||
|
||||
EXPERIMENTAL_FORCE_GUEST_PULL="${EXPERIMENTAL_FORCE_GUEST_PULL:-}"
|
||||
|
||||
@@ -476,92 +464,11 @@ function cleanup_snapshotter() {
|
||||
}
|
||||
|
||||
function deploy_nydus_snapshotter() {
|
||||
echo "::group::deploy_nydus_snapshotter"
|
||||
ensure_yq
|
||||
|
||||
local nydus_snapshotter_install_dir
|
||||
nydus_snapshotter_install_dir="/tmp/nydus-snapshotter"
|
||||
if [[ -d "${nydus_snapshotter_install_dir}" ]]; then
|
||||
rm -rf "${nydus_snapshotter_install_dir}"
|
||||
fi
|
||||
mkdir -p "${nydus_snapshotter_install_dir}"
|
||||
nydus_snapshotter_url=$(get_from_kata_deps ".externals.nydus-snapshotter.url")
|
||||
nydus_snapshotter_version=$(get_from_kata_deps ".externals.nydus-snapshotter.version")
|
||||
git clone -b "${nydus_snapshotter_version}" "${nydus_snapshotter_url}" "${nydus_snapshotter_install_dir}"
|
||||
|
||||
pushd "${nydus_snapshotter_install_dir}"
|
||||
if [[ "${K8S_TEST_HOST_TYPE}" = "baremetal" ]]; then
|
||||
cleanup_nydus_snapshotter || true
|
||||
fi
|
||||
if [[ "${PULL_TYPE}" == "guest-pull" ]]; then
|
||||
# Enable guest pull feature in nydus snapshotter
|
||||
yq -i \
|
||||
'select(.kind == "ConfigMap").data.FS_DRIVER = "proxy"' \
|
||||
misc/snapshotter/base/nydus-snapshotter.yaml
|
||||
else
|
||||
>&2 echo "Invalid pull type"; exit 2
|
||||
fi
|
||||
|
||||
# Disable to read snapshotter config from configmap
|
||||
yq -i \
|
||||
'select(.kind == "ConfigMap").data.ENABLE_CONFIG_FROM_VOLUME = "false"' \
|
||||
misc/snapshotter/base/nydus-snapshotter.yaml
|
||||
# Enable to run snapshotter as a systemd service
|
||||
yq -i \
|
||||
'select(.kind == "ConfigMap").data.ENABLE_SYSTEMD_SERVICE = "true"' \
|
||||
misc/snapshotter/base/nydus-snapshotter.yaml
|
||||
# Enable "runtime specific snapshotter" feature in containerd when configuring containerd for snapshotter
|
||||
yq -i \
|
||||
'select(.kind == "ConfigMap").data.ENABLE_RUNTIME_SPECIFIC_SNAPSHOTTER = "true"' \
|
||||
misc/snapshotter/base/nydus-snapshotter.yaml
|
||||
|
||||
# Pin the version of nydus-snapshotter image.
|
||||
# TODO: replace with a definitive solution (see https://github.com/kata-containers/kata-containers/issues/9742)
|
||||
yq -i \
|
||||
"select(.kind == \"DaemonSet\").spec.template.spec.containers[0].image = \"ghcr.io/containerd/nydus-snapshotter:${nydus_snapshotter_version}\"" \
|
||||
misc/snapshotter/base/nydus-snapshotter.yaml
|
||||
|
||||
# Deploy nydus snapshotter as a daemonset
|
||||
kubectl_retry create -f "misc/snapshotter/nydus-snapshotter-rbac.yaml"
|
||||
if [[ "${KUBERNETES}" = "k3s" ]]; then
|
||||
kubectl_retry apply -k "misc/snapshotter/overlays/k3s"
|
||||
else
|
||||
kubectl_retry apply -f "misc/snapshotter/base/nydus-snapshotter.yaml"
|
||||
fi
|
||||
popd
|
||||
|
||||
kubectl rollout status daemonset nydus-snapshotter -n nydus-system --timeout "${SNAPSHOTTER_DEPLOY_WAIT_TIMEOUT}"
|
||||
|
||||
echo "::endgroup::"
|
||||
echo "::group::nydus snapshotter logs"
|
||||
kubectl_retry logs --selector=app=nydus-snapshotter -n nydus-system
|
||||
echo "::endgroup::"
|
||||
echo "::group::nydus snapshotter describe"
|
||||
kubectl_retry describe pod --selector=app=nydus-snapshotter -n nydus-system
|
||||
echo "::endgroup::"
|
||||
echo "nydus-for-kata-tee is now deployed and managed by kata-deploy; nothing to do here."
|
||||
}
|
||||
|
||||
function cleanup_nydus_snapshotter() {
|
||||
echo "cleanup_nydus_snapshotter"
|
||||
local nydus_snapshotter_install_dir
|
||||
nydus_snapshotter_install_dir="/tmp/nydus-snapshotter"
|
||||
if [[ ! -d "${nydus_snapshotter_install_dir}" ]]; then
|
||||
>&2 echo "nydus snapshotter dir not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pushd "${nydus_snapshotter_install_dir}"
|
||||
|
||||
if [[ "${KUBERNETES}" = "k3s" ]]; then
|
||||
kubectl_retry delete --ignore-not-found -k "misc/snapshotter/overlays/k3s"
|
||||
else
|
||||
kubectl_retry delete --ignore-not-found -f "misc/snapshotter/base/nydus-snapshotter.yaml"
|
||||
fi
|
||||
sleep 180s
|
||||
kubectl_retry delete --ignore-not-found -f "misc/snapshotter/nydus-snapshotter-rbac.yaml"
|
||||
popd
|
||||
sleep 30s
|
||||
echo "::endgroup::"
|
||||
echo "nydus-for-kata-tee is now deployed and managed by kata-deploy; nothing to do here."
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
@@ -60,8 +60,6 @@ setup() {
|
||||
local pod_file
|
||||
local uid
|
||||
|
||||
[[ "${KATA_HYPERVISOR}" = qemu-se* ]] && \
|
||||
skip "See: https://github.com/kata-containers/kata-containers/issues/10002"
|
||||
# This is a reproducer of k8s e2e "[sig-storage] EmptyDir volumes when FSGroup is specified [LinuxOnly] [NodeFeature:FSGroup] new files should be created with FSGroup ownership when container is non-root" test
|
||||
pod_file="${pod_config_dir}/pod-empty-dir-fsgroup.yaml"
|
||||
agnhost_name="${container_images_agnhost_name}"
|
||||
|
||||
@@ -70,8 +70,7 @@ NGC_API_KEY_SEALED_SECRET_EMBEDQA_BASE64=$(echo -n "${NGC_API_KEY_SEALED_SECRET_
|
||||
export NGC_API_KEY_SEALED_SECRET_EMBEDQA_BASE64
|
||||
|
||||
setup_langchain_flow() {
|
||||
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
|
||||
source "${HOME}"/.cicd/venv/bin/activate
|
||||
ensure_cicd_python_venv
|
||||
|
||||
pip install --upgrade pip
|
||||
[[ "$(pip show langchain 2>/dev/null | awk '/^Version:/{print $2}')" = "0.2.5" ]] || pip install langchain==0.2.5
|
||||
@@ -177,13 +176,6 @@ setup_file() {
|
||||
|
||||
dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
|
||||
|
||||
export PYENV_ROOT="${HOME}/.pyenv"
|
||||
[[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
|
||||
eval "$(pyenv init - bash)"
|
||||
|
||||
# shellcheck disable=SC1091 # Virtual environment will be created during test execution
|
||||
python3 -m venv "${HOME}"/.cicd/venv
|
||||
|
||||
setup_langchain_flow
|
||||
|
||||
policy_settings_dir="$(create_tmp_policy_settings_dir "${pod_config_dir}")"
|
||||
@@ -262,8 +254,6 @@ setup_file() {
|
||||
QUESTION="What is the capital of France?"
|
||||
ANSWER="The capital of France is Paris."
|
||||
|
||||
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
|
||||
source "${HOME}"/.cicd/venv/bin/activate
|
||||
# shellcheck disable=SC2031 # Variables are used in heredoc, not subshell
|
||||
cat <<EOF >"${HOME}"/.cicd/venv/langchain_nim.py
|
||||
from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
||||
@@ -295,8 +285,6 @@ EOF
|
||||
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||
[[ -n "${MODEL_NAME}" ]]
|
||||
|
||||
# shellcheck disable=SC1091 # Sourcing virtual environment activation script
|
||||
source "${HOME}"/.cicd/venv/bin/activate
|
||||
cat <<EOF >"${HOME}"/.cicd/venv/langchain_nim_kata_rag.py
|
||||
import os
|
||||
from langchain.chains import ConversationalRetrievalChain, LLMChain
|
||||
|
||||
@@ -10,8 +10,6 @@ load "${BATS_TEST_DIRNAME}/../../common.bash"
|
||||
load "${BATS_TEST_DIRNAME}/tests_common.sh"
|
||||
|
||||
setup() {
|
||||
[[ "${KATA_HYPERVISOR}" == qemu-se* ]] && \
|
||||
skip "See: https://github.com/kata-containers/kata-containers/issues/10002"
|
||||
setup_common || die "setup_common failed"
|
||||
}
|
||||
|
||||
@@ -68,8 +66,6 @@ setup() {
|
||||
}
|
||||
|
||||
teardown() {
|
||||
[[ "${KATA_HYPERVISOR}" == qemu-se* ]] && \
|
||||
skip "See: https://github.com/kata-containers/kata-containers/issues/10002"
|
||||
delete_tmp_policy_settings_dir "${policy_settings_dir}"
|
||||
teardown_common "${node}" "${node_start_time:-}"
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ load "${BATS_TEST_DIRNAME}/tests_common.sh"
|
||||
|
||||
setup() {
|
||||
is_confidential_runtime_class || skip "Only supported for CoCo"
|
||||
[[ "$(uname -m)" == "s390x" ]] && skip "Not supported on s390x"
|
||||
[[ "${KATA_HYPERVISOR}" == *-runtime-rs ]] && skip "Not supported with runtime-rs"
|
||||
|
||||
setup_common
|
||||
@@ -87,7 +86,6 @@ setup() {
|
||||
|
||||
teardown() {
|
||||
is_confidential_runtime_class || skip "Only supported for CoCo"
|
||||
[[ "$(uname -m)" == "s390x" ]] && skip "Not supported on s390x"
|
||||
[[ "${KATA_HYPERVISOR}" == *-runtime-rs ]] && skip "Not supported with runtime-rs"
|
||||
|
||||
confidential_teardown_common "${node}" "${node_start_time:-}"
|
||||
|
||||
@@ -69,7 +69,14 @@ spec:
|
||||
limits:
|
||||
nvidia.com/pgpu: "1"
|
||||
cpu: "16"
|
||||
memory: "128Gi"
|
||||
memory: "64Gi"
|
||||
volumeMounts:
|
||||
- name: nim-trusted-cache
|
||||
mountPath: /opt/nim/.cache
|
||||
volumes:
|
||||
- name: nim-trusted-cache
|
||||
emptyDir:
|
||||
sizeLimit: 64Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
|
||||
@@ -79,7 +79,14 @@ spec:
|
||||
limits:
|
||||
nvidia.com/pgpu: "1"
|
||||
cpu: "16"
|
||||
memory: "48Gi"
|
||||
memory: "32Gi"
|
||||
volumeMounts:
|
||||
- name: nim-trusted-cache
|
||||
mountPath: /opt/nim/.cache
|
||||
volumes:
|
||||
- name: nim-trusted-cache
|
||||
emptyDir:
|
||||
sizeLimit: 40Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
|
||||
@@ -150,7 +150,7 @@ install_genpolicy_drop_ins() {
|
||||
cp "${examples_dir}/20-oci-1.2.0-drop-in.json" "${settings_d}/"
|
||||
elif is_k3s_or_rke2; then
|
||||
cp "${examples_dir}/20-oci-1.2.1-drop-in.json" "${settings_d}/"
|
||||
elif is_nvidia_gpu_platform || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
elif is_nvidia_gpu_platform || [[ "${KATA_HYPERVISOR}" == "qemu-snp" ]] || [[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] || [[ -n "${CONTAINER_ENGINE_VERSION:-}" ]]; then
|
||||
cp "${examples_dir}/20-oci-1.3.0-drop-in.json" "${settings_d}/"
|
||||
fi
|
||||
|
||||
|
||||
@@ -62,6 +62,8 @@ hugepage
|
||||
MACVTAP
|
||||
memdisk
|
||||
pmem
|
||||
Sharedfs
|
||||
Initdata
|
||||
|
||||
# Networking & Communication
|
||||
netns
|
||||
@@ -74,6 +76,8 @@ cgroupsv1
|
||||
coredump
|
||||
CPUSET
|
||||
crio
|
||||
nerdctl
|
||||
dockershim
|
||||
dentries
|
||||
hypercalls
|
||||
inodes
|
||||
@@ -89,3 +93,13 @@ unbootable
|
||||
userspace
|
||||
eBPF
|
||||
dwarves
|
||||
passthroughfd
|
||||
passfd
|
||||
sendfd
|
||||
RunD
|
||||
udevd
|
||||
Reflink
|
||||
UNBINDABLE
|
||||
RAII
|
||||
Virt
|
||||
sysfs
|
||||
|
||||
@@ -9,7 +9,7 @@ FROM golang:1.24-alpine AS nydus-binary-downloader
|
||||
|
||||
# Keep the version here aligned with "ndyus-snapshotter.version"
|
||||
# in versions.yaml
|
||||
ARG NYDUS_SNAPSHOTTER_VERSION=v0.15.10
|
||||
ARG NYDUS_SNAPSHOTTER_VERSION=v0.15.13
|
||||
ARG NYDUS_SNAPSHOTTER_REPO=https://github.com/containerd/nydus-snapshotter
|
||||
|
||||
RUN \
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::config::{Config, NYDUS_FOR_KATA_TEE};
|
||||
use crate::runtime::containerd;
|
||||
use crate::utils;
|
||||
use crate::utils::toml as toml_utils;
|
||||
@@ -49,17 +49,14 @@ pub async fn configure_nydus_snapshotter(
|
||||
configuration_file: &Path,
|
||||
pluginid: &str,
|
||||
) -> Result<()> {
|
||||
info!("Configuring nydus-snapshotter");
|
||||
info!("Configuring {NYDUS_FOR_KATA_TEE}");
|
||||
|
||||
let nydus = match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("nydus-{suffix}"),
|
||||
_ => "nydus".to_string(),
|
||||
Some(suffix) if !suffix.is_empty() => format!("{NYDUS_FOR_KATA_TEE}-{suffix}"),
|
||||
_ => NYDUS_FOR_KATA_TEE.to_string(),
|
||||
};
|
||||
|
||||
let containerd_nydus = match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("nydus-snapshotter-{suffix}"),
|
||||
_ => "nydus-snapshotter".to_string(),
|
||||
};
|
||||
let containerd_nydus = nydus.clone();
|
||||
|
||||
toml_utils::set_toml_value(
|
||||
configuration_file,
|
||||
@@ -118,8 +115,8 @@ pub async fn configure_snapshotter(
|
||||
configure_nydus_snapshotter(config, &configuration_file, pluginid).await?;
|
||||
|
||||
let nydus_snapshotter = match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("nydus-snapshotter-{suffix}"),
|
||||
_ => "nydus-snapshotter".to_string(),
|
||||
Some(suffix) if !suffix.is_empty() => format!("{NYDUS_FOR_KATA_TEE}-{suffix}"),
|
||||
_ => NYDUS_FOR_KATA_TEE.to_string(),
|
||||
};
|
||||
|
||||
utils::host_systemctl(&["restart", &nydus_snapshotter])?;
|
||||
@@ -136,28 +133,37 @@ pub async fn configure_snapshotter(
|
||||
}
|
||||
|
||||
pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
info!("Deploying nydus-snapshotter");
|
||||
info!("Deploying {NYDUS_FOR_KATA_TEE}");
|
||||
|
||||
let nydus_snapshotter = match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("nydus-snapshotter-{suffix}"),
|
||||
_ => "nydus-snapshotter".to_string(),
|
||||
Some(suffix) if !suffix.is_empty() => format!("{NYDUS_FOR_KATA_TEE}-{suffix}"),
|
||||
_ => NYDUS_FOR_KATA_TEE.to_string(),
|
||||
};
|
||||
|
||||
// Clean up existing nydus-snapshotter state to ensure fresh start with new version.
|
||||
// This is safe across all K8s distributions (k3s, rke2, k0s, microk8s, etc.) because
|
||||
// we only touch the nydus data directory, not containerd's internals.
|
||||
// When containerd tries to use non-existent snapshots, it will re-pull/re-unpack.
|
||||
let nydus_data_dir = format!("/host/var/lib/{nydus_snapshotter}");
|
||||
info!("Cleaning up existing nydus-snapshotter state at {}", nydus_data_dir);
|
||||
|
||||
// Stop the service first if it exists (ignore errors if not running)
|
||||
// Stop the service if it is currently running so we can replace the binaries safely.
|
||||
let _ = utils::host_systemctl(&["stop", &format!("{nydus_snapshotter}.service")]);
|
||||
|
||||
// Remove the data directory to clean up old snapshots with potentially incorrect labels
|
||||
if Path::new(&nydus_data_dir).exists() {
|
||||
info!("Removing nydus data directory: {}", nydus_data_dir);
|
||||
fs::remove_dir_all(&nydus_data_dir).ok();
|
||||
}
|
||||
// The nydus data directory (/var/lib/nydus-for-kata-tee) is intentionally preserved
|
||||
// across reinstalls. Removing it would create a split-brain state: the nydus backend
|
||||
// would start empty while containerd's BoltDB (meta.db) still holds snapshot records
|
||||
// from the previous run. Any subsequent image pull then fails with:
|
||||
//
|
||||
// "unable to prepare extraction snapshot:
|
||||
// target snapshot \"sha256:...\": already exists"
|
||||
//
|
||||
// because the metadata layer finds the target chainID in BoltDB and returns AlreadyExists
|
||||
// before the backend is consulted, but when Stat() delegates to the (now empty) backend
|
||||
// it gets NotFound — tripping the unpacker's retry loop.
|
||||
//
|
||||
// Cleaning up containerd's meta.db before wiping the dir was attempted, but that cleanup
|
||||
// itself requires the nydus gRPC service to be reachable (ctr snapshots rm calls the
|
||||
// backend). If the service was stopped or crashed before the cleanup ran, the cleanup
|
||||
// silently fails and the split-brain state reappears.
|
||||
//
|
||||
// The correct invariant is simpler: meta.db and the nydus backend must always agree.
|
||||
// Preserving the data directory across reinstalls guarantees this at zero cost.
|
||||
// Any stale snapshots from previous workloads are naturally garbage-collected by
|
||||
// containerd once the images that reference them are removed.
|
||||
|
||||
let config_guest_pulling = "/opt/kata-artifacts/nydus-snapshotter/config-guest-pulling.toml";
|
||||
let nydus_snapshotter_service =
|
||||
@@ -175,7 +181,7 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
config_content = config_content.replace(
|
||||
"@NYDUS_OVERLAYFS_PATH@",
|
||||
&format!(
|
||||
"{}/nydus-snapshotter/nydus-overlayfs",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/nydus-overlayfs",
|
||||
&config
|
||||
.host_install_dir
|
||||
.strip_prefix("/host")
|
||||
@@ -187,7 +193,7 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
service_content = service_content.replace(
|
||||
"@CONTAINERD_NYDUS_GRPC_BINARY@",
|
||||
&format!(
|
||||
"{}/nydus-snapshotter/containerd-nydus-grpc",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/containerd-nydus-grpc",
|
||||
&config
|
||||
.host_install_dir
|
||||
.strip_prefix("/host")
|
||||
@@ -197,7 +203,7 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
service_content = service_content.replace(
|
||||
"@CONFIG_GUEST_PULLING@",
|
||||
&format!(
|
||||
"{}/nydus-snapshotter/config-guest-pulling.toml",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/config-guest-pulling.toml",
|
||||
&config
|
||||
.host_install_dir
|
||||
.strip_prefix("/host")
|
||||
@@ -205,7 +211,7 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
),
|
||||
);
|
||||
|
||||
fs::create_dir_all(format!("{}/nydus-snapshotter", config.host_install_dir))?;
|
||||
fs::create_dir_all(format!("{}/{NYDUS_FOR_KATA_TEE}", config.host_install_dir))?;
|
||||
|
||||
// Remove existing binaries before copying new ones.
|
||||
// This is crucial for atomic updates (same pattern as copy_artifacts in install.rs):
|
||||
@@ -214,13 +220,13 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
// - Running processes keep using the old inode until they exit
|
||||
// - New processes use the new file immediately
|
||||
// Without this, fs::copy would fail with ETXTBSY ("Text file busy") if the
|
||||
// nydus-snapshotter service is still running from a previous installation.
|
||||
// nydus-for-kata-tee service is still running from a previous installation.
|
||||
let grpc_binary = format!(
|
||||
"{}/nydus-snapshotter/containerd-nydus-grpc",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/containerd-nydus-grpc",
|
||||
config.host_install_dir
|
||||
);
|
||||
let overlayfs_binary = format!(
|
||||
"{}/nydus-snapshotter/nydus-overlayfs",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/nydus-overlayfs",
|
||||
config.host_install_dir
|
||||
);
|
||||
for binary in [&grpc_binary, &overlayfs_binary] {
|
||||
@@ -242,7 +248,7 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
|
||||
fs::write(
|
||||
format!(
|
||||
"{}/nydus-snapshotter/config-guest-pulling.toml",
|
||||
"{}/{NYDUS_FOR_KATA_TEE}/config-guest-pulling.toml",
|
||||
config.host_install_dir
|
||||
),
|
||||
config_content,
|
||||
@@ -260,11 +266,11 @@ pub async fn install_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
}
|
||||
|
||||
pub async fn uninstall_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
info!("Removing deployed nydus-snapshotter");
|
||||
info!("Removing deployed {NYDUS_FOR_KATA_TEE}");
|
||||
|
||||
let nydus_snapshotter = match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("nydus-snapshotter-{suffix}"),
|
||||
_ => "nydus-snapshotter".to_string(),
|
||||
Some(suffix) if !suffix.is_empty() => format!("{NYDUS_FOR_KATA_TEE}-{suffix}"),
|
||||
_ => NYDUS_FOR_KATA_TEE.to_string(),
|
||||
};
|
||||
|
||||
utils::host_systemctl(&["disable", "--now", &format!("{nydus_snapshotter}.service")])?;
|
||||
@@ -273,7 +279,15 @@ pub async fn uninstall_nydus_snapshotter(config: &Config) -> Result<()> {
|
||||
"/host/etc/systemd/system/{nydus_snapshotter}.service"
|
||||
))
|
||||
.ok();
|
||||
fs::remove_dir_all(format!("{}/nydus-snapshotter", config.host_install_dir)).ok();
|
||||
fs::remove_dir_all(format!("{}/{NYDUS_FOR_KATA_TEE}", config.host_install_dir)).ok();
|
||||
|
||||
// The nydus data directory (/var/lib/nydus-for-kata-tee) is intentionally preserved.
|
||||
// See install_nydus_snapshotter for the full explanation: meta.db and the nydus backend
|
||||
// must always agree, and the only way to guarantee that without complex, fragile cleanup
|
||||
// logic is to never remove the data directory. After uninstall, containerd is
|
||||
// reconfigured without the nydus proxy_plugins entry and restarted, so the remaining
|
||||
// snapshot records in meta.db are completely dormant — nothing will use them. If nydus
|
||||
// is reinstalled later the data directory is still present and both sides remain in sync.
|
||||
|
||||
utils::host_systemctl(&["daemon-reload"])?;
|
||||
|
||||
|
||||
@@ -16,6 +16,11 @@ use crate::k8s;
|
||||
pub const K3S_RKE2_CONTAINERD_V3_TMPL: &str = "/etc/containerd/config-v3.toml.tmpl";
|
||||
pub const K3S_RKE2_CONTAINERD_V2_TMPL: &str = "/etc/containerd/config.toml.tmpl";
|
||||
|
||||
/// Name of the nydus-snapshotter instance deployed and managed by kata-deploy for TEE workloads.
|
||||
/// Used as the systemd service name, the containerd proxy plugin key, the runtime class
|
||||
/// snapshotter field, and the base name for the data directory and socket path on the host.
|
||||
pub const NYDUS_FOR_KATA_TEE: &str = "nydus-for-kata-tee";
|
||||
|
||||
/// Resolves whether to use containerd config v3 (true) or v2 (false) for K3s/RKE2.
|
||||
/// 1. Tries config.toml (containerd config file): if it exists and contains "version = 3" or "version = 2", use that.
|
||||
/// 2. Else falls back to the node's containerRuntimeVersion (e.g. "containerd://2.1.5-k3s1").
|
||||
|
||||
@@ -88,6 +88,8 @@ pub async fn update_existing_runtimeclasses_for_nfd(config: &Config) -> Result<(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::config::NYDUS_FOR_KATA_TEE;
|
||||
|
||||
#[test]
|
||||
fn test_runtime_class_name_without_suffix() {
|
||||
// Test runtime class name without MULTI_INSTALL_SUFFIX
|
||||
@@ -144,24 +146,24 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_snapshotter_name_with_suffix() {
|
||||
// Test snapshotter name adjustment with MULTI_INSTALL_SUFFIX
|
||||
// Test that the nydus snapshotter produces the nydus-for-kata-tee containerd plugin
|
||||
// name, with the suffix appended when MULTI_INSTALL_SUFFIX is set.
|
||||
let suffix = Some("dev".to_string());
|
||||
let snapshotter = "nydus";
|
||||
|
||||
if let Some(s) = suffix {
|
||||
let adjusted = format!("{}-{}", snapshotter, s);
|
||||
assert_eq!(adjusted, "nydus-dev");
|
||||
let adjusted = format!("{NYDUS_FOR_KATA_TEE}-{}", s);
|
||||
assert_eq!(adjusted, format!("{NYDUS_FOR_KATA_TEE}-dev"));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nydus_snapshotter_systemd_service_with_suffix() {
|
||||
// Test nydus-snapshotter systemd service name with suffix
|
||||
// Test nydus-for-kata-tee systemd service name with suffix
|
||||
let suffix = Some("test".to_string());
|
||||
|
||||
if let Some(s) = suffix {
|
||||
let service_name = format!("nydus-snapshotter-{}", s);
|
||||
assert_eq!(service_name, "nydus-snapshotter-test");
|
||||
let service_name = format!("{NYDUS_FOR_KATA_TEE}-{}", s);
|
||||
assert_eq!(service_name, format!("{NYDUS_FOR_KATA_TEE}-test"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
use crate::config::{Config, ContainerdPaths, CustomRuntime};
|
||||
use crate::config::{Config, ContainerdPaths, CustomRuntime, NYDUS_FOR_KATA_TEE};
|
||||
use crate::k8s;
|
||||
use crate::utils;
|
||||
use crate::utils::toml as toml_utils;
|
||||
@@ -195,8 +195,10 @@ pub async fn configure_containerd_runtime(
|
||||
let value = parts[1];
|
||||
let snapshotter_value = if value == "nydus" {
|
||||
match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("\"{value}-{suffix}\""),
|
||||
_ => format!("\"{value}\""),
|
||||
Some(suffix) if !suffix.is_empty() => {
|
||||
format!("\"{NYDUS_FOR_KATA_TEE}-{suffix}\"")
|
||||
}
|
||||
_ => format!("\"{NYDUS_FOR_KATA_TEE}\""),
|
||||
}
|
||||
} else {
|
||||
format!("\"{value}\"")
|
||||
@@ -262,8 +264,8 @@ pub async fn configure_custom_containerd_runtime(
|
||||
let snapshotter = custom_runtime.containerd_snapshotter.as_ref().map(|s| {
|
||||
if s == "nydus" {
|
||||
match config.multi_install_suffix.as_ref() {
|
||||
Some(suffix) if !suffix.is_empty() => format!("\"{s}-{suffix}\""),
|
||||
_ => format!("\"{s}\""),
|
||||
Some(suffix) if !suffix.is_empty() => format!("\"{NYDUS_FOR_KATA_TEE}-{suffix}\""),
|
||||
_ => format!("\"{NYDUS_FOR_KATA_TEE}\""),
|
||||
}
|
||||
} else {
|
||||
format!("\"{s}\"")
|
||||
|
||||
@@ -708,8 +708,8 @@ main() {
|
||||
build_type="dragonball-experimental"
|
||||
if [ -n "$kernel_version" ]; then
|
||||
kernel_major_version=$(get_major_kernel_version "${kernel_version}")
|
||||
if [[ ${kernel_major_version} != "5.10" ]]; then
|
||||
info "dragonball-experimental kernel patches are only tested on 5.10.x kernel now, other kernel version may cause confliction"
|
||||
if [[ ${kernel_major_version} != "6.18" ]]; then
|
||||
info "dragonball-experimental kernel patches are only tested on 6.18.x kernel now, other kernel version may cause confliction"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -1 +1 @@
|
||||
187
|
||||
188
|
||||
|
||||
@@ -0,0 +1,525 @@
|
||||
From b6466a47048621a7d7dd72cad22d267d052d01a0 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Date: Wed, 9 Nov 2022 11:38:36 +0800
|
||||
Subject: [PATCH 1/8] upcall: establish upcall server
|
||||
|
||||
Upcall is a direct communication tool between hypervisor and guest. This
|
||||
patch introduces the server side in the upcall system.
|
||||
At the start of the upcall server, A kthread `db-vsock-srv` will be
|
||||
created. In this kthread, a vsock listener is established upon specific
|
||||
port(currently that port is 0xDB, DB refers to Dragonball). After socket
|
||||
is created, it will start accepting the connection from the client side.
|
||||
If the connection is established, upcall server will try to get cmd from
|
||||
the client and that cmd could determine which upcall service will handle
|
||||
the request from the client.
|
||||
|
||||
Besides, different service needs to be registered into upcall server so
|
||||
that it could handle the request from the client. There is a
|
||||
`register_db_vsock_service` in this commit provided for every service to
|
||||
register service into service_entry list during initialization and we will
|
||||
introduce device manager service in the following commits.
|
||||
|
||||
Signed-off-by: Liu Jiang <gerry@linux.alibaba.com>
|
||||
Signed-off-by: Zizheng Bian <zizheng.bian@linux.alibaba.com>
|
||||
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Signed-off-by: WangYu <WangYu@linux.alibaba.com>
|
||||
Signed-off-by: Xingjun Liu <xingjun.liu@linux.alibaba.com>
|
||||
Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
|
||||
---
|
||||
drivers/misc/Kconfig | 1 +
|
||||
drivers/misc/Makefile | 1 +
|
||||
drivers/misc/dragonball/Kconfig | 21 ++
|
||||
drivers/misc/dragonball/Makefile | 6 +
|
||||
drivers/misc/dragonball/upcall_srv/Kconfig | 14 +
|
||||
drivers/misc/dragonball/upcall_srv/Makefile | 13 +
|
||||
.../upcall_srv/dragonball_upcall_srv.c | 327 ++++++++++++++++++
|
||||
include/dragonball/upcall_srv.h | 42 +++
|
||||
8 files changed, 425 insertions(+)
|
||||
create mode 100644 drivers/misc/dragonball/Kconfig
|
||||
create mode 100644 drivers/misc/dragonball/Makefile
|
||||
create mode 100644 drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
create mode 100644 drivers/misc/dragonball/upcall_srv/Makefile
|
||||
create mode 100644 drivers/misc/dragonball/upcall_srv/dragonball_upcall_srv.c
|
||||
create mode 100644 include/dragonball/upcall_srv.h
|
||||
|
||||
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
|
||||
index b9c11f67315f0..7391c581339e1 100644
|
||||
--- a/drivers/misc/Kconfig
|
||||
+++ b/drivers/misc/Kconfig
|
||||
@@ -661,4 +661,5 @@ source "drivers/misc/mchp_pci1xxxx/Kconfig"
|
||||
source "drivers/misc/keba/Kconfig"
|
||||
source "drivers/misc/amd-sbi/Kconfig"
|
||||
source "drivers/misc/rp1/Kconfig"
|
||||
+source "drivers/misc/dragonball/Kconfig"
|
||||
endmenu
|
||||
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
|
||||
index b32a2597d2467..240de7f238fa2 100644
|
||||
--- a/drivers/misc/Makefile
|
||||
+++ b/drivers/misc/Makefile
|
||||
@@ -75,3 +75,4 @@ obj-$(CONFIG_MCHP_LAN966X_PCI) += lan966x-pci.o
|
||||
obj-y += keba/
|
||||
obj-y += amd-sbi/
|
||||
obj-$(CONFIG_MISC_RP1) += rp1/
|
||||
+obj-$(CONFIG_DRAGONBALL_DRIVERS) += dragonball/
|
||||
diff --git a/drivers/misc/dragonball/Kconfig b/drivers/misc/dragonball/Kconfig
|
||||
new file mode 100644
|
||||
index 0000000000000..f81be37219081
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/Kconfig
|
||||
@@ -0,0 +1,21 @@
|
||||
+#
|
||||
+# Alibaba Dragonball Secure Container Runtime Drivers
|
||||
+#
|
||||
+
|
||||
+menuconfig DRAGONBALL_DRIVERS
|
||||
+ bool "Alibaba Dragonball Secure Container Runtime Drivers"
|
||||
+ depends on X86_64 || ARM64
|
||||
+ default n
|
||||
+ help
|
||||
+ Alibaba Dragonball is a secure container runtime with an embedded micro-vmm
|
||||
+ to securely isolate container workloads.
|
||||
+
|
||||
+ Say Y here to get to see options for various misc drivers to support the
|
||||
+ Alibaba Dragonball secure container runtime. This option alone does not
|
||||
+ add any kernel code.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
+
|
||||
+if DRAGONBALL_DRIVERS
|
||||
+source "drivers/misc/dragonball/upcall_srv/Kconfig"
|
||||
+endif # DRAGONBALL_DRIVERS
|
||||
diff --git a/drivers/misc/dragonball/Makefile b/drivers/misc/dragonball/Makefile
|
||||
new file mode 100644
|
||||
index 0000000000000..b7bd86d73ade9
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/Makefile
|
||||
@@ -0,0 +1,6 @@
|
||||
+# SPDX-License-Identifier: GPL-2.0
|
||||
+#
|
||||
+# Makefile for Dragonball misc drivers
|
||||
+#
|
||||
+
|
||||
+obj-$(CONFIG_DRAGONBALL_UPCALL_SRV) += upcall_srv/
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Kconfig b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
new file mode 100644
|
||||
index 0000000000000..b00bf1f8637db
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
@@ -0,0 +1,14 @@
|
||||
+#
|
||||
+# Alibaba Dragonball Secure Container Runtime Drivers for vsock
|
||||
+#
|
||||
+
|
||||
+config DRAGONBALL_UPCALL_SRV
|
||||
+ bool "Dragonball in-kernel Virtual Sockets Server"
|
||||
+ depends on VIRTIO_VSOCKETS
|
||||
+ default y
|
||||
+ help
|
||||
+ This configure implements an in-kernel vsock server to dispatch Dragonball
|
||||
+ requests to registered service handlers, based on the reliable Virtual
|
||||
+ Sockets communication channels between guest and host/vmm.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Makefile b/drivers/misc/dragonball/upcall_srv/Makefile
|
||||
new file mode 100644
|
||||
index 0000000000000..4102e6c7edefd
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Makefile
|
||||
@@ -0,0 +1,13 @@
|
||||
+# SPDX-License-Identifier: GPL-2.0
|
||||
+#
|
||||
+# Makefile for the in-kernel vsock server.
|
||||
+#
|
||||
+# Copyright (C) 2022 Alibaba Cloud, Inc
|
||||
+#
|
||||
+# This program is free software; you can redistribute it and/or
|
||||
+# modify it under the terms of the GNU General Public License
|
||||
+# as published by the Free Software Foundation; either version
|
||||
+# 2 of the License, or (at your option) any later version.
|
||||
+#
|
||||
+
|
||||
+obj-$(CONFIG_DRAGONBALL_UPCALL_SRV) += dragonball_upcall_srv.o
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_upcall_srv.c b/drivers/misc/dragonball/upcall_srv/dragonball_upcall_srv.c
|
||||
new file mode 100644
|
||||
index 0000000000000..bab35baa8c167
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_upcall_srv.c
|
||||
@@ -0,0 +1,327 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+/*
|
||||
+ * drivers/misc/dragonball/upcall_srv/dragonball_upcall_srv.c
|
||||
+ * Dragonball upcall server
|
||||
+ *
|
||||
+ * Copyright (C) 2022 Alibaba Cloud, Inc
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU General Public License
|
||||
+ * as published by the Free Software Foundation; either version
|
||||
+ * 2 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+#define pr_fmt(fmt) "db-upcall-srv: " fmt
|
||||
+
|
||||
+#include <linux/kthread.h>
|
||||
+#include <linux/list.h>
|
||||
+#include <linux/module.h>
|
||||
+#include <linux/net.h>
|
||||
+#include <linux/vm_sockets.h>
|
||||
+#include <net/net_namespace.h>
|
||||
+#include <net/sock.h>
|
||||
+#include <dragonball/upcall_srv.h>
|
||||
+
|
||||
+struct db_conn_info {
|
||||
+ struct work_struct work;
|
||||
+ struct socket *sock;
|
||||
+};
|
||||
+
|
||||
+struct db_service_entry {
|
||||
+ char cmd;
|
||||
+ db_vsock_svc_handler_t handler;
|
||||
+ struct list_head list;
|
||||
+};
|
||||
+
|
||||
+/* Protects registered command. */
|
||||
+static DEFINE_MUTEX(db_service_lock);
|
||||
+static LIST_HEAD(db_service_list);
|
||||
+
|
||||
+static struct task_struct *db_service_task;
|
||||
+static unsigned int db_server_port = DB_SERVER_PORT;
|
||||
+
|
||||
+struct socket *db_create_vsock_listener(unsigned int port)
|
||||
+{
|
||||
+ struct socket *sock;
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ union {
|
||||
+ struct sockaddr sa;
|
||||
+ struct sockaddr_vm svm;
|
||||
+ } addr = {
|
||||
+ .svm = {
|
||||
+ .svm_family = AF_VSOCK,
|
||||
+ .svm_port = port,
|
||||
+ .svm_cid = VMADDR_CID_ANY,
|
||||
+ }
|
||||
+ };
|
||||
+
|
||||
+ ret = sock_create_kern(&init_net, AF_VSOCK, SOCK_STREAM, 0, &sock);
|
||||
+ if (ret) {
|
||||
+ pr_err("Server vsock create failed, err: %d\n", ret);
|
||||
+ return ERR_PTR(ret);
|
||||
+ }
|
||||
+
|
||||
+ ret = sock->ops->bind(sock, &addr.sa, sizeof(addr.svm));
|
||||
+ if (ret) {
|
||||
+ pr_err("Server vsock bind failed, err: %d\n", ret);
|
||||
+ goto err;
|
||||
+ }
|
||||
+ ret = sock->ops->listen(sock, 10);
|
||||
+ if (ret < 0) {
|
||||
+ pr_err("Server vsock listen error: %d\n", ret);
|
||||
+ goto err;
|
||||
+ }
|
||||
+
|
||||
+ return sock;
|
||||
+err:
|
||||
+ sock_release(sock);
|
||||
+ return ERR_PTR(ret);
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(db_create_vsock_listener);
|
||||
+
|
||||
+int db_vsock_sendmsg(struct socket *sock, char *buf, size_t len)
|
||||
+{
|
||||
+ struct kvec vec;
|
||||
+ struct msghdr msgh;
|
||||
+
|
||||
+ vec.iov_base = buf;
|
||||
+ vec.iov_len = len;
|
||||
+ memset(&msgh, 0, sizeof(msgh));
|
||||
+
|
||||
+ return kernel_sendmsg(sock, &msgh, &vec, 1, len);
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(db_vsock_sendmsg);
|
||||
+
|
||||
+int db_vsock_recvmsg(struct socket *sock, char *buf, size_t len, int flags)
|
||||
+{
|
||||
+ struct kvec vec;
|
||||
+ struct msghdr msgh;
|
||||
+
|
||||
+ memset(&vec, 0, sizeof(vec));
|
||||
+ memset(&msgh, 0, sizeof(msgh));
|
||||
+ vec.iov_base = buf;
|
||||
+ vec.iov_len = len;
|
||||
+
|
||||
+ return kernel_recvmsg(sock, &msgh, &vec, 1, len, flags);
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(db_vsock_recvmsg);
|
||||
+
|
||||
+static int db_vsock_recvcmd(struct socket *cli_socket, char *cmd)
|
||||
+{
|
||||
+ int ret;
|
||||
+ char rcv;
|
||||
+ long timeout;
|
||||
+ struct kvec vec;
|
||||
+ struct msghdr msg;
|
||||
+
|
||||
+ memset(&vec, 0, sizeof(vec));
|
||||
+ memset(&msg, 0, sizeof(msg));
|
||||
+ vec.iov_base = &rcv;
|
||||
+ vec.iov_len = 1;
|
||||
+
|
||||
+ timeout = cli_socket->sk->sk_rcvtimeo;
|
||||
+ cli_socket->sk->sk_rcvtimeo = DB_INIT_TIMEOUT * HZ;
|
||||
+ ret = kernel_recvmsg(cli_socket, &msg, &vec, 1, 1, 0);
|
||||
+ cli_socket->sk->sk_rcvtimeo = timeout;
|
||||
+ *cmd = rcv;
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * The workqueue handler for vsock work_struct.
|
||||
+ *
|
||||
+ * Each worker-pool bound to an actual CPU implements concurrency management
|
||||
+ * by hooking into the scheduler. The worker-pool is notified whenever an
|
||||
+ * active worker wakes up or sleeps and keeps track of the number of the
|
||||
+ * currently runnable workers. Generally, work items are not expected to hog
|
||||
+ * a CPU and consume many cycles. That means maintaining just enough concurrency
|
||||
+ * to prevent work processing from stalling should be optimal.
|
||||
+ *
|
||||
+ * So it's OK to sleep in a workqueue handler, it won't cause too many worker
|
||||
+ * threads.
|
||||
+ */
|
||||
+static void db_conn_service(struct work_struct *work)
|
||||
+{
|
||||
+ struct db_conn_info *conn_info =
|
||||
+ container_of(work, struct db_conn_info, work);
|
||||
+ struct db_service_entry *service_entry;
|
||||
+ int len, ret = -1;
|
||||
+ char cmd;
|
||||
+
|
||||
+ len = db_vsock_recvcmd(conn_info->sock, &cmd);
|
||||
+ if (len <= 0)
|
||||
+ goto recv_failed;
|
||||
+
|
||||
+ mutex_lock(&db_service_lock);
|
||||
+ list_for_each_entry(service_entry, &db_service_list, list) {
|
||||
+ if (cmd == service_entry->cmd) {
|
||||
+ ret = service_entry->handler(conn_info->sock);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ mutex_unlock(&db_service_lock);
|
||||
+
|
||||
+recv_failed:
|
||||
+ if (ret) {
|
||||
+ sock_release(conn_info->sock);
|
||||
+ pr_info("Client connection closed, error code: %d\n", ret);
|
||||
+ }
|
||||
+ kfree(conn_info);
|
||||
+}
|
||||
+
|
||||
+static int db_create_cli_conn(struct socket *sock)
|
||||
+{
|
||||
+ struct db_conn_info *conn;
|
||||
+
|
||||
+ conn = kmalloc(sizeof(*conn), GFP_KERNEL);
|
||||
+ if (!conn)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ conn->sock = sock;
|
||||
+ INIT_WORK(&conn->work, db_conn_service);
|
||||
+ schedule_work(&conn->work);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int db_vsock_server(void *data)
|
||||
+{
|
||||
+ struct socket *sock;
|
||||
+ int err;
|
||||
+
|
||||
+ sock = db_create_vsock_listener(db_server_port);
|
||||
+ if (IS_ERR(sock)) {
|
||||
+ err = PTR_ERR(sock);
|
||||
+ pr_err("Init server err: %d\n", err);
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
+ while (!kthread_should_stop()) {
|
||||
+ struct socket *conn;
|
||||
+ struct proto_accept_arg arg;
|
||||
+
|
||||
+ conn = sock_alloc();
|
||||
+ if (!conn)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ conn->type = sock->type;
|
||||
+ conn->ops = sock->ops;
|
||||
+
|
||||
+ /* 0:propotal 1:kernel */
|
||||
+ arg.flags = 0;
|
||||
+ arg.kern = true;
|
||||
+
|
||||
+ err = sock->ops->accept(sock, conn, &arg);
|
||||
+ if (err < 0) {
|
||||
+ pr_err("Server accept err: %d\n", err);
|
||||
+ sock_release(conn);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ err = db_create_cli_conn(conn);
|
||||
+ if (err)
|
||||
+ pr_err("Create client connetion err: %d\n", err);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int db_create_service(void)
|
||||
+{
|
||||
+ struct task_struct *service;
|
||||
+ int rc = 0;
|
||||
+
|
||||
+ service = kthread_create(db_vsock_server, NULL, "db-vsock-srv");
|
||||
+ if (IS_ERR(service)) {
|
||||
+ rc = PTR_ERR(service);
|
||||
+ pr_err("Server task create failed, err: %d\n", rc);
|
||||
+ } else {
|
||||
+ db_service_task = service;
|
||||
+ wake_up_process(service);
|
||||
+ }
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
+static int db_vsock_srv_cmdline_set(const char *device,
|
||||
+ const struct kernel_param *kp)
|
||||
+{
|
||||
+ unsigned int port = 0;
|
||||
+ int processed, consumed = 0;
|
||||
+
|
||||
+ /* Get "@<port>" */
|
||||
+ processed = sscanf(device, "@%u%n", &port, &consumed);
|
||||
+ if (processed < 1 || device[consumed] || port == 0 || port > 1024) {
|
||||
+ pr_err("Using @<port> format and port range (0, 1024].\n");
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
+ db_server_port = port;
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct kernel_param_ops db_vsock_srv_cmdline_param_ops = {
|
||||
+ .set = db_vsock_srv_cmdline_set,
|
||||
+};
|
||||
+
|
||||
+device_param_cb(port, &db_vsock_srv_cmdline_param_ops, NULL, 0400);
|
||||
+
|
||||
+int register_db_vsock_service(const char cmd, db_vsock_svc_handler_t handler)
|
||||
+{
|
||||
+ int rc = -EEXIST;
|
||||
+ struct db_service_entry *service_entry;
|
||||
+
|
||||
+ mutex_lock(&db_service_lock);
|
||||
+ list_for_each_entry(service_entry, &db_service_list, list) {
|
||||
+ if (cmd == service_entry->cmd) {
|
||||
+ rc = -EEXIST;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ service_entry = kzalloc(sizeof(*service_entry), GFP_KERNEL);
|
||||
+ if (!service_entry) {
|
||||
+ rc = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ service_entry->cmd = cmd;
|
||||
+ service_entry->handler = handler;
|
||||
+ list_add_tail(&service_entry->list, &db_service_list);
|
||||
+ rc = 0;
|
||||
+out:
|
||||
+ mutex_unlock(&db_service_lock);
|
||||
+ return rc;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(register_db_vsock_service);
|
||||
+
|
||||
+int unregister_db_vsock_service(const char cmd)
|
||||
+{
|
||||
+ int rc = -EEXIST;
|
||||
+ struct db_service_entry *service_entry, *n;
|
||||
+
|
||||
+ mutex_lock(&db_service_lock);
|
||||
+ list_for_each_entry_safe(service_entry, n, &db_service_list, list) {
|
||||
+ if (cmd == service_entry->cmd) {
|
||||
+ list_del(&service_entry->list);
|
||||
+ rc = 0;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ mutex_unlock(&db_service_lock);
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(unregister_db_vsock_service);
|
||||
+
|
||||
+static int __init db_vsock_srv_init(void)
|
||||
+{
|
||||
+ return db_create_service();
|
||||
+}
|
||||
+
|
||||
+late_initcall(db_vsock_srv_init);
|
||||
+
|
||||
+MODULE_AUTHOR("Alibaba, Inc.");
|
||||
+MODULE_DESCRIPTION("Dragonball vsock server");
|
||||
+MODULE_LICENSE("GPL v2");
|
||||
diff --git a/include/dragonball/upcall_srv.h b/include/dragonball/upcall_srv.h
|
||||
new file mode 100644
|
||||
index 0000000000000..1c733982cc30d
|
||||
--- /dev/null
|
||||
+++ b/include/dragonball/upcall_srv.h
|
||||
@@ -0,0 +1,42 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0 */
|
||||
+/*
|
||||
+ * db_upcall_srv.h Virtual Sockets Server for Dragonball
|
||||
+ *
|
||||
+ * Copyright (C) 2022 Alibaba Cloud, Inc
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify it
|
||||
+ * under the terms of the GNU General Public License as published by the Free
|
||||
+ * Software Foundation; either version 2 of the License, or (at your option)
|
||||
+ * any later version.
|
||||
+ */
|
||||
+
|
||||
+#ifndef _DB_UPCALL_SRV_H
|
||||
+#define _DB_UPCALL_SRV_H
|
||||
+
|
||||
+#include <linux/workqueue.h>
|
||||
+#include <linux/net.h>
|
||||
+
|
||||
+/* Vsock port to listen for incoming connections. */
|
||||
+#define DB_SERVER_PORT 0xDB
|
||||
+#define DB_RECVBUF_SIZE 0x400
|
||||
+#define DB_INIT_TIMEOUT 10
|
||||
+
|
||||
+/*
|
||||
+ * Vsock service handler to handle new incoming connections.
|
||||
+ *
|
||||
+ * Return:
|
||||
+ * 0: on success and the callback takes ownership of the sock.
|
||||
+ * !0: on failure and the callback should keep the sock as is.
|
||||
+ */
|
||||
+typedef int (*db_vsock_svc_handler_t) (struct socket *sock);
|
||||
+
|
||||
+extern int register_db_vsock_service(const char cmd,
|
||||
+ db_vsock_svc_handler_t handler);
|
||||
+extern int unregister_db_vsock_service(const char cmd);
|
||||
+
|
||||
+extern struct socket *db_create_vsock_listener(unsigned int port);
|
||||
+extern int db_vsock_sendmsg(struct socket *sock, char *buf, size_t len);
|
||||
+extern int db_vsock_recvmsg(struct socket *sock, char *buf, size_t len,
|
||||
+ int flags);
|
||||
+
|
||||
+#endif /* _DB_UPCALL_SRV_H */
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,330 @@
|
||||
From 81b293100f31d303cd0f611063cf8b4f167cb4f7 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Date: Mon, 21 Nov 2022 19:19:26 +0800
|
||||
Subject: [PATCH 2/8] upcall: introduce device manager upcall service
|
||||
|
||||
Different services are registered into upcall server to handle the
|
||||
request from the client side. This commit introduces devic manager
|
||||
upcall service and when new message gets into upcall server, cmd `d` is
|
||||
used for identifying the device manager service.
|
||||
|
||||
After a request is sent to device manager service, db_devmgr_handler
|
||||
will start handle the request. A kthread `db_devmgr_server` will be
|
||||
created and it will send CONNECT message to the client side to notify
|
||||
the client start sending message for device management operations.
|
||||
`db_devmgr_process` will be used for determining which device operations
|
||||
will be triggered through msg_type. `get_action` will find out the
|
||||
action for dealing with the operation and `action` fn will execute the
|
||||
actual device management operation in the device manager service.
|
||||
|
||||
Signed-off-by: Liu Jiang <gerry@linux.alibaba.com>
|
||||
Signed-off-by: Zizheng Bian <zizheng.bian@linux.alibaba.com>
|
||||
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Signed-off-by: WangYu <WangYu@linux.alibaba.com>
|
||||
Signed-off-by: Xingjun Liu <xingjun.liu@linux.alibaba.com>
|
||||
---
|
||||
drivers/misc/dragonball/upcall_srv/Kconfig | 12 +
|
||||
drivers/misc/dragonball/upcall_srv/Makefile | 1 +
|
||||
.../upcall_srv/dragonball_device_manager.c | 235 ++++++++++++++++++
|
||||
include/dragonball/device_manager.h | 18 ++
|
||||
4 files changed, 266 insertions(+)
|
||||
create mode 100644 drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
create mode 100644 include/dragonball/device_manager.h
|
||||
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Kconfig b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
index b00bf1f8637db..6554a9741c00d 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
@@ -12,3 +12,15 @@ config DRAGONBALL_UPCALL_SRV
|
||||
Sockets communication channels between guest and host/vmm.
|
||||
|
||||
If unsure, say N.
|
||||
+
|
||||
+config DRAGONBALL_DEVICE_MANAGER
|
||||
+ bool "Vsock Service to Handle Dragonball Device Management Requests"
|
||||
+ depends on DRAGONBALL_UPCALL_SRV
|
||||
+ depends on VIRTIO_VSOCKETS
|
||||
+ default y
|
||||
+ help
|
||||
+ This configure implements a vsock service to handle Dragonball device
|
||||
+ management requests, such as getting device information, hot-plugging
|
||||
+ devices etc.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Makefile b/drivers/misc/dragonball/upcall_srv/Makefile
|
||||
index 4102e6c7edefd..409c0c11e2e66 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/Makefile
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Makefile
|
||||
@@ -11,3 +11,4 @@
|
||||
#
|
||||
|
||||
obj-$(CONFIG_DRAGONBALL_UPCALL_SRV) += dragonball_upcall_srv.o
|
||||
+obj-$(CONFIG_DRAGONBALL_DEVICE_MANAGER) += dragonball_device_manager.o
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
new file mode 100644
|
||||
index 0000000000000..ebcb6ef742855
|
||||
--- /dev/null
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
@@ -0,0 +1,235 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+/*
|
||||
+ * drivers/misc/dragonball/vsock_srv/dragonball_device_manager.c
|
||||
+ * vsock service for device management.
|
||||
+ *
|
||||
+ * Copyright (C) 2022 Alibaba Cloud, Inc
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU General Public License
|
||||
+ * as published by the Free Software Foundation; either version
|
||||
+ * 2 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ */
|
||||
+
|
||||
+#define pr_fmt(fmt) "db-dev-mgr: " fmt
|
||||
+
|
||||
+#include <linux/kthread.h>
|
||||
+#include <linux/module.h>
|
||||
+#include <linux/platform_device.h>
|
||||
+#include <linux/slab.h>
|
||||
+#include <linux/virtio_mmio.h>
|
||||
+#include <linux/cpu.h>
|
||||
+#include <linux/cpumask.h>
|
||||
+#include <linux/cpuhotplug.h>
|
||||
+#include <asm/cpu.h>
|
||||
+#include <dragonball/upcall_srv.h>
|
||||
+#include <dragonball/device_manager.h>
|
||||
+#ifdef CONFIG_ARM64
|
||||
+#include <linux/irqdomain.h>
|
||||
+#include <linux/irq.h>
|
||||
+#endif
|
||||
+#include <linux/percpu.h>
|
||||
+#include <linux/device.h>
|
||||
+#include <asm/numa.h>
|
||||
+
|
||||
+/*
|
||||
+ * Following designs are adopted to simplify implementation:
|
||||
+ * 1) fix size messages with padding to ease receiving logic.
|
||||
+ * 2) binary encoding instead of string encoding because it's on the same host.
|
||||
+ * 3) synchronous communication in ping-pong mode, one in-fly request at most.
|
||||
+ * 4) do not support module unloading
|
||||
+ */
|
||||
+
|
||||
+/* These definitions are synchronized with dragonball */
|
||||
+#define DEV_MGR_MSG_SIZE 0x400
|
||||
+#define DEVMGR_CMD_BYTE 'd'
|
||||
+#define DEVMGR_MAGIC_VERSION 0x444D0100 /* 'DM' + Version 1.0 */
|
||||
+#define SHARED_IRQ_NO 5
|
||||
+
|
||||
+/* Type of request and reply messages. */
|
||||
+enum devmgr_msg_type {
|
||||
+ CONNECT = 0x00000000,
|
||||
+ ADD_CPU = 0x00000001,
|
||||
+ DEL_CPU = 0x00000002,
|
||||
+ ADD_MEM = 0x00000003,
|
||||
+ DEL_MEM = 0x00000004,
|
||||
+ ADD_MMIO = 0x00000005,
|
||||
+ DEL_MMIO = 0x00000006,
|
||||
+ ADD_PCI = 0x00000007,
|
||||
+ DEL_PCI = 0x00000008,
|
||||
+};
|
||||
+
|
||||
+struct devmgr_msg_header {
|
||||
+ /* magic version for identifying upcall */
|
||||
+ uint32_t magic_version;
|
||||
+ /* size of the upcall message */
|
||||
+ uint32_t msg_size;
|
||||
+ /* type for the message to identify its usage */
|
||||
+ uint32_t msg_type;
|
||||
+ /* flag for extra information */
|
||||
+ uint32_t msg_flags;
|
||||
+};
|
||||
+
|
||||
+struct devmgr_req {
|
||||
+ struct devmgr_msg_header msg_header;
|
||||
+ union {
|
||||
+ char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header)];
|
||||
+ } msg_load;
|
||||
+};
|
||||
+
|
||||
+struct devmgr_reply {
|
||||
+ struct devmgr_msg_header msg_header;
|
||||
+ /*
|
||||
+ * if ret is 0, it means the operation is successful.
|
||||
+ * if ret is not 0, return value will be error code.
|
||||
+ */
|
||||
+ int32_t ret;
|
||||
+ union {
|
||||
+ char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header) - sizeof(int32_t)];
|
||||
+ } msg_load;
|
||||
+};
|
||||
+
|
||||
+struct task_res {
|
||||
+ struct task_struct *task;
|
||||
+ struct socket *sock;
|
||||
+ struct devmgr_req req;
|
||||
+ struct devmgr_reply reply;
|
||||
+};
|
||||
+
|
||||
+typedef int (*action_route_t) (struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep);
|
||||
+
|
||||
+static void _fill_msg_header(struct devmgr_msg_header *msg, uint32_t msg_size,
|
||||
+ uint32_t msg_type, uint32_t msg_flags)
|
||||
+{
|
||||
+ msg->magic_version = DEVMGR_MAGIC_VERSION;
|
||||
+ msg->msg_size = msg_size;
|
||||
+ msg->msg_type = msg_type;
|
||||
+ msg->msg_flags = msg_flags;
|
||||
+}
|
||||
+
|
||||
+static struct {
|
||||
+ enum devmgr_msg_type cmd;
|
||||
+ action_route_t fn;
|
||||
+} opt_map[] = {
|
||||
+};
|
||||
+
|
||||
+static action_route_t get_action(struct devmgr_req *req)
|
||||
+{
|
||||
+ int i;
|
||||
+ action_route_t action = NULL;
|
||||
+ int size_opt = ARRAY_SIZE(opt_map);
|
||||
+
|
||||
+ for (i = 0; i < size_opt; i++) {
|
||||
+ if (opt_map[i].cmd == req->msg_header.msg_type) {
|
||||
+ action = opt_map[i].fn;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ return action;
|
||||
+}
|
||||
+
|
||||
+static void db_devmgr_process(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int err;
|
||||
+ action_route_t action;
|
||||
+ struct devmgr_msg_header *req_mh = &req->msg_header;
|
||||
+ struct devmgr_msg_header *rep_mh = &rep->msg_header;
|
||||
+
|
||||
+ if (req_mh->magic_version != DEVMGR_MAGIC_VERSION) {
|
||||
+ _fill_msg_header(rep_mh, 0, req->msg_header.msg_type, 0);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ action = get_action(req);
|
||||
+ if (action == NULL) {
|
||||
+ pr_err("db_devmgr_process : Not found valid command");
|
||||
+ rep->ret = -1;
|
||||
+ _fill_msg_header(rep_mh, 0, req->msg_header.msg_type, 0);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ err = action(req, rep);
|
||||
+ if (err) {
|
||||
+ pr_err("db_devmgr_process : Command run failed, err: %d", err);
|
||||
+ rep->ret = err;
|
||||
+ _fill_msg_header(rep_mh, 0, req->msg_header.msg_type, 0);
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static int db_devmgr_server(void *data)
|
||||
+{
|
||||
+ struct task_res *res = (struct task_res *)data;
|
||||
+ struct devmgr_msg_header *rep_mh = &res->reply.msg_header;
|
||||
+ int len;
|
||||
+
|
||||
+ _fill_msg_header(rep_mh, 0, CONNECT, 0);
|
||||
+ len = db_vsock_sendmsg(res->sock, (char *)&res->reply, DEV_MGR_MSG_SIZE);
|
||||
+ if (len <= 0) {
|
||||
+ pr_err("db_devmgr_server : Server send message failed, err: %d", len);
|
||||
+ sock_release(res->sock);
|
||||
+ kfree(res);
|
||||
+ return len;
|
||||
+ }
|
||||
+
|
||||
+ while (!kthread_should_stop()) {
|
||||
+ len = db_vsock_recvmsg(res->sock, (char *)&res->req,
|
||||
+ DEV_MGR_MSG_SIZE, 0);
|
||||
+ if (len <= 0)
|
||||
+ break;
|
||||
+
|
||||
+ /* The result(OK or Error) will fill into res->reply field */
|
||||
+ db_devmgr_process(&res->req, &res->reply);
|
||||
+
|
||||
+ len = db_vsock_sendmsg(res->sock, (char *)&res->reply,
|
||||
+ DEV_MGR_MSG_SIZE);
|
||||
+ if (len <= 0)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /* TODO: check who shutdown the socket, receiving or sending. */
|
||||
+ sock_release(res->sock);
|
||||
+ kfree(res);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int db_devmgr_handler(struct socket *sock)
|
||||
+{
|
||||
+ struct task_res *res;
|
||||
+ struct task_struct *conn_task;
|
||||
+
|
||||
+ /* TODO: ensure singleton, only one server exists */
|
||||
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
|
||||
+ if (!res)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ res->sock = sock;
|
||||
+ conn_task = kthread_create(db_devmgr_server, res, "db_dev_mgr");
|
||||
+ if (IS_ERR(conn_task)) {
|
||||
+ pr_err("db_devmgr_handler : Client process thread create failed, err: %d",
|
||||
+ (int)PTR_ERR(conn_task));
|
||||
+ goto failed;
|
||||
+ } else {
|
||||
+ res->task = conn_task;
|
||||
+ wake_up_process(conn_task);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+failed:
|
||||
+ kfree(res);
|
||||
+ return PTR_ERR(conn_task);
|
||||
+}
|
||||
+
|
||||
+static int __init db_device_manager_init(void)
|
||||
+{
|
||||
+ return register_db_vsock_service(DEVMGR_CMD_BYTE, db_devmgr_handler);
|
||||
+}
|
||||
+
|
||||
+late_initcall(db_device_manager_init);
|
||||
+
|
||||
+MODULE_AUTHOR("Alibaba, Inc.");
|
||||
+MODULE_DESCRIPTION("Dragonball Device Manager");
|
||||
+MODULE_LICENSE("GPL v2");
|
||||
diff --git a/include/dragonball/device_manager.h b/include/dragonball/device_manager.h
|
||||
new file mode 100644
|
||||
index 0000000000000..a1713e9f026d1
|
||||
--- /dev/null
|
||||
+++ b/include/dragonball/device_manager.h
|
||||
@@ -0,0 +1,18 @@
|
||||
+/* SPDX-License-Identifier: GPL-2.0 */
|
||||
+/*
|
||||
+ * device_manager.h Device Manager for Dragonball
|
||||
+ *
|
||||
+ * Copyright (C) 2022 Alibaba Cloud, Inc
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify it
|
||||
+ * under the terms of the GNU General Public License as published by the Free
|
||||
+ * Software Foundation; either version 2 of the License, or (at your option)
|
||||
+ * any later version.
|
||||
+ */
|
||||
+
|
||||
+#ifndef _DB_DEVICE_MANAGER_H
|
||||
+#define _DB_DEVICE_MANAGER_H
|
||||
+
|
||||
+#include <linux/device.h>
|
||||
+
|
||||
+#endif /* _DB_DEVICE_MANAGER_H */
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,326 @@
|
||||
From f2afaacf8d2d5dbf597fc4ffed70e8804a26b130 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Date: Mon, 21 Nov 2022 19:44:50 +0800
|
||||
Subject: [PATCH 3/8] upcall: add cpu hotplug/hot-unplug into device manager
|
||||
service
|
||||
|
||||
Add cpu hotplug and hot-unplug support into device manager. In the
|
||||
`devmgr_req` message, `msg_type` ADD_CPU in `msg_header` will trigger
|
||||
`add_cpu_dev` action and DEL_CPU will trigger `del_cpu_dev` action, and
|
||||
we use `apic_ids` and `count` delivered in `cpu_dev_info` to notify
|
||||
which and how many cpus will be hotplugged / hot-unplugged.
|
||||
|
||||
`add_cpu_dev` and `del_cpu_dev` will eventually trigger `add_cpu_upcall`
|
||||
and `del_cpu_upcall` to trigger the cpu hotplug / hot-unplug process in
|
||||
the kernel. After the cpu hotplug / hot-unplug process,
|
||||
`cpu_event_notification` will generate device manager reply to the
|
||||
client side.
|
||||
|
||||
Signed-off-by: Liu Jiang <gerry@linux.alibaba.com>
|
||||
Signed-off-by: Zizheng Bian <zizheng.bian@linux.alibaba.com>
|
||||
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Signed-off-by: WangYu <WangYu@linux.alibaba.com>
|
||||
Signed-off-by: Xingjun Liu <xingjun.liu@linux.alibaba.com>
|
||||
Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
|
||||
---
|
||||
drivers/misc/dragonball/upcall_srv/Kconfig | 11 +
|
||||
.../upcall_srv/dragonball_device_manager.c | 236 ++++++++++++++++++
|
||||
2 files changed, 247 insertions(+)
|
||||
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Kconfig b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
index 6554a9741c00d..b237882a29288 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
@@ -24,3 +24,14 @@ config DRAGONBALL_DEVICE_MANAGER
|
||||
devices etc.
|
||||
|
||||
If unsure, say N.
|
||||
+
|
||||
+config DRAGONBALL_HOTPLUG_CPU
|
||||
+ bool "CPU hotplug/hotunplug support"
|
||||
+ depends on DRAGONBALL_DEVICE_MANAGER
|
||||
+ default y
|
||||
+ help
|
||||
+ This configure implements a vCPU hotplug/hotunplug support, vmm
|
||||
+ should send hotplug request by vsock which follow special data
|
||||
+ structure with command and parameter to hot-pluging an vCPU.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
index ebcb6ef742855..16c6b937c5536 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
@@ -23,6 +23,10 @@
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/cpuhotplug.h>
|
||||
#include <asm/cpu.h>
|
||||
+#ifdef CONFIG_X86_64
|
||||
+#include <asm/mpspec.h>
|
||||
+#include <asm/apic.h>
|
||||
+#endif
|
||||
#include <dragonball/upcall_srv.h>
|
||||
#include <dragonball/device_manager.h>
|
||||
#ifdef CONFIG_ARM64
|
||||
@@ -75,9 +79,20 @@ struct devmgr_req {
|
||||
struct devmgr_msg_header msg_header;
|
||||
union {
|
||||
char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header)];
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
+ struct {
|
||||
+ uint8_t count;
|
||||
+ uint8_t apic_ver;
|
||||
+ uint8_t apic_ids[256];
|
||||
+ } cpu_dev_info;
|
||||
+#endif
|
||||
} msg_load;
|
||||
};
|
||||
|
||||
+struct cpu_dev_reply_info {
|
||||
+ uint32_t apic_index;
|
||||
+};
|
||||
+
|
||||
struct devmgr_reply {
|
||||
struct devmgr_msg_header msg_header;
|
||||
/*
|
||||
@@ -87,6 +102,9 @@ struct devmgr_reply {
|
||||
int32_t ret;
|
||||
union {
|
||||
char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header) - sizeof(int32_t)];
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
+ struct cpu_dev_reply_info cpu_dev_info;
|
||||
+#endif
|
||||
} msg_load;
|
||||
};
|
||||
|
||||
@@ -109,10 +127,228 @@ static void _fill_msg_header(struct devmgr_msg_header *msg, uint32_t msg_size,
|
||||
msg->msg_flags = msg_flags;
|
||||
}
|
||||
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+static int get_cpu_id(int apic_id)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for_each_possible_cpu(i) {
|
||||
+ if (cpu_physical_id(i) == apic_id)
|
||||
+ return i;
|
||||
+ }
|
||||
+ return -1;
|
||||
+}
|
||||
+
|
||||
+static int lookup_cpuid(int apic_id)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ /* CPU# to APICID mapping is persistent once it is established */
|
||||
+ for_each_possible_cpu(i) {
|
||||
+ if (cpuid_to_apicid[i] == apic_id)
|
||||
+ return i;
|
||||
+ }
|
||||
+ return -ENODEV;
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
+ * Return the first failed hotplug index of the apic_ids to dragonball.
|
||||
+ * If it is not equal to the count of all hotplug needed vcpus,
|
||||
+ * we will rollback the vcpus from apics_ids[0] to apic_ids[i-1] in dragonball.
|
||||
+ */
|
||||
+static void cpu_event_notification(
|
||||
+ uint8_t apic_ids_index,
|
||||
+ int ret,
|
||||
+ uint32_t action_type,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ pr_info("cpu event notification: apic ids index %d", apic_ids_index);
|
||||
+ rep->msg_load.cpu_dev_info.apic_index = apic_ids_index;
|
||||
+ rep->ret = ret;
|
||||
+ _fill_msg_header(&rep->msg_header,
|
||||
+ sizeof(struct cpu_dev_reply_info), action_type, 0);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+static int add_cpu_upcall(int apic_id, uint8_t apic_ver)
|
||||
+{
|
||||
+ int cpu_id, node_id;
|
||||
+ int ret;
|
||||
+
|
||||
+ pr_info("adding vcpu apic_id %d", apic_id);
|
||||
+
|
||||
+ /**
|
||||
+ * Get the mutex lock for hotplug and cpu update and cpu write lock.
|
||||
+ * So that other threads won't influence the hotplug process.
|
||||
+ */
|
||||
+ lock_device_hotplug();
|
||||
+ cpu_maps_update_begin();
|
||||
+ cpus_write_lock();
|
||||
+
|
||||
+ cpu_id = lookup_cpuid(apic_id);
|
||||
+ if (cpu_id < 0) {
|
||||
+ pr_err("cpu (apic id %d) cannot be added, generic processor info failed", apic_id);
|
||||
+ ret = -EINVAL;
|
||||
+ goto rollback_generic_cpu;
|
||||
+ }
|
||||
+
|
||||
+ set_bit(apic_id, phys_cpu_present_map);
|
||||
+ early_per_cpu(x86_cpu_to_apicid, cpu_id) = apic_id;
|
||||
+ set_cpu_present(cpu_id, true);
|
||||
+
|
||||
+ /* update numa mapping for hot-plugged cpus. */
|
||||
+ node_id = numa_cpu_node(cpu_id);
|
||||
+ if (node_id != NUMA_NO_NODE)
|
||||
+ numa_set_node(cpu_id, node_id);
|
||||
+
|
||||
+ ret = arch_register_cpu(cpu_id);
|
||||
+ if (ret) {
|
||||
+ pr_err("cpu %d cannot be added, register cpu failed %d", cpu_id, ret);
|
||||
+ goto rollback_register_cpu;
|
||||
+ }
|
||||
+
|
||||
+ cpus_write_unlock();
|
||||
+ cpu_maps_update_done();
|
||||
+ unlock_device_hotplug();
|
||||
+
|
||||
+ ret = add_cpu(cpu_id);
|
||||
+ if (ret) {
|
||||
+ pr_err("cpu %d cannot be added, cpu up failed: %d", cpu_id, ret);
|
||||
+ goto rollback_cpu_up;
|
||||
+ }
|
||||
+ return ret;
|
||||
+
|
||||
+rollback_cpu_up:
|
||||
+ arch_unregister_cpu(cpu_id);
|
||||
+ set_cpu_present(cpu_id, false);
|
||||
+ per_cpu(x86_cpu_to_apicid, cpu_id) = -1;
|
||||
+ return ret;
|
||||
+
|
||||
+rollback_register_cpu:
|
||||
+ set_cpu_present(cpu_id, false);
|
||||
+ per_cpu(x86_cpu_to_apicid, cpu_id) = -1;
|
||||
+rollback_generic_cpu:
|
||||
+ cpus_write_unlock();
|
||||
+ cpu_maps_update_done();
|
||||
+ unlock_device_hotplug();
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int del_cpu_upcall(int apic_id)
|
||||
+{
|
||||
+ int cpu_id = get_cpu_id(apic_id);
|
||||
+ int ret;
|
||||
+
|
||||
+ if (cpu_id == 0) {
|
||||
+ pr_err("cannot del bootstrap processor.");
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+ pr_info("deleting vcpu %d", cpu_id);
|
||||
+ ret = remove_cpu(cpu_id);
|
||||
+ if (ret) {
|
||||
+ pr_err("del vcpu failed, err: %d", ret);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ lock_device_hotplug();
|
||||
+ cpu_maps_update_begin();
|
||||
+ cpus_write_lock();
|
||||
+
|
||||
+ arch_unregister_cpu(cpu_id);
|
||||
+ set_cpu_present(cpu_id, false);
|
||||
+ per_cpu(x86_cpu_to_apicid, cpu_id) = -1;
|
||||
+
|
||||
+ cpus_write_unlock();
|
||||
+ cpu_maps_update_done();
|
||||
+ unlock_device_hotplug();
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int add_cpu_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret;
|
||||
+ uint8_t i;
|
||||
+ int apic_id;
|
||||
+
|
||||
+ uint8_t count = req->msg_load.cpu_dev_info.count;
|
||||
+ uint8_t apic_ver = req->msg_load.cpu_dev_info.apic_ver;
|
||||
+ uint8_t *apic_ids = req->msg_load.cpu_dev_info.apic_ids;
|
||||
+
|
||||
+ pr_info("add vcpu number: %d", count);
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ apic_id = apic_ids[i];
|
||||
+ if (get_cpu_id(apic_id) != -1) {
|
||||
+ pr_err("cpu cannot be added: apci_id %d is already been used.", apic_id);
|
||||
+ ret = -EINVAL;
|
||||
+ return ret;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ apic_id = apic_ids[i];
|
||||
+ ret = add_cpu_upcall(apic_id, apic_ver);
|
||||
+ if (ret != 0)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (!ret)
|
||||
+ cpu_event_notification(i, ret, ADD_CPU, rep);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int del_cpu_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret;
|
||||
+ uint8_t i;
|
||||
+ int cpu_id;
|
||||
+
|
||||
+ uint8_t count = req->msg_load.cpu_dev_info.count;
|
||||
+ uint8_t *apic_ids = req->msg_load.cpu_dev_info.apic_ids;
|
||||
+
|
||||
+ pr_info("del vcpu number : %d", count);
|
||||
+
|
||||
+ if (count >= num_online_cpus()) {
|
||||
+ pr_err("cpu del parameter check error: cannot remove all vcpus");
|
||||
+ ret = -EINVAL;
|
||||
+ cpu_event_notification(0, ret, DEL_CPU, rep);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ cpu_id = get_cpu_id(apic_ids[i]);
|
||||
+ if (!cpu_possible(cpu_id)) {
|
||||
+ pr_err("cpu %d cannot be deleted: cpu not possible", cpu_id);
|
||||
+ ret = -EINVAL;
|
||||
+ cpu_event_notification(0, ret, DEL_CPU, rep);
|
||||
+ return ret;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ ret = del_cpu_upcall(apic_ids[i]);
|
||||
+ if (ret != 0)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (!ret)
|
||||
+ cpu_event_notification(i, ret, DEL_CPU, rep);
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static struct {
|
||||
enum devmgr_msg_type cmd;
|
||||
action_route_t fn;
|
||||
} opt_map[] = {
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+ {ADD_CPU, add_cpu_dev},
|
||||
+ {DEL_CPU, del_cpu_dev},
|
||||
+#endif
|
||||
};
|
||||
|
||||
static action_route_t get_action(struct devmgr_req *req)
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,419 @@
|
||||
From 5109240e29df451e1d6f4474620ef6a2f24349b7 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Date: Wed, 23 Nov 2022 19:23:47 +0800
|
||||
Subject: [PATCH 4/8] upcall: add virtio-mmio hotplug/hot-unplug into device
|
||||
manager service
|
||||
|
||||
Add virtio-mmio hotplug/hot-unplug support into device manager. In the
|
||||
`devmgr_req` message, `msg_type` ADD_MMIO in `msg_header` will trigger
|
||||
`add_mmio_dev` action and DEL_MMIO will trigger `del_mmio_dev` action,
|
||||
and we use `mmio_base`, `mmio_size` and `mmio_irq` delivered in
|
||||
`add_mmio_dev` to notify how to hotplug the virtio-mmio device
|
||||
|
||||
Also `virtio_mmio_add_device` and `virtio_mmio_del_device` are
|
||||
introduced under /drivers/virtio/virtio_mmio.c, and we extract
|
||||
`vm_add_device` from `vm_cmdline_set` to help hotplug virtio-mmio
|
||||
device.
|
||||
|
||||
Signed-off-by: Liu Jiang <gerry@linux.alibaba.com>
|
||||
Signed-off-by: Zizheng Bian <zizheng.bian@linux.alibaba.com>
|
||||
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Signed-off-by: WangYu <WangYu@linux.alibaba.com>
|
||||
Signed-off-by: Xingjun Liu <xingjun.liu@linux.alibaba.com>
|
||||
---
|
||||
drivers/misc/dragonball/upcall_srv/Kconfig | 12 ++
|
||||
.../upcall_srv/dragonball_device_manager.c | 112 ++++++++++++++
|
||||
drivers/virtio/Kconfig | 14 ++
|
||||
drivers/virtio/virtio_mmio.c | 140 +++++++++++++++---
|
||||
include/dragonball/device_manager.h | 5 +
|
||||
5 files changed, 260 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Kconfig b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
index b237882a29288..fc83f03c2edd2 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
@@ -25,6 +25,18 @@ config DRAGONBALL_DEVICE_MANAGER
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
+config DRAGONBALL_HOTPLUG_VIRTIO_MMIO
|
||||
+ bool "Virtio-MMIO device hotplug/hotunplug support"
|
||||
+ depends on DRAGONBALL_DEVICE_MANAGER
|
||||
+ default y
|
||||
+ help
|
||||
+ This configure implements a Virtio-MMIO device hotplug/hotunplug
|
||||
+ support, vmm should send hotplug request by vsock which follow
|
||||
+ special data structure with command and parameter to hot-pluging
|
||||
+ an MMIO device.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
+
|
||||
config DRAGONBALL_HOTPLUG_CPU
|
||||
bool "CPU hotplug/hotunplug support"
|
||||
depends on DRAGONBALL_DEVICE_MANAGER
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
index 16c6b937c5536..f591841715c3b 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
@@ -79,6 +79,13 @@ struct devmgr_req {
|
||||
struct devmgr_msg_header msg_header;
|
||||
union {
|
||||
char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header)];
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
+ struct {
|
||||
+ uint64_t mmio_base;
|
||||
+ uint64_t mmio_size;
|
||||
+ uint32_t mmio_irq;
|
||||
+ } add_mmio_dev;
|
||||
+#endif
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
struct {
|
||||
uint8_t count;
|
||||
@@ -102,6 +109,10 @@ struct devmgr_reply {
|
||||
int32_t ret;
|
||||
union {
|
||||
char pad[DEV_MGR_MSG_SIZE - sizeof(struct devmgr_msg_header) - sizeof(int32_t)];
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
+ struct {
|
||||
+ } add_mmio_dev;
|
||||
+#endif
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
struct cpu_dev_reply_info cpu_dev_info;
|
||||
#endif
|
||||
@@ -118,6 +129,62 @@ struct task_res {
|
||||
typedef int (*action_route_t) (struct devmgr_req *req,
|
||||
struct devmgr_reply *rep);
|
||||
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
+#ifdef CONFIG_ARM64
|
||||
+static uint32_t get_device_virq(uint32_t pin)
|
||||
+{
|
||||
+ uint32_t virq;
|
||||
+ struct device_node *node;
|
||||
+ struct irq_fwspec dummy_fwspec = {
|
||||
+ .param_count = 3,
|
||||
+ .param = {0, 0, IRQ_TYPE_EDGE_RISING}
|
||||
+ };
|
||||
+
|
||||
+ node = of_find_node_by_name(NULL, "intc");
|
||||
+ if (!node) {
|
||||
+ pr_err("interrupt controller device node not found.");
|
||||
+ return 0;
|
||||
+ }
|
||||
+ dummy_fwspec.param[1] = pin;
|
||||
+ dummy_fwspec.fwnode = of_node_to_fwnode(node);
|
||||
+ virq = irq_create_fwspec_mapping(&dummy_fwspec);
|
||||
+ of_node_put(node);
|
||||
+ return virq;
|
||||
+}
|
||||
+#elif defined(CONFIG_X86_64)
|
||||
+static inline uint32_t get_device_virq(uint32_t irq)
|
||||
+{
|
||||
+ return irq;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int get_dev_resource(struct devmgr_req *req, struct resource *res)
|
||||
+{
|
||||
+ uint64_t base = req->msg_load.add_mmio_dev.mmio_base;
|
||||
+ uint64_t size = req->msg_load.add_mmio_dev.mmio_size;
|
||||
+ uint32_t irq = req->msg_load.add_mmio_dev.mmio_irq;
|
||||
+ uint32_t virq;
|
||||
+
|
||||
+ if (req->msg_header.msg_size != sizeof(req->msg_load.add_mmio_dev))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ res[0].flags = IORESOURCE_MEM;
|
||||
+ res[0].start = base;
|
||||
+ res[0].end = base + size - 1;
|
||||
+ res[1].flags = IORESOURCE_IRQ;
|
||||
+ virq = get_device_virq(irq);
|
||||
+ if (!virq)
|
||||
+ return -EINVAL;
|
||||
+ res[1].start = res[1].end = virq;
|
||||
+
|
||||
+ /* detect the irq sharing mode */
|
||||
+ if (irq == SHARED_IRQ_NO)
|
||||
+ res[1].flags |= IORESOURCE_IRQ_SHAREABLE;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static void _fill_msg_header(struct devmgr_msg_header *msg, uint32_t msg_size,
|
||||
uint32_t msg_type, uint32_t msg_flags)
|
||||
{
|
||||
@@ -170,6 +237,47 @@ static void cpu_event_notification(
|
||||
}
|
||||
#endif
|
||||
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
+static int add_mmio_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret;
|
||||
+ struct resource res[2] = {};
|
||||
+ struct devmgr_msg_header *rep_mh = &rep->msg_header;
|
||||
+
|
||||
+ ret = get_dev_resource(req, res);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = virtio_mmio_add_device(res, ARRAY_SIZE(res));
|
||||
+ if (!ret) {
|
||||
+ rep->ret = ret;
|
||||
+ _fill_msg_header(rep_mh, 0, ADD_MMIO, 0);
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int del_mmio_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret;
|
||||
+ struct resource res[2] = {};
|
||||
+ struct devmgr_msg_header *rep_mh = &rep->msg_header;
|
||||
+
|
||||
+ ret = get_dev_resource(req, res);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
+ ret = virtio_mmio_del_device(res, ARRAY_SIZE(res));
|
||||
+ if (!ret) {
|
||||
+ rep->ret = ret;
|
||||
+ _fill_msg_header(rep_mh, 0, DEL_MMIO, 0);
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
static int add_cpu_upcall(int apic_id, uint8_t apic_ver)
|
||||
{
|
||||
@@ -345,6 +453,10 @@ static struct {
|
||||
enum devmgr_msg_type cmd;
|
||||
action_route_t fn;
|
||||
} opt_map[] = {
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
+ {ADD_MMIO, add_mmio_dev},
|
||||
+ {DEL_MMIO, del_mmio_dev},
|
||||
+#endif
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
{ADD_CPU, add_cpu_dev},
|
||||
{DEL_CPU, del_cpu_dev},
|
||||
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
|
||||
index 6db5235a7693d..eaf9d1bea38fc 100644
|
||||
--- a/drivers/virtio/Kconfig
|
||||
+++ b/drivers/virtio/Kconfig
|
||||
@@ -31,6 +31,20 @@ menuconfig VIRTIO_MENU
|
||||
bool "Virtio drivers"
|
||||
default y
|
||||
|
||||
+config VIRTIO_MMIO_DRAGONBALL
|
||||
+ bool "Enable features for Dragonball virtio MMIO devices"
|
||||
+ default n
|
||||
+ depends on VIRTIO_MMIO
|
||||
+ depends on X86_64 || ARM64
|
||||
+ select X86_PLATFORM_MSI
|
||||
+ select VIRTIO_MMIO_MSI
|
||||
+ help
|
||||
+ The Dragonball VMM implements several optimizations for MMIO virtio
|
||||
+ devices. This option enables support of those optimization features:
|
||||
+ - virtio-mmio hotplug through upcall
|
||||
+
|
||||
+ If unsure, say N
|
||||
+
|
||||
if VIRTIO_MENU
|
||||
|
||||
config VIRTIO_HARDEN_NOTIFICATION
|
||||
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
|
||||
index b152a1eca05ad..a8a7251e8c030 100644
|
||||
--- a/drivers/virtio/virtio_mmio.c
|
||||
+++ b/drivers/virtio/virtio_mmio.c
|
||||
@@ -669,16 +669,42 @@ static struct device vm_cmdline_parent = {
|
||||
static int vm_cmdline_parent_registered;
|
||||
static int vm_cmdline_id;
|
||||
|
||||
+static int vm_add_device(struct resource *resources, size_t res_size)
|
||||
+{
|
||||
+ int err;
|
||||
+ struct platform_device *pdev;
|
||||
+
|
||||
+ if (!vm_cmdline_parent_registered) {
|
||||
+ err = device_register(&vm_cmdline_parent);
|
||||
+ if (err) {
|
||||
+ pr_err("Failed to register parent device!\n");
|
||||
+ put_device(&vm_cmdline_parent);
|
||||
+ return err;
|
||||
+ }
|
||||
+ vm_cmdline_parent_registered = 1;
|
||||
+ }
|
||||
+
|
||||
+ pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n",
|
||||
+ vm_cmdline_id,
|
||||
+ (unsigned long long)resources[0].start,
|
||||
+ (unsigned long long)resources[0].end,
|
||||
+ (int)resources[1].start);
|
||||
+
|
||||
+ pdev = platform_device_register_resndata(&vm_cmdline_parent,
|
||||
+ "virtio-mmio", vm_cmdline_id++,
|
||||
+ resources, res_size, NULL, 0);
|
||||
+
|
||||
+ return PTR_ERR_OR_ZERO(pdev);
|
||||
+}
|
||||
+
|
||||
static int vm_cmdline_set(const char *device,
|
||||
const struct kernel_param *kp)
|
||||
{
|
||||
- int err;
|
||||
struct resource resources[2] = {};
|
||||
char *str;
|
||||
long long base, size;
|
||||
unsigned int irq;
|
||||
int processed, consumed = 0;
|
||||
- struct platform_device *pdev;
|
||||
|
||||
/* Consume "size" part of the command line parameter */
|
||||
size = memparse(device, &str);
|
||||
@@ -703,27 +729,7 @@ static int vm_cmdline_set(const char *device,
|
||||
resources[1].flags = IORESOURCE_IRQ;
|
||||
resources[1].start = resources[1].end = irq;
|
||||
|
||||
- if (!vm_cmdline_parent_registered) {
|
||||
- err = device_register(&vm_cmdline_parent);
|
||||
- if (err) {
|
||||
- put_device(&vm_cmdline_parent);
|
||||
- pr_err("Failed to register parent device!\n");
|
||||
- return err;
|
||||
- }
|
||||
- vm_cmdline_parent_registered = 1;
|
||||
- }
|
||||
-
|
||||
- pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n",
|
||||
- vm_cmdline_id,
|
||||
- (unsigned long long)resources[0].start,
|
||||
- (unsigned long long)resources[0].end,
|
||||
- (int)resources[1].start);
|
||||
-
|
||||
- pdev = platform_device_register_resndata(&vm_cmdline_parent,
|
||||
- "virtio-mmio", vm_cmdline_id++,
|
||||
- resources, ARRAY_SIZE(resources), NULL, 0);
|
||||
-
|
||||
- return PTR_ERR_OR_ZERO(pdev);
|
||||
+ return vm_add_device(resources, ARRAY_SIZE(resources));
|
||||
}
|
||||
|
||||
static int vm_cmdline_get_device(struct device *dev, void *data)
|
||||
@@ -773,6 +779,94 @@ static void vm_unregister_cmdline_devices(void)
|
||||
}
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_DRAGONBALL_DEVICE_MANAGER
|
||||
+static int vm_match_device(struct device *dev, const void *data)
|
||||
+{
|
||||
+ struct resource *resource = (struct resource *)data;
|
||||
+ struct platform_device *pdev = to_platform_device(dev);
|
||||
+
|
||||
+ if ((pdev->resource[0].start == resource[0].start) &&
|
||||
+ (pdev->resource[0].end == resource[0].end) &&
|
||||
+ (pdev->resource[1].start == resource[1].start))
|
||||
+ return 1;
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static struct device *vm_find_device(struct resource *res)
|
||||
+{
|
||||
+ return device_find_child(&vm_cmdline_parent, res, vm_match_device);
|
||||
+}
|
||||
+
|
||||
+static int vm_device_overlap(struct device *dev, const void *data)
|
||||
+{
|
||||
+ struct resource *res = (struct resource *)data;
|
||||
+ struct platform_device *pdev = to_platform_device(dev);
|
||||
+
|
||||
+ /* Detect IRQ number conflicts except shared IRQs. */
|
||||
+ if (!(res[1].flags & IORESOURCE_IRQ_SHAREABLE) &&
|
||||
+ (pdev->resource[1].start == res[1].start)) {
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ /* Detect device MMIO addresses overlapping */
|
||||
+ if ((pdev->resource[0].start < res[0].end) &&
|
||||
+ (pdev->resource[0].end > res[0].start)) {
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static struct device *vm_detect_resource(struct resource *res)
|
||||
+{
|
||||
+ /* return NULL if no resource overlapped */
|
||||
+ return device_find_child(&vm_cmdline_parent, res, vm_device_overlap);
|
||||
+}
|
||||
+
|
||||
+int virtio_mmio_add_device(struct resource *resources, size_t res_size)
|
||||
+{
|
||||
+ int err;
|
||||
+ struct device *dev;
|
||||
+
|
||||
+ if (res_size < 2 || !resources)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ dev = vm_detect_resource(resources);
|
||||
+ if (dev) {
|
||||
+ put_device(dev);
|
||||
+ return -EEXIST;
|
||||
+ }
|
||||
+
|
||||
+ lock_device_hotplug();
|
||||
+ err = vm_add_device(resources, res_size);
|
||||
+ unlock_device_hotplug();
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(virtio_mmio_add_device);
|
||||
+
|
||||
+int virtio_mmio_del_device(struct resource *resources, size_t res_size)
|
||||
+{
|
||||
+ int ret;
|
||||
+ struct device *dev;
|
||||
+
|
||||
+ if (res_size < 2 || !resources)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ dev = vm_find_device(resources);
|
||||
+ if (!dev)
|
||||
+ return -ENODEV;
|
||||
+
|
||||
+ put_device(dev);
|
||||
+ lock_device_hotplug();
|
||||
+ ret = vm_unregister_cmdline_device(dev, NULL);
|
||||
+ unlock_device_hotplug();
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(virtio_mmio_del_device);
|
||||
+#endif /* CONFIG_DRAGONBALL_DEVICE_MANAGER */
|
||||
+
|
||||
#else
|
||||
|
||||
static void vm_unregister_cmdline_devices(void)
|
||||
diff --git a/include/dragonball/device_manager.h b/include/dragonball/device_manager.h
|
||||
index a1713e9f026d1..785761c47f973 100644
|
||||
--- a/include/dragonball/device_manager.h
|
||||
+++ b/include/dragonball/device_manager.h
|
||||
@@ -15,4 +15,9 @@
|
||||
|
||||
#include <linux/device.h>
|
||||
|
||||
+#if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
|
||||
+int virtio_mmio_add_device(struct resource *resources, size_t res_size);
|
||||
+int virtio_mmio_del_device(struct resource *resources, size_t res_size);
|
||||
+#endif
|
||||
+
|
||||
#endif /* _DB_DEVICE_MANAGER_H */
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
From 9fecfb42b7da22604467053e88418c72745e0560 Mon Sep 17 00:00:00 2001
|
||||
From: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
Date: Wed, 10 May 2023 13:55:43 +0800
|
||||
Subject: [PATCH 5/8] upcall: dragonball-devmgr supports cpu hotplug on arm64
|
||||
|
||||
Enable vcpuhotplug feature on aarch64 in guest kernel. It communicates
|
||||
with dragonball by using upcall. This commit does these changes:
|
||||
|
||||
1. Wraps x86 related fields with CONFIG_X86_64.
|
||||
2. Add "cpu_event_notification" for arm64.
|
||||
3. Add "add_cpu_dev" and "del_cpu_dev" for arm64.
|
||||
|
||||
Signed-off-by: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
Reviewed-by : Chao Wu <chaowu@linux.alibaba.com>
|
||||
Reviewed-by: Zizheng Bian <zizheng.bian@linux.alibaba.com>
|
||||
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
|
||||
---
|
||||
.../upcall_srv/dragonball_device_manager.c | 84 ++++++++++++++++++-
|
||||
1 file changed, 81 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
index f591841715c3b..e2d1b7d56eafe 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
@@ -89,15 +89,21 @@ struct devmgr_req {
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
struct {
|
||||
uint8_t count;
|
||||
+#ifdef CONFIG_X86_64
|
||||
uint8_t apic_ver;
|
||||
uint8_t apic_ids[256];
|
||||
+#endif
|
||||
} cpu_dev_info;
|
||||
#endif
|
||||
} msg_load;
|
||||
};
|
||||
|
||||
struct cpu_dev_reply_info {
|
||||
+#if defined(CONFIG_X86_64)
|
||||
uint32_t apic_index;
|
||||
+#elif defined(CONFIG_ARM64)
|
||||
+ uint32_t cpu_id;
|
||||
+#endif
|
||||
};
|
||||
|
||||
struct devmgr_reply {
|
||||
@@ -194,7 +200,8 @@ static void _fill_msg_header(struct devmgr_msg_header *msg, uint32_t msg_size,
|
||||
msg->msg_flags = msg_flags;
|
||||
}
|
||||
|
||||
-#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
+#if defined(CONFIG_X86_64)
|
||||
static int get_cpu_id(int apic_id)
|
||||
{
|
||||
int i;
|
||||
@@ -235,6 +242,24 @@ static void cpu_event_notification(
|
||||
_fill_msg_header(&rep->msg_header,
|
||||
sizeof(struct cpu_dev_reply_info), action_type, 0);
|
||||
}
|
||||
+#elif defined(CONFIG_ARM64)
|
||||
+/**
|
||||
+ * Return the first failed hotplug index of the cpu_id to dragonball.
|
||||
+ * If hotplug/hotunplug succeeds, it will equals to the expected cpu count.
|
||||
+ */
|
||||
+static void cpu_event_notification(
|
||||
+ uint8_t cpu_id,
|
||||
+ int ret,
|
||||
+ uint32_t action_type,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ pr_info("cpu event notification: cpu_id %d\n", cpu_id);
|
||||
+ rep->msg_load.cpu_dev_info.cpu_id = cpu_id;
|
||||
+ rep->ret = ret;
|
||||
+ _fill_msg_header(&rep->msg_header,
|
||||
+ sizeof(struct cpu_dev_reply_info), action_type, 0);
|
||||
+}
|
||||
+#endif
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_VIRTIO_MMIO)
|
||||
@@ -278,7 +303,8 @@ static int del_mmio_dev(struct devmgr_req *req,
|
||||
#endif
|
||||
|
||||
|
||||
-#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
+#if defined(CONFIG_X86_64)
|
||||
static int add_cpu_upcall(int apic_id, uint8_t apic_ver)
|
||||
{
|
||||
int cpu_id, node_id;
|
||||
@@ -447,6 +473,58 @@ static int del_cpu_dev(struct devmgr_req *req,
|
||||
cpu_event_notification(i, ret, DEL_CPU, rep);
|
||||
return ret;
|
||||
}
|
||||
+#elif defined(CONFIG_ARM64)
|
||||
+static int add_cpu_dev(struct devmgr_req *req, struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int i, ret = 0;
|
||||
+ unsigned int cpu_id, nr_online_cpus;
|
||||
+ uint8_t count = req->msg_load.cpu_dev_info.count;
|
||||
+
|
||||
+ nr_online_cpus = num_online_cpus();
|
||||
+
|
||||
+ pr_info("Current vcpu number: %d, Add vcpu number: %d\n",
|
||||
+ nr_online_cpus, count);
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ cpu_id = nr_online_cpus + i;
|
||||
+ ret = add_cpu(cpu_id);
|
||||
+ if (ret != 0)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cpu_event_notification(nr_online_cpus + i, ret, ADD_CPU, rep);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int del_cpu_dev(struct devmgr_req *req, struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int i, ret = 0;
|
||||
+ unsigned int cpu_id, nr_online_cpus;
|
||||
+ uint8_t count = req->msg_load.cpu_dev_info.count;
|
||||
+
|
||||
+ nr_online_cpus = num_online_cpus();
|
||||
+
|
||||
+ pr_info("Current vcpu number: %d, Delete vcpu number: %d\n",
|
||||
+ nr_online_cpus, count);
|
||||
+
|
||||
+ if (count >= nr_online_cpus) {
|
||||
+ pr_err("cpu del parameter check error: cannot remove all vcpus\n");
|
||||
+ ret = -EINVAL;
|
||||
+ cpu_event_notification(0, ret, DEL_CPU, rep);
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < count; ++i) {
|
||||
+ cpu_id = nr_online_cpus - i - 1;
|
||||
+ ret = remove_cpu(cpu_id);
|
||||
+ if (ret != 0)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cpu_event_notification(nr_online_cpus - i, ret, DEL_CPU, rep);
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif
|
||||
#endif
|
||||
|
||||
static struct {
|
||||
@@ -457,7 +535,7 @@ static struct {
|
||||
{ADD_MMIO, add_mmio_dev},
|
||||
{DEL_MMIO, del_mmio_dev},
|
||||
#endif
|
||||
-#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_X86_64)
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
{ADD_CPU, add_cpu_dev},
|
||||
{DEL_CPU, del_cpu_dev},
|
||||
#endif
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
From 8f10003034a64ea5a562f912fee5037637084a9b Mon Sep 17 00:00:00 2001
|
||||
From: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
Date: Wed, 10 May 2023 14:51:40 +0800
|
||||
Subject: [PATCH 6/8] msi: control msi irq number activated
|
||||
|
||||
When passthroughing pci device, kernel will initialize and activate
|
||||
(max_cpu_count+1) msi irq. However, in vcpu hotplugging situation,
|
||||
because of vgic, max_cpu_count may be greater than online_cpu_count.
|
||||
Those offline cpus will also be activated by kernel, which cause failure
|
||||
of passthroughing pci device.
|
||||
|
||||
To solve this problem, this patch add a function
|
||||
"check_affinity_mask_online" to check if msi_desc->affinity contains
|
||||
online cpus. If current cpu is offline, it will continue the for loop to
|
||||
skip activating related irq.
|
||||
|
||||
Signed-off-by: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
Reviewed-by: Shuo Tan <shuo.tan@linux.alibaba.com>
|
||||
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
|
||||
---
|
||||
kernel/irq/msi.c | 20 ++++++++++++++++++++
|
||||
1 file changed, 20 insertions(+)
|
||||
|
||||
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
|
||||
index e7ad992548416..1ecfbad4d2c8c 100644
|
||||
--- a/kernel/irq/msi.c
|
||||
+++ b/kernel/irq/msi.c
|
||||
@@ -1289,6 +1289,23 @@ static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
+/* This function is used for check whether the cpu affinity belongs to the
|
||||
+ * online cpus. When we passthrough the nvme devices, the kernel will allocate
|
||||
+ * maxcpus+1 MSI irqs and then activate them. In vcpu hotplug situations, it
|
||||
+ * may happen that kernel activates the offline cpus when bootcpus < maxcpus.
|
||||
+ * To avoid this conflict, this function check the affinities.
|
||||
+ */
|
||||
+static inline bool check_affinity_mask_online(struct irq_affinity_desc *affinity)
|
||||
+{
|
||||
+ int cpu;
|
||||
+
|
||||
+ for_each_cpu(cpu, &affinity->mask)
|
||||
+ if (cpu_online(cpu))
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
|
||||
struct msi_ctrl *ctrl)
|
||||
{
|
||||
@@ -1340,6 +1357,9 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
|
||||
return msi_handle_pci_fail(domain, desc, allocated);
|
||||
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
+ if (desc->affinity
|
||||
+ && !check_affinity_mask_online(desc->affinity))
|
||||
+ continue;
|
||||
irq_set_msi_desc_off(virq, i, desc);
|
||||
irq_debugfs_copy_devname(virq + i, dev);
|
||||
ret = msi_init_virq(domain, virq + i, vflags);
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,139 @@
|
||||
From e82de56fe9659355b208a283d56d5924875e5290 Mon Sep 17 00:00:00 2001
|
||||
From: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
Date: Tue, 23 May 2023 09:43:02 +0800
|
||||
Subject: [PATCH 7/8] smp: update bringup_nonboot_cpus parameters
|
||||
|
||||
On aarch64, kvm doesn't allow vmm to call KVM_CREATE_VCPU ioctls after
|
||||
vm has already started, which is caused by vgic_initialized check in
|
||||
kvm_arch_vcpu_precreate() function. Therefore, to support vcpu hotplug
|
||||
feature on aarch64, all the vcpus should be created and configured ready
|
||||
for start at booting procedure.
|
||||
|
||||
To solve the problem, dragonball will add a property in each cpu node,
|
||||
called "boot-onlined". This property indicates whether this cpu should
|
||||
be onlined at first boot. It has two values: 0 and 1. 0 means offline,
|
||||
while 1 means online.
|
||||
|
||||
This commit also add a helper function called "of_get_cpu_boot_onlined",
|
||||
which parse the cpu node and get the value of boot-onlined property.
|
||||
Then update the global variable "boot_onlined_cpu".
|
||||
|
||||
When kernel calling smp_init(), bringup_nonboot_cpus will start all the
|
||||
other cpus except cpu0. The activated cpu number equals setup_max_cpus.
|
||||
In vcpu hotplug scenario, vmm will create all the vcpufd before vm is
|
||||
initialized, while activating only a few vcpus at first boot. The
|
||||
setup_max_cpus variable will be initialized as all vcpu count. This
|
||||
cause that the other cpus cannot find enough cpu threads, and they will
|
||||
wait for 5 seconds each cpu.
|
||||
|
||||
Therefore, we use boot_onlined_cpu instead of setup_max_cpus to give
|
||||
"bringup_nonboot_cpus" correct cpu number it needs.
|
||||
|
||||
Signed-off-by: xuejun-xj <jiyunxue@linux.alibaba.com>
|
||||
---
|
||||
.../devicetree/bindings/arm/cpus.yaml | 12 +++++++++
|
||||
arch/arm64/kernel/smp.c | 25 +++++++++++++++++++
|
||||
kernel/smp.c | 11 +++++++-
|
||||
3 files changed, 47 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
|
||||
index 736b7ab1bd0a0..fb26561f8e82f 100644
|
||||
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
|
||||
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
|
||||
@@ -425,6 +425,18 @@ properties:
|
||||
formed by encoding the target CPU id into the low bits of the
|
||||
physical start address it should jump to.
|
||||
|
||||
+ boot-onlined:
|
||||
+ $ref: '/schemas/types.yaml#/definitions/uint32'
|
||||
+ description: |
|
||||
+ The boot-onlined property is an optional u32 value that indicates
|
||||
+ whether the cpu device should be activated at first boot. This is
|
||||
+ useful in vcpu hotplug scenario to pass correct value of activated
|
||||
+ cpu number.
|
||||
+
|
||||
+ This property has two values: 0 and 1. 1 means the cpu should be
|
||||
+ activated while 0 means it shouldn't.
|
||||
+
|
||||
+
|
||||
thermal-idle:
|
||||
type: object
|
||||
|
||||
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
|
||||
index 68cea3a4a35ca..7938569e400c6 100644
|
||||
--- a/arch/arm64/kernel/smp.c
|
||||
+++ b/arch/arm64/kernel/smp.c
|
||||
@@ -673,6 +673,28 @@ static void __init acpi_parse_and_init_cpus(void)
|
||||
#define acpi_parse_and_init_cpus(...) do { } while (0)
|
||||
#endif
|
||||
|
||||
+
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_ARM64)
|
||||
+extern unsigned int boot_onlined_cpu;
|
||||
+static void __init of_get_cpu_boot_onlined(struct device_node *dn)
|
||||
+{
|
||||
+ unsigned int boot_onlined;
|
||||
+ int r;
|
||||
+
|
||||
+ r = of_property_read_u32(dn, "boot-onlined", &boot_onlined);
|
||||
+ if (r) {
|
||||
+ pr_err("%pOF: missing boot-onlined property\n", dn);
|
||||
+ return;
|
||||
+ }
|
||||
+ /*
|
||||
+ * Property boot-onlined has two values: 0 and 1.
|
||||
+ * 0 means offline, and 1 means online.
|
||||
+ * Here just count the number of boot_onlined_cpu.
|
||||
+ */
|
||||
+ boot_onlined_cpu += boot_onlined;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Enumerate the possible CPU set from the device tree and build the
|
||||
* cpu logical map array containing MPIDR values related to logical
|
||||
@@ -683,6 +705,9 @@ static void __init of_parse_and_init_cpus(void)
|
||||
struct device_node *dn;
|
||||
|
||||
for_each_of_cpu_node(dn) {
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_ARM64)
|
||||
+ of_get_cpu_boot_onlined(dn);
|
||||
+#endif
|
||||
u64 hwid = of_get_cpu_hwid(dn, 0);
|
||||
|
||||
if (hwid & ~MPIDR_HWID_BITMASK)
|
||||
diff --git a/kernel/smp.c b/kernel/smp.c
|
||||
index 02f52291fae42..95dce332c18f1 100644
|
||||
--- a/kernel/smp.c
|
||||
+++ b/kernel/smp.c
|
||||
@@ -988,17 +988,26 @@ void __init setup_nr_cpu_ids(void)
|
||||
set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
|
||||
}
|
||||
|
||||
+/* Setup number of CPUs to activate */
|
||||
+unsigned int boot_onlined_cpu = 0;
|
||||
+
|
||||
/* Called by boot processor to activate the rest. */
|
||||
void __init smp_init(void)
|
||||
{
|
||||
int num_nodes, num_cpus;
|
||||
|
||||
+ int num_onlined_cpu = setup_max_cpus;
|
||||
+
|
||||
idle_threads_init();
|
||||
cpuhp_threads_init();
|
||||
|
||||
pr_info("Bringing up secondary CPUs ...\n");
|
||||
|
||||
- bringup_nonboot_cpus(setup_max_cpus);
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU) && defined(CONFIG_ARM64)
|
||||
+ if (boot_onlined_cpu != 0)
|
||||
+ num_onlined_cpu = boot_onlined_cpu;
|
||||
+#endif
|
||||
+ bringup_nonboot_cpus(num_onlined_cpu);
|
||||
|
||||
num_nodes = num_online_nodes();
|
||||
num_cpus = num_online_cpus();
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -0,0 +1,173 @@
|
||||
From 3ffce0dd35013a3e1f15b2595b21875546a5f4c8 Mon Sep 17 00:00:00 2001
|
||||
From: Chao Wu <chaowu@linux.alibaba.com>
|
||||
Date: Wed, 27 Dec 2023 14:43:47 +0800
|
||||
Subject: [PATCH 8/8] upcall: add pci hotplug / hot-unplug support
|
||||
|
||||
add two new upcall functions add_pci_dev and del_pci_dev, mainly for hotplugging
|
||||
and hot-unplugging pci device in the guest kernel through the upcall server.
|
||||
|
||||
Users could implement upcall client side with add_pci or del_pci command and trigger
|
||||
those commands in the hypervisor side.
|
||||
|
||||
As always, Dragonball hypervisor will implement the client side to do pci hotplug and
|
||||
hot-unplug as an example
|
||||
|
||||
Signed-off-by: Gerry Liu <gerry@linux.alibaba.com>
|
||||
Signed-off-by: Helin Guo <helinguo@linux.alibaba.com>
|
||||
Signed-off-by: Chao Wu <chaowu@linux.alibaba.com>
|
||||
---
|
||||
drivers/misc/dragonball/upcall_srv/Kconfig | 11 +++
|
||||
.../upcall_srv/dragonball_device_manager.c | 90 +++++++++++++++++++
|
||||
2 files changed, 101 insertions(+)
|
||||
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/Kconfig b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
index fc83f03c2edd2..735928316eda2 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/Kconfig
|
||||
@@ -47,3 +47,14 @@ config DRAGONBALL_HOTPLUG_CPU
|
||||
structure with command and parameter to hot-pluging an vCPU.
|
||||
|
||||
If unsure, say N.
|
||||
+
|
||||
+config DRAGONBALL_HOTPLUG_PCI
|
||||
+ bool "PCI hotplug/hotunplug support"
|
||||
+ depends on DRAGONBALL_DEVICE_MANAGER
|
||||
+ default y
|
||||
+ help
|
||||
+ This configure implements a PCI hotplug/hotunplug support, vmm
|
||||
+ should send hotplug request by vsock which follow special data
|
||||
+ structure with command and parameter to hot-pluging a PCI device.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
diff --git a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
index e2d1b7d56eafe..6f81b8cca19df 100644
|
||||
--- a/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
+++ b/drivers/misc/dragonball/upcall_srv/dragonball_device_manager.c
|
||||
@@ -22,6 +22,7 @@
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/cpuhotplug.h>
|
||||
+#include <linux/pci.h>
|
||||
#include <asm/cpu.h>
|
||||
#ifdef CONFIG_X86_64
|
||||
#include <asm/mpspec.h>
|
||||
@@ -94,6 +95,12 @@ struct devmgr_req {
|
||||
uint8_t apic_ids[256];
|
||||
#endif
|
||||
} cpu_dev_info;
|
||||
+#endif
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_PCI)
|
||||
+ struct {
|
||||
+ uint8_t busno;
|
||||
+ uint8_t devfn;
|
||||
+ } pci_dev_info;
|
||||
#endif
|
||||
} msg_load;
|
||||
};
|
||||
@@ -121,6 +128,9 @@ struct devmgr_reply {
|
||||
#endif
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
struct cpu_dev_reply_info cpu_dev_info;
|
||||
+#endif
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_PCI)
|
||||
+ struct {} pci_dev_info;
|
||||
#endif
|
||||
} msg_load;
|
||||
};
|
||||
@@ -302,6 +312,82 @@ static int del_mmio_dev(struct devmgr_req *req,
|
||||
}
|
||||
#endif
|
||||
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_PCI)
|
||||
+static int add_pci_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret = 0;
|
||||
+ struct devmgr_msg_header *rep_mh = &rep->msg_header;
|
||||
+ uint8_t busno = req->msg_load.pci_dev_info.busno;
|
||||
+ uint8_t devfn = req->msg_load.pci_dev_info.devfn;
|
||||
+ struct pci_bus *bus;
|
||||
+ struct pci_dev *dev;
|
||||
+
|
||||
+ pr_info("add pci device of busno: %02x, devfn: %02x\n", busno, devfn);
|
||||
+
|
||||
+ pci_lock_rescan_remove();
|
||||
+
|
||||
+ /* It is similar to pci_rescan_bus */
|
||||
+
|
||||
+ bus = pci_find_bus(0, busno);
|
||||
+ if (!bus) {
|
||||
+ pr_err("Could not find PCI bus for busno %02x\n", busno);
|
||||
+ ret = -ENODEV;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ pci_scan_slot(bus, devfn);
|
||||
+ dev = pci_get_slot(bus, devfn);
|
||||
+ if (!dev) {
|
||||
+ pr_err("Could not find PCI device for slot %02x\n", devfn);
|
||||
+ ret = -ENODEV;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ pci_bus_claim_resources(bus);
|
||||
+
|
||||
+ pci_bus_add_devices(bus);
|
||||
+
|
||||
+ pci_dev_put(dev);
|
||||
+
|
||||
+out:
|
||||
+ pci_unlock_rescan_remove();
|
||||
+ if (!ret)
|
||||
+ _fill_msg_header(rep_mh, 0, ADD_PCI, 0);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static int del_pci_dev(struct devmgr_req *req,
|
||||
+ struct devmgr_reply *rep)
|
||||
+{
|
||||
+ int ret = 0;
|
||||
+ struct devmgr_msg_header *rep_mh = &rep->msg_header;
|
||||
+ uint8_t busno = req->msg_load.pci_dev_info.busno;
|
||||
+ uint8_t devfn = req->msg_load.pci_dev_info.devfn;
|
||||
+ struct pci_dev *dev;
|
||||
+
|
||||
+ pr_info("remove pci device of busno: %02x, devfn: %02x\n", busno, devfn);
|
||||
+
|
||||
+ pci_lock_rescan_remove();
|
||||
+
|
||||
+ dev = pci_get_domain_bus_and_slot(0, busno, devfn);
|
||||
+
|
||||
+ if (!dev) {
|
||||
+ pr_err("Could not find PCI device for slot %02x\n", devfn);
|
||||
+ ret = -ENODEV;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ pci_stop_and_remove_bus_device(dev);
|
||||
+
|
||||
+ pci_dev_put(dev);
|
||||
+out:
|
||||
+ pci_unlock_rescan_remove();
|
||||
+ if (!ret)
|
||||
+ _fill_msg_header(rep_mh, 0, DEL_PCI, 0);
|
||||
+ return ret;
|
||||
+}
|
||||
+#endif
|
||||
|
||||
#if defined(CONFIG_DRAGONBALL_HOTPLUG_CPU)
|
||||
#if defined(CONFIG_X86_64)
|
||||
@@ -539,6 +625,10 @@ static struct {
|
||||
{ADD_CPU, add_cpu_dev},
|
||||
{DEL_CPU, del_cpu_dev},
|
||||
#endif
|
||||
+#if defined(CONFIG_DRAGONBALL_HOTPLUG_PCI)
|
||||
+ {ADD_PCI, add_pci_dev},
|
||||
+ {DEL_PCI, del_pci_dev},
|
||||
+#endif
|
||||
};
|
||||
|
||||
static action_route_t get_action(struct devmgr_req *req)
|
||||
--
|
||||
2.34.1
|
||||
|
||||
@@ -38,6 +38,11 @@ if os.environ.get("GITHUB_TOKEN"):
|
||||
_GH_HEADERS["Authorization"] = f"Bearer {os.environ['GITHUB_TOKEN']}"
|
||||
_GH_API_URL = f"https://api.github.com/repos/{os.environ['GITHUB_REPOSITORY']}"
|
||||
_GH_RUNS_URL = f"{_GH_API_URL}/actions/runs"
|
||||
_GH_SUMMARY_URL = (
|
||||
f"{os.environ.get('GITHUB_SERVER_URL')}/"
|
||||
f"{os.environ.get('GITHUB_REPOSITORY')}/actions/runs/"
|
||||
f"{os.environ.get('GITHUB_RUN_ID')}"
|
||||
)
|
||||
if os.environ.get("DEBUG", "false") == "true":
|
||||
DEBUG_DIR = os.path.join(os.path.abspath('.'), str(int(time.time())))
|
||||
os.makedirs(DEBUG_DIR)
|
||||
@@ -135,12 +140,13 @@ class Checker:
|
||||
warn = []
|
||||
for name, job in self.results.items():
|
||||
status = self._job_status(job)
|
||||
url = job.get("html_url", "")
|
||||
if status == RUNNING:
|
||||
warn.append(f"WARN: {name} - Still running")
|
||||
warn.append(f"WARN: {name} - Still running {url}")
|
||||
elif status == PASS:
|
||||
good.append(f"PASS: {name} - success")
|
||||
good.append(f"PASS: {name} - success {url}")
|
||||
else:
|
||||
bad.append(f"FAIL: {name} - Not passed - {status}")
|
||||
bad.append(f"FAIL: {name} - Not passed - {status} {url}")
|
||||
out = '\n'.join(sorted(good) + sorted(warn) + sorted(bad))
|
||||
stat = self.status()
|
||||
if stat == RUNNING:
|
||||
@@ -154,6 +160,51 @@ class Checker:
|
||||
status = "Not all required jobs passed!"
|
||||
return f"{out}\n\n{status}"
|
||||
|
||||
def write_step_summary(self):
|
||||
"""Write WARN/FAIL results to GitHub Step Summary if available"""
|
||||
def _section(name, items, icon='*'):
|
||||
"""Format a MD section"""
|
||||
lines = []
|
||||
lines.append(f"<details open>\n<summary><h2>{name}</h2></summary>\n")
|
||||
if not items:
|
||||
lines.append("None")
|
||||
else:
|
||||
for item in items:
|
||||
lines.append(f"{icon} {item}")
|
||||
lines.append("</details>\n")
|
||||
return lines
|
||||
|
||||
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
if not summary_path:
|
||||
return
|
||||
|
||||
lines = []
|
||||
passing = []
|
||||
failing = []
|
||||
running = []
|
||||
|
||||
for name, job in self.results.items():
|
||||
status = self._job_status(job)
|
||||
url = job.get("html_url", "")
|
||||
if status == RUNNING:
|
||||
running.append(f"[{name}]({url})" if url else name)
|
||||
elif status == PASS:
|
||||
passing.append(f"[{name}]({url})" if url else name)
|
||||
else:
|
||||
link = f"[{name}]({url})" if url else name
|
||||
failing.append(f"{link} ({status})")
|
||||
lines.extend(_section("Failing checks", failing, "❌"))
|
||||
lines.extend(_section("In progress checks", running, "🔶"))
|
||||
lines.extend(_section("Successful checks", passing, "🟢"))
|
||||
summary = [f"Total: {len(self.results)}, "
|
||||
f"Passed: {len(passing)}, "
|
||||
f"Failed: {len(failing)}, Running: {len(running)}"]
|
||||
lines.extend(_section("Summary", summary))
|
||||
|
||||
with open(summary_path, "w", encoding="utf8") as summary:
|
||||
summary.write("\n".join(lines) + "\n")
|
||||
print(f"Human-readable summary: {_GH_SUMMARY_URL}")
|
||||
|
||||
def fetch_json_from_url(self, url, task, params=None):
|
||||
"""Fetches URL and reports json output"""
|
||||
print(url, file=sys.stderr)
|
||||
@@ -220,6 +271,7 @@ class Checker:
|
||||
for job in jobs:
|
||||
self.record(run["name"], job)
|
||||
print(self)
|
||||
self.write_step_summary()
|
||||
return self.status()
|
||||
|
||||
def wait_for_required_tests(self):
|
||||
|
||||
@@ -102,7 +102,7 @@ mapping:
|
||||
- Kata Containers CI / kata-containers-ci-on-push / run-kata-deploy-tests / run-kata-deploy-tests (qemu, rke2)
|
||||
- Kata Containers CI / kata-containers-ci-on-push / run-kata-monitor-tests / run-monitor (qemu, crio)
|
||||
- Kata Containers CI / kata-containers-ci-on-push / run-k8s-tests-on-nvidia-gpu / run-nvidia-gpu-tests-on-amd64
|
||||
- Kata Containers CI / kata-containers-ci-on-push / run-k8s-tests-on-nvidia-gpu / run-nvidia-gpu-snp-tests-on-amd64
|
||||
# - Kata Containers CI / kata-containers-ci-on-push / run-k8s-tests-on-nvidia-gpu / run-nvidia-gpu-snp-tests-on-amd64
|
||||
required-labels:
|
||||
- ok-to-test
|
||||
build:
|
||||
|
||||
9
utils.mk
9
utils.mk
@@ -181,16 +181,9 @@ CWD := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
|
||||
standard_rust_check:
|
||||
@echo "standard rust check..."
|
||||
cargo fmt -- --check
|
||||
cargo clippy --all-targets --all-features --release \
|
||||
cargo clippy --all-targets --all-features --release --locked \
|
||||
-- \
|
||||
-D warnings
|
||||
cargo check
|
||||
@DIFF=$$(git diff HEAD); \
|
||||
if [ -n "$$DIFF" ]; then \
|
||||
echo "ERROR: cargo check resulted in uncommited changes"; \
|
||||
echo "$$DIFF"; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# Install a file (full version).
|
||||
#
|
||||
|
||||
@@ -226,7 +226,7 @@ assets:
|
||||
kernel-dragonball-experimental:
|
||||
description: "Linux kernel with Dragonball VMM optimizations like upcall"
|
||||
url: "https://cdn.kernel.org/pub/linux/kernel/v6.x/"
|
||||
version: "v6.12.47"
|
||||
version: "v6.18.15"
|
||||
|
||||
externals:
|
||||
description: "Third-party projects used by the system"
|
||||
@@ -388,7 +388,7 @@ externals:
|
||||
nydus-snapshotter:
|
||||
description: "Snapshotter for Nydus image acceleration service"
|
||||
url: "https://github.com/containerd/nydus-snapshotter"
|
||||
version: "v0.15.10"
|
||||
version: "v0.15.13"
|
||||
|
||||
opa:
|
||||
description: "Open Policy Agent"
|
||||
|
||||
Reference in New Issue
Block a user