diff --git a/.github/workflows/commit-message-check.yaml b/.github/workflows/commit-message-check.yaml
index fbdb02b6df..191e94b0da 100644
--- a/.github/workflows/commit-message-check.yaml
+++ b/.github/workflows/commit-message-check.yaml
@@ -47,7 +47,7 @@ jobs:
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
with:
commits: ${{ steps.get-pr-commits.outputs.commits }}
- pattern: '^.{0,75}(\n.*)*$'
+ pattern: '^.{0,75}(\n.*)*$|^Merge pull request (?:kata-containers)?#[\d]+ from.*'
error: 'Subject too long (max 75)'
post_error: ${{ env.error_msg }}
@@ -95,6 +95,6 @@ jobs:
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
with:
commits: ${{ steps.get-pr-commits.outputs.commits }}
- pattern: '^[\s\t]*[^:\s\t]+[\s\t]*:'
+ pattern: '^[\s\t]*[^:\s\t]+[\s\t]*:|^Merge pull request (?:kata-containers)?#[\d]+ from.*'
error: 'Failed to find subsystem in subject'
post_error: ${{ env.error_msg }}
diff --git a/.github/workflows/move-issues-to-in-progress.yaml b/.github/workflows/move-issues-to-in-progress.yaml
index 0e15abaea3..5ab9beb98d 100644
--- a/.github/workflows/move-issues-to-in-progress.yaml
+++ b/.github/workflows/move-issues-to-in-progress.yaml
@@ -59,7 +59,7 @@ jobs:
exit 1
}
- project_name="Issue backlog"
+ project_name="runtime-rs"
project_type="org"
project_column="In progress"
diff --git a/Makefile b/Makefile
index 2b6f6a748f..4d2be6b4d8 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,10 @@
# List of available components
COMPONENTS =
+COMPONENTS += libs
COMPONENTS += agent
COMPONENTS += runtime
+COMPONENTS += runtime-rs
# List of available tools
TOOLS =
@@ -21,11 +23,6 @@ STANDARD_TARGETS = build check clean install test vendor
default: all
-all: logging-crate-tests build
-
-logging-crate-tests:
- make -C src/libs/logging
-
include utils.mk
include ./tools/packaging/kata-deploy/local-build/Makefile
@@ -49,7 +46,6 @@ docs-url-alive-check:
binary-tarball \
default \
install-binary-tarball \
- logging-crate-tests \
static-checks \
docs-url-alive-check
diff --git a/README.md b/README.md
index 90a5c9209a..4a7a6ea9a7 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,7 @@ See the [official documentation](docs) including:
- [Developer guide](docs/Developer-Guide.md)
- [Design documents](docs/design)
- [Architecture overview](docs/design/architecture)
+ - [Architecture 3.0 overview](docs/design/architecture_3.0/)
## Configuration
@@ -117,6 +118,8 @@ The table below lists the core parts of the project:
|-|-|-|
| [runtime](src/runtime) | core | Main component run by a container manager and providing a containerd shimv2 runtime implementation. |
| [agent](src/agent) | core | Management process running inside the virtual machine / POD that sets up the container environment. |
+| [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) |
+| [`dragonball`](src/dragonball) | core | An optional built-in VMM brings out-of-the-box Kata Containers experience with optimizations on container workloads |
| [documentation](docs) | documentation | Documentation common to all components (such as design and install documentation). |
| [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) |
| [tests](https://github.com/kata-containers/tests) | tests | Excludes unit tests which live with the main code. |
diff --git a/docs/design/architecture_3.0/README.md b/docs/design/architecture_3.0/README.md
new file mode 100644
index 0000000000..562404bd51
--- /dev/null
+++ b/docs/design/architecture_3.0/README.md
@@ -0,0 +1,170 @@
+# Kata 3.0 Architecture
+## Overview
+In cloud-native scenarios, there is an increased demand for container startup speed, resource consumption, stability, and security, areas where the present Kata Containers runtime is challenged relative to other runtimes. To achieve this, we propose a solid, field-tested and secure Rust version of the kata-runtime.
+
+Also, we provide the following designs:
+
+- Turn key solution with builtin `Dragonball` Sandbox
+- Async I/O to reduce resource consumption
+- Extensible framework for multiple services, runtimes and hypervisors
+- Lifecycle management for sandbox and container associated resources
+
+### Rationale for choosing Rust
+
+We chose Rust because it is designed as a system language with a focus on efficiency.
+In contrast to Go, Rust makes a variety of design trade-offs in order to obtain
+good execution performance, with innovative techniques that, in contrast to C or
+C++, provide reasonable protection against common memory errors (buffer
+overflow, invalid pointers, range errors), error checking (ensuring errors are
+dealt with), thread safety, ownership of resources, and more.
+
+These benefits were verified in our project when the Kata Containers guest agent
+was rewritten in Rust. We notably saw a significant reduction in memory usage
+with the Rust-based implementation.
+
+
+## Design
+### Architecture
+
+### Built-in VMM
+#### Current Kata 2.x architecture
+
+As shown in the figure, runtime and VMM are separate processes. The runtime process forks the VMM process and interacts through the inter-process RPC. Typically, process interaction consumes more resources than peers within the process, and it will result in relatively low efficiency. At the same time, the cost of resource operation and maintenance should be considered. For example, when performing resource recovery under abnormal conditions, the exception of any process must be detected by others and activate the appropriate resource recovery process. If there are additional processes, the recovery becomes even more difficult.
+#### How To Support Built-in VMM
+We provide `Dragonball` Sandbox to enable built-in VMM by integrating VMM's function into the Rust library. We could perform VMM-related functionalities by using the library. Because runtime and VMM are in the same process, there is a benefit in terms of message processing speed and API synchronization. It can also guarantee the consistency of the runtime and the VMM life cycle, reducing resource recovery and exception handling maintenance, as shown in the figure:
+
+### Async Support
+#### Why Need Async
+**Async is already in stable Rust and allows us to write async code**
+
+- Async provides significantly reduced CPU and memory overhead, especially for workloads with a large amount of IO-bound tasks
+- Async is zero-cost in Rust, which means that you only pay for what you use. Specifically, you can use async without heap allocations and dynamic dispatch, which greatly improves efficiency
+- For more (see [Why Async?](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) and [The State of Asynchronous Rust](https://rust-lang.github.io/async-book/01_getting_started/03_state_of_async_rust.html)).
+
+**There may be several problems if implementing kata-runtime with Sync Rust**
+
+- Too many threads with a new TTRPC connection
+ - TTRPC threads: reaper thread(1) + listener thread(1) + client handler(2)
+- Add 3 I/O threads with a new container
+- In Sync mode, implementing a timeout mechanism is challenging. For example, in TTRPC API interaction, the timeout mechanism is difficult to align with Golang
+#### How To Support Async
+The kata-runtime is controlled by TOKIO_RUNTIME_WORKER_THREADS to run the OS thread, which is 2 threads by default. For TTRPC and container-related threads run in the `tokio` thread in a unified manner, and related dependencies need to be switched to Async, such as Timer, File, Netlink, etc. With the help of Async, we can easily support no-block I/O and timer. Currently, we only utilize Async for kata-runtime. The built-in VMM keeps the OS thread because it can ensure that the threads are controllable.
+
+**For N tokio worker threads and M containers**
+
+- Sync runtime(both OS thread and `tokio` task are OS thread but without `tokio` worker thread) OS thread number: 4 + 12*M
+- Async runtime(only OS thread is OS thread) OS thread number: 2 + N
+```shell
+├─ main(OS thread)
+├─ async-logger(OS thread)
+└─ tokio worker(N * OS thread)
+ ├─ agent log forwarder(1 * tokio task)
+ ├─ health check thread(1 * tokio task)
+ ├─ TTRPC reaper thread(M * tokio task)
+ ├─ TTRPC listener thread(M * tokio task)
+ ├─ TTRPC client handler thread(7 * M * tokio task)
+ ├─ container stdin io thread(M * tokio task)
+ ├─ container stdin io thread(M * tokio task)
+ └─ container stdin io thread(M * tokio task)
+```
+### Extensible Framework
+The Kata 3.x runtime is designed with the extension of service, runtime, and hypervisor, combined with configuration to meet the needs of different scenarios. At present, the service provides a register mechanism to support multiple services. Services could interact with runtime through messages. In addition, the runtime handler handles messages from services. To meet the needs of a binary that supports multiple runtimes and hypervisors, the startup must obtain the runtime handler type and hypervisor type through configuration.
+
+
+### Resource Manager
+In our case, there will be a variety of resources, and every resource has several subtypes. Especially for `Virt-Container`, every subtype of resource has different operations. And there may be dependencies, such as the share-fs rootfs and the share-fs volume will use share-fs resources to share files to the VM. Currently, network and share-fs are regarded as sandbox resources, while rootfs, volume, and cgroup are regarded as container resources. Also, we abstract a common interface for each resource and use subclass operations to evaluate the differences between different subtypes.
+
+
+## Roadmap
+
+- Stage 1 (June): provide basic features (current delivered)
+- Stage 2 (September): support common features
+- Stage 3: support full features
+
+| **Class** | **Sub-Class** | **Development Stage** |
+| -------------------------- | ------------------- | --------------------- |
+| Service | task service | Stage 1 |
+| | extend service | Stage 3 |
+| | image service | Stage 3 |
+| Runtime handler | `Virt-Container` | Stage 1 |
+| | `Wasm-Container` | Stage 3 |
+| | `Linux-Container` | Stage 3 |
+| Endpoint | VETH Endpoint | Stage 1 |
+| | Physical Endpoint | Stage 2 |
+| | Tap Endpoint | Stage 2 |
+| | `Tuntap` Endpoint | Stage 2 |
+| | `IPVlan` Endpoint | Stage 3 |
+| | `MacVlan` Endpoint | Stage 3 |
+| | MACVTAP Endpoint | Stage 3 |
+| | `VhostUserEndpoint` | Stage 3 |
+| Network Interworking Model | Tc filter | Stage 1 |
+| | Route | Stage 1 |
+| | `MacVtap` | Stage 3 |
+| Storage | Virtio-fs | Stage 1 |
+| | `nydus` | Stage 2 |
+| Hypervisor | `Dragonball` | Stage 1 |
+| | QEMU | Stage 2 |
+| | ACRN | Stage 3 |
+| | Cloud Hypervisor | Stage 3 |
+| | Firecracker | Stage 3 |
+
+## FAQ
+
+- Are the "service", "message dispatcher" and "runtime handler" all part of the single Kata 3.x runtime binary?
+
+ Yes. They are components in Kata 3.x runtime. And they will be packed into one binary.
+ 1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc.
+ 2. Message dispatcher, it is used to match multiple requests from the service module.
+ 3. Runtime handler is used to deal with the operation for sandbox and container.
+- What is the name of the Kata 3.x runtime binary?
+
+ Apparently we can't use `containerd-shim-v2-kata` because it's already used. We are facing the hardest issue of "naming" again. Any suggestions are welcomed.
+ Internally we use `containerd-shim-v2-rund`.
+
+- Is the Kata 3.x design compatible with the containerd shimv2 architecture?
+
+ Yes. It is designed to follow the functionality of go version kata. And it implements the `containerd shim v2` interface/protocol.
+
+- How will users migrate to the Kata 3.x architecture?
+
+ The migration plan will be provided before the Kata 3.x is merging into the main branch.
+
+- Is `Dragonball` limited to its own built-in VMM? Can the `Dragonball` system be configured to work using an external `Dragonball` VMM/hypervisor?
+
+ The `Dragonball` could work as an external hypervisor. However, stability and performance is challenging in this case. Built in VMM could optimise the container overhead, and it's easy to maintain stability.
+
+ `runD` is the `containerd-shim-v2` counterpart of `runC` and can run a pod/containers. `Dragonball` is a `microvm`/VMM that is designed to run container workloads. Instead of `microvm`/VMM, we sometimes refer to it as secure sandbox.
+
+- QEMU, Cloud Hypervisor and Firecracker support are planned, but how that would work. Are they working in separate process?
+
+ Yes. They are unable to work as built in VMM.
+
+- What is `upcall`?
+
+ The `upcall` is used to hotplug CPU/memory/MMIO devices, and it solves two issues.
+ 1. avoid dependency on PCI/ACPI
+ 2. avoid dependency on `udevd` within guest and get deterministic results for hotplug operations. So `upcall` is an alternative to ACPI based CPU/memory/device hotplug. And we may cooperate with the community to add support for ACPI based CPU/memory/device hotplug if needed.
+
+ `Dbs-upcall` is a `vsock-based` direct communication tool between VMM and guests. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests once the kernel has started. And the client side is in VMM , it'll be a thread that communicates with VSOCK through `uds`. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid virtualization of ACPI to minimize virtual machine's overhead. And there could be many other usage through this direct communication channel. It's already open source.
+ https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall
+
+- The URL below says the kernel patches work with 4.19, but do they also work with 5.15+ ?
+
+ Forward compatibility should be achievable, we have ported it to 5.10 based kernel.
+
+- Are these patches platform-specific or would they work for any architecture that supports VSOCK?
+
+ It's almost platform independent, but some message related to CPU hotplug are platform dependent.
+
+- Could the kernel driver be replaced with a userland daemon in the guest using loopback VSOCK?
+
+ We need to create device nodes for hot-added CPU/memory/devices, so it's not easy for userspace daemon to do these tasks.
+
+- The fact that `upcall` allows communication between the VMM and the guest suggests that this architecture might be incompatible with https://github.com/confidential-containers where the VMM should have no knowledge of what happens inside the VM.
+
+ 1. `TDX` doesn't support CPU/memory hotplug yet.
+ 2. For ACPI based device hotplug, it depends on ACPI `DSDT` table, and the guest kernel will execute `ASL` code to handle during handling those hotplug event. And it should be easier to audit VSOCK based communication than ACPI `ASL` methods.
+
+- What is the security boundary for the monolithic / "Built-in VMM" case?
+
+ It has the security boundary of virtualization. More details will be provided in next stage.
\ No newline at end of file
diff --git a/docs/design/architecture_3.0/images/architecture.png b/docs/design/architecture_3.0/images/architecture.png
new file mode 100644
index 0000000000..5825f1eb4e
Binary files /dev/null and b/docs/design/architecture_3.0/images/architecture.png differ
diff --git a/docs/design/architecture_3.0/images/built_in_vmm.png b/docs/design/architecture_3.0/images/built_in_vmm.png
new file mode 100644
index 0000000000..91386c5eb6
Binary files /dev/null and b/docs/design/architecture_3.0/images/built_in_vmm.png differ
diff --git a/docs/design/architecture_3.0/images/framework.png b/docs/design/architecture_3.0/images/framework.png
new file mode 100644
index 0000000000..992afdfff5
Binary files /dev/null and b/docs/design/architecture_3.0/images/framework.png differ
diff --git a/docs/design/architecture_3.0/images/not_built_in_vmm.png b/docs/design/architecture_3.0/images/not_built_in_vmm.png
new file mode 100644
index 0000000000..ad1d5b8e34
Binary files /dev/null and b/docs/design/architecture_3.0/images/not_built_in_vmm.png differ
diff --git a/docs/design/architecture_3.0/images/resourceManager.png b/docs/design/architecture_3.0/images/resourceManager.png
new file mode 100644
index 0000000000..7a8fcae0c7
Binary files /dev/null and b/docs/design/architecture_3.0/images/resourceManager.png differ
diff --git a/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio b/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio
new file mode 100644
index 0000000000..61882a8892
--- /dev/null
+++ b/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio
@@ -0,0 +1 @@
+5Vxbk6MoFP41qdp9yFQUNd2Pnb7tw8zWbHXVTvcjicSwjZJBcptfvxBRI5jWXIxm+ilyRMTvfH4c8JAeuA/XzwzOZ9+oj0jPHvjrHnjo2bY1vPXEj7RsEos3cBJDwLCvKuWGF/wLKeNAWRfYR3GhIqeUcDwvGic0itCEF2yQMboqVptSUrzrHAbIMLxMIDGtP7DPZ4n1xh3k9r8QDmbpna2BOhPCtLIyxDPo09WOCTz2wD2jlCdH4foeEQleikty3dOes1nHGIp4nQs8PnwlT39PfeufH084Dp5fFkHfcZNmlpAs1BOr3vJNCgGji8hHspVBD4xWM8zRyxxO5NmVcLqwzXhIRMkSh1NMyD0llIlyRCNRaWR2VPV9iRhH6x2T6vgzoiHibCOqqLNOiqpikT1U5VXuk6zObMcf9o0yQsWDIGs7h0ocKLQOQM66CuAsrwicU4KbU4JbY7DZBmwcxu8SN8SWWGCzH0SrNRBviiB6JoiWXQKi0xSI7rCafMgXOqaKlPEZDWgEyWNuHRXpmdf5Sulc4fkf4nyjRBkuOC2i7cN4lnkm5oy+ZxoJPkKeQxYg/sHzqTFDPsKH/mGIQI6XRakuQ1td+p1i0ZXMr0Dza+bEtImYLtgEqas0n2XdON6NwPAiDuVw1OmXAdgdexkcA0XKZ4h1G0XH6xiK7sGBwPkFo1qy9iKfvKnVklIpPTeNSM8dY3CzU2EuJSWur0yp35/21NeHea2+OEh6cFb58gzGhCiOtwLmwVC+RtE4nm+95hEuPT6HUYFR3s+FjH5Hk+T9upO4BuM/PAGn6L/o1aBw+Kc8ls2JMD7i/SkMMdkkl2W37NkAOOIXMgyJYc0ai2EU94U+4GneC3EUyF8fC2ri8YKjtOMCnqTvSQVzrCVEzEdQW2KSqcJGY8OOmmQR3UXUZNhGOILWmL/Ky7+4qvS2c+ZhrVreFjaq0LygnD2WOckxN4eqfCuE7muErj3dy2bfZwfu1gBOSIg/putORhdF+IDVdnSRdmAHviVmvD8ROg5xtI3WZgJPIo80PMUz8yJoSeSgwVaCJCQ4iERxIpAUDYORRBBPILlTJ0Ls+2SfbBffgzN4xbKqWV3qFn1acj63mIsYux4JYSRGc9MjHWC4PiXvAMXNlY0uSqvjVgMHLiutljkP7rK2DjUA3daZZ86ACY4W68+lri6ooQhlS53Nqas5pb4SdfWsznHcnGxeg7o6oH11NdeKr0hdnboxUnPMM6dMKxiHn0xcnWpeX1hczQnZlYpr+xRPb9ZxcdVXBZyyEd4rI2FzXzPNCVSE+Iqy9y4yr699unFKvgZfmHnmpIkhtfS2/yX+bWTVcEjdWWxjsmqbc4l4Jh6xP42vgdHuoHVGm0G/TPG5Evg8t234zGh1Scki7ORHXB0+rywquix8ZlTUxZHcHhw5kltuU8gBMwb6icJFF2lnd24YB2YYtAw/wwBuuKL1ARyYEZXPYECjMZQXd5/N7Q/h4DpyYnUR9cqAu7CImqt075BDeasAqRy6jtOv/TEcmCHQv98+j4SW0viyEmoGUe+IRegq5NO7bZu/jikCaULmp14zLf3cf9E1U6dG9g+K/Du5bUbiSGAc40nRGUWYjkm5Eoiyzetu4S1vQRbzy7al9Lpjs80rM7kUKpWZXKBuKteOg90S/6a2E7PXbe1TiVsze91MNtWD2KHWUIKM0dChSa16kqrnXiBJ1THFXMUigyxbVX8FhH+/wrHQ+wLz68sLQzH+Bcfb9iRzFTCicXfUcx9KyfrxC6uLTrbfTd2lt7ulrHSpYvBFONYu4J9uYTqWh2kVOp3GqJENEumb0oRg2bUFqyprXdT9jhgWDytpoWlcLmtvBVUr17jmRSxFsCMiZunZNPqnmroiZoNycTlexM5OZnNmfiKZq4h5bhJWj5BWp8hl60mC+sBWl1x6Voc+0rbPLTOFrmFunRT4XUoU0+XwauIOO01c91jiGg3ddo65Byd2NbDv7EhinrBfrZKRTkt7YLM/XjiUavpIbp88ANfscMXWOD3d5sStcaKY/2VDUj3/4wvw+D8=7Zxde5pKEIB/jZf68K1eRk2T9DQnaUxi25s+K6xAg6yFRWN+/VniouBsDTZAICe96MMOuIR3Zmdn9qulDuePZwFaOJfEwl5LkazHljpqKUpfM9j/sWC9ERgSF9iBa21E8k4wdp8wF0pcGrkWDjMPUkI86i6yQpP4PjZpRoaCgKyyj82Il33rAtkYCMYm8qB04lrU2Uh7urSTn2PXdpI3yxK/M0fJw1wQOsgiq5RIPW2pw4AQurmaPw6xF7NLuOABUabU+O63L0YreuZrshO0N5V9OuYn208IsE//umo6Gc4847cbkadfN79Hq98Xs59tZVP1EnkR58W/la4TgAGJfAvHlUgtdbByXIrHC2TGd1fMYpjMoXOPlWR2OXM9b0g8ErCyT3z20CDnH88/cokDih9TquMfc4bJHNNgzR7hdzWuFm6XcpeXVzstdxPVOSkN68mDiFuWva16R49dcIBH6On9wNRzsjTKQqkClBSFDzFLHCxdxuvPYOWXwYY0IA84QdtS1L7RVZGxh5zJsWzpuFsid1nJghcYsSwCr5YFXgPg3XnsYQsiv8d3NsOGaTK5hUJnWwdQj9XtTyWpRDWoWs3UYAA1EOrgoAw1cM+S0UBZnHXj7Th/icLPeLK6o9HyZnL1ebycRcO20gegXwZ8pOtO+5E/e56KPLuW07MrZRFPVJkifu8GlEmGxKfI9WMrLxA9A9+zNBH6njJVjVLR9zt6Bv42KE7RV0T0tdLow451gsI5p180/ipdiyHloS1rJdF2aIS/Rku8uB6YT6P19f2Pn1dtaOpzHIab7tRywwWiplO0uSPcm5kiczfMHp7OKgzIlQpdjTC5kQH+IPKpO4/xO8i3vHfkavbY5w3gS2MPAxhnvYh71pAU72BK8ynApfQgV7VSrl3Atbk0VSknze2YzKv6Q88LrZF93r6/uyBXET0Z4ytRfwh4YsvGY14kAXWITXzkne6kgyzx3TNfCFlwzr8wpWs+OoYiSvYTUhRQflfWmQD7yVCarBytkpBEgYkPGBEPetlLbUwP0FF4NhgTOKjhAHuIusvskFvhtg9j9VGAGOgpin9bcD5UVivogVZgwFbQKykHEneTMEz5iufR64hWEvNBloKQr1qWMOYYeiSymOg8T/9XF7JyDdEqAO0nN8BmgMyHg4FcXZjW0VyhSw0w7z6aElZocq4oTa4ySlOgS20wT2GcJuJZSJwmBgr96r+Yrkjw0JiOX1dr1/wTq0xRvSGEzsLGQNUgVEHqWy1UmFLcEy+aFz9kX6GlvjlUOC81tBm/RWOgGm8K9ay3Gt2uL6eDrvXln9n6K5mGrmB4cuyw7/7U6OZfoU8VQoUdlef60WOr+SPsWq75DFmTOn0VAi9kSkM4SAGAF463LKIyCLU0FRLdTppmQtd+WTyhp724YuVL5CO7BNMtja0hdTQ1C9fod9SuyGLL8Q9CvDrAe2JZLOliTlfagGOs7UZz1uUacIYTEDfbzPY9GLMeG7PR3/17Y95wYuJuYR4emK09Y0PuJv75Ra5yp9ctCW0PoL3F84WHaPGDNNUMLgpI18ya4bjY/eVlY0xZVXL3fbqAahGrOoVUBTMN98Pru8Zw1eXcXEtbUyXkCpOODVfJIXThRXaTCetSHQg3OMsQeANde8ndisd39dL4iubhDY+9dzBlF3Z8McKbdZm76G3zAHvh9hmgFMaI7s+1pxfncPYCdSDPtX1WNJk62MvUQUzcZSHNCb8xdy3refpfpOqsMdRKtbKkV9lyYD65dBlI0l6GxDw0fF+zVqRC1N2XUPcr9VEwteSkfUybzNmQ68UZppZW4pqa1t+KaOv1og0TS27VB6ep6oVZq7/zgEkmxzz1muSm9fqThrklJz2fu6QxoAUmXTM/LViE4WEbmfFHbBx2cxyIwKzrRhvmoEuHhMUHHtXMddW+X0wS0BTv6+FFM2mLrLtmtGEWA/PT1GLk/31mmm9qUzQzX54Oc+ydwL51Eh86sYOd0k+WXGbNfixIrdnfFNkf/Y0//Fz4zut5Lowe07dG67/SR2YF/6Hdli+u4E/PN4sOTeCy3Av9+RuuievTlFvdN4pusgE3qWMzRcd/tlM4qKm/X5MhjvKSejekQL3PhrTF8ArbguFzmbZ1nJXUVvn70zd5ld99uaqy9Q2D+A9972lJsH1u/+iVvAoXVLU/FF2ywgXb1j8ULtBST9nXeCeJnV7fyoH9lK10mNN8KP14X1xLhQ9ul+sT+8yNHsjYG03a7rD3TbCWcEj8mWsxhbgo/vX2jAo4dFCPeH57SFlcsD0Uhvw6e8LOA6amw+/M2Del954eZ4bHbCzpdfRu1lQES6Bl4XLHApICocbhnAlQ7N/vJKuKrN7LRVa0gqY0sHCSZOEhP85ZSIw3WixIAAeHPhrR0dsIhG1I6nekbjnaNtrfroPJ/Q9f/hV8vnw6Pb3QJoKtBPBQH2kWb32VhsMDLez4QzfqcoSY0gdjWJUeQCNUS474pZDj2/6Mu5LxQxWkB6JGoZbXJs7dH9/P+rfawD25e3ryNam3vha0iRD51pTEe0HmRS9AfuOjf/b5y6ItuJrUKWnliZA/NH4z5YzemwYSX3NIAeouTHilBlhxd/DtJvrenR6snv4H7VtZe5s4FP01fkw+FhvbjzF2lpk2ScfTpM3LfDIIowYQFcLL/PqRQKzCcdIYnHSSl8DVgjnn6ujqSvR0099cEBC6n7ENvZ6m2JuePu1pmjocG+wft2xTi6EIw5IgW1QqDHP0LxRGRVhjZMOoUpFi7FEUVo0WDgJo0YoNEILX1WoO9qpPDcESSoa5BTzZeo9s6qbW0UAp7JcQLd3syaoiSnyQVRaGyAU2XpdM+qynmwRjml75GxN6HLwMF4apeXtnX9yB7/6Gajd35s33k7Sz85c0yV+BwID+ctcmcLW/f5Dg29Wn+defV6rpuYZooqyAFwu8kM8BZW8LyQpZULw63WZ4EhwHNuR9qj19snYRhfMQWLx0zTyI2Vzqe6LYBpGb13WQ55nYwyTpSHccaFgWs0eU4EdYKrGH4wVjQZ88870FPitIKNyUWBc4XEDsQ0q2rIoo1frivbe1+3XhIWpGu1vyjqweEE65zLsugGcXAvtmHq4f/Nn1l8Xi0p2YN+vZ5OvlxbyBh7Pbq9dhL8HtaDvgNhbGwGgRbuOtoa1KaEtQs8Ee8ksrJt52QoD1COl+zKsEOTigQhDVUY2RAAewRcjVcRVz1ZAxb4JcU9rCXJMwpyB6PJTQ1JwdqvYADpucfWwMddCms9e1pQn5br1dl5DH1IWkFY3vClVdOzaq/Y40xEPh5fH1RH9zejKQ8PdhFCWxS40G1hkLNuEzROQJfI87g/a1tzaFGm9mCo3SYuV00CIBg8Fb8/+hRMAdItRkeAEUcHU3PI72gl8t+ZULAtvjBQecZ9ksO7L7TUNipC10o92gssbI4NhDYiQxcg8iv8wI8DmcwSIKE2CUg1DSyfxbR1sbHhvtsYT2JxTEmxLc7xheXT82vFn6poTvrmiRvSOtYgY8tAy48jOYGP76hCOBLOCdiQIf2ba3a0Yu6FEaZoC2KJDi9qNHmKq8TCVxQJEPn/BtmYyqNItJs2EefY+kSZOA2iFps0909HBhBpPbS+THfwZabE0bEjm/dVw0rMVFow7Dokb85TEzJWCJgwXgbQ8Y+lgWHDhOU+ijG/pYt1sEfVTz+i4zDI2gy7mdL9CP38G8W0fy+PohJ2vOLBK8RyiP7pVNGZp0UWSjFSuwPBClzzR+xnwTZ+KBR3hiMWnl2OTWbAGVtQ6LB2dVYsMZQejYoNyq4QEBPAkrVQpKM6MPyBKx6fasl4hkug3C+LJtFCzrZh8FJxmWvETrh5vdPzwKQbDrZyVxg9TS9HBsX25DHvlFOF9UMjrSvqr9M3PYYEvQzqzvzo+1o0uCnOlqz4/tvgUGcKH9Zn58jgi0kkjr/+nDR9diOTfC3NNBy5iAhZesZ0oiU8P3RYExIJaIiwfKixY1aRiXHRFIuoopFkG0mnqdlbjulCe8KIHwHHupMyfl3PIZrwpDANczewnn2ZsMJ4Vjwryg8FXu+Xk5W+fR7V/QAxThYNZQWzOLm4gCQs/4YYkd3SVLgua2MLB/sWUEl35VYPQzLpalKmyJs2LUVGr0htPnLWDaGh392l5GQy5d07scHHIqSxoDsORIu3yj4gc5ijm/xcJQ8MaQVwqO6oNJqYvV69b0EY6JBZ8AQYDL3mAJ6X4l4YA8STZJ8FmVKzVTJ5reYsR9OU/41w4tjGrkp+8jGtX4z3/FK5axch6hJph5KuhDLT/UskW11Gtbj3pjiqdLuWzIi74HvYQbRL/xiqequPvey9Jz/G66Eb0kN9vSzS0kbKQnI3L68mNxZeV9ait3r/JmmtS29OabH9n8rLemvZM1fPjDu/b968mPlW/OnZXrNqQQa9K7ayPkQ3o/pPeQ0nvS3x+otqW8jSNDzvN+CO8rhDc7Dl8S3qcEqW3dVUc7sgaH1139bvYQXW3v/1n8cG6+nG8j0xw36K7JddfmbgR463xvP5Lc7m1se+ZfCPCbpUhPSf4ZPUJquT15p7TVA0z1RW/TCQOjQUy0A4hJI93POED5im8NOoJ1aOyHtd8lqnKyOPSSbCnFHNs4DDGhH8PnpTyPVOV0rCv5n7p/MCnjU2XYIfPPOJH5uu9H9n6u00lue9B/KRN6x0TIue7nnsxUHJ7+Vkzzt2ROOkPY4s4auy0+zEsDlOLzRn32Hw==7Zpdc5s4FIZ/jS+bAQQYLhvno9OZ7GTqbbbdmx0FFFsTjByQY7u/fiVb4kNSCLbBcZskM4l1EAL0nPfo6OABGM1W1xmcT29IjJKBY8WrAbgYOI5tWz77xy3rrSXgLW6YZDgWnUrDGP9CwmgJ6wLHKK91pIQkFM/rxoikKYpozQazjCzr3R5IUr/qHE6QZhhHMNGt/+CYTsVTeFZp/4LwZEqLBxZHZlB2FoZ8CmOyrJjA5QCMMkLo9tNsNUIJnzw5L9HTl6evV9ercxz+m9w/X8Lnp+Gn7WBXu5xSPEKGUtrt0ILlM0wWYr7Es9K1nMCMLNIY8UGsAThfTjFF4zmM+NElcxlme8BJMiIJyZgpJSniJpLSKzjDCfeav/GMeYBj/YWW7O83MoMp6xLDfLoZ1maNKZ0l4iM/VbiRHbB2y4cXk/SMMopWFfRiMq4RmSGarVkXcdQVZwi/dgXlZekkXhhubdOKgwDZEQrHnBQjl5PPPoj534EF6JrFwAGxh4LYbQekgUGNFW/cQkpRlm4sjsWtOc3II6pcOnDuge/3CJBptUbQ1wm6UrtVgoWgOyfoni7B4+Px6nQCnY7tGuj4fcHxNDjZIqVsHhsY2S8x6pbFfeC5nqWzfwgiFEU9MvKdGqPQwMikoN5CoK8xuru5ecd8HFAXke29NaHh2yUM+69PTo+IwLCOaKgTckxxzg77QhRoiFiOyx76/coIhHVGbx7nQg3RZIFyhsh6ZF7Ld2CHpg3nm9+DGR6LiAMMsrFMyVtfSOR+tcKEbUkpxCnKDsfh+6ORZR2MY5LAPBdXzR8RjaayIfemVp87prDFamSCBnqDZv8G0I6GwxTWjoujxRYWpfFnXtZhrYh7M+aBH60w/cHBnHmi9VNMJv98sRLMNo21aBwGhbHI1j/kuLzxc3P9oSfb5VU3LXnZ3WjmZJFFqKGfyHgpzCaoaTyRd6G4Vu3SfaNavjCgl7YMJZDi53qNzOQP4gq3BPMMoshLLfvMqzuf7dUH2T65OK9allKGUvMnbdHdTo02EHMiuK50m/MOecMtqyuOcNbS37cjlt5fzOoBgmhRETAL4gX3fM07jy+kjvUQtNSDXK5PRBC+FSpLozJEWzm4vpKqKuN0pAZXqc7YvncENeglmA81dKMG+7TUYKuJ4m+mhqHVfFtq/3C3/o51DLXpxbRbEmuCY8km1Tbrsiij12lggie8rBIxB2c5Njjn+SqOYPJZHJjhOOZjGxPsegp+mml1i4KOSTr9bUxblNyMUVOZu1dmWkuQRey0K5GzjKPm2LlvnO44anoto2ZwUkHTCdScuniHvnNODbT0XH2t8kLk7Cz46EXIMWKAIUXMairqf8QhdXtv1Qga3k8ba5ZdxKFvc/T1KY5/XYDl+P72P//O/v7dhFSD+H7ebqovn4Ft4NPX+00zH72ofIeZJshD3sDpTy/9O9JLJSenpY66qP0bOcmL7byeH1jw2mMvdLrreWOEOpEF3VZcz1brrm2X8yIPkD7cU4lMu+GgeVuj9pf31dW2xiwfveT/IZ9O0mFwWuoBijOqTr+veuTmvnf1vFJEsF3zffWrHudN1bPfZrIUTKG5XtTTuKi8Kp/w1OVzZlV+9izIFXciQ77Xj5jUd5sHLi2sWX5Pe9u9/LY7uPwf7Zpdb6M4FIZ/TS5b+QMIXHbSmVlp1aqadnZm9mblgpuwJTgCp0n216+dmAR/pCEJpKmmGqmDD8YEP+e8Pj7Qw4Px/GtBJqMbltCsh0Ay7+HrHkIQgkD8Jy2LlSWULWkYFmmiOm0M9+l/VBmBsk7ThJZaR85YxtOJboxZntOYazZSFGymd3timX7XCRlSy3Afk8y2/kgTPlJP4YON/Q+aDkd8/cDqzJhUnZWhHJGEzWom/LmHBwVjfHU0ng9oJievmpeb8jv4l9G+3/8zGz3+83D3cH97sRrsyz6XrB+hoDlvd2jF8oVkUzVf6ln5oprAgk3zhMpBQA9/mo1STu8nJJZnZ8JlhO0pzbIBy1ghTDnLqTSxnH8h4zSTXvOQjoUHIHBLZ+LvNzYmueiSkHK0HBaKxoiPM3UoL1VuBEPRbvjwapJeaMHpvIZeTcZXysaUFwvRRZ311BXKrz1FebZxEj+KVrZRzUFQoDoS5ZjD9cibyRcHav73YIHbZtFDOPFpmHjNgLzCQGMlG3eEc1rkSwsC0lrygj3T2q1D9IiDoEOAIlY1goFN0KtiVyOIuiLonS/B0+PxdTqhTQd6DjqdhZdvwSmmORfz+AojuI1RuyweQ9/zgc3+KYxpHHfIKEAao8jByBVBXleMAovRXzc3vzEfhPUggv5bE+q/XcJw+PqEOkSE+zqivk0IuXQORl0hCi1EIscVD/37hhGOdEZvrnORhWg4paVABJ6F18od2LFpw6flv6MZnooIwo6wAa7krSsk1X61xkRsSTlJc1ocjyMIBgMAjsYxzEhZqruWz5THo6pR7U1BlzumqMFq5IKGO4MG3wG0k+FwydppcTQoJ9A8uZJlHdGKpTenUvjpPOU/JZhLX7V+qcmUx9dzxWzZWKjGcVAEi2LxsxpXNn4t79/3q/bmrstWddv9aJZsWsR0d8bLSTGkr42n8i6aaNUu2zfq5QsH+spW0Izw9EWvkbn8Qd3hjqUyg1jnpQBe+rrzQV8fZPXk6rp6WcoYysyfrEV3NTXWQMKJyKLWbSI7lK/8ZHPFgdjw99WIG+9fz+oRAdGgpuMOiC3uucs7Tx9ILcdD2DAequX6TAIiAJGxNBpDNA0HLzBSVWOclqLBM6ozMPBPEA0N6mMf0XBQNMDzigZoJorvLBr64PWfZfaP9uuPwCmizS543rHECjiRbHJrs14VZew6DcnSoSyrxMLBRY6NP8l8NY1JdqVOjNMkkWM7E2w9BT/PtLpBQccVOt1tTO2iaDPVNOZux0xbCbLSTlhTzo2OurXzUJ1uWTX9hqoZnpVootDMqdfv0PfOqbGVnpuvVbYoZ2viY9eJ76kATDgVVldR/0OHzO090Ag63k87a5ad6VDl0Xvr0JEb9QNyuHemQ/isdAhipEuHuSFvqkJr+TKyqrazN4iQ8z7bfhf03L+rrWyMT39cXwSLK/z3t5vvJbydZk+LVlZxXbV2vPPyWo+lS8+P6vF0AS7hrg2YbN3RQgT9UqW7iDJsR5mbwHmV0DygeyEOD13sI2OghgW01jzbXuqX6zuYFCymZbltpa859lst+qf5zGwLnvpCjn3bz9qo0ruB2S+I31KK0EFa5NeVCOyQoZYVx2uqONF5KY7xMQkyv0BoqjgQ6QNh01W7Vhz79fn6U64P1VmeDU8nOqK5+QZ6hXjzJTn+/D8=7Vpdc5s6EP01fmwHiU8/Nm7a5s5NJrfONM1TRwHFKAHkEXJs99dXsoUBidrEBsd3ksmMwy5CmD17VquDB/YoXXxlaBpf0ggnA2hFi4H9eQAhAJYn/knPcu0JpCUdE0YiNah0jMlvrJyW8s5IhPPaQE5pwsm07gxpluGQ13yIMTqvD3ugSf2uUzTBhmMcosT03pKIx+opXKv0f8NkEvPNA6szKSoGK0ceo4jOKy77fGCPGKV8fZQuRjiRwSvicnET336f3k3PfesxeAK31xN+82E92ZeXXLJ5BIYzvvfUd/dX83/8X/8Nb6eP367AxdgbAnWJ9YySmYqXela+LALI6CyLsJzEGthn85hwPJ6iUJ6di5QRvgeSJCOaUCZcGc2wdNGMf0EpSWTW3JBUZAC0rvBcfH6nKcrEkAjl8WpaIIyYp4k6lJeqNAKBsFs+vArSM2YcLyrQq2B8xTTFnC3FEHXWUVeovHZUKOZlkrjD4doXVxIEFpmDVGJONjOXwRcHKv4vwAJ0gsXuwG+JdQ0TaVwjzjHLVh5oSW/OGX3CBdwDaAfw3vY8LQ2EP3JxEDk9AujWAfQbAPSbAAR9AQgNANks4wKDLTiCXnA0QLoPXMe1TJAeghCHYY8gebAG0qa6VlACRYGtouT0BZJtgPTj8vINAwTtOo2A99oIOSdfB2HrOqiWw77As/06eIGJ3QanKnZg2Bd4rgGeaLDEQ79dgtlDjWCvXgI9A6PJDOcCI+tJ5Ljs/w9qAUVQz1Z/B4N4LEiamj9oNUAC+4LENyAR+yGOSIbZ4Wh43mhkWSeLhqMTpGkFakLD7guN4B2NkhtN5eqoaAx39wM4iz5JrUBYYYLynMiCjheE/5S4fHSVdadiKY8/LxRkK2OpjMMwEVCw5c9iXmncre7vu4Vd3nVlFbd9GZg5nbEQ7+5xOWITzHd3WjiqSShmalS3VA3IFz6GE8TJc114aUoHdYdrSmRnsOlELfDRrVcC6NYnWT+5uq6qdWhT6X2RsWlfh8aYSCQRWlaGTeWAfMtX1skCbS3d1zOWyb+J6gE6QQvRppkQf0nPXdl5fCJ1zAe3JR/8k+KDZw21dVGboi0bHE/vQLWJOmKDoykywHePwIYWstk7G/ZhQ3BabAB6l/h/Y0Ngbf9ebnPj1Xo8OAbbTI3zmkYG4USvyY1NePFqwJRHUEImUlsJRYKLFts+k+0qCVHySZ1ISRTJuRv763oHfppddQuhpok7ve03gSmDtquaWux2RNpokFXtBJXKWdbR5tq5b53uuGrCllXTPamqCQO9pzZyqnVPbRvtud+uq+6s+JjK8NmMJJxkwtmk4r+Xoe2vzBpkLwCOWoZMudgsQ4JJY2VSxmM6oRlKzkuvFvlyzL+UTlWAHzHnSxVhNOO0Hv6OSwXwWtYKr2WtaF0EDgPD1IUvsoRk+MMPInhAH3IDm7ej4uuvyTZy16up+MDUjNst4fsuqAdqY3tsm15pCQdteXmcNRzob2h1MavtCq7/5MV4C9jRzgcUmaipb/3uTEzR/p0OndDBPjE21JOrMzZA7zhs0HSAl7JBmOVPDtfDyx9u2ud/AA==7Vpdc+I2FP01PLJjSf7iMSGEbadps8smDX1TbAW0ayzGFgH66ythGduSlziAgc5mmGGsa/ka33PP1dE1HdSfrYYJnk/vWEiiDrTCVQfddCDs2a74loZ1ZnAtOzNMEhpmJlAYRvRfooyWsi5oSNLKRM5YxOm8agxYHJOAV2w4SdiyOu2FRdW7zvGEGIZRgCPT+jcN+TSz+o5V2D8TOpnmdwaWOjPD+WRlSKc4ZMuSCQ06qJ8wxrOj2apPIhm7PC645z6MfyNfxwysx8G0+3R/1e1mzm7fc8n2ERIS8+O6hpnrVxwtVLzUs/J1HsCELeKQSCdWB10vp5ST0RwH8uxSZIywvdAo6rOIJcIUs5hIE4v5LZ7RSGbNNzoTGQCtP8lSfH9lMxyLKSFOpxu3QAymfBapQ3mpSiPgi3HDh1dBeiUJJ6sS9CoYQ8JmhCdrMUWdtdUVKq9thfKySBKn18ts01KCwDxzsErMydZzEXxxoOL/DizQUbB4O/A7Yl3BRA7uMeckiTcWaElryhP2g+RwdyDy4TNyXS0NhD10iB/aLQLoVAH0agD06gAEbQFoGwAmi5gLDHbgCFrB0QDp2XdsxzJBevEDEgQtguTCCkjb6lpCCeQFtoyS3RZIjgHS493dLwwQRFUaAffcCLkXXwdh4zqolsO2wENeFTzfxG6LUxk70GsLPM8ATwgs8dC/LsFQTyPY2Uugb2A0WZBUYGT9EDku5f9BElAE9XrzORjEU0FSJ/6gVQMJbAuSngGJ2A9xTGOSHI6G6/b7lnWxaNg6QepWoDo0UFto5ArxA44yEc4HBzDgMFAgcXgluwViFEQ4Taks6WRF+ZME5pOjRmMVTHl8s1KYbQZrNTgMFIFFsn7K/crBeHN/z8nHxV03o/y270MzZYskIDvmKZXLcTIhu/wprUXCShPFzI3ypqoG+tyWkAhz+lptvdTlg7rDPaNSG2y1qAU+OdVaAJ2qk+zJ1XXlbofmSldGxrY9C43hSCQRXpemzeWEdMdP1tkCkZbvmcci+7dRPYAQDdo29YT4SXq+lZ2nJ9KR+eA15EPvovjgWj1tZdRcNGWD7eoaVHN0JDbYWk8GeM4J2NCgcfbBhn3YkOugS6ED0IXi/40OvrX7dzn10qvxfHAKupltznsWGowTapMb+/D87YDZIcERncj2SiAyXIhsdC0FKw1wdKVOzGgYSt+1CruqwS9TVzfo1dRxp7UtJzA7oc3Kpha7NyJtKGRVPEGpdBaFtL547luoj1w27YZl07uoqgl9XVQbOdVYVCNDn3vNZPXRio/ZHL5e0Ih3aSysdZ38jzq0+7VZTesLgFPWIWg2W9qVbwfu8PcQf2eqQ85F1SGAYLV06DvyplVIf3MP3Xa29gBqP7iq3g5WV+7t8+Cv73j45dtoMRx+/wetbmh3XzKUMrZahN54lWUfKcULMhb8G1fod5IlGpnU+LyeXw+uBt5D9Dhyfv/yx+NDl3cva2Pju9VE09/8NF6f81q61sbHX513RbXuDxDWPGEBSdOfrc+l/D3XUn2KxRc6bj1C5eUXOWam7dFcF8Piz2kZxMU//NDgPw==7Z3Rcps4FIafJpdhkAQCLpM0bWfb7qZNO9vuTQeDbNgAYjC24z59hS0MWAQ7MUZsVrnImAMWRv/5dMSRBBfoJn58l7lp8In6JLqAuv94gd5cQGgAwP4XhvXWYOvm1jDLQn9rApXhPvxFuFHn1kXok3njwJzSKA/TptGjSUK8vGFzs4yumodNadQ8a+rOiGC499xItP4d+nnAr8LUK/t7Es6C8sxA53titzyYG+aB69NVzYRuL9BNRmm+/RQ/3pCoqLuyXuIr78fnN95tguKrf4P53XQB08ttYW+f85XdJWQkyV9cdPrwKZtOYvNj+mX94evbL+Yf8c9LiLZlL91owSssI3O6yDzCrBmZEnbOzefJuqgCN/En9JHXR74uK3m+CuPITdjW9Tx3s5y7AasmdB3QLPxFk9yNmKUweEEY+R/dNV0UF5NnhJQbtWO/MjPbC5iR/aDwlzvZnGvz/eKAMCHZ13VKeAnFYXSR+MTnX9qJVXxjFrnzOf88fyC5F/CNKSuK/1Zgs+0jK5oLsiRZTh5rbsYr/h2hMcmzosL4XtvmPsQhQpbGKVpVPolLZIKaP1q6wVngHMx2hVdasw9c7udIb3RIv6fvKghzcp+6XrG9Yu1EoVUeR6VCPVT9NIyiGxrRbHNGNJ1Ooedt/CmjD6S2x8cTbOJWsbpd/Gi1jKZYhu5oumXquz9Rul2jUZeuLKZ/5UxBuYTkK5o9MCOrjDBfj0BBAnyTWG0KOthC7lkVBE5TQggYcNJlw4JsgkzEZ8GLb5JoQle3leF6Y2A76i1qu4ht8mwa5qsiojJLQqvG+m0YlU0zSfzyCK+QPfS2Rn4IeL5mvEE53AyxXzIjnSVy/Yoq6vSBjERuHi6b8b9/NS1BTYZNVoTK6Vzh18AP61ADsumzFX2n0GeNij4khkCl5q6JPKgmwn2ryb96R0N2KVVDoINmS3DpOEjXHGfXm8LNIreXyEvZ85HdzzrBbcQQXPWcwmRK/+cNN9rr+pa3vtJabSRGWcX5Mzi3B+Ic4D3OmZs4MkEXoz2TJN2dTiFeKgWwpWHZlDuK8qd9+CDlpS7DU+7YMiEvr7vmNkvCPLhIgoyGdUa67RttrNtwgvB5E1l792G2IR11AwiaKdSfgTocCHUkdNxtUybqUHCbNFgzadxI4f4U7mMI7YY4wKR4fwbvhjTei6Qdksa7ODhF84BkkmH33XmwK2IwjiFreaVzrBJuHVnRwxzLS7hBYNaHL/eGk8/NsZhwC5MoTIphkmWY5SEdxWCJzKi9f0Nu4hHQrtJuJ9EuLe1mO8DRbGm0i1k3JlviuxFVxD9NPDaQdOKxiu+nJNrxUPEdWHvEW8iy68TDQYnHXQNq8XYuqXTWx5RuN8AIWFfRvcOVD7M+WHTfZx1b2NasinVgDgu7GN6TbWAfC+ljyr4hJH/SE1YDa6eQbg02sCaSbiENyYrqljiwlnsMtXyTgFOwi7CPIaxbamjtJNihNNgBNDUMq7A+LOxQcBsmdK7C+hOkO+aQYR3pywB9T9589q1Z9jNZXv1pR5di89y1aGu3XkpQ8rUs26q7C295RH/oZyUXMPdSbrYtOIOBW5wBA+dM3iA2+//1dVydTv/SZVwI6Bp0qj/bkEzxK10O1CVePVp3+XI9WLce1/tqhGOH00ykQ+ZGu1htnu0WvOu6G7Ga5qPIqL8oy9YP6cJ6I6AZNdIdSzLpr3Tp0SCkO9JIh1Am6WLmZkmjRTyGmD4i0pFhNVGXTDoQu+YK9WNRLx81IoF1IDWqA7EL782Y0qmCvS6TaTqaXu/BS4b9tS7oPwn2ljky7ZUnrQuPTBtoJpQFe8uDAyYR9YoBc3l9+QMTW/vhWebq0nYpVMf8BICH6pkLk9yQga36lNaB+RV75psHfxTgSiS4j5T5eTBnLfC4bseR6qQ/eb95EHskrZOOgNOY/WKcbUlK54U3uI8Lw1juy8fEvA21OvKO7GQ7gor5lzOPZIV66GCo2VBSqEfi4sUq1CvsW3r02Gh5sOXApItLBxXpR5Pe+0P2jibdRMx3Bli50nndLTfl8jiXclNeZNSlI6wSaycgPFRiTURYx3Y9WKNhEW55IOdDmKpI3Yq5Ae1GB112Lr3lgV0K+aORl5aKAyY2NatCHg4ctsVcnE+m7iLKFfat2JtAlzVe/v6bHX5wvwd3i3u69n7E/3z7633LTFZBrya4B9Trbf5oo+9VbNy5eU6yZGOButGqWU9TTA29KZpVvj2g/rIA2CKTeS6ZjlhZ0OTmgEx9jDH3Vdl7aUbLaanslrpG56prMVH15HRedo35nv83KpD7cot7u1E4K3zZY9VJmP26qLHiaVpXfEcc+v4mrrYp2UTySeYaDEl57wbYn73fpq41pLpHPA7rFJJeFGr6mhlvjYykrhec6PPFJF+nRByrU1AdEhobh4UG5pBKH/HwimdRdSDz0lM92vtxXjYwYtoljdykuAOhG2LSlGbic9cUMAeFto8ABvWjNNusXkm2vc+q3uuGbn8D
\ No newline at end of file
diff --git a/docs/hypervisors.md b/docs/hypervisors.md
index 02dd49aa12..e380450b20 100644
--- a/docs/hypervisors.md
+++ b/docs/hypervisors.md
@@ -33,6 +33,7 @@ are available, their default values and how each setting can be used.
[Cloud Hypervisor] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-clh.toml` |
[Firecracker] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-fc.toml` |
[QEMU] | C | all | Type 2 ([KVM]) | `configuration-qemu.toml` |
+[`Dragonball`] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-dragonball.toml` |
## Determine currently configured hypervisor
@@ -52,6 +53,7 @@ the hypervisors:
[Cloud Hypervisor] | Low latency, small memory footprint, small attack surface | Minimal | | excellent | excellent | High performance modern cloud workloads | |
[Firecracker] | Very slimline | Extremely minimal | Doesn't support all device types | excellent | excellent | Serverless / FaaS | |
[QEMU] | Lots of features | Lots | | good | good | Good option for most users | | All users |
+[`Dragonball`] | Built-in VMM, low CPU and memory overhead| Minimal | | excellent | excellent | Optimized for most container workloads | `out-of-the-box` Kata Containers experience |
For further details, see the [Virtualization in Kata Containers](design/virtualization.md) document and the official documentation for each hypervisor.
@@ -60,3 +62,4 @@ For further details, see the [Virtualization in Kata Containers](design/virtuali
[Firecracker]: https://github.com/firecracker-microvm/firecracker
[KVM]: https://en.wikipedia.org/wiki/Kernel-based_Virtual_Machine
[QEMU]: http://www.qemu-project.org
+[`Dragonball`]: https://github.com/openanolis/dragonball-sandbox
diff --git a/docs/install/README.md b/docs/install/README.md
index 9ad55f0f21..0ed42d87f5 100644
--- a/docs/install/README.md
+++ b/docs/install/README.md
@@ -79,3 +79,6 @@ versions. This is not recommended for normal users.
* [upgrading document](../Upgrading.md)
* [developer guide](../Developer-Guide.md)
* [runtime documentation](../../src/runtime/README.md)
+
+## Kata Containers 3.0 rust runtime installation
+* [installation guide](../install/kata-containers-3.0-rust-runtime-installation-guide.md)
diff --git a/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md b/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md
new file mode 100644
index 0000000000..122e43b512
--- /dev/null
+++ b/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md
@@ -0,0 +1,101 @@
+# Kata Containers 3.0 rust runtime installation
+The following is an overview of the different installation methods available.
+
+## Prerequisites
+
+Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check
+[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata
+Containers.
+
+### Platform support
+
+Kata Containers 3.0 rust runtime currently runs on 64-bit systems supporting the following
+architectures:
+
+> **Notes:**
+> For other architectures, see https://github.com/kata-containers/kata-containers/issues/4320
+
+| Architecture | Virtualization technology |
+|-|-|
+| `x86_64`| [Intel](https://www.intel.com) VT-x |
+| `aarch64` ("`arm64`")| [ARM](https://www.arm.com) Hyp |
+
+## Packaged installation methods
+
+| Installation method | Description | Automatic updates | Use case | Availability
+|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- |
+| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | No |
+| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
+| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. | No |
+| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No |
+| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No |
+| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
+
+### Kata Deploy Installation
+`ToDo`
+### Official packages
+`ToDo`
+### Snap Installation
+`ToDo`
+### Automatic Installation
+`ToDo`
+### Manual Installation
+`ToDo`
+
+## Build from source installation
+
+### Rust Environment Set Up
+
+* Download `Rustup` and install `Rust`
+ > **Notes:**
+ > Rust version 1.58 is needed
+
+ Example for `x86_64`
+ ```
+ $ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+ $ source $HOME/.cargo/env
+ $ rustup install 1.58
+ $ rustup default 1.58-x86_64-unknown-linux-gnu
+ ```
+
+* Musl support for fully static binary
+
+ Example for `x86_64`
+ ```
+ $ rustup target add x86_64-unknown-linux-musl
+ ```
+* [Musl `libc`](http://musl.libc.org/) install
+
+ Example for musl 1.2.3
+ ```
+ $ wget https://git.musl-libc.org/cgit/musl/snapshot/musl-1.2.3.tar.gz
+ $ tar vxf musl-1.2.3.tar.gz
+ $ cd musl-1.2.3/
+ $ ./configure --prefix=/usr/local/
+ $ make && sudo make install
+ ```
+
+
+### Install Kata 3.0 Rust Runtime Shim
+
+```
+$ git clone https://github.com/kata-containers/kata-containers.git
+$ cd kata-containers/src/runtime-rs
+$ make && make install
+```
+After running the command above, the default config file `configuration.toml` will be installed under `/usr/share/defaults/kata-containers/`, the binary file `containerd-shim-kata-v2` will be installed under `/user/local/bin` .
+
+### Build Kata Containers Kernel
+Follow the [Kernel installation guide](/tools/packaging/kernel/README.md).
+
+### Build Kata Rootfs
+Follow the [Rootfs installation guide](../../tools/osbuilder/rootfs-builder/README.md).
+
+### Build Kata Image
+Follow the [Image installation guide](../../tools/osbuilder/image-builder/README.md).
+
+### Install Containerd
+
+Follow the [Containerd installation guide](container-manager/containerd/containerd-install.md).
+
+
diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock
index abfca3a780..e26f7bd032 100644
--- a/src/agent/Cargo.lock
+++ b/src/agent/Cargo.lock
@@ -98,6 +98,12 @@ version = "3.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3"
+[[package]]
+name = "byte-unit"
+version = "3.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8"
+
[[package]]
name = "byteorder"
version = "1.4.3"
@@ -224,6 +230,12 @@ dependencies = [
"os_str_bytes",
]
+[[package]]
+name = "common-path"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101"
+
[[package]]
name = "core-foundation-sys"
version = "0.8.3"
@@ -322,6 +334,17 @@ dependencies = [
"libc",
]
+[[package]]
+name = "fail"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011"
+dependencies = [
+ "lazy_static",
+ "log",
+ "rand 0.8.5",
+]
+
[[package]]
name = "fastrand"
version = "1.7.0"
@@ -442,6 +465,17 @@ dependencies = [
"slab",
]
+[[package]]
+name = "getrandom"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
+dependencies = [
+ "cfg-if 1.0.0",
+ "libc",
+ "wasi 0.9.0+wasi-snapshot-preview1",
+]
+
[[package]]
name = "getrandom"
version = "0.2.7"
@@ -453,6 +487,12 @@ dependencies = [
"wasi 0.11.0+wasi-snapshot-preview1",
]
+[[package]]
+name = "glob"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
+
[[package]]
name = "hashbrown"
version = "0.12.1"
@@ -584,13 +624,14 @@ dependencies = [
"clap",
"futures",
"ipnetwork",
+ "kata-sys-util",
"lazy_static",
"libc",
"log",
"logging",
"netlink-packet-utils 0.4.1",
"netlink-sys 0.7.0",
- "nix 0.23.1",
+ "nix 0.24.2",
"oci",
"opentelemetry",
"procfs",
@@ -621,6 +662,47 @@ dependencies = [
"vsock-exporter",
]
+[[package]]
+name = "kata-sys-util"
+version = "0.1.0"
+dependencies = [
+ "byteorder",
+ "cgroups-rs",
+ "chrono",
+ "common-path",
+ "fail",
+ "kata-types",
+ "lazy_static",
+ "libc",
+ "nix 0.24.2",
+ "oci",
+ "once_cell",
+ "rand 0.7.3",
+ "serde_json",
+ "slog",
+ "slog-scope",
+ "subprocess",
+ "thiserror",
+]
+
+[[package]]
+name = "kata-types"
+version = "0.1.0"
+dependencies = [
+ "byte-unit",
+ "glob",
+ "lazy_static",
+ "num_cpus",
+ "oci",
+ "regex",
+ "serde",
+ "serde_json",
+ "slog",
+ "slog-scope",
+ "thiserror",
+ "toml",
+]
+
[[package]]
name = "lazy_static"
version = "1.4.0"
@@ -857,6 +939,7 @@ dependencies = [
"bitflags",
"cfg-if 1.0.0",
"libc",
+ "memoffset",
]
[[package]]
@@ -935,7 +1018,7 @@ dependencies = [
"lazy_static",
"percent-encoding",
"pin-project",
- "rand",
+ "rand 0.8.5",
"serde",
"thiserror",
"tokio",
@@ -1199,9 +1282,9 @@ dependencies = [
[[package]]
name = "protobuf"
-version = "2.14.0"
+version = "2.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485"
+checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96"
dependencies = [
"serde",
"serde_derive",
@@ -1209,18 +1292,18 @@ dependencies = [
[[package]]
name = "protobuf-codegen"
-version = "2.14.0"
+version = "2.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c"
+checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707"
dependencies = [
"protobuf",
]
[[package]]
name = "protobuf-codegen-pure"
-version = "2.14.0"
+version = "2.27.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440"
+checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6"
dependencies = [
"protobuf",
"protobuf-codegen",
@@ -1231,6 +1314,7 @@ name = "protocols"
version = "0.1.0"
dependencies = [
"async-trait",
+ "oci",
"protobuf",
"ttrpc",
"ttrpc-codegen",
@@ -1245,6 +1329,19 @@ dependencies = [
"proc-macro2",
]
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "getrandom 0.1.16",
+ "libc",
+ "rand_chacha 0.2.2",
+ "rand_core 0.5.1",
+ "rand_hc",
+]
+
[[package]]
name = "rand"
version = "0.8.5"
@@ -1252,8 +1349,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.3",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.5.1",
]
[[package]]
@@ -1263,7 +1370,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
- "rand_core",
+ "rand_core 0.6.3",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+dependencies = [
+ "getrandom 0.1.16",
]
[[package]]
@@ -1272,7 +1388,16 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
dependencies = [
- "getrandom",
+ "getrandom 0.2.7",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core 0.5.1",
]
[[package]]
@@ -1579,6 +1704,16 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+[[package]]
+name = "subprocess"
+version = "0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
[[package]]
name = "syn"
version = "1.0.98"
@@ -1846,9 +1981,9 @@ dependencies = [
[[package]]
name = "ttrpc"
-version = "0.5.3"
+version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c46d73bc2a74f2440921b6539afbed68064b48b2c4f194c637430d1c83d052ad"
+checksum = "2ecfff459a859c6ba6668ff72b34c2f1d94d9d58f7088414c2674ad0f31cc7d8"
dependencies = [
"async-trait",
"byteorder",
@@ -1947,6 +2082,12 @@ dependencies = [
"tokio-vsock",
]
+[[package]]
+name = "wasi"
+version = "0.9.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
+
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml
index a25120b4ff..166d080e9b 100644
--- a/src/agent/Cargo.toml
+++ b/src/agent/Cargo.toml
@@ -7,12 +7,12 @@ edition = "2018"
[dependencies]
oci = { path = "../libs/oci" }
rustjail = { path = "rustjail" }
-protocols = { path = "../libs/protocols" }
+protocols = { path = "../libs/protocols", features = ["async"] }
lazy_static = "1.3.0"
-ttrpc = { version = "0.5.0", features = ["async", "protobuf-codec"], default-features = false }
-protobuf = "=2.14.0"
+ttrpc = { version = "0.6.0", features = ["async"], default-features = false }
+protobuf = "2.27.0"
libc = "0.2.58"
-nix = "0.23.0"
+nix = "0.24.1"
capctl = "0.2.0"
serde_json = "1.0.39"
scan_fmt = "0.2.3"
@@ -20,6 +20,7 @@ scopeguard = "1.0.0"
thiserror = "1.0.26"
regex = "1.5.5"
serial_test = "0.5.1"
+kata-sys-util = { path = "../libs/kata-sys-util" }
sysinfo = "0.23.0"
# Async helpers
diff --git a/src/agent/Makefile b/src/agent/Makefile
index 533411bee6..f0e86fd6bc 100644
--- a/src/agent/Makefile
+++ b/src/agent/Makefile
@@ -107,10 +107,7 @@ endef
##TARGET default: build code
default: $(TARGET) show-header
-$(TARGET): $(GENERATED_CODE) logging-crate-tests $(TARGET_PATH)
-
-logging-crate-tests:
- make -C $(CWD)/../libs/logging
+$(TARGET): $(GENERATED_CODE) $(TARGET_PATH)
$(TARGET_PATH): show-summary
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) $(if $(findstring release,$(BUILD_TYPE)),--release) $(EXTRA_RUSTFEATURES)
@@ -203,7 +200,6 @@ codecov-html: check_tarpaulin
.PHONY: \
help \
- logging-crate-tests \
optimize \
show-header \
show-summary \
diff --git a/src/agent/rustjail/Cargo.toml b/src/agent/rustjail/Cargo.toml
index 78c0f962eb..375591c9f7 100644
--- a/src/agent/rustjail/Cargo.toml
+++ b/src/agent/rustjail/Cargo.toml
@@ -16,7 +16,7 @@ scopeguard = "1.0.0"
capctl = "0.2.0"
lazy_static = "1.3.0"
libc = "0.2.58"
-protobuf = "=2.14.0"
+protobuf = "2.27.0"
slog = "2.5.2"
slog-scope = "4.1.2"
scan_fmt = "0.2.6"
@@ -27,7 +27,7 @@ cgroups = { package = "cgroups-rs", version = "0.2.8" }
rlimit = "0.5.3"
cfg-if = "0.1.0"
-tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] }
+tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros", "rt"] }
futures = "0.3.17"
async-trait = "0.1.31"
inotify = "0.9.2"
diff --git a/src/agent/src/console.rs b/src/agent/src/console.rs
index c705af1b71..8f1ae5ff32 100644
--- a/src/agent/src/console.rs
+++ b/src/agent/src/console.rs
@@ -9,7 +9,7 @@ use anyhow::{anyhow, Result};
use nix::fcntl::{self, FcntlArg, FdFlag, OFlag};
use nix::libc::{STDERR_FILENO, STDIN_FILENO, STDOUT_FILENO};
use nix::pty::{openpty, OpenptyResult};
-use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
+use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr};
use nix::sys::stat::Mode;
use nix::sys::wait;
use nix::unistd::{self, close, dup2, fork, setsid, ForkResult, Pid};
@@ -67,7 +67,7 @@ pub async fn debug_console_handler(
SockFlag::SOCK_CLOEXEC,
None,
)?;
- let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, port);
+ let addr = VsockAddr::new(libc::VMADDR_CID_ANY, port);
socket::bind(listenfd, &addr)?;
socket::listen(listenfd, 1)?;
diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs
index eaf208601b..1a1bce4364 100644
--- a/src/agent/src/main.rs
+++ b/src/agent/src/main.rs
@@ -22,7 +22,7 @@ extern crate slog;
use anyhow::{anyhow, Context, Result};
use clap::{AppSettings, Parser};
use nix::fcntl::OFlag;
-use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
+use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr};
use nix::unistd::{self, dup, Pid};
use std::env;
use std::ffi::OsStr;
@@ -128,7 +128,7 @@ async fn create_logger_task(rfd: RawFd, vsock_port: u32, shutdown: Receiver>,
}
-// A container ID must match this regex:
-//
-// ^[a-zA-Z0-9][a-zA-Z0-9_.-]+$
-//
-fn verify_cid(id: &str) -> Result<()> {
- let mut chars = id.chars();
-
- let valid = match chars.next() {
- Some(first)
- if first.is_alphanumeric()
- && id.len() > 1
- && chars.all(|c| c.is_alphanumeric() || ['.', '-', '_'].contains(&c)) =>
- {
- true
- }
- _ => false,
- };
-
- match valid {
- true => Ok(()),
- false => Err(anyhow!("invalid container ID: {:?}", id)),
- }
-}
-
impl AgentService {
#[instrument]
async fn do_create_container(
@@ -165,7 +142,7 @@ impl AgentService {
) -> Result<()> {
let cid = req.container_id.clone();
- verify_cid(&cid)?;
+ kata_sys_util::validate::verify_id(&cid)?;
let mut oci_spec = req.OCI.clone();
let use_sandbox_pidns = req.get_sandbox_pidns();
@@ -650,7 +627,7 @@ impl AgentService {
}
#[async_trait]
-impl protocols::agent_ttrpc::AgentService for AgentService {
+impl agent_ttrpc::AgentService for AgentService {
async fn create_container(
&self,
ctx: &TtrpcContext,
@@ -1536,7 +1513,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
struct HealthService;
#[async_trait]
-impl protocols::health_ttrpc::Health for HealthService {
+impl health_ttrpc::Health for HealthService {
async fn check(
&self,
_ctx: &TtrpcContext,
@@ -1675,18 +1652,17 @@ async fn read_stream(reader: Arc>>, l: usize) -> Resu
}
pub fn start(s: Arc>, server_address: &str) -> Result {
- let agent_service = Box::new(AgentService { sandbox: s })
- as Box;
+ let agent_service =
+ Box::new(AgentService { sandbox: s }) as Box;
let agent_worker = Arc::new(agent_service);
- let health_service =
- Box::new(HealthService {}) as Box;
+ let health_service = Box::new(HealthService {}) as Box;
let health_worker = Arc::new(health_service);
- let aservice = protocols::agent_ttrpc::create_agent_service(agent_worker);
+ let aservice = agent_ttrpc::create_agent_service(agent_worker);
- let hservice = protocols::health_ttrpc::create_health(health_worker);
+ let hservice = health_ttrpc::create_health(health_worker);
let server = TtrpcServer::new()
.bind(server_address)?
@@ -2012,7 +1988,7 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> {
mod tests {
use super::*;
use crate::{
- assert_result, namespace::Namespace, protocols::agent_ttrpc::AgentService as _,
+ assert_result, namespace::Namespace, protocols::agent_ttrpc_async::AgentService as _,
skip_if_not_root,
};
use nix::mount;
@@ -2672,233 +2648,6 @@ OtherField:other
}
}
- #[tokio::test]
- async fn test_verify_cid() {
- #[derive(Debug)]
- struct TestData<'a> {
- id: &'a str,
- expect_error: bool,
- }
-
- let tests = &[
- TestData {
- // Cannot be blank
- id: "",
- expect_error: true,
- },
- TestData {
- // Cannot be a space
- id: " ",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: ".",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "-",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "_",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: " a",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: ".a",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "-a",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "_a",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "..",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "a",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "z",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "A",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "Z",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "0",
- expect_error: true,
- },
- TestData {
- // Too short
- id: "9",
- expect_error: true,
- },
- TestData {
- // Must start with an alphanumeric
- id: "-1",
- expect_error: true,
- },
- TestData {
- id: "/",
- expect_error: true,
- },
- TestData {
- id: "a/",
- expect_error: true,
- },
- TestData {
- id: "a/../",
- expect_error: true,
- },
- TestData {
- id: "../a",
- expect_error: true,
- },
- TestData {
- id: "../../a",
- expect_error: true,
- },
- TestData {
- id: "../../../a",
- expect_error: true,
- },
- TestData {
- id: "foo/../bar",
- expect_error: true,
- },
- TestData {
- id: "foo bar",
- expect_error: true,
- },
- TestData {
- id: "a.",
- expect_error: false,
- },
- TestData {
- id: "a..",
- expect_error: false,
- },
- TestData {
- id: "aa",
- expect_error: false,
- },
- TestData {
- id: "aa.",
- expect_error: false,
- },
- TestData {
- id: "hello..world",
- expect_error: false,
- },
- TestData {
- id: "hello/../world",
- expect_error: true,
- },
- TestData {
- id: "aa1245124sadfasdfgasdga.",
- expect_error: false,
- },
- TestData {
- id: "aAzZ0123456789_.-",
- expect_error: false,
- },
- TestData {
- id: "abcdefghijklmnopqrstuvwxyz0123456789.-_",
- expect_error: false,
- },
- TestData {
- id: "0123456789abcdefghijklmnopqrstuvwxyz.-_",
- expect_error: false,
- },
- TestData {
- id: " abcdefghijklmnopqrstuvwxyz0123456789.-_",
- expect_error: true,
- },
- TestData {
- id: ".abcdefghijklmnopqrstuvwxyz0123456789.-_",
- expect_error: true,
- },
- TestData {
- id: "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_",
- expect_error: false,
- },
- TestData {
- id: "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ.-_",
- expect_error: false,
- },
- TestData {
- id: " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_",
- expect_error: true,
- },
- TestData {
- id: ".ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_",
- expect_error: true,
- },
- TestData {
- id: "/a/b/c",
- expect_error: true,
- },
- TestData {
- id: "a/b/c",
- expect_error: true,
- },
- TestData {
- id: "foo/../../../etc/passwd",
- expect_error: true,
- },
- TestData {
- id: "../../../../../../etc/motd",
- expect_error: true,
- },
- TestData {
- id: "/etc/passwd",
- expect_error: true,
- },
- ];
-
- for (i, d) in tests.iter().enumerate() {
- let msg = format!("test[{}]: {:?}", i, d);
-
- let result = verify_cid(d.id);
-
- let msg = format!("{}, result: {:?}", msg, result);
-
- if result.is_ok() {
- assert!(!d.expect_error, "{}", msg);
- } else {
- assert!(d.expect_error, "{}", msg);
- }
- }
- }
-
#[tokio::test]
async fn test_volume_capacity_stats() {
skip_if_not_root!();
diff --git a/src/dragonball/.gitignore b/src/dragonball/.gitignore
new file mode 100644
index 0000000000..64f40ab296
--- /dev/null
+++ b/src/dragonball/.gitignore
@@ -0,0 +1,3 @@
+target
+Cargo.lock
+.idea
diff --git a/src/dragonball/Cargo.toml b/src/dragonball/Cargo.toml
new file mode 100644
index 0000000000..0f4aa582f9
--- /dev/null
+++ b/src/dragonball/Cargo.toml
@@ -0,0 +1,65 @@
+[package]
+name = "dragonball"
+version = "0.1.0"
+authors = ["The Kata Containers community "]
+description = "A secure sandbox for Kata Containers"
+keywords = ["kata-containers", "sandbox", "vmm", "dragonball"]
+homepage = "https://katacontainers.io/"
+repository = "https://github.com/kata-containers/kata-containers.git"
+license = "Apache-2.0"
+edition = "2018"
+
+[dependencies]
+arc-swap = "1.5.0"
+bytes = "1.1.0"
+dbs-address-space = "0.1.0"
+dbs-allocator = "0.1.0"
+dbs-arch = "0.1.0"
+dbs-boot = "0.2.0"
+dbs-device = "0.1.0"
+dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] }
+dbs-legacy-devices = "0.1.0"
+dbs-upcall = { version = "0.1.0", optional = true }
+dbs-utils = "0.1.0"
+dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] }
+kvm-bindings = "0.5.0"
+kvm-ioctls = "0.11.0"
+lazy_static = "1.2"
+libc = "0.2.39"
+linux-loader = "0.4.0"
+log = "0.4.14"
+nix = "0.23.1"
+seccompiler = "0.2.0"
+serde = "1.0.27"
+serde_derive = "1.0.27"
+serde_json = "1.0.9"
+slog = "2.5.2"
+slog-scope = "4.4.0"
+thiserror = "1"
+vmm-sys-util = "0.9.0"
+virtio-queue = { version = "0.1.0", optional = true }
+vm-memory = { version = "0.7.0", features = ["backend-mmap"] }
+
+[dev-dependencies]
+slog-term = "2.9.0"
+slog-async = "2.7.0"
+
+[features]
+acpi = []
+atomic-guest-memory = []
+hotplug = ["virtio-vsock"]
+virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"]
+virtio-blk = ["dbs-virtio-devices/virtio-blk", "virtio-queue"]
+virtio-net = ["dbs-virtio-devices/virtio-net", "virtio-queue"]
+# virtio-fs only work on atomic-guest-memory
+virtio-fs = ["dbs-virtio-devices/virtio-fs", "virtio-queue", "atomic-guest-memory"]
+
+[patch.'crates-io']
+dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-boot = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
+dbs-arch = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" }
diff --git a/src/dragonball/LICENSE b/src/dragonball/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/src/dragonball/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/src/dragonball/Makefile b/src/dragonball/Makefile
new file mode 100644
index 0000000000..8acd29de57
--- /dev/null
+++ b/src/dragonball/Makefile
@@ -0,0 +1,29 @@
+# Copyright (c) 2019-2022 Alibaba Cloud. All rights reserved.
+# Copyright (c) 2019-2022 Ant Group. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+default: build
+
+build:
+ # FIXME: This line will be removed when we solve the vm-memory dependency problem in Dragonball Sandbox
+ cargo update -p vm-memory:0.8.0 --precise 0.7.0
+ cargo build --all-features
+
+check: clippy format
+
+clippy:
+ @echo "INFO: cargo clippy..."
+ cargo clippy --all-targets --all-features \
+ -- \
+ -D warnings
+
+format:
+ @echo "INFO: cargo fmt..."
+ cargo fmt -- --check
+
+clean:
+ cargo clean
+
+test:
+ @echo "INFO: testing dragonball for development build"
+ cargo test --all-features -- --nocapture
diff --git a/src/dragonball/README.md b/src/dragonball/README.md
new file mode 100644
index 0000000000..c9d7e5119c
--- /dev/null
+++ b/src/dragonball/README.md
@@ -0,0 +1,40 @@
+# Introduction
+`Dragonball Sandbox` is a light-weight virtual machine manager (VMM) based on Linux Kernel-based Virtual Machine (KVM),
+which is optimized for container workloads with:
+- container image management and acceleration service
+- flexible and high-performance virtual device drivers
+- low CPU and memory overhead
+- minimal startup time
+- optimized concurrent startup speed
+
+`Dragonball Sandbox` aims to provide a simple solution for the Kata Containers community. It is integrated into Kata 3.0
+runtime as a built-in VMM and gives users an out-of-the-box Kata Containers experience without complex environment setup
+and configuration process.
+
+# Getting Started
+[TODO](https://github.com/kata-containers/kata-containers/issues/4302)
+
+# Documentation
+
+Device: [Device Document](docs/device.md)
+vCPU: [vCPU Document](docs/vcpu.md)
+API: [API Document](docs/api.md)
+
+Currently, the documents are still actively adding.
+You could see the [official documentation](docs/) page for more details.
+
+# Supported Architectures
+- x86-64
+- aarch64
+
+# Supported Kernel
+[TODO](https://github.com/kata-containers/kata-containers/issues/4303)
+
+# Acknowledgement
+Part of the code is based on the [Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) project, [`crosvm`](https://github.com/google/crosvm) project and [Firecracker](https://github.com/firecracker-microvm/firecracker) project. They are all rust written virtual machine managers with advantages on safety and security.
+
+`Dragonball sandbox` is designed to be a VMM that is customized for Kata Containers and we will focus on optimizing container workloads for Kata ecosystem. The focus on the Kata community is what differentiates us from other rust written virtual machines.
+
+# License
+
+`Dragonball` is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0.
\ No newline at end of file
diff --git a/src/dragonball/THIRD-PARTY b/src/dragonball/THIRD-PARTY
new file mode 100644
index 0000000000..c3069125a3
--- /dev/null
+++ b/src/dragonball/THIRD-PARTY
@@ -0,0 +1,27 @@
+// Copyright 2017 The Chromium OS Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/src/dragonball/docs/api.md b/src/dragonball/docs/api.md
new file mode 100644
index 0000000000..dab49835ae
--- /dev/null
+++ b/src/dragonball/docs/api.md
@@ -0,0 +1,27 @@
+# API
+
+We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager.
+This document provides the introduction for each of them.
+
+## `ConfigureBootSource`
+Configure the boot source of the VM using `BootSourceConfig`. This action can only be called before the VM has booted.
+
+### Boot Source Config
+1. `kernel_path`: Path of the kernel image. `Dragonball` only supports compressed kernel image for now.
+2. `initrd_path`: Path of the initrd (could be None)
+3. `boot_args`: Boot arguments passed to the kernel (could be None)
+
+## `SetVmConfiguration`
+Set virtual machine configuration using `VmConfigInfo` to initialize VM.
+
+### VM Config Info
+1. `vcpu_count`: Number of vCPU to start. Currently we only support up to 255 vCPUs.
+2. `max_vcpu_count`: Max number of vCPU can be added through CPU hotplug.
+3. `cpu_pm`: CPU power management.
+4. `cpu_topology`: CPU topology information (including `threads_per_core`, `cores_per_die`, `dies_per_socket` and `sockets`).
+5. `vpmu_feature`: `vPMU` feature level.
+6. `mem_type`: Memory type that can be either `hugetlbfs` or `shmem`, default is `shmem`.
+7. `mem_file_path` : Memory file path.
+8. `mem_size_mib`: The memory size in MiB. The maximum memory size is 1TB.
+9. `serial_path`: Optional sock path.
+
diff --git a/src/dragonball/docs/device.md b/src/dragonball/docs/device.md
new file mode 100644
index 0000000000..ab2e078e7b
--- /dev/null
+++ b/src/dragonball/docs/device.md
@@ -0,0 +1,20 @@
+# Device
+
+## Device Manager
+
+Currently we have following device manager:
+| Name | Description |
+| --- | --- |
+| [address space manager](../src/address_space_manager.rs) | abstracts virtual machine's physical management and provide mapping for guest virtual memory and MMIO ranges of emulated virtual devices, pass-through devices and vCPU |
+| [config manager](../src/config_manager.rs) | provides abstractions for configuration information |
+| [console manager](../src/device_manager/console_manager.rs) | provides management for all console devices |
+| [resource manager](../src/resource_manager.rs) |provides resource management for `legacy_irq_pool`, `msi_irq_pool`, `pio_pool`, `mmio_pool`, `mem_pool`, `kvm_mem_slot_pool` with builder `ResourceManagerBuilder` |
+| [VSOCK device manager](../src/device_manager/vsock_dev_mgr.rs) | provides configuration info for `VIRTIO-VSOCK` and management for all VSOCK devices |
+
+
+## Device supported
+`VIRTIO-VSOCK`
+`i8042`
+`COM1`
+`COM2`
+
diff --git a/src/dragonball/docs/vcpu.md b/src/dragonball/docs/vcpu.md
new file mode 100644
index 0000000000..e2be8037b6
--- /dev/null
+++ b/src/dragonball/docs/vcpu.md
@@ -0,0 +1,42 @@
+# vCPU
+
+## vCPU Manager
+The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc.
+
+For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445)
+
+### vCPU config
+`VcpuConfig` is used to configure guest overall CPU info.
+
+`boot_vcpu_count` is used to define the initial vCPU number.
+
+`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature
+
+`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology.
+
+`vpmu_feature` is used to define `vPMU` feature level.
+If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default).
+If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions).
+If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported
+
+## vCPU State
+
+There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow.
+
+When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`.
+
+During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason.
+
+If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine.
+When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM.
+
+When the VMM is stopped / destroyed, the state will change to `exited`.
+
+## vCPU Hot plug
+Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug.
+
+To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try.
+
+vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid.
+
+
diff --git a/src/dragonball/src/address_space_manager.rs b/src/dragonball/src/address_space_manager.rs
new file mode 100644
index 0000000000..9992833e0c
--- /dev/null
+++ b/src/dragonball/src/address_space_manager.rs
@@ -0,0 +1,892 @@
+// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Address space abstraction to manage virtual machine's physical address space.
+//!
+//! The AddressSpace abstraction is introduced to manage virtual machine's physical address space.
+//! The regions in virtual machine's physical address space may be used to:
+//! 1) map guest virtual memory
+//! 2) map MMIO ranges for emulated virtual devices, such as virtio-fs DAX window.
+//! 3) map MMIO ranges for pass-through devices, such as PCI device BARs.
+//! 4) map MMIO ranges for to vCPU, such as local APIC.
+//! 5) not used/available
+//!
+//! A related abstraction, vm_memory::GuestMemory, is used to access guest virtual memory only.
+//! In other words, AddressSpace is the resource owner, and GuestMemory is an accessor for guest
+//! virtual memory.
+
+use std::collections::{BTreeMap, HashMap};
+use std::fs::File;
+use std::os::unix::io::{AsRawFd, FromRawFd};
+use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+
+use dbs_address_space::{
+ AddressSpace, AddressSpaceError, AddressSpaceLayout, AddressSpaceRegion,
+ AddressSpaceRegionType, NumaNode, NumaNodeInfo, MPOL_MF_MOVE, MPOL_PREFERRED,
+};
+use dbs_allocator::Constraint;
+use kvm_bindings::kvm_userspace_memory_region;
+use kvm_ioctls::VmFd;
+use log::{debug, error, info, warn};
+use nix::sys::mman;
+use nix::unistd::dup;
+#[cfg(feature = "atomic-guest-memory")]
+use vm_memory::atomic::GuestMemoryAtomic;
+use vm_memory::{
+ Address, FileOffset, GuestAddress, GuestAddressSpace, GuestMemoryMmap, GuestMemoryRegion,
+ GuestRegionMmap, GuestUsize, MemoryRegionAddress, MmapRegion,
+};
+
+use crate::resource_manager::ResourceManager;
+use crate::vm::NumaRegionInfo;
+
+#[cfg(not(feature = "atomic-guest-memory"))]
+/// Concrete GuestAddressSpace type used by the VMM.
+pub type GuestAddressSpaceImpl = Arc;
+
+#[cfg(feature = "atomic-guest-memory")]
+/// Concrete GuestAddressSpace type used by the VMM.
+pub type GuestAddressSpaceImpl = GuestMemoryAtomic;
+
+/// Concrete GuestMemory type used by the VMM.
+pub type GuestMemoryImpl = as GuestAddressSpace>::M;
+/// Concrete GuestRegion type used by the VMM.
+pub type GuestRegionImpl = GuestRegionMmap;
+
+// Maximum number of working threads for memory pre-allocation.
+const MAX_PRE_ALLOC_THREAD: u64 = 16;
+
+// Control the actual number of pre-allocating threads. After several performance tests, we decide to use one thread to do pre-allocating for every 4G memory.
+const PRE_ALLOC_GRANULARITY: u64 = 32;
+
+// We don't have plan to support mainframe computer and only focus on PC servers.
+// 64 as max nodes should be enough for now.
+const MAX_NODE: u32 = 64;
+
+// We will split the memory region if it conflicts with the MMIO hole.
+// But if the space below the MMIO hole is smaller than the MINIMAL_SPLIT_SPACE, we won't split the memory region in order to enhance performance.
+const MINIMAL_SPLIT_SPACE: u64 = 128 << 20;
+
+/// Errors associated with virtual machine address space management.
+#[derive(Debug, thiserror::Error)]
+pub enum AddressManagerError {
+ /// Invalid address space operation.
+ #[error("invalid address space operation")]
+ InvalidOperation,
+
+ /// Invalid address range.
+ #[error("invalid address space region (0x{0:x}, 0x{1:x})")]
+ InvalidAddressRange(u64, GuestUsize),
+
+ /// No available mem address.
+ #[error("no available mem address")]
+ NoAvailableMemAddress,
+
+ /// No available kvm slotse.
+ #[error("no available kvm slots")]
+ NoAvailableKvmSlot,
+
+ /// Address manager failed to create memfd to map anonymous memory.
+ #[error("address manager failed to create memfd to map anonymous memory")]
+ CreateMemFd(#[source] nix::Error),
+
+ /// Address manager failed to open memory file.
+ #[error("address manager failed to open memory file")]
+ OpenFile(#[source] std::io::Error),
+
+ /// Memory file provided is invalid due to empty file path, non-existent file path and other possible mistakes.
+ #[error("memory file provided to address manager {0} is invalid")]
+ FileInvalid(String),
+
+ /// Memory file provided is invalid due to empty memory type
+ #[error("memory type provided to address manager {0} is invalid")]
+ TypeInvalid(String),
+
+ /// Failed to set size for memory file.
+ #[error("address manager failed to set size for memory file")]
+ SetFileSize(#[source] std::io::Error),
+
+ /// Failed to unlink memory file.
+ #[error("address manager failed to unlink memory file")]
+ UnlinkFile(#[source] nix::Error),
+
+ /// Failed to duplicate fd of memory file.
+ #[error("address manager failed to duplicate memory file descriptor")]
+ DupFd(#[source] nix::Error),
+
+ /// Failure in accessing the memory located at some address.
+ #[error("address manager failed to access guest memory located at 0x{0:x}")]
+ AccessGuestMemory(u64, #[source] vm_memory::mmap::Error),
+
+ /// Failed to create GuestMemory
+ #[error("address manager failed to create guest memory object")]
+ CreateGuestMemory(#[source] vm_memory::Error),
+
+ /// Failure in initializing guest memory.
+ #[error("address manager failed to initialize guest memory")]
+ GuestMemoryNotInitialized,
+
+ /// Failed to mmap() guest memory
+ #[error("address manager failed to mmap() guest memory into current process")]
+ MmapGuestMemory(#[source] vm_memory::mmap::MmapRegionError),
+
+ /// Failed to set KVM memory slot.
+ #[error("address manager failed to configure KVM memory slot")]
+ KvmSetMemorySlot(#[source] kvm_ioctls::Error),
+
+ /// Failed to set madvise on AddressSpaceRegion
+ #[error("address manager failed to set madvice() on guest memory region")]
+ Madvise(#[source] nix::Error),
+
+ /// join threads fail
+ #[error("address manager failed to join threads")]
+ JoinFail,
+
+ /// Failed to create Address Space Region
+ #[error("address manager failed to create Address Space Region {0}")]
+ CreateAddressSpaceRegion(#[source] AddressSpaceError),
+}
+
+type Result = std::result::Result;
+
+/// Parameters to configure address space creation operations.
+pub struct AddressSpaceMgrBuilder<'a> {
+ mem_type: &'a str,
+ mem_file: &'a str,
+ mem_index: u32,
+ mem_suffix: bool,
+ mem_prealloc: bool,
+ dirty_page_logging: bool,
+ vmfd: Option>,
+}
+
+impl<'a> AddressSpaceMgrBuilder<'a> {
+ /// Create a new [`AddressSpaceMgrBuilder`] object.
+ pub fn new(mem_type: &'a str, mem_file: &'a str) -> Result {
+ if mem_type.is_empty() {
+ return Err(AddressManagerError::TypeInvalid(mem_type.to_string()));
+ }
+ Ok(AddressSpaceMgrBuilder {
+ mem_type,
+ mem_file,
+ mem_index: 0,
+ mem_suffix: true,
+ mem_prealloc: false,
+ dirty_page_logging: false,
+ vmfd: None,
+ })
+ }
+
+ /// Enable/disable adding numbered suffix to memory file path.
+ /// This feature could be useful to generate hugetlbfs files with number suffix. (e.g. shmem0, shmem1)
+ pub fn toggle_file_suffix(&mut self, enabled: bool) {
+ self.mem_suffix = enabled;
+ }
+
+ /// Enable/disable memory pre-allocation.
+ /// Enable this feature could improve performance stability at the start of workload by avoiding page fault.
+ /// Disable this feature may influence performance stability but the cpu resource consumption and start-up time will decrease.
+ pub fn toggle_prealloc(&mut self, prealloc: bool) {
+ self.mem_prealloc = prealloc;
+ }
+
+ /// Enable/disable KVM dirty page logging.
+ pub fn toggle_dirty_page_logging(&mut self, logging: bool) {
+ self.dirty_page_logging = logging;
+ }
+
+ /// Set KVM [`VmFd`] handle to configure memory slots.
+ pub fn set_kvm_vm_fd(&mut self, vmfd: Arc) -> Option> {
+ let mut existing_vmfd = None;
+ if self.vmfd.is_some() {
+ existing_vmfd = self.vmfd.clone();
+ }
+ self.vmfd = Some(vmfd);
+ existing_vmfd
+ }
+
+ /// Build a ['AddressSpaceMgr'] using the configured parameters.
+ pub fn build(
+ self,
+ res_mgr: &ResourceManager,
+ numa_region_infos: &[NumaRegionInfo],
+ ) -> Result {
+ let mut mgr = AddressSpaceMgr::default();
+ mgr.create_address_space(res_mgr, numa_region_infos, self)?;
+ Ok(mgr)
+ }
+
+ fn get_next_mem_file(&mut self) -> String {
+ if self.mem_suffix {
+ let path = format!("{}{}", self.mem_file, self.mem_index);
+ self.mem_index += 1;
+ path
+ } else {
+ self.mem_file.to_string()
+ }
+ }
+}
+
+/// Struct to manage virtual machine's physical address space.
+pub struct AddressSpaceMgr {
+ address_space: Option,
+ vm_as: Option,
+ base_to_slot: Arc>>,
+ prealloc_handlers: Vec>,
+ prealloc_exit: Arc,
+ numa_nodes: BTreeMap,
+}
+
+impl AddressSpaceMgr {
+ /// Query address space manager is initialized or not
+ pub fn is_initialized(&self) -> bool {
+ self.address_space.is_some()
+ }
+
+ /// Gets address space.
+ pub fn address_space(&self) -> Option<&AddressSpace> {
+ self.address_space.as_ref()
+ }
+
+ /// Create the address space for a virtual machine.
+ ///
+ /// This method is designed to be called when starting up a virtual machine instead of at
+ /// runtime, so it's expected the virtual machine will be tore down and no strict error recover.
+ pub fn create_address_space(
+ &mut self,
+ res_mgr: &ResourceManager,
+ numa_region_infos: &[NumaRegionInfo],
+ mut param: AddressSpaceMgrBuilder,
+ ) -> Result<()> {
+ let mut regions = Vec::new();
+ let mut start_addr = dbs_boot::layout::GUEST_MEM_START;
+
+ // Create address space regions.
+ for info in numa_region_infos.iter() {
+ info!("numa_region_info {:?}", info);
+ // convert size_in_mib to bytes
+ let size = info
+ .size
+ .checked_shl(20)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+
+ // Guest memory does not intersect with the MMIO hole.
+ // TODO: make it work for ARM (issue #4307)
+ if start_addr > dbs_boot::layout::MMIO_LOW_END
+ || start_addr + size <= dbs_boot::layout::MMIO_LOW_START
+ {
+ let region = self.create_region(start_addr, size, info, &mut param)?;
+ regions.push(region);
+ start_addr = start_addr
+ .checked_add(size)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+ } else {
+ // Add guest memory below the MMIO hole, avoid splitting the memory region
+ // if the available address region is small than MINIMAL_SPLIT_SPACE MiB.
+ let mut below_size = dbs_boot::layout::MMIO_LOW_START
+ .checked_sub(start_addr)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+ if below_size < (MINIMAL_SPLIT_SPACE) {
+ below_size = 0;
+ } else {
+ let region = self.create_region(start_addr, below_size, info, &mut param)?;
+ regions.push(region);
+ }
+
+ // Add guest memory above the MMIO hole
+ let above_start = dbs_boot::layout::MMIO_LOW_END + 1;
+ let above_size = size
+ .checked_sub(below_size)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+ let region = self.create_region(above_start, above_size, info, &mut param)?;
+ regions.push(region);
+ start_addr = above_start
+ .checked_add(above_size)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+ }
+ }
+
+ // Create GuestMemory object
+ let mut vm_memory = GuestMemoryMmap::new();
+ for reg in regions.iter() {
+ // Allocate used guest memory addresses.
+ // These addresses are statically allocated, resource allocation/update should not fail.
+ let constraint = Constraint::new(reg.len())
+ .min(reg.start_addr().raw_value())
+ .max(reg.last_addr().raw_value());
+ let _key = res_mgr
+ .allocate_mem_address(&constraint)
+ .ok_or(AddressManagerError::NoAvailableMemAddress)?;
+ let mmap_reg = self.create_mmap_region(reg.clone())?;
+
+ vm_memory = vm_memory
+ .insert_region(mmap_reg.clone())
+ .map_err(AddressManagerError::CreateGuestMemory)?;
+ self.map_to_kvm(res_mgr, ¶m, reg, mmap_reg)?;
+ }
+
+ #[cfg(feature = "atomic-guest-memory")]
+ {
+ self.vm_as = Some(AddressSpace::convert_into_vm_as(vm_memory));
+ }
+ #[cfg(not(feature = "atomic-guest-memory"))]
+ {
+ self.vm_as = Some(Arc::new(vm_memory));
+ }
+
+ let layout = AddressSpaceLayout::new(
+ *dbs_boot::layout::GUEST_PHYS_END,
+ dbs_boot::layout::GUEST_MEM_START,
+ *dbs_boot::layout::GUEST_MEM_END,
+ );
+ self.address_space = Some(AddressSpace::from_regions(regions, layout));
+
+ Ok(())
+ }
+
+ // size unit: Byte
+ fn create_region(
+ &mut self,
+ start_addr: u64,
+ size_bytes: u64,
+ info: &NumaRegionInfo,
+ param: &mut AddressSpaceMgrBuilder,
+ ) -> Result> {
+ let mem_file_path = param.get_next_mem_file();
+ let region = AddressSpaceRegion::create_default_memory_region(
+ GuestAddress(start_addr),
+ size_bytes,
+ info.host_numa_node_id,
+ param.mem_type,
+ &mem_file_path,
+ param.mem_prealloc,
+ false,
+ )
+ .map_err(AddressManagerError::CreateAddressSpaceRegion)?;
+ let region = Arc::new(region);
+
+ self.insert_into_numa_nodes(
+ ®ion,
+ info.guest_numa_node_id.unwrap_or(0),
+ &info.vcpu_ids,
+ );
+ info!(
+ "create new region: guest addr 0x{:x}-0x{:x} size {}",
+ start_addr,
+ start_addr + size_bytes,
+ size_bytes
+ );
+
+ Ok(region)
+ }
+
+ fn map_to_kvm(
+ &mut self,
+ res_mgr: &ResourceManager,
+ param: &AddressSpaceMgrBuilder,
+ reg: &Arc,
+ mmap_reg: Arc,
+ ) -> Result<()> {
+ // Build mapping between GPA <-> HVA, by adding kvm memory slot.
+ let slot = res_mgr
+ .allocate_kvm_mem_slot(1, None)
+ .ok_or(AddressManagerError::NoAvailableKvmSlot)?;
+
+ if let Some(vmfd) = param.vmfd.as_ref() {
+ let host_addr = mmap_reg
+ .get_host_address(MemoryRegionAddress(0))
+ .map_err(|_e| AddressManagerError::InvalidOperation)?;
+ let flags = 0u32;
+
+ let mem_region = kvm_userspace_memory_region {
+ slot: slot as u32,
+ guest_phys_addr: reg.start_addr().raw_value(),
+ memory_size: reg.len() as u64,
+ userspace_addr: host_addr as u64,
+ flags,
+ };
+
+ info!(
+ "VM: guest memory region {:x} starts at {:x?}",
+ reg.start_addr().raw_value(),
+ host_addr
+ );
+ // Safe because the guest regions are guaranteed not to overlap.
+ unsafe { vmfd.set_user_memory_region(mem_region) }
+ .map_err(AddressManagerError::KvmSetMemorySlot)?;
+ }
+
+ self.base_to_slot
+ .lock()
+ .unwrap()
+ .insert(reg.start_addr().raw_value(), slot as u32);
+
+ Ok(())
+ }
+
+ /// Mmap the address space region into current process.
+ pub fn create_mmap_region(
+ &mut self,
+ region: Arc,
+ ) -> Result> {
+ // Special check for 32bit host with 64bit virtual machines.
+ if region.len() > usize::MAX as u64 {
+ return Err(AddressManagerError::InvalidAddressRange(
+ region.start_addr().raw_value(),
+ region.len(),
+ ));
+ }
+ // The device MMIO regions may not be backed by memory files, so refuse to mmap them.
+ if region.region_type() == AddressSpaceRegionType::DeviceMemory {
+ return Err(AddressManagerError::InvalidOperation);
+ }
+
+ // The GuestRegionMmap/MmapRegion will take ownership of the FileOffset object,
+ // so we have to duplicate the fd here. It's really a dirty design.
+ let file_offset = match region.file_offset().as_ref() {
+ Some(fo) => {
+ let fd = dup(fo.file().as_raw_fd()).map_err(AddressManagerError::DupFd)?;
+ // Safe because we have just duplicated the raw fd.
+ let file = unsafe { File::from_raw_fd(fd) };
+ let file_offset = FileOffset::new(file, fo.start());
+ Some(file_offset)
+ }
+ None => None,
+ };
+ let perm_flags = if (region.perm_flags() & libc::MAP_POPULATE) != 0 && region.is_hugepage()
+ {
+ // mmap(MAP_POPULATE) conflicts with madive(MADV_HUGEPAGE) because mmap(MAP_POPULATE)
+ // will pre-fault in all memory with normal pages before madive(MADV_HUGEPAGE) gets
+ // called. So remove the MAP_POPULATE flag and memory will be faulted in by working
+ // threads.
+ region.perm_flags() & (!libc::MAP_POPULATE)
+ } else {
+ region.perm_flags()
+ };
+ let mmap_reg = MmapRegion::build(
+ file_offset,
+ region.len() as usize,
+ libc::PROT_READ | libc::PROT_WRITE,
+ perm_flags,
+ )
+ .map_err(AddressManagerError::MmapGuestMemory)?;
+
+ if region.is_anonpage() {
+ self.configure_anon_mem(&mmap_reg)?;
+ }
+ if let Some(node_id) = region.host_numa_node_id() {
+ self.configure_numa(&mmap_reg, node_id)?;
+ }
+ if region.is_hugepage() {
+ self.configure_thp_and_prealloc(®ion, &mmap_reg)?;
+ }
+
+ let reg = GuestRegionImpl::new(mmap_reg, region.start_addr())
+ .map_err(AddressManagerError::CreateGuestMemory)?;
+ Ok(Arc::new(reg))
+ }
+
+ fn configure_anon_mem(&self, mmap_reg: &MmapRegion) -> Result<()> {
+ unsafe {
+ mman::madvise(
+ mmap_reg.as_ptr() as *mut libc::c_void,
+ mmap_reg.size(),
+ mman::MmapAdvise::MADV_DONTFORK,
+ )
+ }
+ .map_err(AddressManagerError::Madvise)
+ }
+
+ fn configure_numa(&self, mmap_reg: &MmapRegion, node_id: u32) -> Result<()> {
+ let nodemask = 1_u64
+ .checked_shl(node_id)
+ .ok_or_else(|| AddressManagerError::InvalidOperation)?;
+ let res = unsafe {
+ libc::syscall(
+ libc::SYS_mbind,
+ mmap_reg.as_ptr() as *mut libc::c_void,
+ mmap_reg.size(),
+ MPOL_PREFERRED,
+ &nodemask as *const u64,
+ MAX_NODE,
+ MPOL_MF_MOVE,
+ )
+ };
+ if res < 0 {
+ warn!(
+ "failed to mbind memory to host_numa_node_id {}: this may affect performance",
+ node_id
+ );
+ }
+ Ok(())
+ }
+
+ // We set Transparent Huge Page (THP) through mmap to increase performance.
+ // In order to reduce the impact of page fault on performance, we start several threads (up to MAX_PRE_ALLOC_THREAD) to touch every 4k page of the memory region to manually do memory pre-allocation.
+ // The reason why we don't use mmap to enable THP and pre-alloction is that THP setting won't take effect in this operation (tested in kernel 4.9)
+ fn configure_thp_and_prealloc(
+ &mut self,
+ region: &Arc,
+ mmap_reg: &MmapRegion,
+ ) -> Result<()> {
+ debug!(
+ "Setting MADV_HUGEPAGE on AddressSpaceRegion addr {:x?} len {:x?}",
+ mmap_reg.as_ptr(),
+ mmap_reg.size()
+ );
+
+ // Safe because we just create the MmapRegion
+ unsafe {
+ mman::madvise(
+ mmap_reg.as_ptr() as *mut libc::c_void,
+ mmap_reg.size(),
+ mman::MmapAdvise::MADV_HUGEPAGE,
+ )
+ }
+ .map_err(AddressManagerError::Madvise)?;
+
+ if region.perm_flags() & libc::MAP_POPULATE > 0 {
+ // Touch every 4k page to trigger allocation. The step is 4K instead of 2M to ensure
+ // pre-allocation when running out of huge pages.
+ const PAGE_SIZE: u64 = 4096;
+ const PAGE_SHIFT: u32 = 12;
+ let addr = mmap_reg.as_ptr() as u64;
+ // Here we use >> PAGE_SHIFT to calculate how many 4K pages in the memory region.
+ let npage = (mmap_reg.size() as u64) >> PAGE_SHIFT;
+
+ let mut touch_thread = ((mmap_reg.size() as u64) >> PRE_ALLOC_GRANULARITY) + 1;
+ if touch_thread > MAX_PRE_ALLOC_THREAD {
+ touch_thread = MAX_PRE_ALLOC_THREAD;
+ }
+
+ let per_npage = npage / touch_thread;
+ for n in 0..touch_thread {
+ let start_npage = per_npage * n;
+ let end_npage = if n == (touch_thread - 1) {
+ npage
+ } else {
+ per_npage * (n + 1)
+ };
+ let mut per_addr = addr + (start_npage * PAGE_SIZE);
+ let should_stop = self.prealloc_exit.clone();
+
+ let handler = thread::Builder::new()
+ .name("PreallocThread".to_string())
+ .spawn(move || {
+ info!("PreallocThread start start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
+ start_npage, end_npage, per_addr, touch_thread );
+ for _ in start_npage..end_npage {
+ if should_stop.load(Ordering::Acquire) {
+ info!("PreallocThread stop start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
+ start_npage, end_npage, per_addr, touch_thread);
+ break;
+ }
+
+ // Reading from a THP page may be served by the zero page, so only
+ // write operation could ensure THP memory allocation. So use
+ // the compare_exchange(old_val, old_val) trick to trigger allocation.
+ let addr_ptr = per_addr as *mut u8;
+ let read_byte = unsafe { std::ptr::read_volatile(addr_ptr) };
+ let atomic_u8 : &AtomicU8 = unsafe {&*(addr_ptr as *mut AtomicU8)};
+ let _ = atomic_u8.compare_exchange(read_byte, read_byte, Ordering::SeqCst, Ordering::SeqCst);
+ per_addr += PAGE_SIZE;
+ }
+
+ info!("PreallocThread done start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}",
+ start_npage, end_npage, per_addr, touch_thread );
+ });
+
+ match handler {
+ Err(e) => error!(
+ "Failed to create working thread for async pre-allocation, {:?}. This may affect performance stability at the start of the workload.",
+ e
+ ),
+ Ok(hdl) => self.prealloc_handlers.push(hdl),
+ }
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Get the address space object
+ pub fn get_address_space(&self) -> Option<&AddressSpace> {
+ self.address_space.as_ref()
+ }
+
+ /// Get the default guest memory object, which will be used to access virtual machine's default
+ /// guest memory.
+ pub fn get_vm_as(&self) -> Option<&GuestAddressSpaceImpl> {
+ self.vm_as.as_ref()
+ }
+
+ /// Get the base to slot map
+ pub fn get_base_to_slot_map(&self) -> Arc>> {
+ self.base_to_slot.clone()
+ }
+
+ /// get numa nodes infos from address space manager.
+ pub fn get_numa_nodes(&self) -> &BTreeMap {
+ &self.numa_nodes
+ }
+
+ /// add cpu and memory numa informations to BtreeMap
+ fn insert_into_numa_nodes(
+ &mut self,
+ region: &Arc,
+ guest_numa_node_id: u32,
+ vcpu_ids: &[u32],
+ ) {
+ let node = self
+ .numa_nodes
+ .entry(guest_numa_node_id)
+ .or_insert_with(NumaNode::new);
+ node.add_info(&NumaNodeInfo {
+ base: region.start_addr(),
+ size: region.len(),
+ });
+ node.add_vcpu_ids(vcpu_ids);
+ }
+
+ /// get address space layout from address space manager.
+ pub fn get_layout(&self) -> Result {
+ self.address_space
+ .as_ref()
+ .map(|v| v.layout())
+ .ok_or(AddressManagerError::GuestMemoryNotInitialized)
+ }
+
+ /// Wait for the pre-allocation working threads to finish work.
+ ///
+ /// Force all working threads to exit if `stop` is true.
+ pub fn wait_prealloc(&mut self, stop: bool) -> Result<()> {
+ if stop {
+ self.prealloc_exit.store(true, Ordering::Release);
+ }
+ while let Some(handlers) = self.prealloc_handlers.pop() {
+ if let Err(e) = handlers.join() {
+ error!("wait_prealloc join fail {:?}", e);
+ return Err(AddressManagerError::JoinFail);
+ }
+ }
+ Ok(())
+ }
+}
+
+impl Default for AddressSpaceMgr {
+ /// Create a new empty AddressSpaceMgr
+ fn default() -> Self {
+ AddressSpaceMgr {
+ address_space: None,
+ vm_as: None,
+ base_to_slot: Arc::new(Mutex::new(HashMap::new())),
+ prealloc_handlers: Vec::new(),
+ prealloc_exit: Arc::new(AtomicBool::new(false)),
+ numa_nodes: BTreeMap::new(),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use dbs_boot::layout::GUEST_MEM_START;
+ use std::ops::Deref;
+
+ use vm_memory::{Bytes, GuestAddressSpace, GuestMemory, GuestMemoryRegion};
+ use vmm_sys_util::tempfile::TempFile;
+
+ use super::*;
+
+ #[test]
+ fn test_create_address_space() {
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 128 << 20;
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: vec![1, 2],
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ let vm_as = as_mgr.get_vm_as().unwrap();
+ let guard = vm_as.memory();
+ let gmem = guard.deref();
+ assert_eq!(gmem.num_regions(), 1);
+
+ let reg = gmem
+ .find_region(GuestAddress(GUEST_MEM_START + mem_size - 1))
+ .unwrap();
+ assert_eq!(reg.start_addr(), GuestAddress(GUEST_MEM_START));
+ assert_eq!(reg.len(), mem_size);
+ assert!(gmem
+ .find_region(GuestAddress(GUEST_MEM_START + mem_size))
+ .is_none());
+ assert!(reg.file_offset().is_some());
+
+ let buf = [0x1u8, 0x2u8, 0x3u8, 0x4u8, 0x5u8];
+ gmem.write_slice(&buf, GuestAddress(GUEST_MEM_START))
+ .unwrap();
+
+ // Update middle of mapped memory region
+ let mut val = 0xa5u8;
+ gmem.write_obj(val, GuestAddress(GUEST_MEM_START + 0x1))
+ .unwrap();
+ val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x1)).unwrap();
+ assert_eq!(val, 0xa5);
+ val = gmem.read_obj(GuestAddress(GUEST_MEM_START)).unwrap();
+ assert_eq!(val, 1);
+ val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x2)).unwrap();
+ assert_eq!(val, 3);
+ val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x5)).unwrap();
+ assert_eq!(val, 0);
+
+ // Read ahead of mapped memory region
+ assert!(gmem
+ .read_obj::(GuestAddress(GUEST_MEM_START + mem_size))
+ .is_err());
+
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = dbs_boot::layout::MMIO_LOW_START + (1 << 30);
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: vec![1, 2],
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ let vm_as = as_mgr.get_vm_as().unwrap();
+ let guard = vm_as.memory();
+ let gmem = guard.deref();
+ #[cfg(target_arch = "x86_64")]
+ assert_eq!(gmem.num_regions(), 2);
+ #[cfg(target_arch = "aarch64")]
+ assert_eq!(gmem.num_regions(), 1);
+
+ // Test dropping GuestMemoryMmap object releases all resources.
+ for _ in 0..10000 {
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 1 << 20;
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: vec![1, 2],
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let _as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ }
+ let file = TempFile::new().unwrap().into_file();
+ let fd = file.as_raw_fd();
+ // fd should be small enough if there's no leaking of fds.
+ assert!(fd < 1000);
+ }
+
+ #[test]
+ fn test_address_space_mgr_get_boundary() {
+ let layout = AddressSpaceLayout::new(
+ *dbs_boot::layout::GUEST_PHYS_END,
+ dbs_boot::layout::GUEST_MEM_START,
+ *dbs_boot::layout::GUEST_MEM_END,
+ );
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 128 << 20;
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: vec![1, 2],
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ assert_eq!(as_mgr.get_layout().unwrap(), layout);
+ }
+
+ #[test]
+ fn test_address_space_mgr_get_numa_nodes() {
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 128 << 20;
+ let cpu_vec = vec![1, 2];
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: cpu_vec.clone(),
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ let mut numa_node = NumaNode::new();
+ numa_node.add_info(&NumaNodeInfo {
+ base: GuestAddress(GUEST_MEM_START),
+ size: mem_size,
+ });
+ numa_node.add_vcpu_ids(&cpu_vec);
+
+ assert_eq!(*as_mgr.get_numa_nodes().get(&0).unwrap(), numa_node);
+ }
+
+ #[test]
+ fn test_address_space_mgr_async_prealloc() {
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 2 << 20;
+ let cpu_vec = vec![1, 2];
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: cpu_vec,
+ }];
+ let mut builder = AddressSpaceMgrBuilder::new("hugeshmem", "").unwrap();
+ builder.toggle_prealloc(true);
+ let mut as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ as_mgr.wait_prealloc(false).unwrap();
+ }
+
+ #[test]
+ fn test_address_space_mgr_builder() {
+ let mut builder = AddressSpaceMgrBuilder::new("shmem", "/tmp/shmem").unwrap();
+
+ assert_eq!(builder.mem_type, "shmem");
+ assert_eq!(builder.mem_file, "/tmp/shmem");
+ assert_eq!(builder.mem_index, 0);
+ assert!(builder.mem_suffix);
+ assert!(!builder.mem_prealloc);
+ assert!(!builder.dirty_page_logging);
+ assert!(builder.vmfd.is_none());
+
+ assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem0");
+ assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem1");
+ assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem2");
+ assert_eq!(builder.mem_index, 3);
+
+ builder.toggle_file_suffix(false);
+ assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem");
+ assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem");
+ assert_eq!(builder.mem_index, 3);
+
+ builder.toggle_prealloc(true);
+ builder.toggle_dirty_page_logging(true);
+ assert!(builder.mem_prealloc);
+ assert!(builder.dirty_page_logging);
+ }
+
+ #[test]
+ fn test_configure_invalid_numa() {
+ let res_mgr = ResourceManager::new(None);
+ let mem_size = 128 << 20;
+ let numa_region_infos = vec![NumaRegionInfo {
+ size: mem_size >> 20,
+ host_numa_node_id: None,
+ guest_numa_node_id: Some(0),
+ vcpu_ids: vec![1, 2],
+ }];
+ let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap();
+ let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap();
+ let mmap_reg = MmapRegion::new(8).unwrap();
+
+ assert!(as_mgr.configure_numa(&mmap_reg, u32::MAX).is_err());
+ }
+}
diff --git a/src/dragonball/src/api/mod.rs b/src/dragonball/src/api/mod.rs
new file mode 100644
index 0000000000..75ca6af690
--- /dev/null
+++ b/src/dragonball/src/api/mod.rs
@@ -0,0 +1,6 @@
+// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! API related data structures to configure the vmm.
+
+pub mod v1;
diff --git a/src/dragonball/src/api/v1/boot_source.rs b/src/dragonball/src/api/v1/boot_source.rs
new file mode 100644
index 0000000000..8ff7e030dc
--- /dev/null
+++ b/src/dragonball/src/api/v1/boot_source.rs
@@ -0,0 +1,55 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use serde_derive::{Deserialize, Serialize};
+
+/// Default guest kernel command line:
+/// - `reboot=k` shutdown the guest on reboot, instead of well... rebooting;
+/// - `panic=1` on panic, reboot after 1 second;
+/// - `pci=off` do not scan for PCI devices (ser boot time);
+/// - `nomodules` disable loadable kernel module support;
+/// - `8250.nr_uarts=0` disable 8250 serial interface;
+/// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (ser boot time);
+/// - `i8042.nomux` do not probe i8042 for a multiplexing controller (ser boot time);
+/// - `i8042.nopnp` do not use ACPIPnP to discover KBD/AUX controllers (ser boot time);
+/// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (ser boot time).
+pub const DEFAULT_KERNEL_CMDLINE: &str = "reboot=k panic=1 pci=off nomodules 8250.nr_uarts=0 \
+ i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd";
+
+/// Strongly typed data structure used to configure the boot source of the microvm.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, Default)]
+#[serde(deny_unknown_fields)]
+pub struct BootSourceConfig {
+ /// Path of the kernel image.
+ /// We only support uncompressed kernel for Dragonball.
+ pub kernel_path: String,
+ /// Path of the initrd, if there is one.
+ /// ps. rootfs is set in BlockDeviceConfigInfo
+ pub initrd_path: Option,
+ /// The boot arguments to pass to the kernel.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub boot_args: Option,
+}
+
+/// Errors associated with actions on `BootSourceConfig`.
+#[derive(Debug, thiserror::Error)]
+pub enum BootSourceConfigError {
+ /// The kernel file cannot be opened.
+ #[error(
+ "the kernel file cannot be opened due to invalid kernel path or invalid permissions: {0}"
+ )]
+ InvalidKernelPath(#[source] std::io::Error),
+
+ /// The initrd file cannot be opened.
+ #[error("the initrd file cannot be opened due to invalid path or invalid permissions: {0}")]
+ InvalidInitrdPath(#[source] std::io::Error),
+
+ /// The kernel command line is invalid.
+ #[error("the kernel command line is invalid: {0}")]
+ InvalidKernelCommandLine(#[source] linux_loader::cmdline::Error),
+
+ /// The boot source cannot be update post boot.
+ #[error("the update operation is not allowed after boot")]
+ UpdateNotAllowedPostBoot,
+}
diff --git a/src/dragonball/src/api/v1/instance_info.rs b/src/dragonball/src/api/v1/instance_info.rs
new file mode 100644
index 0000000000..ae159aa614
--- /dev/null
+++ b/src/dragonball/src/api/v1/instance_info.rs
@@ -0,0 +1,88 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+use serde_derive::{Deserialize, Serialize};
+
+/// The microvm state.
+///
+/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is
+/// called, the state goes from Uninitialized to Starting. The state is changed to Running until
+/// the start_microvm method ends. Halting and Halted are currently unsupported.
+#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub enum InstanceState {
+ /// Microvm is not initialized.
+ Uninitialized,
+ /// Microvm is starting.
+ Starting,
+ /// Microvm is running.
+ Running,
+ /// Microvm is Paused.
+ Paused,
+ /// Microvm received a halt instruction.
+ Halting,
+ /// Microvm is halted.
+ Halted,
+ /// Microvm exit instead of process exit.
+ Exited(i32),
+}
+
+/// The state of async actions
+#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
+pub enum AsyncState {
+ /// Uninitialized
+ Uninitialized,
+ /// Success
+ Success,
+ /// Failure
+ Failure,
+}
+
+/// The strongly typed that contains general information about the microVM.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct InstanceInfo {
+ /// The ID of the microVM.
+ pub id: String,
+ /// The state of the microVM.
+ pub state: InstanceState,
+ /// The version of the VMM that runs the microVM.
+ pub vmm_version: String,
+ /// The pid of the current VMM process.
+ pub pid: u32,
+ /// The state of async actions.
+ pub async_state: AsyncState,
+ /// List of tids of vcpu threads (vcpu index, tid)
+ pub tids: Vec<(u8, u32)>,
+ /// Last instance downtime
+ pub last_instance_downtime: u64,
+}
+
+impl InstanceInfo {
+ /// create instance info object with given id, version, and platform type
+ pub fn new(id: String, vmm_version: String) -> Self {
+ InstanceInfo {
+ id,
+ state: InstanceState::Uninitialized,
+ vmm_version,
+ pid: std::process::id(),
+ async_state: AsyncState::Uninitialized,
+ tids: Vec::new(),
+ last_instance_downtime: 0,
+ }
+ }
+}
+
+impl Default for InstanceInfo {
+ fn default() -> Self {
+ InstanceInfo {
+ id: String::from(""),
+ state: InstanceState::Uninitialized,
+ vmm_version: env!("CARGO_PKG_VERSION").to_string(),
+ pid: std::process::id(),
+ async_state: AsyncState::Uninitialized,
+ tids: Vec::new(),
+ last_instance_downtime: 0,
+ }
+ }
+}
diff --git a/src/dragonball/src/api/v1/machine_config.rs b/src/dragonball/src/api/v1/machine_config.rs
new file mode 100644
index 0000000000..e4ae228679
--- /dev/null
+++ b/src/dragonball/src/api/v1/machine_config.rs
@@ -0,0 +1,86 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+/// We only support this number of vcpus for now. Mostly because we have set all vcpu related metrics as u8
+/// and breaking u8 will take extra efforts.
+pub const MAX_SUPPORTED_VCPUS: u8 = 254;
+
+/// Memory hotplug value should have alignment in this size (unit: MiB)
+pub const MEMORY_HOTPLUG_ALIGHMENT: u8 = 64;
+
+/// Errors associated with configuring the microVM.
+#[derive(Debug, PartialEq, thiserror::Error)]
+pub enum VmConfigError {
+ /// Cannot update the configuration of the microvm post boot.
+ #[error("update operation is not allowed after boot")]
+ UpdateNotAllowedPostBoot,
+
+ /// The max vcpu count is invalid.
+ #[error("the vCPU number shouldn't large than {}", MAX_SUPPORTED_VCPUS)]
+ VcpuCountExceedsMaximum,
+
+ /// The vcpu count is invalid. When hyperthreading is enabled, the `cpu_count` must be either
+ /// 1 or an even number.
+ #[error(
+ "the vCPU number '{0}' can only be 1 or an even number when hyperthreading is enabled"
+ )]
+ InvalidVcpuCount(u8),
+
+ /// The threads_per_core is invalid. It should be either 1 or 2.
+ #[error("the threads_per_core number '{0}' can only be 1 or 2")]
+ InvalidThreadsPerCore(u8),
+
+ /// The cores_per_die is invalid. It should be larger than 0.
+ #[error("the cores_per_die number '{0}' can only be larger than 0")]
+ InvalidCoresPerDie(u8),
+
+ /// The dies_per_socket is invalid. It should be larger than 0.
+ #[error("the dies_per_socket number '{0}' can only be larger than 0")]
+ InvalidDiesPerSocket(u8),
+
+ /// The socket number is invalid. It should be either 1 or 2.
+ #[error("the socket number '{0}' can only be 1 or 2")]
+ InvalidSocket(u8),
+
+ /// max vcpu count inferred from cpu topology(threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count
+ #[error("the max vcpu count inferred from cpu topology '{0}' (threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count")]
+ InvalidCpuTopology(u8),
+
+ /// The max vcpu count is invalid.
+ #[error(
+ "the max vCPU number '{0}' shouldn't less than vCPU count and can only be 1 or an even number when hyperthreading is enabled"
+ )]
+ InvalidMaxVcpuCount(u8),
+
+ /// The memory size is invalid. The memory can only be an unsigned integer.
+ #[error("the memory size 0x{0:x}MiB is invalid")]
+ InvalidMemorySize(usize),
+
+ /// The hotplug memory size is invalid. The memory can only be an unsigned integer.
+ #[error(
+ "the hotplug memory size '{0}' (MiB) is invalid, must be multiple of {}",
+ MEMORY_HOTPLUG_ALIGHMENT
+ )]
+ InvalidHotplugMemorySize(usize),
+
+ /// The memory type is invalid.
+ #[error("the memory type '{0}' is invalid")]
+ InvalidMemType(String),
+
+ /// The memory file path is invalid.
+ #[error("the memory file path is invalid")]
+ InvalidMemFilePath(String),
+
+ /// NUMA region memory size is invalid
+ #[error("Total size of memory in NUMA regions: {0}, should matches memory size in config")]
+ InvalidNumaRegionMemorySize(usize),
+
+ /// NUMA region vCPU count is invalid
+ #[error("Total counts of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")]
+ InvalidNumaRegionCpuCount(u16),
+
+ /// NUMA region vCPU count is invalid
+ #[error("Max id of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")]
+ InvalidNumaRegionCpuMaxId(u16),
+}
diff --git a/src/dragonball/src/api/v1/mod.rs b/src/dragonball/src/api/v1/mod.rs
new file mode 100644
index 0000000000..99e3075ebb
--- /dev/null
+++ b/src/dragonball/src/api/v1/mod.rs
@@ -0,0 +1,19 @@
+// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! API Version 1 related data structures to configure the vmm.
+
+mod vmm_action;
+pub use self::vmm_action::*;
+
+/// Wrapper for configuring the microVM boot source.
+mod boot_source;
+pub use self::boot_source::{BootSourceConfig, BootSourceConfigError, DEFAULT_KERNEL_CMDLINE};
+
+/// Wrapper over the microVM general information.
+mod instance_info;
+pub use self::instance_info::{InstanceInfo, InstanceState};
+
+/// Wrapper for configuring the memory and CPU of the microVM.
+mod machine_config;
+pub use self::machine_config::{VmConfigError, MAX_SUPPORTED_VCPUS};
diff --git a/src/dragonball/src/api/v1/vmm_action.rs b/src/dragonball/src/api/v1/vmm_action.rs
new file mode 100644
index 0000000000..06004f3f0c
--- /dev/null
+++ b/src/dragonball/src/api/v1/vmm_action.rs
@@ -0,0 +1,636 @@
+// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the THIRD-PARTY file.
+
+use std::fs::File;
+use std::sync::mpsc::{Receiver, Sender, TryRecvError};
+
+use log::{debug, error, info, warn};
+
+use crate::error::{Result, StartMicroVmError, StopMicrovmError};
+use crate::event_manager::EventManager;
+use crate::vm::{CpuTopology, KernelConfigInfo, VmConfigInfo};
+use crate::vmm::Vmm;
+
+use self::VmConfigError::*;
+use self::VmmActionError::MachineConfig;
+
+#[cfg(feature = "virtio-blk")]
+pub use crate::device_manager::blk_dev_mgr::{
+ BlockDeviceConfigInfo, BlockDeviceConfigUpdateInfo, BlockDeviceError, BlockDeviceMgr,
+};
+#[cfg(feature = "virtio-fs")]
+pub use crate::device_manager::fs_dev_mgr::{
+ FsDeviceConfigInfo, FsDeviceConfigUpdateInfo, FsDeviceError, FsDeviceMgr, FsMountConfigInfo,
+};
+#[cfg(feature = "virtio-net")]
+pub use crate::device_manager::virtio_net_dev_mgr::{
+ VirtioNetDeviceConfigInfo, VirtioNetDeviceConfigUpdateInfo, VirtioNetDeviceError,
+ VirtioNetDeviceMgr,
+};
+#[cfg(feature = "virtio-vsock")]
+pub use crate::device_manager::vsock_dev_mgr::{VsockDeviceConfigInfo, VsockDeviceError};
+
+use super::*;
+
+/// Wrapper for all errors associated with VMM actions.
+#[derive(Debug, thiserror::Error)]
+pub enum VmmActionError {
+ /// Invalid virtual machine instance ID.
+ #[error("the virtual machine instance ID is invalid")]
+ InvalidVMID,
+
+ /// Failed to hotplug, due to Upcall not ready.
+ #[error("Upcall not ready, can't hotplug device.")]
+ UpcallNotReady,
+
+ /// The action `ConfigureBootSource` failed either because of bad user input or an internal
+ /// error.
+ #[error("failed to configure boot source for VM: {0}")]
+ BootSource(#[source] BootSourceConfigError),
+
+ /// The action `StartMicroVm` failed either because of bad user input or an internal error.
+ #[error("failed to boot the VM: {0}")]
+ StartMicroVm(#[source] StartMicroVmError),
+
+ /// The action `StopMicroVm` failed either because of bad user input or an internal error.
+ #[error("failed to shutdown the VM: {0}")]
+ StopMicrovm(#[source] StopMicrovmError),
+
+ /// One of the actions `GetVmConfiguration` or `SetVmConfiguration` failed either because of bad
+ /// input or an internal error.
+ #[error("failed to set configuration for the VM: {0}")]
+ MachineConfig(#[source] VmConfigError),
+
+ #[cfg(feature = "virtio-vsock")]
+ /// The action `InsertVsockDevice` failed either because of bad user input or an internal error.
+ #[error("failed to add virtio-vsock device: {0}")]
+ Vsock(#[source] VsockDeviceError),
+
+ #[cfg(feature = "virtio-blk")]
+ /// Block device related errors.
+ #[error("virtio-blk device error: {0}")]
+ Block(#[source] BlockDeviceError),
+
+ #[cfg(feature = "virtio-net")]
+ /// Net device related errors.
+ #[error("virtio-net device error: {0}")]
+ VirtioNet(#[source] VirtioNetDeviceError),
+
+ #[cfg(feature = "virtio-fs")]
+ /// The action `InsertFsDevice` failed either because of bad user input or an internal error.
+ #[error("virtio-fs device: {0}")]
+ FsDevice(#[source] FsDeviceError),
+}
+
+/// This enum represents the public interface of the VMM. Each action contains various
+/// bits of information (ids, paths, etc.).
+#[derive(Clone, Debug, PartialEq)]
+pub enum VmmAction {
+ /// Configure the boot source of the microVM using `BootSourceConfig`.
+ /// This action can only be called before the microVM has booted.
+ ConfigureBootSource(BootSourceConfig),
+
+ /// Launch the microVM. This action can only be called before the microVM has booted.
+ StartMicroVm,
+
+ /// Shutdown the vmicroVM. This action can only be called after the microVM has booted.
+ /// When vmm is used as the crate by the other process, which is need to
+ /// shutdown the vcpu threads and destory all of the object.
+ ShutdownMicroVm,
+
+ /// Get the configuration of the microVM.
+ GetVmConfiguration,
+
+ /// Set the microVM configuration (memory & vcpu) using `VmConfig` as input. This
+ /// action can only be called before the microVM has booted.
+ SetVmConfiguration(VmConfigInfo),
+
+ #[cfg(feature = "virtio-vsock")]
+ /// Add a new vsock device or update one that already exists using the
+ /// `VsockDeviceConfig` as input. This action can only be called before the microVM has
+ /// booted. The response is sent using the `OutcomeSender`.
+ InsertVsockDevice(VsockDeviceConfigInfo),
+
+ #[cfg(feature = "virtio-blk")]
+ /// Add a new block device or update one that already exists using the `BlockDeviceConfig` as
+ /// input. This action can only be called before the microVM has booted.
+ InsertBlockDevice(BlockDeviceConfigInfo),
+
+ #[cfg(feature = "virtio-blk")]
+ /// Remove a new block device for according to given drive_id
+ RemoveBlockDevice(String),
+
+ #[cfg(feature = "virtio-blk")]
+ /// Update a block device, after microVM start. Currently, the only updatable properties
+ /// are the RX and TX rate limiters.
+ UpdateBlockDevice(BlockDeviceConfigUpdateInfo),
+
+ #[cfg(feature = "virtio-net")]
+ /// Add a new network interface config or update one that already exists using the
+ /// `NetworkInterfaceConfig` as input. This action can only be called before the microVM has
+ /// booted. The response is sent using the `OutcomeSender`.
+ InsertNetworkDevice(VirtioNetDeviceConfigInfo),
+
+ #[cfg(feature = "virtio-net")]
+ /// Update a network interface, after microVM start. Currently, the only updatable properties
+ /// are the RX and TX rate limiters.
+ UpdateNetworkInterface(VirtioNetDeviceConfigUpdateInfo),
+
+ #[cfg(feature = "virtio-fs")]
+ /// Add a new shared fs device or update one that already exists using the
+ /// `FsDeviceConfig` as input. This action can only be called before the microVM has
+ /// booted.
+ InsertFsDevice(FsDeviceConfigInfo),
+
+ #[cfg(feature = "virtio-fs")]
+ /// Attach a new virtiofs Backend fs or detach an existing virtiofs Backend fs using the
+ /// `FsMountConfig` as input. This action can only be called _after_ the microVM has
+ /// booted.
+ ManipulateFsBackendFs(FsMountConfigInfo),
+
+ #[cfg(feature = "virtio-fs")]
+ /// Update fs rate limiter, after microVM start.
+ UpdateFsDevice(FsDeviceConfigUpdateInfo),
+}
+
+/// The enum represents the response sent by the VMM in case of success. The response is either
+/// empty, when no data needs to be sent, or an internal VMM structure.
+#[derive(Debug)]
+pub enum VmmData {
+ /// No data is sent on the channel.
+ Empty,
+ /// The microVM configuration represented by `VmConfigInfo`.
+ MachineConfiguration(Box),
+}
+
+/// Request data type used to communicate between the API and the VMM.
+pub type VmmRequest = Box;
+
+/// Data type used to communicate between the API and the VMM.
+pub type VmmRequestResult = std::result::Result;
+
+/// Response data type used to communicate between the API and the VMM.
+pub type VmmResponse = Box;
+
+/// VMM Service to handle requests from the API server.
+///
+/// There are two levels of API servers as below:
+/// API client <--> VMM API Server <--> VMM Core
+pub struct VmmService {
+ from_api: Receiver,
+ to_api: Sender,
+ machine_config: VmConfigInfo,
+}
+
+impl VmmService {
+ /// Create a new VMM API server instance.
+ pub fn new(from_api: Receiver, to_api: Sender) -> Self {
+ VmmService {
+ from_api,
+ to_api,
+ machine_config: VmConfigInfo::default(),
+ }
+ }
+
+ /// Handle requests from the HTTP API Server and send back replies.
+ pub fn run_vmm_action(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> Result<()> {
+ let request = match self.from_api.try_recv() {
+ Ok(t) => *t,
+ Err(TryRecvError::Empty) => {
+ warn!("Got a spurious notification from api thread");
+ return Ok(());
+ }
+ Err(TryRecvError::Disconnected) => {
+ panic!("The channel's sending half was disconnected. Cannot receive data.");
+ }
+ };
+ debug!("receive vmm action: {:?}", request);
+
+ let response = match request {
+ VmmAction::ConfigureBootSource(boot_source_body) => {
+ self.configure_boot_source(vmm, boot_source_body)
+ }
+ VmmAction::StartMicroVm => self.start_microvm(vmm, event_mgr),
+ VmmAction::ShutdownMicroVm => self.shutdown_microvm(vmm),
+ VmmAction::GetVmConfiguration => Ok(VmmData::MachineConfiguration(Box::new(
+ self.machine_config.clone(),
+ ))),
+ VmmAction::SetVmConfiguration(machine_config) => {
+ self.set_vm_configuration(vmm, machine_config)
+ }
+ #[cfg(feature = "virtio-vsock")]
+ VmmAction::InsertVsockDevice(vsock_cfg) => self.add_vsock_device(vmm, vsock_cfg),
+ #[cfg(feature = "virtio-blk")]
+ VmmAction::InsertBlockDevice(block_device_config) => {
+ self.add_block_device(vmm, event_mgr, block_device_config)
+ }
+ #[cfg(feature = "virtio-blk")]
+ VmmAction::UpdateBlockDevice(blk_update) => {
+ self.update_blk_rate_limiters(vmm, blk_update)
+ }
+ #[cfg(feature = "virtio-blk")]
+ VmmAction::RemoveBlockDevice(drive_id) => {
+ self.remove_block_device(vmm, event_mgr, &drive_id)
+ }
+ #[cfg(feature = "virtio-net")]
+ VmmAction::InsertNetworkDevice(virtio_net_cfg) => {
+ self.add_virtio_net_device(vmm, event_mgr, virtio_net_cfg)
+ }
+ #[cfg(feature = "virtio-net")]
+ VmmAction::UpdateNetworkInterface(netif_update) => {
+ self.update_net_rate_limiters(vmm, netif_update)
+ }
+ #[cfg(feature = "virtio-fs")]
+ VmmAction::InsertFsDevice(fs_cfg) => self.add_fs_device(vmm, fs_cfg),
+
+ #[cfg(feature = "virtio-fs")]
+ VmmAction::ManipulateFsBackendFs(fs_mount_cfg) => {
+ self.manipulate_fs_backend_fs(vmm, fs_mount_cfg)
+ }
+ #[cfg(feature = "virtio-fs")]
+ VmmAction::UpdateFsDevice(fs_update_cfg) => {
+ self.update_fs_rate_limiters(vmm, fs_update_cfg)
+ }
+ };
+
+ debug!("send vmm response: {:?}", response);
+ self.send_response(response)
+ }
+
+ fn send_response(&self, result: VmmRequestResult) -> Result<()> {
+ self.to_api
+ .send(Box::new(result))
+ .map_err(|_| ())
+ .expect("vmm: one-shot API result channel has been closed");
+
+ Ok(())
+ }
+
+ fn configure_boot_source(
+ &self,
+ vmm: &mut Vmm,
+ boot_source_config: BootSourceConfig,
+ ) -> VmmRequestResult {
+ use super::BootSourceConfigError::{
+ InvalidInitrdPath, InvalidKernelCommandLine, InvalidKernelPath,
+ UpdateNotAllowedPostBoot,
+ };
+ use super::VmmActionError::BootSource;
+
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ if vm.is_vm_initialized() {
+ return Err(BootSource(UpdateNotAllowedPostBoot));
+ }
+
+ let kernel_file = File::open(&boot_source_config.kernel_path)
+ .map_err(|e| BootSource(InvalidKernelPath(e)))?;
+
+ let initrd_file = match boot_source_config.initrd_path {
+ None => None,
+ Some(ref path) => Some(File::open(path).map_err(|e| BootSource(InvalidInitrdPath(e)))?),
+ };
+
+ let mut cmdline = linux_loader::cmdline::Cmdline::new(dbs_boot::layout::CMDLINE_MAX_SIZE);
+ let boot_args = boot_source_config
+ .boot_args
+ .clone()
+ .unwrap_or_else(|| String::from(DEFAULT_KERNEL_CMDLINE));
+ cmdline
+ .insert_str(boot_args)
+ .map_err(|e| BootSource(InvalidKernelCommandLine(e)))?;
+
+ let kernel_config = KernelConfigInfo::new(kernel_file, initrd_file, cmdline);
+ vm.set_kernel_config(kernel_config);
+
+ Ok(VmmData::Empty)
+ }
+
+ fn start_microvm(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> VmmRequestResult {
+ use self::StartMicroVmError::MicroVMAlreadyRunning;
+ use self::VmmActionError::StartMicroVm;
+
+ let vmm_seccomp_filter = vmm.vmm_seccomp_filter();
+ let vcpu_seccomp_filter = vmm.vcpu_seccomp_filter();
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ if vm.is_vm_initialized() {
+ return Err(StartMicroVm(MicroVMAlreadyRunning));
+ }
+
+ vm.start_microvm(event_mgr, vmm_seccomp_filter, vcpu_seccomp_filter)
+ .map(|_| VmmData::Empty)
+ .map_err(StartMicroVm)
+ }
+
+ fn shutdown_microvm(&mut self, vmm: &mut Vmm) -> VmmRequestResult {
+ vmm.event_ctx.exit_evt_triggered = true;
+
+ Ok(VmmData::Empty)
+ }
+
+ /// Set virtual machine configuration.
+ pub fn set_vm_configuration(
+ &mut self,
+ vmm: &mut Vmm,
+ machine_config: VmConfigInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ if vm.is_vm_initialized() {
+ return Err(MachineConfig(UpdateNotAllowedPostBoot));
+ }
+
+ // If the check is successful, set it up together.
+ let mut config = vm.vm_config().clone();
+ if config.vcpu_count != machine_config.vcpu_count {
+ let vcpu_count = machine_config.vcpu_count;
+ // Check that the vcpu_count value is >=1.
+ if vcpu_count == 0 {
+ return Err(MachineConfig(InvalidVcpuCount(vcpu_count)));
+ }
+ config.vcpu_count = vcpu_count;
+ }
+
+ if config.cpu_topology != machine_config.cpu_topology {
+ let cpu_topology = &machine_config.cpu_topology;
+ config.cpu_topology = handle_cpu_topology(cpu_topology, config.vcpu_count)?.clone();
+ } else {
+ // the same default
+ let mut default_cpu_topology = CpuTopology {
+ threads_per_core: 1,
+ cores_per_die: config.vcpu_count,
+ dies_per_socket: 1,
+ sockets: 1,
+ };
+ if machine_config.max_vcpu_count > config.vcpu_count {
+ default_cpu_topology.cores_per_die = machine_config.max_vcpu_count;
+ }
+ config.cpu_topology = default_cpu_topology;
+ }
+ let cpu_topology = &config.cpu_topology;
+ let max_vcpu_from_topo = cpu_topology.threads_per_core
+ * cpu_topology.cores_per_die
+ * cpu_topology.dies_per_socket
+ * cpu_topology.sockets;
+ // If the max_vcpu_count inferred by cpu_topology is not equal to
+ // max_vcpu_count, max_vcpu_count will be changed. currently, max vcpu size
+ // is used when cpu_topology is not defined and help define the cores_per_die
+ // for the default cpu topology.
+ let mut max_vcpu_count = machine_config.max_vcpu_count;
+ if max_vcpu_count < config.vcpu_count {
+ return Err(MachineConfig(InvalidMaxVcpuCount(max_vcpu_count)));
+ }
+ if max_vcpu_from_topo != max_vcpu_count {
+ max_vcpu_count = max_vcpu_from_topo;
+ info!("Since max_vcpu_count is not equal to cpu topo information, we have changed the max vcpu count to {}", max_vcpu_from_topo);
+ }
+ config.max_vcpu_count = max_vcpu_count;
+
+ config.cpu_pm = machine_config.cpu_pm;
+ config.mem_type = machine_config.mem_type;
+
+ let mem_size_mib_value = machine_config.mem_size_mib;
+ // Support 1TB memory at most, 2MB aligned for huge page.
+ if mem_size_mib_value == 0 || mem_size_mib_value > 0x10_0000 || mem_size_mib_value % 2 != 0
+ {
+ return Err(MachineConfig(InvalidMemorySize(mem_size_mib_value)));
+ }
+ config.mem_size_mib = mem_size_mib_value;
+
+ config.mem_file_path = machine_config.mem_file_path.clone();
+
+ if config.mem_type == "hugetlbfs" && config.mem_file_path.is_empty() {
+ return Err(MachineConfig(InvalidMemFilePath("".to_owned())));
+ }
+ config.vpmu_feature = machine_config.vpmu_feature;
+
+ let vm_id = vm.shared_info().read().unwrap().id.clone();
+ let serial_path = match machine_config.serial_path {
+ Some(value) => value,
+ None => {
+ if config.serial_path.is_none() {
+ String::from("/run/dragonball/") + &vm_id + "_com1"
+ } else {
+ // Safe to unwrap() because we have checked it has a value.
+ config.serial_path.as_ref().unwrap().clone()
+ }
+ }
+ };
+ config.serial_path = Some(serial_path);
+
+ vm.set_vm_config(config.clone());
+ self.machine_config = config;
+
+ Ok(VmmData::Empty)
+ }
+
+ #[cfg(feature = "virtio-vsock")]
+ fn add_vsock_device(&self, vmm: &mut Vmm, config: VsockDeviceConfigInfo) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ if vm.is_vm_initialized() {
+ return Err(VmmActionError::Vsock(
+ VsockDeviceError::UpdateNotAllowedPostBoot,
+ ));
+ }
+
+ // VMADDR_CID_ANY (-1U) means any address for binding;
+ // VMADDR_CID_HYPERVISOR (0) is reserved for services built into the hypervisor;
+ // VMADDR_CID_RESERVED (1) must not be used;
+ // VMADDR_CID_HOST (2) is the well-known address of the host.
+ if config.guest_cid <= 2 {
+ return Err(VmmActionError::Vsock(VsockDeviceError::GuestCIDInvalid(
+ config.guest_cid,
+ )));
+ }
+
+ info!("add_vsock_device: {:?}", config);
+ let ctx = vm.create_device_op_context(None).map_err(|e| {
+ info!("create device op context error: {:?}", e);
+ VmmActionError::Vsock(VsockDeviceError::UpdateNotAllowedPostBoot)
+ })?;
+
+ vm.device_manager_mut()
+ .vsock_manager
+ .insert_device(ctx, config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::Vsock)
+ }
+
+ #[cfg(feature = "virtio-blk")]
+ // Only call this function as part of the API.
+ // If the drive_id does not exist, a new Block Device Config is added to the list.
+ fn add_block_device(
+ &mut self,
+ vmm: &mut Vmm,
+ event_mgr: &mut EventManager,
+ config: BlockDeviceConfigInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ let ctx = vm
+ .create_device_op_context(Some(event_mgr.epoll_manager()))
+ .map_err(|e| {
+ if let StartMicroVmError::UpcallNotReady = e {
+ return VmmActionError::UpcallNotReady;
+ }
+ VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot)
+ })?;
+
+ BlockDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::Block)
+ }
+
+ #[cfg(feature = "virtio-blk")]
+ /// Updates configuration for an emulated net device as described in `config`.
+ fn update_blk_rate_limiters(
+ &mut self,
+ vmm: &mut Vmm,
+ config: BlockDeviceConfigUpdateInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+
+ BlockDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::Block)
+ }
+
+ #[cfg(feature = "virtio-blk")]
+ // Remove the device
+ fn remove_block_device(
+ &mut self,
+ vmm: &mut Vmm,
+ event_mgr: &mut EventManager,
+ drive_id: &str,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ let ctx = vm
+ .create_device_op_context(Some(event_mgr.epoll_manager()))
+ .map_err(|_| VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot))?;
+
+ BlockDeviceMgr::remove_device(vm.device_manager_mut(), ctx, drive_id)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::Block)
+ }
+
+ #[cfg(feature = "virtio-net")]
+ fn add_virtio_net_device(
+ &mut self,
+ vmm: &mut Vmm,
+ event_mgr: &mut EventManager,
+ config: VirtioNetDeviceConfigInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ let ctx = vm
+ .create_device_op_context(Some(event_mgr.epoll_manager()))
+ .map_err(|e| {
+ if let StartMicroVmError::MicroVMAlreadyRunning = e {
+ VmmActionError::VirtioNet(VirtioNetDeviceError::UpdateNotAllowedPostBoot)
+ } else if let StartMicroVmError::UpcallNotReady = e {
+ VmmActionError::UpcallNotReady
+ } else {
+ VmmActionError::StartMicroVm(e)
+ }
+ })?;
+
+ VirtioNetDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::VirtioNet)
+ }
+
+ #[cfg(feature = "virtio-net")]
+ fn update_net_rate_limiters(
+ &mut self,
+ vmm: &mut Vmm,
+ config: VirtioNetDeviceConfigUpdateInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+
+ VirtioNetDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::VirtioNet)
+ }
+
+ #[cfg(feature = "virtio-fs")]
+ fn add_fs_device(&mut self, vmm: &mut Vmm, config: FsDeviceConfigInfo) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+ let hotplug = vm.is_vm_initialized();
+ if !cfg!(feature = "hotplug") && hotplug {
+ return Err(VmmActionError::FsDevice(
+ FsDeviceError::UpdateNotAllowedPostBoot,
+ ));
+ }
+
+ let ctx = vm.create_device_op_context(None).map_err(|e| {
+ info!("create device op context error: {:?}", e);
+ VmmActionError::FsDevice(FsDeviceError::UpdateNotAllowedPostBoot)
+ })?;
+ FsDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::FsDevice)
+ }
+
+ #[cfg(feature = "virtio-fs")]
+ fn manipulate_fs_backend_fs(
+ &self,
+ vmm: &mut Vmm,
+ config: FsMountConfigInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+
+ if !vm.is_vm_initialized() {
+ return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning));
+ }
+
+ FsDeviceMgr::manipulate_backend_fs(vm.device_manager_mut(), config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::FsDevice)
+ }
+
+ #[cfg(feature = "virtio-fs")]
+ fn update_fs_rate_limiters(
+ &self,
+ vmm: &mut Vmm,
+ config: FsDeviceConfigUpdateInfo,
+ ) -> VmmRequestResult {
+ let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
+
+ if !vm.is_vm_initialized() {
+ return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning));
+ }
+
+ FsDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config)
+ .map(|_| VmmData::Empty)
+ .map_err(VmmActionError::FsDevice)
+ }
+}
+
+fn handle_cpu_topology(
+ cpu_topology: &CpuTopology,
+ vcpu_count: u8,
+) -> std::result::Result<&CpuTopology, VmmActionError> {
+ // Check if dies_per_socket, cores_per_die, threads_per_core and socket number is valid
+ if cpu_topology.threads_per_core < 1 || cpu_topology.threads_per_core > 2 {
+ return Err(MachineConfig(InvalidThreadsPerCore(
+ cpu_topology.threads_per_core,
+ )));
+ }
+ let vcpu_count_from_topo = cpu_topology
+ .sockets
+ .checked_mul(cpu_topology.dies_per_socket)
+ .ok_or(MachineConfig(VcpuCountExceedsMaximum))?
+ .checked_mul(cpu_topology.cores_per_die)
+ .ok_or(MachineConfig(VcpuCountExceedsMaximum))?
+ .checked_mul(cpu_topology.threads_per_core)
+ .ok_or(MachineConfig(VcpuCountExceedsMaximum))?;
+ if vcpu_count_from_topo > MAX_SUPPORTED_VCPUS {
+ return Err(MachineConfig(VcpuCountExceedsMaximum));
+ }
+ if vcpu_count_from_topo < vcpu_count {
+ return Err(MachineConfig(InvalidCpuTopology(vcpu_count_from_topo)));
+ }
+
+ Ok(cpu_topology)
+}
diff --git a/src/dragonball/src/config_manager.rs b/src/dragonball/src/config_manager.rs
new file mode 100644
index 0000000000..f855be1266
--- /dev/null
+++ b/src/dragonball/src/config_manager.rs
@@ -0,0 +1,760 @@
+// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::convert::TryInto;
+use std::io;
+use std::ops::{Index, IndexMut};
+use std::sync::Arc;
+
+use dbs_device::DeviceIo;
+use dbs_utils::rate_limiter::{RateLimiter, TokenBucket};
+use serde_derive::{Deserialize, Serialize};
+
+/// Get bucket update for rate limiter.
+#[macro_export]
+macro_rules! get_bucket_update {
+ ($self:ident, $rate_limiter: ident, $metric: ident) => {{
+ match &$self.$rate_limiter {
+ Some(rl_cfg) => {
+ let tb_cfg = &rl_cfg.$metric;
+ dbs_utils::rate_limiter::RateLimiter::make_bucket(
+ tb_cfg.size,
+ tb_cfg.one_time_burst,
+ tb_cfg.refill_time,
+ )
+ // Updated active rate-limiter.
+ .map(dbs_utils::rate_limiter::BucketUpdate::Update)
+ // Updated/deactivated rate-limiter
+ .unwrap_or(dbs_utils::rate_limiter::BucketUpdate::Disabled)
+ }
+ // No update to the rate-limiter.
+ None => dbs_utils::rate_limiter::BucketUpdate::None,
+ }
+ }};
+}
+
+/// Trait for generic configuration information.
+pub trait ConfigItem {
+ /// Related errors.
+ type Err;
+
+ /// Get the unique identifier of the configuration item.
+ fn id(&self) -> &str;
+
+ /// Check whether current configuration item conflicts with another one.
+ fn check_conflicts(&self, other: &Self) -> std::result::Result<(), Self::Err>;
+}
+
+/// Struct to manage a group of configuration items.
+#[derive(Debug, Default, Deserialize, PartialEq, Serialize)]
+pub struct ConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ configs: Vec,
+}
+
+impl ConfigInfos
+where
+ T: ConfigItem + Clone + Default,
+{
+ /// Constructor
+ pub fn new() -> Self {
+ ConfigInfos::default()
+ }
+
+ /// Insert a configuration item in the group.
+ pub fn insert(&mut self, config: T) -> std::result::Result<(), T::Err> {
+ for item in self.configs.iter() {
+ config.check_conflicts(item)?;
+ }
+ self.configs.push(config);
+
+ Ok(())
+ }
+
+ /// Update a configuration item in the group.
+ pub fn update(&mut self, config: T, err: T::Err) -> std::result::Result<(), T::Err> {
+ match self.get_index_by_id(&config) {
+ None => Err(err),
+ Some(index) => {
+ for (idx, item) in self.configs.iter().enumerate() {
+ if idx != index {
+ config.check_conflicts(item)?;
+ }
+ }
+ self.configs[index] = config;
+ Ok(())
+ }
+ }
+ }
+
+ /// Insert or update a configuration item in the group.
+ pub fn insert_or_update(&mut self, config: T) -> std::result::Result<(), T::Err> {
+ match self.get_index_by_id(&config) {
+ None => {
+ for item in self.configs.iter() {
+ config.check_conflicts(item)?;
+ }
+
+ self.configs.push(config)
+ }
+ Some(index) => {
+ for (idx, item) in self.configs.iter().enumerate() {
+ if idx != index {
+ config.check_conflicts(item)?;
+ }
+ }
+ self.configs[index] = config;
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Remove the matching configuration entry.
+ pub fn remove(&mut self, config: &T) -> Option {
+ if let Some(index) = self.get_index_by_id(config) {
+ Some(self.configs.remove(index))
+ } else {
+ None
+ }
+ }
+
+ /// Returns an immutable iterator over the config items
+ pub fn iter(&self) -> ::std::slice::Iter {
+ self.configs.iter()
+ }
+
+ /// Get the configuration entry with matching ID.
+ pub fn get_by_id(&self, item: &T) -> Option<&T> {
+ let id = item.id();
+
+ self.configs.iter().rfind(|cfg| cfg.id() == id)
+ }
+
+ fn get_index_by_id(&self, item: &T) -> Option {
+ let id = item.id();
+ self.configs.iter().position(|cfg| cfg.id() == id)
+ }
+}
+
+impl Clone for ConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ fn clone(&self) -> Self {
+ ConfigInfos {
+ configs: self.configs.clone(),
+ }
+ }
+}
+
+/// Struct to maintain configuration information for a device.
+pub struct DeviceConfigInfo
+where
+ T: ConfigItem + Clone,
+{
+ /// Configuration information for the device object.
+ pub config: T,
+ /// The associated device object.
+ pub device: Option>,
+}
+
+impl DeviceConfigInfo
+where
+ T: ConfigItem + Clone,
+{
+ /// Create a new instance of ['DeviceInfoGroup'].
+ pub fn new(config: T) -> Self {
+ DeviceConfigInfo {
+ config,
+ device: None,
+ }
+ }
+
+ /// Create a new instance of ['DeviceInfoGroup'] with optional device.
+ pub fn new_with_device(config: T, device: Option>) -> Self {
+ DeviceConfigInfo { config, device }
+ }
+
+ /// Set the device object associated with the configuration.
+ pub fn set_device(&mut self, device: Arc) {
+ self.device = Some(device);
+ }
+}
+
+impl Clone for DeviceConfigInfo
+where
+ T: ConfigItem + Clone,
+{
+ fn clone(&self) -> Self {
+ DeviceConfigInfo::new_with_device(self.config.clone(), self.device.clone())
+ }
+}
+
+/// Struct to maintain configuration information for a group of devices.
+pub struct DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ info_list: Vec>,
+}
+
+impl Default for DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ /// Create a new instance of ['DeviceConfigInfos'].
+ pub fn new() -> Self {
+ DeviceConfigInfos {
+ info_list: Vec::new(),
+ }
+ }
+
+ /// Insert or update configuration information for a device.
+ pub fn insert_or_update(&mut self, config: &T) -> std::result::Result {
+ let device_info = DeviceConfigInfo::new(config.clone());
+ Ok(match self.get_index_by_id(config) {
+ Some(index) => {
+ for (idx, info) in self.info_list.iter().enumerate() {
+ if idx != index {
+ info.config.check_conflicts(config)?;
+ }
+ }
+ self.info_list[index] = device_info;
+ index
+ }
+ None => {
+ for info in self.info_list.iter() {
+ info.config.check_conflicts(config)?;
+ }
+ self.info_list.push(device_info);
+ self.info_list.len() - 1
+ }
+ })
+ }
+
+ /// Remove a device configuration information object.
+ pub fn remove(&mut self, index: usize) -> Option> {
+ if self.info_list.len() > index {
+ Some(self.info_list.remove(index))
+ } else {
+ None
+ }
+ }
+
+ /// Get number of device configuration information objects.
+ pub fn len(&self) -> usize {
+ self.info_list.len()
+ }
+
+ /// Returns true if the device configuration information objects is empty.
+ pub fn is_empty(&self) -> bool {
+ self.info_list.len() == 0
+ }
+
+ /// Add a device configuration information object at the tail.
+ pub fn push(&mut self, info: DeviceConfigInfo) {
+ self.info_list.push(info);
+ }
+
+ /// Iterator for configuration information objects.
+ pub fn iter(&self) -> std::slice::Iter> {
+ self.info_list.iter()
+ }
+
+ /// Mutable iterator for configuration information objects.
+ pub fn iter_mut(&mut self) -> std::slice::IterMut> {
+ self.info_list.iter_mut()
+ }
+
+ fn get_index_by_id(&self, config: &T) -> Option {
+ self.info_list
+ .iter()
+ .position(|info| info.config.id().eq(config.id()))
+ }
+}
+
+impl Index for DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ type Output = DeviceConfigInfo;
+ fn index(&self, idx: usize) -> &Self::Output {
+ &self.info_list[idx]
+ }
+}
+
+impl IndexMut for DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ fn index_mut(&mut self, idx: usize) -> &mut Self::Output {
+ &mut self.info_list[idx]
+ }
+}
+
+impl Clone for DeviceConfigInfos
+where
+ T: ConfigItem + Clone,
+{
+ fn clone(&self) -> Self {
+ DeviceConfigInfos {
+ info_list: self.info_list.clone(),
+ }
+ }
+}
+
+/// Configuration information for RateLimiter token bucket.
+#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
+pub struct TokenBucketConfigInfo {
+ /// The size for the token bucket. A TokenBucket of `size` total capacity will take `refill_time`
+ /// milliseconds to go from zero tokens to total capacity.
+ pub size: u64,
+ /// Number of free initial tokens, that can be consumed at no cost.
+ pub one_time_burst: u64,
+ /// Complete refill time in milliseconds.
+ pub refill_time: u64,
+}
+
+impl TokenBucketConfigInfo {
+ fn resize(&mut self, n: u64) {
+ if n != 0 {
+ self.size /= n;
+ self.one_time_burst /= n;
+ }
+ }
+}
+
+impl From for TokenBucket {
+ fn from(t: TokenBucketConfigInfo) -> TokenBucket {
+ (&t).into()
+ }
+}
+
+impl From<&TokenBucketConfigInfo> for TokenBucket {
+ fn from(t: &TokenBucketConfigInfo) -> TokenBucket {
+ TokenBucket::new(t.size, t.one_time_burst, t.refill_time)
+ }
+}
+
+/// Configuration information for RateLimiter objects.
+#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)]
+pub struct RateLimiterConfigInfo {
+ /// Data used to initialize the RateLimiter::bandwidth bucket.
+ pub bandwidth: TokenBucketConfigInfo,
+ /// Data used to initialize the RateLimiter::ops bucket.
+ pub ops: TokenBucketConfigInfo,
+}
+
+impl RateLimiterConfigInfo {
+ /// Update the bandwidth budget configuration.
+ pub fn update_bandwidth(&mut self, new_config: TokenBucketConfigInfo) {
+ self.bandwidth = new_config;
+ }
+
+ /// Update the ops budget configuration.
+ pub fn update_ops(&mut self, new_config: TokenBucketConfigInfo) {
+ self.ops = new_config;
+ }
+
+ /// resize the limiter to its 1/n.
+ pub fn resize(&mut self, n: u64) {
+ self.bandwidth.resize(n);
+ self.ops.resize(n);
+ }
+}
+
+impl TryInto for &RateLimiterConfigInfo {
+ type Error = io::Error;
+
+ fn try_into(self) -> Result {
+ RateLimiter::new(
+ self.bandwidth.size,
+ self.bandwidth.one_time_burst,
+ self.bandwidth.refill_time,
+ self.ops.size,
+ self.ops.one_time_burst,
+ self.ops.refill_time,
+ )
+ }
+}
+
+impl TryInto for RateLimiterConfigInfo {
+ type Error = io::Error;
+
+ fn try_into(self) -> Result {
+ RateLimiter::new(
+ self.bandwidth.size,
+ self.bandwidth.one_time_burst,
+ self.bandwidth.refill_time,
+ self.ops.size,
+ self.ops.one_time_burst,
+ self.ops.refill_time,
+ )
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[derive(Debug, thiserror::Error)]
+ pub enum DummyError {
+ #[error("configuration entry exists")]
+ Exist,
+ }
+
+ #[derive(Clone, Debug, Default)]
+ pub struct DummyConfigInfo {
+ id: String,
+ content: String,
+ }
+
+ impl ConfigItem for DummyConfigInfo {
+ type Err = DummyError;
+
+ fn id(&self) -> &str {
+ &self.id
+ }
+
+ fn check_conflicts(&self, other: &Self) -> Result<(), DummyError> {
+ if self.id == other.id || self.content == other.content {
+ Err(DummyError::Exist)
+ } else {
+ Ok(())
+ }
+ }
+ }
+
+ type DummyConfigInfos = ConfigInfos;
+
+ #[test]
+ fn test_insert_config_info() {
+ let mut configs = DummyConfigInfos::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert(config1).unwrap();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+
+ // Test case: cannot insert new item with the same id.
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.insert(config2).unwrap_err();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert(config3).unwrap();
+ assert_eq!(configs.configs.len(), 2);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+ assert_eq!(configs.configs[1].id, "2");
+ assert_eq!(configs.configs[1].content, "c");
+
+ // Test case: cannot insert new item with the same content.
+ let config4 = DummyConfigInfo {
+ id: "3".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert(config4).unwrap_err();
+ assert_eq!(configs.configs.len(), 2);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+ assert_eq!(configs.configs[1].id, "2");
+ assert_eq!(configs.configs[1].content, "c");
+ }
+
+ #[test]
+ fn test_update_config_info() {
+ let mut configs = DummyConfigInfos::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert(config1).unwrap();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+
+ // Test case: succeed to update an existing entry
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.update(config2, DummyError::Exist).unwrap();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+
+ // Test case: cannot update a non-existing entry
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.update(config3, DummyError::Exist).unwrap_err();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+
+ // Test case: cannot update an entry with conflicting content
+ let config4 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert(config4).unwrap();
+ let config5 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.update(config5, DummyError::Exist).unwrap_err();
+ }
+
+ #[test]
+ fn test_insert_or_update_config_info() {
+ let mut configs = DummyConfigInfos::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert_or_update(config1).unwrap();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "a");
+
+ // Test case: succeed to update an existing entry
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.insert_or_update(config2.clone()).unwrap();
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+
+ // Add a second entry
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(config3.clone()).unwrap();
+ assert_eq!(configs.configs.len(), 2);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+ assert_eq!(configs.configs[1].id, "2");
+ assert_eq!(configs.configs[1].content, "c");
+
+ // Lookup the first entry
+ let config4 = configs
+ .get_by_id(&DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ })
+ .unwrap();
+ assert_eq!(config4.id, config2.id);
+ assert_eq!(config4.content, config2.content);
+
+ // Lookup the second entry
+ let config5 = configs
+ .get_by_id(&DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ })
+ .unwrap();
+ assert_eq!(config5.id, config3.id);
+ assert_eq!(config5.content, config3.content);
+
+ // Test case: can't insert an entry with conflicting content
+ let config6 = DummyConfigInfo {
+ id: "3".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(config6).unwrap_err();
+ assert_eq!(configs.configs.len(), 2);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+ assert_eq!(configs.configs[1].id, "2");
+ assert_eq!(configs.configs[1].content, "c");
+ }
+
+ #[test]
+ fn test_remove_config_info() {
+ let mut configs = DummyConfigInfos::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert_or_update(config1).unwrap();
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.insert_or_update(config2.clone()).unwrap();
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(config3.clone()).unwrap();
+ assert_eq!(configs.configs.len(), 2);
+ assert_eq!(configs.configs[0].id, "1");
+ assert_eq!(configs.configs[0].content, "b");
+ assert_eq!(configs.configs[1].id, "2");
+ assert_eq!(configs.configs[1].content, "c");
+
+ let config4 = configs
+ .remove(&DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "no value".to_owned(),
+ })
+ .unwrap();
+ assert_eq!(config4.id, config2.id);
+ assert_eq!(config4.content, config2.content);
+ assert_eq!(configs.configs.len(), 1);
+ assert_eq!(configs.configs[0].id, "2");
+ assert_eq!(configs.configs[0].content, "c");
+
+ let config5 = configs
+ .remove(&DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "no value".to_owned(),
+ })
+ .unwrap();
+ assert_eq!(config5.id, config3.id);
+ assert_eq!(config5.content, config3.content);
+ assert_eq!(configs.configs.len(), 0);
+ }
+
+ type DummyDeviceInfoList = DeviceConfigInfos;
+
+ #[test]
+ fn test_insert_or_update_device_info() {
+ let mut configs = DummyDeviceInfoList::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert_or_update(&config1).unwrap();
+ assert_eq!(configs.len(), 1);
+ assert_eq!(configs[0].config.id, "1");
+ assert_eq!(configs[0].config.content, "a");
+
+ // Test case: succeed to update an existing entry
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.insert_or_update(&config2 /* */).unwrap();
+ assert_eq!(configs.len(), 1);
+ assert_eq!(configs[0].config.id, "1");
+ assert_eq!(configs[0].config.content, "b");
+
+ // Add a second entry
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(&config3).unwrap();
+ assert_eq!(configs.len(), 2);
+ assert_eq!(configs[0].config.id, "1");
+ assert_eq!(configs[0].config.content, "b");
+ assert_eq!(configs[1].config.id, "2");
+ assert_eq!(configs[1].config.content, "c");
+
+ // Lookup the first entry
+ let config4_id = configs
+ .get_index_by_id(&DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ })
+ .unwrap();
+ let config4 = &configs[config4_id].config;
+ assert_eq!(config4.id, config2.id);
+ assert_eq!(config4.content, config2.content);
+
+ // Lookup the second entry
+ let config5_id = configs
+ .get_index_by_id(&DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ })
+ .unwrap();
+ let config5 = &configs[config5_id].config;
+ assert_eq!(config5.id, config3.id);
+ assert_eq!(config5.content, config3.content);
+
+ // Test case: can't insert an entry with conflicting content
+ let config6 = DummyConfigInfo {
+ id: "3".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(&config6).unwrap_err();
+ assert_eq!(configs.len(), 2);
+ assert_eq!(configs[0].config.id, "1");
+ assert_eq!(configs[0].config.content, "b");
+ assert_eq!(configs[1].config.id, "2");
+ assert_eq!(configs[1].config.content, "c");
+ }
+
+ #[test]
+ fn test_remove_device_info() {
+ let mut configs = DummyDeviceInfoList::new();
+
+ let config1 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "a".to_owned(),
+ };
+ configs.insert_or_update(&config1).unwrap();
+ let config2 = DummyConfigInfo {
+ id: "1".to_owned(),
+ content: "b".to_owned(),
+ };
+ configs.insert_or_update(&config2).unwrap();
+ let config3 = DummyConfigInfo {
+ id: "2".to_owned(),
+ content: "c".to_owned(),
+ };
+ configs.insert_or_update(&config3).unwrap();
+ assert_eq!(configs.len(), 2);
+ assert_eq!(configs[0].config.id, "1");
+ assert_eq!(configs[0].config.content, "b");
+ assert_eq!(configs[1].config.id, "2");
+ assert_eq!(configs[1].config.content, "c");
+
+ let config4 = configs.remove(0).unwrap().config;
+ assert_eq!(config4.id, config2.id);
+ assert_eq!(config4.content, config2.content);
+ assert_eq!(configs.len(), 1);
+ assert_eq!(configs[0].config.id, "2");
+ assert_eq!(configs[0].config.content, "c");
+
+ let config5 = configs.remove(0).unwrap().config;
+ assert_eq!(config5.id, config3.id);
+ assert_eq!(config5.content, config3.content);
+ assert_eq!(configs.len(), 0);
+ }
+}
diff --git a/src/dragonball/src/device_manager/blk_dev_mgr.rs b/src/dragonball/src/device_manager/blk_dev_mgr.rs
new file mode 100644
index 0000000000..e4688b4f6f
--- /dev/null
+++ b/src/dragonball/src/device_manager/blk_dev_mgr.rs
@@ -0,0 +1,773 @@
+// Copyright 2020-2022 Alibaba, Inc. or its affiliates. All Rights Reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the THIRD-PARTY file.
+
+//! Device manager for virtio-blk and vhost-user-blk devices.
+use std::collections::{vec_deque, VecDeque};
+use std::convert::TryInto;
+use std::fs::OpenOptions;
+use std::os::unix::fs::OpenOptionsExt;
+use std::os::unix::io::AsRawFd;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+
+use dbs_virtio_devices as virtio;
+use dbs_virtio_devices::block::{aio::Aio, io_uring::IoUring, Block, LocalFile, Ufile};
+use serde_derive::{Deserialize, Serialize};
+
+use crate::address_space_manager::GuestAddressSpaceImpl;
+use crate::config_manager::{ConfigItem, DeviceConfigInfo, RateLimiterConfigInfo};
+use crate::device_manager::blk_dev_mgr::BlockDeviceError::InvalidDeviceId;
+use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext};
+use crate::get_bucket_update;
+use crate::vm::KernelConfigInfo;
+
+use super::DbsMmioV2Device;
+
+// The flag of whether to use the shared irq.
+const USE_SHARED_IRQ: bool = true;
+// The flag of whether to use the generic irq.
+const USE_GENERIC_IRQ: bool = true;
+
+macro_rules! info(
+ ($l:expr, $($args:tt)+) => {
+ slog::info!($l, $($args)+; slog::o!("subsystem" => "block_manager"))
+ };
+);
+
+macro_rules! error(
+ ($l:expr, $($args:tt)+) => {
+ slog::error!($l, $($args)+; slog::o!("subsystem" => "block_manager"))
+ };
+);
+
+/// Default queue size for VirtIo block devices.
+pub const QUEUE_SIZE: u16 = 128;
+
+/// Errors associated with the operations allowed on a drive.
+#[derive(Debug, thiserror::Error)]
+pub enum BlockDeviceError {
+ /// Invalid VM instance ID.
+ #[error("invalid VM instance id")]
+ InvalidVMID,
+
+ /// The block device path is invalid.
+ #[error("invalid block device path '{0}'")]
+ InvalidBlockDevicePath(PathBuf),
+
+ /// The block device type is invalid.
+ #[error("invalid block device type")]
+ InvalidBlockDeviceType,
+
+ /// The block device path was already used for a different drive.
+ #[error("block device path '{0}' already exists")]
+ BlockDevicePathAlreadyExists(PathBuf),
+
+ /// The device id doesn't exist.
+ #[error("invalid block device id '{0}'")]
+ InvalidDeviceId(String),
+
+ /// Cannot perform the requested operation after booting the microVM.
+ #[error("block device does not support runtime update")]
+ UpdateNotAllowedPostBoot,
+
+ /// A root block device was already added.
+ #[error("could not add multiple virtual machine root devices")]
+ RootBlockDeviceAlreadyAdded,
+
+ /// Failed to send patch message to block epoll handler.
+ #[error("could not send patch message to the block epoll handler")]
+ BlockEpollHanderSendFail,
+
+ /// Failure from device manager,
+ #[error("device manager errors: {0}")]
+ DeviceManager(#[from] DeviceMgrError),
+
+ /// Failure from virtio subsystem.
+ #[error(transparent)]
+ Virtio(virtio::Error),
+
+ /// Unable to seek the block device backing file due to invalid permissions or
+ /// the file was deleted/corrupted.
+ #[error("cannot create block device: {0}")]
+ CreateBlockDevice(#[source] virtio::Error),
+
+ /// Cannot open the block device backing file.
+ #[error("cannot open the block device backing file: {0}")]
+ OpenBlockDevice(#[source] std::io::Error),
+
+ /// Cannot initialize a MMIO Block Device or add a device to the MMIO Bus.
+ #[error("failure while registering block device: {0}")]
+ RegisterBlockDevice(#[source] DeviceMgrError),
+}
+
+/// Type of low level storage device/protocol for virtio-blk devices.
+#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
+pub enum BlockDeviceType {
+ /// Unknown low level device type.
+ Unknown,
+ /// Vhost-user-blk based low level device.
+ /// SPOOL is a reliable NVMe virtualization system for the cloud environment.
+ /// You could learn more SPOOL here: https://www.usenix.org/conference/atc20/presentation/xue
+ Spool,
+ /// Local disk/file based low level device.
+ RawBlock,
+}
+
+impl BlockDeviceType {
+ /// Get type of low level storage device/protocol by parsing `path`.
+ pub fn get_type(path: &str) -> BlockDeviceType {
+ // SPOOL path should be started with "spool", e.g. "spool:/device1"
+ if path.starts_with("spool:/") {
+ BlockDeviceType::Spool
+ } else {
+ BlockDeviceType::RawBlock
+ }
+ }
+}
+
+/// Configuration information for a block device.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct BlockDeviceConfigUpdateInfo {
+ /// Unique identifier of the drive.
+ pub drive_id: String,
+ /// Rate Limiter for I/O operations.
+ pub rate_limiter: Option,
+}
+
+impl BlockDeviceConfigUpdateInfo {
+ /// Provides a `BucketUpdate` description for the bandwidth rate limiter.
+ pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate {
+ get_bucket_update!(self, rate_limiter, bandwidth)
+ }
+ /// Provides a `BucketUpdate` description for the ops rate limiter.
+ pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate {
+ get_bucket_update!(self, rate_limiter, ops)
+ }
+}
+
+/// Configuration information for a block device.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct BlockDeviceConfigInfo {
+ /// Unique identifier of the drive.
+ pub drive_id: String,
+ /// Type of low level storage/protocol.
+ pub device_type: BlockDeviceType,
+ /// Path of the drive.
+ pub path_on_host: PathBuf,
+ /// If set to true, it makes the current device the root block device.
+ /// Setting this flag to true will mount the block device in the
+ /// guest under /dev/vda unless the part_uuid is present.
+ pub is_root_device: bool,
+ /// Part-UUID. Represents the unique id of the boot partition of this device.
+ /// It is optional and it will be used only if the `is_root_device` field is true.
+ pub part_uuid: Option,
+ /// If set to true, the drive is opened in read-only mode. Otherwise, the
+ /// drive is opened as read-write.
+ pub is_read_only: bool,
+ /// If set to false, the drive is opened with buffered I/O mode. Otherwise, the
+ /// drive is opened with direct I/O mode.
+ pub is_direct: bool,
+ /// Don't close `path_on_host` file when dropping the device.
+ pub no_drop: bool,
+ /// Block device multi-queue
+ pub num_queues: usize,
+ /// Virtio queue size. Size: byte
+ pub queue_size: u16,
+ /// Rate Limiter for I/O operations.
+ pub rate_limiter: Option,
+ /// Use shared irq
+ pub use_shared_irq: Option,
+ /// Use generic irq
+ pub use_generic_irq: Option,
+}
+
+impl std::default::Default for BlockDeviceConfigInfo {
+ fn default() -> Self {
+ Self {
+ drive_id: String::default(),
+ device_type: BlockDeviceType::RawBlock,
+ path_on_host: PathBuf::default(),
+ is_root_device: false,
+ part_uuid: None,
+ is_read_only: false,
+ is_direct: Self::default_direct(),
+ no_drop: Self::default_no_drop(),
+ num_queues: Self::default_num_queues(),
+ queue_size: 256,
+ rate_limiter: None,
+ use_shared_irq: None,
+ use_generic_irq: None,
+ }
+ }
+}
+
+impl BlockDeviceConfigInfo {
+ /// Get default queue numbers
+ pub fn default_num_queues() -> usize {
+ 1
+ }
+
+ /// Get default value of is_direct switch
+ pub fn default_direct() -> bool {
+ true
+ }
+
+ /// Get default value of no_drop switch
+ pub fn default_no_drop() -> bool {
+ false
+ }
+
+ /// Get type of low level storage/protocol.
+ pub fn device_type(&self) -> BlockDeviceType {
+ self.device_type
+ }
+
+ /// Returns a reference to `path_on_host`.
+ pub fn path_on_host(&self) -> &PathBuf {
+ &self.path_on_host
+ }
+
+ /// Returns a reference to the part_uuid.
+ pub fn get_part_uuid(&self) -> Option<&String> {
+ self.part_uuid.as_ref()
+ }
+
+ /// Checks whether the drive had read only permissions.
+ pub fn is_read_only(&self) -> bool {
+ self.is_read_only
+ }
+
+ /// Checks whether the drive uses direct I/O
+ pub fn is_direct(&self) -> bool {
+ self.is_direct
+ }
+
+ /// Get number and size of queues supported.
+ pub fn queue_sizes(&self) -> Vec {
+ (0..self.num_queues)
+ .map(|_| self.queue_size)
+ .collect::>()
+ }
+}
+
+impl ConfigItem for BlockDeviceConfigInfo {
+ type Err = BlockDeviceError;
+
+ fn id(&self) -> &str {
+ &self.drive_id
+ }
+
+ fn check_conflicts(&self, other: &Self) -> Result<(), BlockDeviceError> {
+ if self.drive_id == other.drive_id {
+ Ok(())
+ } else if self.path_on_host == other.path_on_host {
+ Err(BlockDeviceError::BlockDevicePathAlreadyExists(
+ self.path_on_host.clone(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+impl std::fmt::Debug for BlockDeviceInfo {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.config)
+ }
+}
+
+/// Block Device Info
+pub type BlockDeviceInfo = DeviceConfigInfo;
+
+/// Wrapper for the collection that holds all the Block Devices Configs
+//#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+#[derive(Clone)]
+pub struct BlockDeviceMgr {
+ /// A list of `BlockDeviceInfo` objects.
+ info_list: VecDeque,
+ has_root_block: bool,
+ has_part_uuid_root: bool,
+ read_only_root: bool,
+ part_uuid: Option,
+ use_shared_irq: bool,
+}
+
+impl BlockDeviceMgr {
+ /// returns a front-to-back iterator.
+ pub fn iter(&self) -> vec_deque::Iter {
+ self.info_list.iter()
+ }
+
+ /// Checks whether any of the added BlockDevice is the root.
+ pub fn has_root_block_device(&self) -> bool {
+ self.has_root_block
+ }
+
+ /// Checks whether the root device is configured using a part UUID.
+ pub fn has_part_uuid_root(&self) -> bool {
+ self.has_part_uuid_root
+ }
+
+ /// Checks whether the root device has read-only permisssions.
+ pub fn is_read_only_root(&self) -> bool {
+ self.read_only_root
+ }
+
+ /// Gets the index of the device with the specified `drive_id` if it exists in the list.
+ pub fn get_index_of_drive_id(&self, id: &str) -> Option {
+ self.info_list
+ .iter()
+ .position(|info| info.config.id().eq(id))
+ }
+
+ /// Gets the 'BlockDeviceConfigInfo' of the device with the specified `drive_id` if it exists in the list.
+ pub fn get_config_of_drive_id(&self, drive_id: &str) -> Option {
+ match self.get_index_of_drive_id(drive_id) {
+ Some(index) => {
+ let config = self.info_list.get(index).unwrap().config.clone();
+ Some(config)
+ }
+ None => None,
+ }
+ }
+
+ /// Inserts `block_device_config` in the block device configuration list.
+ /// If an entry with the same id already exists, it will attempt to update
+ /// the existing entry.
+ /// Inserting a secondary root block device will fail.
+ pub fn insert_device(
+ device_mgr: &mut DeviceManager,
+ mut ctx: DeviceOpContext,
+ config: BlockDeviceConfigInfo,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ if !cfg!(feature = "hotplug") && ctx.is_hotplug {
+ return Err(BlockDeviceError::UpdateNotAllowedPostBoot);
+ }
+
+ let mgr = &mut device_mgr.block_manager;
+
+ // If the id of the drive already exists in the list, the operation is update.
+ match mgr.get_index_of_drive_id(config.id()) {
+ Some(index) => {
+ // No support for runtime update yet.
+ if ctx.is_hotplug {
+ Err(BlockDeviceError::BlockDevicePathAlreadyExists(
+ config.path_on_host.clone(),
+ ))
+ } else {
+ for (idx, info) in mgr.info_list.iter().enumerate() {
+ if idx != index {
+ info.config.check_conflicts(&config)?;
+ }
+ }
+ mgr.update(index, config)
+ }
+ }
+ None => {
+ for info in mgr.info_list.iter() {
+ info.config.check_conflicts(&config)?;
+ }
+ let index = mgr.create(config.clone())?;
+ if !ctx.is_hotplug {
+ return Ok(());
+ }
+
+ match config.device_type {
+ BlockDeviceType::RawBlock => {
+ let device = Self::create_blk_device(&config, &mut ctx)
+ .map_err(BlockDeviceError::Virtio)?;
+ let dev = DeviceManager::create_mmio_virtio_device(
+ device,
+ &mut ctx,
+ config.use_shared_irq.unwrap_or(mgr.use_shared_irq),
+ config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
+ )
+ .map_err(BlockDeviceError::DeviceManager)?;
+ mgr.update_device_by_index(index, Arc::clone(&dev))?;
+ // live-upgrade need save/restore device from info.device.
+ mgr.info_list[index].set_device(dev.clone());
+ ctx.insert_hotplug_mmio_device(&dev, None).map_err(|e| {
+ let logger = ctx.logger().new(slog::o!());
+ BlockDeviceMgr::remove_device(device_mgr, ctx, &config.drive_id)
+ .unwrap();
+ error!(
+ logger,
+ "failed to hot-add virtio block device {}, {:?}",
+ &config.drive_id,
+ e
+ );
+ BlockDeviceError::DeviceManager(e)
+ })
+ }
+ _ => Err(BlockDeviceError::InvalidBlockDeviceType),
+ }
+ }
+ }
+ }
+
+ /// Attaches all block devices from the BlockDevicesConfig.
+ pub fn attach_devices(
+ &mut self,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ for info in self.info_list.iter_mut() {
+ match info.config.device_type {
+ BlockDeviceType::RawBlock => {
+ info!(
+ ctx.logger(),
+ "attach virtio-blk device, drive_id {}, path {}",
+ info.config.drive_id,
+ info.config.path_on_host.to_str().unwrap_or("")
+ );
+ let device = Self::create_blk_device(&info.config, ctx)
+ .map_err(BlockDeviceError::Virtio)?;
+ let device = DeviceManager::create_mmio_virtio_device(
+ device,
+ ctx,
+ info.config.use_shared_irq.unwrap_or(self.use_shared_irq),
+ info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
+ )
+ .map_err(BlockDeviceError::RegisterBlockDevice)?;
+ info.device = Some(device);
+ }
+ _ => {
+ return Err(BlockDeviceError::OpenBlockDevice(
+ std::io::Error::from_raw_os_error(libc::EINVAL),
+ ));
+ }
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Removes all virtio-blk devices
+ pub fn remove_devices(&mut self, ctx: &mut DeviceOpContext) -> Result<(), DeviceMgrError> {
+ while let Some(mut info) = self.info_list.pop_back() {
+ info!(ctx.logger(), "remove drive {}", info.config.drive_id);
+ if let Some(device) = info.device.take() {
+ DeviceManager::destroy_mmio_virtio_device(device, ctx)?;
+ }
+ }
+
+ Ok(())
+ }
+
+ fn remove(&mut self, drive_id: &str) -> Option {
+ match self.get_index_of_drive_id(drive_id) {
+ Some(index) => self.info_list.remove(index),
+ None => None,
+ }
+ }
+
+ /// remove a block device, it basically is the inverse operation of `insert_device``
+ pub fn remove_device(
+ dev_mgr: &mut DeviceManager,
+ mut ctx: DeviceOpContext,
+ drive_id: &str,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ if !cfg!(feature = "hotplug") {
+ return Err(BlockDeviceError::UpdateNotAllowedPostBoot);
+ }
+
+ let mgr = &mut dev_mgr.block_manager;
+ match mgr.remove(drive_id) {
+ Some(mut info) => {
+ info!(ctx.logger(), "remove drive {}", info.config.drive_id);
+ if let Some(device) = info.device.take() {
+ DeviceManager::destroy_mmio_virtio_device(device, &mut ctx)
+ .map_err(BlockDeviceError::DeviceManager)?;
+ }
+ }
+ None => return Err(BlockDeviceError::InvalidDeviceId(drive_id.to_owned())),
+ }
+
+ Ok(())
+ }
+
+ fn create_blk_device(
+ cfg: &BlockDeviceConfigInfo,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result>, virtio::Error> {
+ let epoll_mgr = ctx.epoll_mgr.clone().ok_or(virtio::Error::InvalidInput)?;
+
+ let mut block_files: Vec> = vec![];
+
+ match cfg.device_type {
+ BlockDeviceType::RawBlock => {
+ let custom_flags = if cfg.is_direct() {
+ info!(
+ ctx.logger(),
+ "Open block device \"{}\" in direct mode.",
+ cfg.path_on_host().display()
+ );
+ libc::O_DIRECT
+ } else {
+ info!(
+ ctx.logger(),
+ "Open block device \"{}\" in buffer mode.",
+ cfg.path_on_host().display(),
+ );
+ 0
+ };
+ let io_uring_supported = IoUring::is_supported();
+ for i in 0..cfg.num_queues {
+ let queue_size = cfg.queue_sizes()[i] as u32;
+ let file = OpenOptions::new()
+ .read(true)
+ .custom_flags(custom_flags)
+ .write(!cfg.is_read_only())
+ .open(cfg.path_on_host())?;
+ info!(ctx.logger(), "Queue {}: block file opened", i);
+
+ if io_uring_supported {
+ info!(
+ ctx.logger(),
+ "Queue {}: Using io_uring Raw disk file, queue size {}.", i, queue_size
+ );
+ let io_engine = IoUring::new(file.as_raw_fd(), queue_size)?;
+ block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?));
+ } else {
+ info!(
+ ctx.logger(),
+ "Queue {}: Since io_uring_supported is not enabled, change to default support of Aio Raw disk file, queue size {}", i, queue_size
+ );
+ let io_engine = Aio::new(file.as_raw_fd(), queue_size)?;
+ block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?));
+ }
+ }
+ }
+ _ => {
+ error!(
+ ctx.logger(),
+ "invalid block device type: {:?}", cfg.device_type
+ );
+ return Err(virtio::Error::InvalidInput);
+ }
+ };
+
+ let mut limiters = vec![];
+ for _i in 0..cfg.num_queues {
+ if let Some(limiter) = cfg.rate_limiter.clone().map(|mut v| {
+ v.resize(cfg.num_queues as u64);
+ v.try_into().unwrap()
+ }) {
+ limiters.push(limiter);
+ }
+ }
+
+ Ok(Box::new(Block::new(
+ block_files,
+ cfg.is_read_only,
+ Arc::new(cfg.queue_sizes()),
+ epoll_mgr,
+ limiters,
+ )?))
+ }
+
+ /// Generated guest kernel commandline related to root block device.
+ pub fn generate_kernel_boot_args(
+ &self,
+ kernel_config: &mut KernelConfigInfo,
+ ) -> std::result::Result<(), DeviceMgrError> {
+ // Respect user configuration if kernel_cmdline contains "root=",
+ // special attention for the case when kernel command line starting with "root=xxx"
+ let old_kernel_cmdline = format!(" {}", kernel_config.kernel_cmdline().as_str());
+ if !old_kernel_cmdline.contains(" root=") && self.has_root_block {
+ let cmdline = kernel_config.kernel_cmdline_mut();
+ if let Some(ref uuid) = self.part_uuid {
+ cmdline
+ .insert("root", &format!("PART_UUID={}", uuid))
+ .map_err(DeviceMgrError::Cmdline)?;
+ } else {
+ cmdline
+ .insert("root", "/dev/vda")
+ .map_err(DeviceMgrError::Cmdline)?;
+ }
+ if self.read_only_root {
+ if old_kernel_cmdline.contains(" rw") {
+ return Err(DeviceMgrError::InvalidOperation);
+ }
+ cmdline.insert_str("ro").map_err(DeviceMgrError::Cmdline)?;
+ }
+ }
+
+ Ok(())
+ }
+
+ /// insert a block device's config. return index on success.
+ fn create(
+ &mut self,
+ block_device_config: BlockDeviceConfigInfo,
+ ) -> std::result::Result {
+ self.check_data_file_present(&block_device_config)?;
+ if self
+ .get_index_of_drive_path(&block_device_config.path_on_host)
+ .is_some()
+ {
+ return Err(BlockDeviceError::BlockDevicePathAlreadyExists(
+ block_device_config.path_on_host,
+ ));
+ }
+
+ // check whether the Device Config belongs to a root device
+ // we need to satisfy the condition by which a VMM can only have on root device
+ if block_device_config.is_root_device {
+ if self.has_root_block {
+ return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded);
+ } else {
+ self.has_root_block = true;
+ self.read_only_root = block_device_config.is_read_only;
+ self.has_part_uuid_root = block_device_config.part_uuid.is_some();
+ self.part_uuid = block_device_config.part_uuid.clone();
+ // Root Device should be the first in the list whether or not PART_UUID is specified
+ // in order to avoid bugs in case of switching from part_uuid boot scenarios to
+ // /dev/vda boot type.
+ self.info_list
+ .push_front(BlockDeviceInfo::new(block_device_config));
+ Ok(0)
+ }
+ } else {
+ self.info_list
+ .push_back(BlockDeviceInfo::new(block_device_config));
+ Ok(self.info_list.len() - 1)
+ }
+ }
+
+ /// Updates a Block Device Config. The update fails if it would result in two
+ /// root block devices.
+ fn update(
+ &mut self,
+ mut index: usize,
+ new_config: BlockDeviceConfigInfo,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ // Check if the path exists
+ self.check_data_file_present(&new_config)?;
+ if let Some(idx) = self.get_index_of_drive_path(&new_config.path_on_host) {
+ if idx != index {
+ return Err(BlockDeviceError::BlockDevicePathAlreadyExists(
+ new_config.path_on_host.clone(),
+ ));
+ }
+ }
+
+ if self.info_list.get(index).is_none() {
+ return Err(InvalidDeviceId(index.to_string()));
+ }
+ // Check if the root block device is being updated.
+ if self.info_list[index].config.is_root_device {
+ self.has_root_block = new_config.is_root_device;
+ self.read_only_root = new_config.is_root_device && new_config.is_read_only;
+ self.has_part_uuid_root = new_config.part_uuid.is_some();
+ self.part_uuid = new_config.part_uuid.clone();
+ } else if new_config.is_root_device {
+ // Check if a second root block device is being added.
+ if self.has_root_block {
+ return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded);
+ } else {
+ // One of the non-root blocks is becoming root.
+ self.has_root_block = true;
+ self.read_only_root = new_config.is_read_only;
+ self.has_part_uuid_root = new_config.part_uuid.is_some();
+ self.part_uuid = new_config.part_uuid.clone();
+
+ // Make sure the root device is on the first position.
+ self.info_list.swap(0, index);
+ // Block config to be updated has moved to first position.
+ index = 0;
+ }
+ }
+ // Update the config.
+ self.info_list[index].config = new_config;
+
+ Ok(())
+ }
+
+ fn check_data_file_present(
+ &self,
+ block_device_config: &BlockDeviceConfigInfo,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ if block_device_config.device_type == BlockDeviceType::RawBlock
+ && !block_device_config.path_on_host.exists()
+ {
+ Err(BlockDeviceError::InvalidBlockDevicePath(
+ block_device_config.path_on_host.clone(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+
+ fn get_index_of_drive_path(&self, drive_path: &Path) -> Option {
+ self.info_list
+ .iter()
+ .position(|info| info.config.path_on_host.eq(drive_path))
+ }
+
+ /// update devce information in `info_list`. The caller of this method is
+ /// `insert_device` when hotplug is true.
+ pub fn update_device_by_index(
+ &mut self,
+ index: usize,
+ device: Arc,
+ ) -> Result<(), BlockDeviceError> {
+ if let Some(info) = self.info_list.get_mut(index) {
+ info.device = Some(device);
+ return Ok(());
+ }
+
+ Err(BlockDeviceError::InvalidDeviceId("".to_owned()))
+ }
+
+ /// Update the ratelimiter settings of a virtio blk device.
+ pub fn update_device_ratelimiters(
+ device_mgr: &mut DeviceManager,
+ new_cfg: BlockDeviceConfigUpdateInfo,
+ ) -> std::result::Result<(), BlockDeviceError> {
+ let mgr = &mut device_mgr.block_manager;
+ match mgr.get_index_of_drive_id(&new_cfg.drive_id) {
+ Some(index) => {
+ let config = &mut mgr.info_list[index].config;
+ config.rate_limiter = new_cfg.rate_limiter.clone();
+ let device = mgr.info_list[index]
+ .device
+ .as_mut()
+ .ok_or_else(|| BlockDeviceError::InvalidDeviceId("".to_owned()))?;
+ if let Some(mmio_dev) = device.as_any().downcast_ref::() {
+ let guard = mmio_dev.state();
+ let inner_dev = guard.get_inner_device();
+ if let Some(blk_dev) = inner_dev
+ .as_any()
+ .downcast_ref::>()
+ {
+ return blk_dev
+ .set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops())
+ .map(|_p| ())
+ .map_err(|_e| BlockDeviceError::BlockEpollHanderSendFail);
+ }
+ }
+ Ok(())
+ }
+ None => Err(BlockDeviceError::InvalidDeviceId(new_cfg.drive_id)),
+ }
+ }
+}
+
+impl Default for BlockDeviceMgr {
+ /// Constructor for the BlockDeviceMgr. It initializes an empty LinkedList.
+ fn default() -> BlockDeviceMgr {
+ BlockDeviceMgr {
+ info_list: VecDeque::::new(),
+ has_root_block: false,
+ has_part_uuid_root: false,
+ read_only_root: false,
+ part_uuid: None,
+ use_shared_irq: USE_SHARED_IRQ,
+ }
+ }
+}
diff --git a/src/dragonball/src/device_manager/console_manager.rs b/src/dragonball/src/device_manager/console_manager.rs
new file mode 100644
index 0000000000..1e3b2a2f22
--- /dev/null
+++ b/src/dragonball/src/device_manager/console_manager.rs
@@ -0,0 +1,440 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the THIRD-PARTY file.
+
+//! Virtual machine console device manager.
+//!
+//! A virtual console are composed up of two parts: frontend in virtual machine and backend in
+//! host OS. A frontend may be serial port, virtio-console etc, a backend may be stdio or Unix
+//! domain socket. The manager connects the frontend with the backend.
+use std::io::{self, Read};
+use std::os::unix::net::{UnixListener, UnixStream};
+use std::path::Path;
+use std::sync::{Arc, Mutex};
+
+use bytes::{BufMut, BytesMut};
+use dbs_legacy_devices::{ConsoleHandler, SerialDevice};
+use dbs_utils::epoll_manager::{
+ EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId,
+};
+use vmm_sys_util::terminal::Terminal;
+
+use super::{DeviceMgrError, Result};
+
+const EPOLL_EVENT_SERIAL: u32 = 0;
+const EPOLL_EVENT_SERIAL_DATA: u32 = 1;
+const EPOLL_EVENT_STDIN: u32 = 2;
+// Maximal backend throughput for every data transaction.
+const MAX_BACKEND_THROUGHPUT: usize = 64;
+
+/// Errors related to Console manager operations.
+#[derive(Debug, thiserror::Error)]
+pub enum ConsoleManagerError {
+ /// Cannot create unix domain socket for serial port
+ #[error("cannot create socket for serial console")]
+ CreateSerialSock(#[source] std::io::Error),
+
+ /// An operation on the epoll instance failed due to resource exhaustion or bad configuration.
+ #[error("failure while managing epoll event for console fd")]
+ EpollMgr(#[source] dbs_utils::epoll_manager::Error),
+
+ /// Cannot set mode for terminal.
+ #[error("failure while setting attribute for terminal")]
+ StdinHandle(#[source] vmm_sys_util::errno::Error),
+}
+
+enum Backend {
+ StdinHandle(std::io::Stdin),
+ SockPath(String),
+}
+
+/// Console manager to manage frontend and backend console devices.
+pub struct ConsoleManager {
+ epoll_mgr: EpollManager,
+ logger: slog::Logger,
+ subscriber_id: Option,
+ backend: Option,
+}
+
+impl ConsoleManager {
+ /// Create a console manager instance.
+ pub fn new(epoll_mgr: EpollManager, logger: &slog::Logger) -> Self {
+ let logger = logger.new(slog::o!("subsystem" => "console_manager"));
+ ConsoleManager {
+ epoll_mgr,
+ logger,
+ subscriber_id: Default::default(),
+ backend: None,
+ }
+ }
+
+ /// Create a console backend device by using stdio streams.
+ pub fn create_stdio_console(&mut self, device: Arc>) -> Result<()> {
+ let stdin_handle = std::io::stdin();
+ stdin_handle
+ .lock()
+ .set_raw_mode()
+ .map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?;
+
+ let handler = ConsoleEpollHandler::new(device, Some(stdin_handle), None, &self.logger);
+ self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler)));
+ self.backend = Some(Backend::StdinHandle(std::io::stdin()));
+
+ Ok(())
+ }
+
+ /// Create s console backend device by using Unix Domain socket.
+ pub fn create_socket_console(
+ &mut self,
+ device: Arc>,
+ sock_path: String,
+ ) -> Result<()> {
+ let sock_listener = Self::bind_domain_socket(&sock_path).map_err(|e| {
+ DeviceMgrError::ConsoleManager(ConsoleManagerError::CreateSerialSock(e))
+ })?;
+ let handler = ConsoleEpollHandler::new(device, None, Some(sock_listener), &self.logger);
+
+ self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler)));
+ self.backend = Some(Backend::SockPath(sock_path));
+
+ Ok(())
+ }
+
+ /// Reset the host side terminal to canonical mode.
+ pub fn reset_console(&self) -> Result<()> {
+ if let Some(Backend::StdinHandle(stdin_handle)) = self.backend.as_ref() {
+ stdin_handle
+ .lock()
+ .set_canon_mode()
+ .map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?;
+ }
+
+ Ok(())
+ }
+
+ fn bind_domain_socket(serial_path: &str) -> std::result::Result {
+ let path = Path::new(serial_path);
+ if path.is_file() {
+ let _ = std::fs::remove_file(serial_path);
+ }
+
+ UnixListener::bind(path)
+ }
+}
+
+struct ConsoleEpollHandler {
+ device: Arc>,
+ stdin_handle: Option,
+ sock_listener: Option,
+ sock_conn: Option,
+ logger: slog::Logger,
+}
+
+impl ConsoleEpollHandler {
+ fn new(
+ device: Arc>,
+ stdin_handle: Option,
+ sock_listener: Option,
+ logger: &slog::Logger,
+ ) -> Self {
+ ConsoleEpollHandler {
+ device,
+ stdin_handle,
+ sock_listener,
+ sock_conn: None,
+ logger: logger.new(slog::o!("subsystem" => "console_manager")),
+ }
+ }
+
+ fn uds_listener_accept(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
+ if self.sock_conn.is_some() {
+ slog::warn!(self.logger,
+ "UDS for serial port 1 already exists, reject the new connection";
+ "subsystem" => "console_mgr",
+ );
+ // Do not expected poisoned lock.
+ let _ = self.sock_listener.as_mut().unwrap().accept();
+ } else {
+ // Safe to unwrap() because self.sock_conn is Some().
+ let (conn_sock, _) = self.sock_listener.as_ref().unwrap().accept()?;
+ let events = Events::with_data(&conn_sock, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
+ if let Err(e) = ops.add(events) {
+ slog::error!(self.logger,
+ "failed to register epoll event for serial, {:?}", e;
+ "subsystem" => "console_mgr",
+ );
+ return Err(std::io::Error::last_os_error());
+ }
+
+ let conn_sock_copy = conn_sock.try_clone()?;
+ // Do not expected poisoned lock.
+ self.device
+ .lock()
+ .unwrap()
+ .set_output_stream(Some(Box::new(conn_sock_copy)));
+
+ self.sock_conn = Some(conn_sock);
+ }
+
+ Ok(())
+ }
+
+ fn uds_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
+ let mut should_drop = true;
+
+ if let Some(conn_sock) = self.sock_conn.as_mut() {
+ let mut out = [0u8; MAX_BACKEND_THROUGHPUT];
+ match conn_sock.read(&mut out[..]) {
+ Ok(0) => {
+ // Zero-length read means EOF. Remove this conn sock.
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .set_output_stream(None);
+ }
+ Ok(count) => {
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .raw_input(&out[..count])?;
+ should_drop = false;
+ }
+ Err(e) => {
+ slog::warn!(self.logger,
+ "error while reading serial conn sock: {:?}", e;
+ "subsystem" => "console_mgr"
+ );
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .set_output_stream(None);
+ }
+ }
+ }
+
+ if should_drop {
+ assert!(self.sock_conn.is_some());
+ // Safe to unwrap() because self.sock_conn is Some().
+ let sock_conn = self.sock_conn.take().unwrap();
+ let events = Events::with_data(&sock_conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
+ if let Err(e) = ops.remove(events) {
+ slog::error!(self.logger,
+ "failed deregister epoll event for UDS, {:?}", e;
+ "subsystem" => "console_mgr"
+ );
+ }
+ }
+
+ Ok(())
+ }
+
+ fn stdio_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> {
+ let mut should_drop = true;
+
+ if let Some(handle) = self.stdin_handle.as_ref() {
+ let mut out = [0u8; MAX_BACKEND_THROUGHPUT];
+ // Safe to unwrap() because self.stdin_handle is Some().
+ let stdin_lock = handle.lock();
+ match stdin_lock.read_raw(&mut out[..]) {
+ Ok(0) => {
+ // Zero-length read indicates EOF. Remove from pollables.
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .set_output_stream(None);
+ }
+ Ok(count) => {
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .raw_input(&out[..count])?;
+ should_drop = false;
+ }
+ Err(e) => {
+ slog::warn!(self.logger,
+ "error while reading stdin: {:?}", e;
+ "subsystem" => "console_mgr"
+ );
+ self.device
+ .lock()
+ .expect("console: poisoned console lock")
+ .set_output_stream(None);
+ }
+ }
+ }
+
+ if should_drop {
+ let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN);
+ if let Err(e) = ops.remove(events) {
+ slog::error!(self.logger,
+ "failed to deregister epoll event for stdin, {:?}", e;
+ "subsystem" => "console_mgr"
+ );
+ }
+ }
+
+ Ok(())
+ }
+}
+
+impl MutEventSubscriber for ConsoleEpollHandler {
+ fn process(&mut self, events: Events, ops: &mut EventOps) {
+ slog::trace!(self.logger, "ConsoleEpollHandler::process()");
+ let slot = events.data();
+ match slot {
+ EPOLL_EVENT_SERIAL => {
+ if let Err(e) = self.uds_listener_accept(ops) {
+ slog::warn!(self.logger, "failed to accept incoming connection, {:?}", e);
+ }
+ }
+ EPOLL_EVENT_SERIAL_DATA => {
+ if let Err(e) = self.uds_read_in(ops) {
+ slog::warn!(self.logger, "failed to read data from UDS, {:?}", e);
+ }
+ }
+ EPOLL_EVENT_STDIN => {
+ if let Err(e) = self.stdio_read_in(ops) {
+ slog::warn!(self.logger, "failed to read data from stdin, {:?}", e);
+ }
+ }
+ _ => slog::error!(self.logger, "unknown epoll slot number {}", slot),
+ }
+ }
+
+ fn init(&mut self, ops: &mut EventOps) {
+ slog::trace!(self.logger, "ConsoleEpollHandler::init()");
+
+ if self.stdin_handle.is_some() {
+ slog::info!(self.logger, "ConsoleEpollHandler: stdin handler");
+ let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN);
+ if let Err(e) = ops.add(events) {
+ slog::error!(
+ self.logger,
+ "failed to register epoll event for stdin, {:?}",
+ e
+ );
+ }
+ }
+ if let Some(sock) = self.sock_listener.as_ref() {
+ slog::info!(self.logger, "ConsoleEpollHandler: sock listener");
+ let events = Events::with_data(sock, EPOLL_EVENT_SERIAL, EventSet::IN);
+ if let Err(e) = ops.add(events) {
+ slog::error!(
+ self.logger,
+ "failed to register epoll event for UDS listener, {:?}",
+ e
+ );
+ }
+ }
+
+ if let Some(conn) = self.sock_conn.as_ref() {
+ slog::info!(self.logger, "ConsoleEpollHandler: sock connection");
+ let events = Events::with_data(conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN);
+ if let Err(e) = ops.add(events) {
+ slog::error!(
+ self.logger,
+ "failed to register epoll event for UDS connection, {:?}",
+ e
+ );
+ }
+ }
+ }
+}
+
+/// Writer to process guest kernel dmesg.
+pub struct DmesgWriter {
+ buf: BytesMut,
+ logger: slog::Logger,
+}
+
+impl DmesgWriter {
+ /// Creates a new instance.
+ pub fn new(logger: &slog::Logger) -> Self {
+ Self {
+ buf: BytesMut::with_capacity(1024),
+ logger: logger.new(slog::o!("subsystem" => "dmesg")),
+ }
+ }
+}
+
+impl io::Write for DmesgWriter {
+ /// 0000000 [ 0 . 0 3 4 9 1 6 ] R
+ /// 5b 20 20 20 20 30 2e 30 33 34 39 31 36 5d 20 52
+ /// 0000020 u n / s b i n / i n i t a s
+ /// 75 6e 20 2f 73 62 69 6e 2f 69 6e 69 74 20 61 73
+ /// 0000040 i n i t p r o c e s s \r \n [
+ ///
+ /// dmesg message end a line with /r/n . When redirect message to logger, we should
+ /// remove the /r/n .
+ fn write(&mut self, buf: &[u8]) -> io::Result {
+ let arr: Vec<&[u8]> = buf.split(|c| *c == b'\n').collect();
+ let count = arr.len();
+
+ for (i, sub) in arr.iter().enumerate() {
+ if sub.is_empty() {
+ if !self.buf.is_empty() {
+ slog::info!(
+ self.logger,
+ "{}",
+ String::from_utf8_lossy(self.buf.as_ref()).trim_end()
+ );
+ self.buf.clear();
+ }
+ } else if sub.len() < buf.len() && i < count - 1 {
+ slog::info!(
+ self.logger,
+ "{}{}",
+ String::from_utf8_lossy(self.buf.as_ref()).trim_end(),
+ String::from_utf8_lossy(sub).trim_end(),
+ );
+ self.buf.clear();
+ } else {
+ self.buf.put_slice(sub);
+ }
+ }
+
+ Ok(buf.len())
+ }
+
+ fn flush(&mut self) -> io::Result<()> {
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use slog::Drain;
+ use std::io::Write;
+
+ fn create_logger() -> slog::Logger {
+ let decorator = slog_term::TermDecorator::new().build();
+ let drain = slog_term::FullFormat::new(decorator).build().fuse();
+ let drain = slog_async::Async::new(drain).build().fuse();
+ slog::Logger::root(drain, slog::o!())
+ }
+
+ #[test]
+ fn test_dmesg_writer() {
+ let mut writer = DmesgWriter {
+ buf: Default::default(),
+ logger: create_logger(),
+ };
+
+ writer.flush().unwrap();
+ writer.write_all("".as_bytes()).unwrap();
+ writer.write_all("\n".as_bytes()).unwrap();
+ writer.write_all("\n\n".as_bytes()).unwrap();
+ writer.write_all("\n\n\n".as_bytes()).unwrap();
+ writer.write_all("12\n23\n34\n56".as_bytes()).unwrap();
+ writer.write_all("78".as_bytes()).unwrap();
+ writer.write_all("90\n".as_bytes()).unwrap();
+ writer.flush().unwrap();
+ }
+
+ // TODO: add unit tests for console manager
+}
diff --git a/src/dragonball/src/device_manager/fs_dev_mgr.rs b/src/dragonball/src/device_manager/fs_dev_mgr.rs
new file mode 100644
index 0000000000..088dc980f1
--- /dev/null
+++ b/src/dragonball/src/device_manager/fs_dev_mgr.rs
@@ -0,0 +1,528 @@
+// Copyright 2020-2022 Alibaba Cloud. All Rights Reserved.
+// Copyright 2019 Intel Corporation. All Rights Reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+use std::convert::TryInto;
+
+use dbs_utils::epoll_manager::EpollManager;
+use dbs_virtio_devices::{self as virtio, Error as VirtIoError};
+use serde_derive::{Deserialize, Serialize};
+use slog::{error, info};
+
+use crate::address_space_manager::GuestAddressSpaceImpl;
+use crate::config_manager::{
+ ConfigItem, DeviceConfigInfo, DeviceConfigInfos, RateLimiterConfigInfo,
+};
+use crate::device_manager::{
+ DbsMmioV2Device, DeviceManager, DeviceMgrError, DeviceOpContext, DeviceVirtioRegionHandler,
+};
+use crate::get_bucket_update;
+
+use super::DbsVirtioDevice;
+
+// The flag of whether to use the shared irq.
+const USE_SHARED_IRQ: bool = true;
+// The flag of whether to use the generic irq.
+const USE_GENERIC_IRQ: bool = true;
+// Default cache size is 2 Gi since this is a typical VM memory size.
+const DEFAULT_CACHE_SIZE: u64 = 2 * 1024 * 1024 * 1024;
+// We have 2 supported fs device mode, vhostuser and virtio
+const VHOSTUSER_FS_MODE: &str = "vhostuser";
+// We have 2 supported fs device mode, vhostuser and virtio
+const VIRTIO_FS_MODE: &str = "virtio";
+
+/// Errors associated with `FsDeviceConfig`.
+#[derive(Debug, thiserror::Error)]
+pub enum FsDeviceError {
+ /// Invalid fs, "virtio" or "vhostuser" is allowed.
+ #[error("the fs type is invalid, virtio or vhostuser is allowed")]
+ InvalidFs,
+
+ /// Cannot access address space.
+ #[error("Cannot access address space.")]
+ AddressSpaceNotInitialized,
+
+ /// Cannot convert RateLimterConfigInfo into RateLimiter.
+ #[error("failure while converting RateLimterConfigInfo into RateLimiter: {0}")]
+ RateLimterConfigInfoTryInto(#[source] std::io::Error),
+
+ /// The fs device tag was already used for a different fs.
+ #[error("VirtioFs device tag {0} already exists")]
+ FsDeviceTagAlreadyExists(String),
+
+ /// The fs device path was already used for a different fs.
+ #[error("VirtioFs device tag {0} already exists")]
+ FsDevicePathAlreadyExists(String),
+
+ /// The update is not allowed after booting the microvm.
+ #[error("update operation is not allowed after boot")]
+ UpdateNotAllowedPostBoot,
+
+ /// The attachbackendfs operation fails.
+ #[error("Fs device attach a backend fs failed")]
+ AttachBackendFailed(String),
+
+ /// attach backend fs must be done when vm is running.
+ #[error("vm is not running when attaching a backend fs")]
+ MicroVMNotRunning,
+
+ /// The mount tag doesn't exist.
+ #[error("fs tag'{0}' doesn't exist")]
+ TagNotExists(String),
+
+ /// Failed to send patch message to VirtioFs epoll handler.
+ #[error("could not send patch message to the VirtioFs epoll handler")]
+ VirtioFsEpollHanderSendFail,
+
+ /// Creating a shared-fs device fails (if the vhost-user socket cannot be open.)
+ #[error("cannot create shared-fs device: {0}")]
+ CreateFsDevice(#[source] VirtIoError),
+
+ /// Cannot initialize a shared-fs device or add a device to the MMIO Bus.
+ #[error("failure while registering shared-fs device: {0}")]
+ RegisterFsDevice(#[source] DeviceMgrError),
+
+ /// The device manager errors.
+ #[error("DeviceManager error: {0}")]
+ DeviceManager(#[source] DeviceMgrError),
+}
+
+/// Configuration information for a vhost-user-fs device.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct FsDeviceConfigInfo {
+ /// vhost-user socket path.
+ pub sock_path: String,
+ /// virtiofs mount tag name used inside the guest.
+ /// used as the device name during mount.
+ pub tag: String,
+ /// Number of virtqueues to use.
+ pub num_queues: usize,
+ /// Size of each virtqueue. Unit: byte.
+ pub queue_size: u16,
+ /// DAX cache window size
+ pub cache_size: u64,
+ /// Number of thread pool workers.
+ pub thread_pool_size: u16,
+ /// The caching policy the file system should use (auto, always or never).
+ /// This cache policy is set for virtio-fs, visit https://gitlab.com/virtio-fs/virtiofsd to get further information.
+ pub cache_policy: String,
+ /// Writeback cache
+ pub writeback_cache: bool,
+ /// Enable no_open or not
+ pub no_open: bool,
+ /// Enable xattr or not
+ pub xattr: bool,
+ /// Drop CAP_SYS_RESOURCE or not
+ pub drop_sys_resource: bool,
+ /// virtio fs or vhostuser fs.
+ pub mode: String,
+ /// Enable kill_priv_v2 or not
+ pub fuse_killpriv_v2: bool,
+ /// Enable no_readdir or not
+ pub no_readdir: bool,
+ /// Rate Limiter for I/O operations.
+ pub rate_limiter: Option,
+ /// Use shared irq
+ pub use_shared_irq: Option,
+ /// Use generic irq
+ pub use_generic_irq: Option,
+}
+
+impl std::default::Default for FsDeviceConfigInfo {
+ fn default() -> Self {
+ Self {
+ sock_path: String::default(),
+ tag: String::default(),
+ num_queues: 1,
+ queue_size: 1024,
+ cache_size: DEFAULT_CACHE_SIZE,
+ thread_pool_size: 0,
+ cache_policy: Self::default_cache_policy(),
+ writeback_cache: Self::default_writeback_cache(),
+ no_open: Self::default_no_open(),
+ fuse_killpriv_v2: Self::default_fuse_killpriv_v2(),
+ no_readdir: Self::default_no_readdir(),
+ xattr: Self::default_xattr(),
+ drop_sys_resource: Self::default_drop_sys_resource(),
+ mode: Self::default_fs_mode(),
+ rate_limiter: Some(RateLimiterConfigInfo::default()),
+ use_shared_irq: None,
+ use_generic_irq: None,
+ }
+ }
+}
+
+impl FsDeviceConfigInfo {
+ /// The default mode is set to 'virtio' for 'virtio-fs' device.
+ pub fn default_fs_mode() -> String {
+ String::from(VIRTIO_FS_MODE)
+ }
+
+ /// The default cache policy
+ pub fn default_cache_policy() -> String {
+ "always".to_string()
+ }
+
+ /// The default setting of writeback cache
+ pub fn default_writeback_cache() -> bool {
+ true
+ }
+
+ /// The default setting of no_open
+ pub fn default_no_open() -> bool {
+ true
+ }
+
+ /// The default setting of killpriv_v2
+ pub fn default_fuse_killpriv_v2() -> bool {
+ false
+ }
+
+ /// The default setting of xattr
+ pub fn default_xattr() -> bool {
+ false
+ }
+
+ /// The default setting of drop_sys_resource
+ pub fn default_drop_sys_resource() -> bool {
+ false
+ }
+
+ /// The default setting of no_readdir
+ pub fn default_no_readdir() -> bool {
+ false
+ }
+
+ /// The default setting of rate limiter
+ pub fn default_fs_rate_limiter() -> Option {
+ None
+ }
+}
+
+/// Configuration information for virtio-fs.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct FsDeviceConfigUpdateInfo {
+ /// virtiofs mount tag name used inside the guest.
+ /// used as the device name during mount.
+ pub tag: String,
+ /// Rate Limiter for I/O operations.
+ pub rate_limiter: Option,
+}
+
+impl FsDeviceConfigUpdateInfo {
+ /// Provides a `BucketUpdate` description for the bandwidth rate limiter.
+ pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate {
+ get_bucket_update!(self, rate_limiter, bandwidth)
+ }
+ /// Provides a `BucketUpdate` description for the ops rate limiter.
+ pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate {
+ get_bucket_update!(self, rate_limiter, ops)
+ }
+}
+
+impl ConfigItem for FsDeviceConfigInfo {
+ type Err = FsDeviceError;
+
+ fn id(&self) -> &str {
+ &self.tag
+ }
+
+ fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> {
+ if self.tag == other.tag {
+ Err(FsDeviceError::FsDeviceTagAlreadyExists(self.tag.clone()))
+ } else if self.mode.as_str() == VHOSTUSER_FS_MODE && self.sock_path == other.sock_path {
+ Err(FsDeviceError::FsDevicePathAlreadyExists(
+ self.sock_path.clone(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+/// Configuration information of manipulating backend fs for a virtiofs device.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct FsMountConfigInfo {
+ /// Mount operations, mount, update, umount
+ pub ops: String,
+ /// The backend fs type to mount.
+ pub fstype: Option,
+ /// the source file/directory the backend fs points to
+ pub source: Option,
+ /// where the backend fs gets mounted
+ pub mountpoint: String,
+ /// backend fs config content in json format
+ pub config: Option,
+ /// virtiofs mount tag name used inside the guest.
+ /// used as the device name during mount.
+ pub tag: String,
+ /// Path to file that contains file lists that should be prefetched by rafs
+ pub prefetch_list_path: Option,
+ /// What size file supports dax
+ pub dax_threshold_size_kb: Option,
+}
+
+pub(crate) type FsDeviceInfo = DeviceConfigInfo;
+
+impl ConfigItem for FsDeviceInfo {
+ type Err = FsDeviceError;
+ fn id(&self) -> &str {
+ &self.config.tag
+ }
+
+ fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> {
+ if self.config.tag == other.config.tag {
+ Err(FsDeviceError::FsDeviceTagAlreadyExists(
+ self.config.tag.clone(),
+ ))
+ } else if self.config.sock_path == other.config.sock_path {
+ Err(FsDeviceError::FsDevicePathAlreadyExists(
+ self.config.sock_path.clone(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+/// Wrapper for the collection that holds all the Fs Devices Configs
+pub struct FsDeviceMgr {
+ /// A list of `FsDeviceConfig` objects.
+ pub(crate) info_list: DeviceConfigInfos,
+ pub(crate) use_shared_irq: bool,
+}
+
+impl FsDeviceMgr {
+ /// Inserts `fs_cfg` in the shared-fs device configuration list.
+ pub fn insert_device(
+ device_mgr: &mut DeviceManager,
+ ctx: DeviceOpContext,
+ fs_cfg: FsDeviceConfigInfo,
+ ) -> std::result::Result<(), FsDeviceError> {
+ // It's too complicated to manage life cycle of shared-fs service process for hotplug.
+ if ctx.is_hotplug {
+ error!(
+ ctx.logger(),
+ "no support of shared-fs device hotplug";
+ "subsystem" => "shared-fs",
+ "tag" => &fs_cfg.tag,
+ );
+ return Err(FsDeviceError::UpdateNotAllowedPostBoot);
+ }
+
+ info!(
+ ctx.logger(),
+ "add shared-fs device configuration";
+ "subsystem" => "shared-fs",
+ "tag" => &fs_cfg.tag,
+ );
+ device_mgr
+ .fs_manager
+ .lock()
+ .unwrap()
+ .info_list
+ .insert_or_update(&fs_cfg)?;
+
+ Ok(())
+ }
+
+ /// Attaches all vhost-user-fs devices from the FsDevicesConfig.
+ pub fn attach_devices(
+ &mut self,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), FsDeviceError> {
+ let epoll_mgr = ctx
+ .epoll_mgr
+ .clone()
+ .ok_or(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput))?;
+
+ for info in self.info_list.iter_mut() {
+ let device = Self::create_fs_device(&info.config, ctx, epoll_mgr.clone())?;
+ let mmio_device = DeviceManager::create_mmio_virtio_device(
+ device,
+ ctx,
+ info.config.use_shared_irq.unwrap_or(self.use_shared_irq),
+ info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
+ )
+ .map_err(FsDeviceError::RegisterFsDevice)?;
+
+ info.set_device(mmio_device);
+ }
+
+ Ok(())
+ }
+
+ fn create_fs_device(
+ config: &FsDeviceConfigInfo,
+ ctx: &mut DeviceOpContext,
+ epoll_mgr: EpollManager,
+ ) -> std::result::Result {
+ match &config.mode as &str {
+ VIRTIO_FS_MODE => Self::attach_virtio_fs_devices(config, ctx, epoll_mgr),
+ _ => Err(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput)),
+ }
+ }
+
+ fn attach_virtio_fs_devices(
+ config: &FsDeviceConfigInfo,
+ ctx: &mut DeviceOpContext,
+ epoll_mgr: EpollManager,
+ ) -> std::result::Result {
+ info!(
+ ctx.logger(),
+ "add virtio-fs device configuration";
+ "subsystem" => "virito-fs",
+ "tag" => &config.tag,
+ "dax_window_size" => &config.cache_size,
+ );
+
+ let limiter = if let Some(rlc) = config.rate_limiter.clone() {
+ Some(
+ rlc.try_into()
+ .map_err(FsDeviceError::RateLimterConfigInfoTryInto)?,
+ )
+ } else {
+ None
+ };
+
+ let vm_as = ctx.get_vm_as().map_err(|e| {
+ error!(ctx.logger(), "virtio-fs get vm_as error: {:?}", e;
+ "subsystem" => "virito-fs");
+ FsDeviceError::DeviceManager(e)
+ })?;
+ let address_space = match ctx.address_space.as_ref() {
+ Some(address_space) => address_space.clone(),
+ None => {
+ error!(ctx.logger(), "virtio-fs get address_space error"; "subsystem" => "virito-fs");
+ return Err(FsDeviceError::AddressSpaceNotInitialized);
+ }
+ };
+ let handler = DeviceVirtioRegionHandler {
+ vm_as,
+ address_space,
+ };
+
+ let device = Box::new(
+ virtio::fs::VirtioFs::new(
+ &config.tag,
+ config.num_queues,
+ config.queue_size,
+ config.cache_size,
+ &config.cache_policy,
+ config.thread_pool_size,
+ config.writeback_cache,
+ config.no_open,
+ config.fuse_killpriv_v2,
+ config.xattr,
+ config.drop_sys_resource,
+ config.no_readdir,
+ Box::new(handler),
+ epoll_mgr,
+ limiter,
+ )
+ .map_err(FsDeviceError::CreateFsDevice)?,
+ );
+
+ Ok(device)
+ }
+
+ /// Attach a backend fs to a VirtioFs device or detach a backend
+ /// fs from a Virtiofs device
+ pub fn manipulate_backend_fs(
+ device_mgr: &mut DeviceManager,
+ config: FsMountConfigInfo,
+ ) -> std::result::Result<(), FsDeviceError> {
+ let mut found = false;
+
+ let mgr = &mut device_mgr.fs_manager.lock().unwrap();
+ for info in mgr
+ .info_list
+ .iter()
+ .filter(|info| info.config.tag.as_str() == config.tag.as_str())
+ {
+ found = true;
+ if let Some(device) = info.device.as_ref() {
+ if let Some(mmio_dev) = device.as_any().downcast_ref::() {
+ let mut guard = mmio_dev.state();
+ let inner_dev = guard.get_inner_device_mut();
+ if let Some(virtio_fs_dev) = inner_dev
+ .as_any_mut()
+ .downcast_mut::>()
+ {
+ return virtio_fs_dev
+ .manipulate_backend_fs(
+ config.source,
+ config.fstype,
+ &config.mountpoint,
+ config.config,
+ &config.ops,
+ config.prefetch_list_path,
+ config.dax_threshold_size_kb,
+ )
+ .map(|_p| ())
+ .map_err(|e| FsDeviceError::AttachBackendFailed(e.to_string()));
+ }
+ }
+ }
+ }
+ if !found {
+ Err(FsDeviceError::AttachBackendFailed(
+ "fs tag not found".to_string(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+
+ /// Gets the index of the device with the specified `tag` if it exists in the list.
+ pub fn get_index_of_tag(&self, tag: &str) -> Option {
+ self.info_list
+ .iter()
+ .position(|info| info.config.id().eq(tag))
+ }
+
+ /// Update the ratelimiter settings of a virtio fs device.
+ pub fn update_device_ratelimiters(
+ device_mgr: &mut DeviceManager,
+ new_cfg: FsDeviceConfigUpdateInfo,
+ ) -> std::result::Result<(), FsDeviceError> {
+ let mgr = &mut device_mgr.fs_manager.lock().unwrap();
+ match mgr.get_index_of_tag(&new_cfg.tag) {
+ Some(index) => {
+ let config = &mut mgr.info_list[index].config;
+ config.rate_limiter = new_cfg.rate_limiter.clone();
+ let device = mgr.info_list[index]
+ .device
+ .as_mut()
+ .ok_or_else(|| FsDeviceError::TagNotExists("".to_owned()))?;
+
+ if let Some(mmio_dev) = device.as_any().downcast_ref::() {
+ let guard = mmio_dev.state();
+ let inner_dev = guard.get_inner_device();
+ if let Some(fs_dev) = inner_dev
+ .as_any()
+ .downcast_ref::>()
+ {
+ return fs_dev
+ .set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops())
+ .map(|_p| ())
+ .map_err(|_e| FsDeviceError::VirtioFsEpollHanderSendFail);
+ }
+ }
+ Ok(())
+ }
+ None => Err(FsDeviceError::TagNotExists(new_cfg.tag)),
+ }
+ }
+}
+
+impl Default for FsDeviceMgr {
+ /// Create a new `FsDeviceMgr` object..
+ fn default() -> Self {
+ FsDeviceMgr {
+ info_list: DeviceConfigInfos::new(),
+ use_shared_irq: USE_SHARED_IRQ,
+ }
+ }
+}
diff --git a/src/dragonball/src/device_manager/legacy.rs b/src/dragonball/src/device_manager/legacy.rs
new file mode 100644
index 0000000000..50a47cab73
--- /dev/null
+++ b/src/dragonball/src/device_manager/legacy.rs
@@ -0,0 +1,246 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the THIRD-PARTY file.
+
+//! Device Manager for Legacy Devices.
+
+use std::io;
+use std::sync::{Arc, Mutex};
+
+use dbs_device::device_manager::Error as IoManagerError;
+#[cfg(target_arch = "aarch64")]
+use dbs_legacy_devices::RTCDevice;
+use dbs_legacy_devices::SerialDevice;
+use vmm_sys_util::eventfd::EventFd;
+
+// The I8042 Data Port (IO Port 0x60) is used for reading data that was received from a I8042 device or from the I8042 controller itself and writing data to a I8042 device or to the I8042 controller itself.
+const I8042_DATA_PORT: u16 = 0x60;
+
+/// Errors generated by legacy device manager.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+ /// Cannot add legacy device to Bus.
+ #[error("bus failure while managing legacy device")]
+ BusError(#[source] IoManagerError),
+
+ /// Cannot create EventFd.
+ #[error("failure while reading EventFd file descriptor")]
+ EventFd(#[source] io::Error),
+
+ /// Failed to register/deregister interrupt.
+ #[error("failure while managing interrupt for legacy device")]
+ IrqManager(#[source] vmm_sys_util::errno::Error),
+}
+
+/// The `LegacyDeviceManager` is a wrapper that is used for registering legacy devices
+/// on an I/O Bus.
+///
+/// It currently manages the uart and i8042 devices. The `LegacyDeviceManger` should be initialized
+/// only by using the constructor.
+pub struct LegacyDeviceManager {
+ #[cfg(target_arch = "x86_64")]
+ i8042_reset_eventfd: EventFd,
+ #[cfg(target_arch = "aarch64")]
+ pub(crate) _rtc_device: Arc>,
+ #[cfg(target_arch = "aarch64")]
+ _rtc_eventfd: EventFd,
+ pub(crate) com1_device: Arc>,
+ _com1_eventfd: EventFd,
+ pub(crate) com2_device: Arc>,
+ _com2_eventfd: EventFd,
+}
+
+impl LegacyDeviceManager {
+ /// Get the serial device for com1.
+ pub fn get_com1_serial(&self) -> Arc> {
+ self.com1_device.clone()
+ }
+
+ /// Get the serial device for com2
+ pub fn get_com2_serial(&self) -> Arc> {
+ self.com2_device.clone()
+ }
+}
+
+#[cfg(target_arch = "x86_64")]
+pub(crate) mod x86_64 {
+ use super::*;
+ use dbs_device::device_manager::IoManager;
+ use dbs_device::resources::Resource;
+ use dbs_legacy_devices::{EventFdTrigger, I8042Device, I8042DeviceMetrics};
+ use kvm_ioctls::VmFd;
+
+ pub(crate) const COM1_IRQ: u32 = 4;
+ pub(crate) const COM1_PORT1: u16 = 0x3f8;
+ pub(crate) const COM2_IRQ: u32 = 3;
+ pub(crate) const COM2_PORT1: u16 = 0x2f8;
+
+ type Result = ::std::result::Result;
+
+ impl LegacyDeviceManager {
+ /// Create a LegacyDeviceManager instance handling legacy devices (uart, i8042).
+ pub fn create_manager(bus: &mut IoManager, vm_fd: Option>) -> Result {
+ let (com1_device, com1_eventfd) =
+ Self::create_com_device(bus, vm_fd.as_ref(), COM1_IRQ, COM1_PORT1)?;
+ let (com2_device, com2_eventfd) =
+ Self::create_com_device(bus, vm_fd.as_ref(), COM2_IRQ, COM2_PORT1)?;
+
+ let exit_evt = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?;
+ let i8042_device = Arc::new(Mutex::new(I8042Device::new(
+ EventFdTrigger::new(exit_evt.try_clone().map_err(Error::EventFd)?),
+ Arc::new(I8042DeviceMetrics::default()),
+ )));
+ let resources = [Resource::PioAddressRange {
+ // 0x60 and 0x64 are the io ports that i8042 devices used.
+ // We register pio address range from 0x60 - 0x64 with base I8042_DATA_PORT for i8042 to use.
+ base: I8042_DATA_PORT,
+ size: 0x5,
+ }];
+ bus.register_device_io(i8042_device, &resources)
+ .map_err(Error::BusError)?;
+
+ Ok(LegacyDeviceManager {
+ i8042_reset_eventfd: exit_evt,
+ com1_device,
+ _com1_eventfd: com1_eventfd,
+ com2_device,
+ _com2_eventfd: com2_eventfd,
+ })
+ }
+
+ /// Get the eventfd for exit notification.
+ pub fn get_reset_eventfd(&self) -> Result {
+ self.i8042_reset_eventfd.try_clone().map_err(Error::EventFd)
+ }
+
+ fn create_com_device(
+ bus: &mut IoManager,
+ vm_fd: Option<&Arc>,
+ irq: u32,
+ port_base: u16,
+ ) -> Result<(Arc>, EventFd)> {
+ let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?;
+ let device = Arc::new(Mutex::new(SerialDevice::new(
+ eventfd.try_clone().map_err(Error::EventFd)?,
+ )));
+ // port_base defines the base port address for the COM devices.
+ // Since every COM device has 8 data registers so we register the pio address range as size 0x8.
+ let resources = [Resource::PioAddressRange {
+ base: port_base,
+ size: 0x8,
+ }];
+ bus.register_device_io(device.clone(), &resources)
+ .map_err(Error::BusError)?;
+
+ if let Some(fd) = vm_fd {
+ fd.register_irqfd(&eventfd, irq)
+ .map_err(Error::IrqManager)?;
+ }
+
+ Ok((device, eventfd))
+ }
+ }
+}
+
+#[cfg(target_arch = "aarch64")]
+pub(crate) mod aarch64 {
+ use super::*;
+ use dbs_device::device_manager::IoManager;
+ use dbs_device::resources::DeviceResources;
+ use kvm_ioctls::VmFd;
+ use std::collections::HashMap;
+
+ type Result = ::std::result::Result;
+
+ /// LegacyDeviceType: com1
+ pub const COM1: &str = "com1";
+ /// LegacyDeviceType: com2
+ pub const COM2: &str = "com2";
+ /// LegacyDeviceType: rtc
+ pub const RTC: &str = "rtc";
+
+ impl LegacyDeviceManager {
+ /// Create a LegacyDeviceManager instance handling legacy devices.
+ pub fn create_manager(
+ bus: &mut IoManager,
+ vm_fd: Option>,
+ resources: &HashMap,
+ ) -> Result {
+ let (com1_device, com1_eventfd) =
+ Self::create_com_device(bus, vm_fd.as_ref(), resources.get(COM1).unwrap())?;
+ let (com2_device, com2_eventfd) =
+ Self::create_com_device(bus, vm_fd.as_ref(), resources.get(COM2).unwrap())?;
+ let (rtc_device, rtc_eventfd) =
+ Self::create_rtc_device(bus, vm_fd.as_ref(), resources.get(RTC).unwrap())?;
+
+ Ok(LegacyDeviceManager {
+ _rtc_device: rtc_device,
+ _rtc_eventfd: rtc_eventfd,
+ com1_device,
+ _com1_eventfd: com1_eventfd,
+ com2_device,
+ _com2_eventfd: com2_eventfd,
+ })
+ }
+
+ fn create_com_device(
+ bus: &mut IoManager,
+ vm_fd: Option<&Arc>,
+ resources: &DeviceResources,
+ ) -> Result<(Arc>, EventFd)> {
+ let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?;
+ let device = Arc::new(Mutex::new(SerialDevice::new(
+ eventfd.try_clone().map_err(Error::EventFd)?,
+ )));
+
+ bus.register_device_io(device.clone(), resources.get_all_resources())
+ .map_err(Error::BusError)?;
+
+ if let Some(fd) = vm_fd {
+ let irq = resources.get_legacy_irq().unwrap();
+ fd.register_irqfd(&eventfd, irq)
+ .map_err(Error::IrqManager)?;
+ }
+
+ Ok((device, eventfd))
+ }
+
+ fn create_rtc_device(
+ bus: &mut IoManager,
+ vm_fd: Option<&Arc>,
+ resources: &DeviceResources,
+ ) -> Result<(Arc>, EventFd)> {
+ let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?;
+ let device = Arc::new(Mutex::new(RTCDevice::new()));
+
+ bus.register_device_io(device.clone(), resources.get_all_resources())
+ .map_err(Error::BusError)?;
+
+ if let Some(fd) = vm_fd {
+ let irq = resources.get_legacy_irq().unwrap();
+ fd.register_irqfd(&eventfd, irq)
+ .map_err(Error::IrqManager)?;
+ }
+
+ Ok((device, eventfd))
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ #[cfg(target_arch = "x86_64")]
+ use super::*;
+
+ #[test]
+ #[cfg(target_arch = "x86_64")]
+ fn test_create_legacy_device_manager() {
+ let mut bus = dbs_device::device_manager::IoManager::new();
+ let mgr = LegacyDeviceManager::create_manager(&mut bus, None).unwrap();
+ let _exit_fd = mgr.get_reset_eventfd().unwrap();
+ }
+}
diff --git a/src/dragonball/src/device_manager/memory_region_handler.rs b/src/dragonball/src/device_manager/memory_region_handler.rs
new file mode 100644
index 0000000000..2be149ef97
--- /dev/null
+++ b/src/dragonball/src/device_manager/memory_region_handler.rs
@@ -0,0 +1,110 @@
+// Copyright 2022 Alibaba, Inc. or its affiliates. All Rights Reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+use std::io;
+use std::sync::Arc;
+
+use dbs_address_space::{AddressSpace, AddressSpaceRegion, AddressSpaceRegionType};
+use dbs_virtio_devices::{Error as VirtIoError, VirtioRegionHandler};
+use log::{debug, error};
+use vm_memory::{FileOffset, GuestAddressSpace, GuestMemoryRegion, GuestRegionMmap};
+
+use crate::address_space_manager::GuestAddressSpaceImpl;
+
+/// This struct implements the VirtioRegionHandler trait, which inserts the memory
+/// region of the virtio device into vm_as and address_space.
+///
+/// * After region is inserted into the vm_as, the virtio device can read guest memory
+/// data using vm_as.get_slice with GuestAddress.
+///
+/// * Insert virtio memory into address_space so that the correct guest last address can
+/// be found when initializing the e820 table. The e820 table is a table that describes
+/// guest memory prepared before the guest startup. we need to config the correct guest
+/// memory address and length in the table. The virtio device memory belongs to the MMIO
+/// space and does not belong to the Guest Memory space. Therefore, it cannot be configured
+/// into the e820 table. When creating AddressSpaceRegion we use
+/// AddressSpaceRegionType::ReservedMemory type, in this way, address_space will know that
+/// this region a special memory, it will don't put the this memory in e820 table.
+///
+/// This function relies on the atomic-guest-memory feature. Without this feature enabled, memory
+/// regions cannot be inserted into vm_as. Because the insert_region interface of vm_as does
+/// not insert regions in place, but returns an array of inserted regions. We need to manually
+/// replace this array of regions with vm_as, and that's what atomic-guest-memory feature does.
+/// So we rely on the atomic-guest-memory feature here
+pub struct DeviceVirtioRegionHandler {
+ pub(crate) vm_as: GuestAddressSpaceImpl,
+ pub(crate) address_space: AddressSpace,
+}
+
+impl DeviceVirtioRegionHandler {
+ fn insert_address_space(
+ &mut self,
+ region: Arc,
+ ) -> std::result::Result<(), VirtIoError> {
+ let file_offset = match region.file_offset() {
+ // TODO: use from_arc
+ Some(f) => Some(FileOffset::new(f.file().try_clone()?, 0)),
+ None => None,
+ };
+
+ let as_region = Arc::new(AddressSpaceRegion::build(
+ AddressSpaceRegionType::DAXMemory,
+ region.start_addr(),
+ region.size() as u64,
+ None,
+ file_offset,
+ region.flags(),
+ false,
+ ));
+
+ self.address_space.insert_region(as_region).map_err(|e| {
+ error!("inserting address apace error: {}", e);
+ // dbs-virtio-devices should not depend on dbs-address-space.
+ // So here io::Error is used instead of AddressSpaceError directly.
+ VirtIoError::IOError(io::Error::new(
+ io::ErrorKind::Other,
+ format!(
+ "invalid address space region ({0:#x}, {1:#x})",
+ region.start_addr().0,
+ region.len()
+ ),
+ ))
+ })?;
+ Ok(())
+ }
+
+ fn insert_vm_as(
+ &mut self,
+ region: Arc,
+ ) -> std::result::Result<(), VirtIoError> {
+ let vm_as_new = self.vm_as.memory().insert_region(region).map_err(|e| {
+ error!(
+ "DeviceVirtioRegionHandler failed to insert guest memory region: {:?}.",
+ e
+ );
+ VirtIoError::InsertMmap(e)
+ })?;
+ // Do not expect poisoned lock here, so safe to unwrap().
+ self.vm_as.lock().unwrap().replace(vm_as_new);
+
+ Ok(())
+ }
+}
+
+impl VirtioRegionHandler for DeviceVirtioRegionHandler {
+ fn insert_region(
+ &mut self,
+ region: Arc,
+ ) -> std::result::Result<(), VirtIoError> {
+ debug!(
+ "add geust memory region to address_space/vm_as, new region: {:?}",
+ region
+ );
+
+ self.insert_address_space(region.clone())?;
+ self.insert_vm_as(region)?;
+
+ Ok(())
+ }
+}
diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs
new file mode 100644
index 0000000000..43c237d4ce
--- /dev/null
+++ b/src/dragonball/src/device_manager/mod.rs
@@ -0,0 +1,1003 @@
+// Copyright (C) 2022 Alibaba Cloud. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Device manager to manage IO devices for a virtual machine.
+
+#[cfg(target_arch = "aarch64")]
+use std::collections::HashMap;
+
+use std::io;
+use std::sync::{Arc, Mutex, MutexGuard};
+
+use arc_swap::ArcSwap;
+use dbs_address_space::AddressSpace;
+#[cfg(target_arch = "aarch64")]
+use dbs_arch::{DeviceType, MMIODeviceInfo};
+use dbs_device::device_manager::{Error as IoManagerError, IoManager, IoManagerContext};
+#[cfg(target_arch = "aarch64")]
+use dbs_device::resources::DeviceResources;
+use dbs_device::resources::Resource;
+use dbs_device::DeviceIo;
+use dbs_interrupt::KvmIrqManager;
+use dbs_legacy_devices::ConsoleHandler;
+use dbs_utils::epoll_manager::EpollManager;
+use kvm_ioctls::VmFd;
+
+#[cfg(feature = "dbs-virtio-devices")]
+use dbs_device::resources::ResourceConstraint;
+#[cfg(feature = "dbs-virtio-devices")]
+use dbs_virtio_devices as virtio;
+#[cfg(feature = "dbs-virtio-devices")]
+use dbs_virtio_devices::{
+ mmio::{
+ MmioV2Device, DRAGONBALL_FEATURE_INTR_USED, DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY,
+ DRAGONBALL_MMIO_DOORBELL_SIZE, MMIO_DEFAULT_CFG_SIZE,
+ },
+ VirtioDevice,
+};
+
+#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
+use dbs_upcall::{
+ DevMgrRequest, DevMgrService, MmioDevRequest, UpcallClient, UpcallClientError,
+ UpcallClientRequest, UpcallClientResponse,
+};
+#[cfg(feature = "hotplug")]
+use dbs_virtio_devices::vsock::backend::VsockInnerConnector;
+
+use crate::address_space_manager::GuestAddressSpaceImpl;
+use crate::error::StartMicroVmError;
+use crate::resource_manager::ResourceManager;
+use crate::vm::{KernelConfigInfo, Vm};
+use crate::IoManagerCached;
+
+/// Virtual machine console device manager.
+pub mod console_manager;
+/// Console Manager for virtual machines console device.
+pub use self::console_manager::ConsoleManager;
+
+mod legacy;
+pub use self::legacy::{Error as LegacyDeviceError, LegacyDeviceManager};
+
+#[cfg(target_arch = "aarch64")]
+pub use self::legacy::aarch64::{COM1, COM2, RTC};
+
+#[cfg(feature = "virtio-vsock")]
+/// Device manager for user-space vsock devices.
+pub mod vsock_dev_mgr;
+#[cfg(feature = "virtio-vsock")]
+use self::vsock_dev_mgr::VsockDeviceMgr;
+
+#[cfg(feature = "virtio-blk")]
+/// virtio-block device manager
+pub mod blk_dev_mgr;
+#[cfg(feature = "virtio-blk")]
+use self::blk_dev_mgr::BlockDeviceMgr;
+
+#[cfg(feature = "virtio-net")]
+/// Device manager for virtio-net devices.
+pub mod virtio_net_dev_mgr;
+#[cfg(feature = "virtio-net")]
+use self::virtio_net_dev_mgr::VirtioNetDeviceMgr;
+
+#[cfg(feature = "virtio-fs")]
+/// virtio-block device manager
+pub mod fs_dev_mgr;
+#[cfg(feature = "virtio-fs")]
+use self::fs_dev_mgr::FsDeviceMgr;
+#[cfg(feature = "virtio-fs")]
+mod memory_region_handler;
+#[cfg(feature = "virtio-fs")]
+pub use self::memory_region_handler::*;
+
+macro_rules! info(
+ ($l:expr, $($args:tt)+) => {
+ slog::info!($l, $($args)+; slog::o!("subsystem" => "device_manager"))
+ };
+);
+
+/// Errors related to device manager operations.
+#[derive(Debug, thiserror::Error)]
+pub enum DeviceMgrError {
+ /// Invalid operation.
+ #[error("invalid device manager operation")]
+ InvalidOperation,
+
+ /// Failed to get device resource.
+ #[error("failed to get device assigned resources")]
+ GetDeviceResource,
+
+ /// Appending to kernel command line failed.
+ #[error("failed to add kernel command line parameter for device: {0}")]
+ Cmdline(#[source] linux_loader::cmdline::Error),
+
+ /// Failed to manage console devices.
+ #[error(transparent)]
+ ConsoleManager(console_manager::ConsoleManagerError),
+
+ /// Failed to create the device.
+ #[error("failed to create virtual device: {0}")]
+ CreateDevice(#[source] io::Error),
+
+ /// Failed to perform an operation on the bus.
+ #[error(transparent)]
+ IoManager(IoManagerError),
+
+ /// Failure from legacy device manager.
+ #[error(transparent)]
+ LegacyManager(legacy::Error),
+
+ #[cfg(feature = "dbs-virtio-devices")]
+ /// Error from Virtio subsystem.
+ #[error(transparent)]
+ Virtio(virtio::Error),
+
+ #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
+ /// Failed to hotplug the device.
+ #[error("failed to hotplug virtual device")]
+ HotplugDevice(#[source] UpcallClientError),
+
+ /// Failed to free device resource.
+ #[error("failed to free device resources: {0}")]
+ ResourceError(#[source] crate::resource_manager::ResourceError),
+}
+
+/// Specialized version of `std::result::Result` for device manager operations.
+pub type Result = ::std::result::Result;
+
+/// Type of the dragonball virtio devices.
+#[cfg(feature = "dbs-virtio-devices")]
+pub type DbsVirtioDevice = Box<
+ dyn VirtioDevice,
+>;
+
+/// Type of the dragonball virtio mmio devices.
+#[cfg(feature = "dbs-virtio-devices")]
+pub type DbsMmioV2Device =
+ MmioV2Device;
+
+/// Struct to support transactional operations for device management.
+pub struct DeviceManagerTx {
+ io_manager: IoManager,
+ _io_lock: Arc>,
+ _guard: MutexGuard<'static, ()>,
+}
+
+impl DeviceManagerTx {
+ fn new(mgr_ctx: &DeviceManagerContext) -> Self {
+ // Do not expect poisoned lock.
+ let guard = mgr_ctx.io_lock.lock().unwrap();
+
+ // It's really a heavy burden to carry on a lifetime parameter for MutexGuard.
+ // So we play a tricky here that we hold a reference to the Arc> and transmute
+ // the MutexGuard<'a, ()> to MutexGuard<'static, ()>.
+ // It's safe because we hold a reference to the Mutex lock.
+ let guard =
+ unsafe { std::mem::transmute::, MutexGuard<'static, ()>>(guard) };
+
+ DeviceManagerTx {
+ io_manager: mgr_ctx.io_manager.load().as_ref().clone(),
+ _io_lock: mgr_ctx.io_lock.clone(),
+ _guard: guard,
+ }
+ }
+}
+
+/// Operation context for device management.
+#[derive(Clone)]
+pub struct DeviceManagerContext {
+ io_manager: Arc>,
+ io_lock: Arc>,
+}
+
+impl DeviceManagerContext {
+ /// Create a DeviceManagerContext object.
+ pub fn new(io_manager: Arc>, io_lock: Arc>) -> Self {
+ DeviceManagerContext {
+ io_manager,
+ io_lock,
+ }
+ }
+}
+
+impl IoManagerContext for DeviceManagerContext {
+ type Context = DeviceManagerTx;
+
+ fn begin_tx(&self) -> Self::Context {
+ DeviceManagerTx::new(self)
+ }
+
+ fn commit_tx(&self, context: Self::Context) {
+ self.io_manager.store(Arc::new(context.io_manager));
+ }
+
+ fn cancel_tx(&self, context: Self::Context) {
+ drop(context);
+ }
+
+ fn register_device_io(
+ &self,
+ ctx: &mut Self::Context,
+ device: Arc,
+ resources: &[Resource],
+ ) -> std::result::Result<(), dbs_device::device_manager::Error> {
+ ctx.io_manager.register_device_io(device, resources)
+ }
+
+ fn unregister_device_io(
+ &self,
+ ctx: &mut Self::Context,
+ resources: &[Resource],
+ ) -> std::result::Result<(), dbs_device::device_manager::Error> {
+ ctx.io_manager.unregister_device_io(resources)
+ }
+}
+
+/// Context for device addition/removal operations.
+pub struct DeviceOpContext {
+ epoll_mgr: Option,
+ io_context: DeviceManagerContext,
+ irq_manager: Arc,
+ res_manager: Arc,
+ vm_fd: Arc,
+ vm_as: Option,
+ address_space: Option,
+ logger: slog::Logger,
+ is_hotplug: bool,
+
+ #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
+ upcall_client: Option>>,
+ #[cfg(feature = "dbs-virtio-devices")]
+ virtio_devices: Vec>,
+}
+
+impl DeviceOpContext {
+ pub(crate) fn new(
+ epoll_mgr: Option,
+ device_mgr: &DeviceManager,
+ vm_as: Option,
+ address_space: Option,
+ is_hotplug: bool,
+ ) -> Self {
+ let irq_manager = device_mgr.irq_manager.clone();
+ let res_manager = device_mgr.res_manager.clone();
+
+ let vm_fd = device_mgr.vm_fd.clone();
+ let io_context = DeviceManagerContext {
+ io_manager: device_mgr.io_manager.clone(),
+ io_lock: device_mgr.io_lock.clone(),
+ };
+ let logger = device_mgr.logger.new(slog::o!());
+
+ DeviceOpContext {
+ epoll_mgr,
+ io_context,
+ irq_manager,
+ res_manager,
+ vm_fd,
+ vm_as,
+ address_space,
+ logger,
+ is_hotplug,
+ #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
+ upcall_client: None,
+ #[cfg(feature = "dbs-virtio-devices")]
+ virtio_devices: Vec::new(),
+ }
+ }
+
+ pub(crate) fn create_boot_ctx(vm: &Vm, epoll_mgr: Option) -> Self {
+ Self::new(epoll_mgr, vm.device_manager(), None, None, false)
+ }
+
+ pub(crate) fn get_vm_as(&self) -> Result {
+ match self.vm_as.as_ref() {
+ Some(v) => Ok(v.clone()),
+ None => Err(DeviceMgrError::InvalidOperation),
+ }
+ }
+
+ pub(crate) fn logger(&self) -> &slog::Logger {
+ &self.logger
+ }
+
+ #[allow(unused_variables)]
+ fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> {
+ if self.is_hotplug {
+ return Err(DeviceMgrError::InvalidOperation);
+ }
+
+ #[cfg(feature = "dbs-virtio-devices")]
+ {
+ let cmdline = kernel_config.kernel_cmdline_mut();
+
+ for device in self.virtio_devices.iter() {
+ let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?;
+
+ // as per doc, [virtio_mmio.]device=@: needs to be appended
+ // to kernel commandline for virtio mmio devices to get recognized
+ // the size parameter has to be transformed to KiB, so dividing hexadecimal value in
+ // bytes to 1024; further, the '{}' formatting rust construct will automatically
+ // transform it to decimal
+ cmdline
+ .insert(
+ "virtio_mmio.device",
+ &format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq),
+ )
+ .map_err(DeviceMgrError::Cmdline)?;
+ }
+ }
+
+ Ok(())
+ }
+
+ #[cfg(target_arch = "aarch64")]
+ fn generate_virtio_device_info(&self) -> Result> {
+ let mut dev_info = HashMap::new();
+ #[cfg(feature = "dbs-virtio-devices")]
+ for (_index, device) in self.virtio_devices.iter().enumerate() {
+ let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_mmio_device_info(device)?;
+ let dev_type;
+ let device_id;
+ if let Some(mmiov2_device) = device.as_any().downcast_ref::() {
+ dev_type = mmiov2_device.get_device_type();
+ device_id = None;
+ } else {
+ return Err(DeviceMgrError::InvalidOperation);
+ }
+ dev_info.insert(
+ (
+ DeviceType::Virtio(dev_type),
+ format!("virtio-{}@0x{:08x?}", dev_type, mmio_base),
+ ),
+ MMIODeviceInfo::new(mmio_base, mmio_size, vec![irq], device_id),
+ );
+ }
+ Ok(dev_info)
+ }
+}
+
+#[cfg(all(feature = "hotplug", not(feature = "dbs-upcall")))]
+impl DeviceOpContext {
+ pub(crate) fn insert_hotplug_mmio_device(
+ &self,
+ _dev: &Arc,
+ _callback: Option<()>,
+ ) -> Result<()> {
+ Err(DeviceMgrError::InvalidOperation)
+ }
+
+ pub(crate) fn remove_hotplug_mmio_device(
+ &self,
+ _dev: &Arc,
+ _callback: Option<()>,
+ ) -> Result<()> {
+ Err(DeviceMgrError::InvalidOperation)
+ }
+}
+
+#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))]
+impl DeviceOpContext {
+ pub(crate) fn create_hotplug_ctx(vm: &Vm, epoll_mgr: Option) -> Self {
+ let vm_as = vm.vm_as().expect("VM should have memory ready").clone();
+
+ let mut ctx = Self::new(
+ epoll_mgr,
+ vm.device_manager(),
+ Some(vm_as),
+ vm.vm_address_space().cloned(),
+ true,
+ );
+ ctx.upcall_client = vm.upcall_client().clone();
+ ctx
+ }
+
+ fn call_hotplug_device(
+ &self,
+ req: DevMgrRequest,
+ callback: Option>,
+ ) -> Result<()> {
+ if let Some(upcall_client) = self.upcall_client.as_ref() {
+ if let Some(cb) = callback {
+ upcall_client
+ .send_request(UpcallClientRequest::DevMgr(req), cb)
+ .map_err(DeviceMgrError::HotplugDevice)?;
+ } else {
+ upcall_client
+ .send_request_without_result(UpcallClientRequest::DevMgr(req))
+ .map_err(DeviceMgrError::HotplugDevice)?;
+ }
+ Ok(())
+ } else {
+ Err(DeviceMgrError::InvalidOperation)
+ }
+ }
+
+ pub(crate) fn insert_hotplug_mmio_device(
+ &self,
+ dev: &Arc,
+ callback: Option>,
+ ) -> Result<()> {
+ if !self.is_hotplug {
+ return Err(DeviceMgrError::InvalidOperation);
+ }
+
+ let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
+ let req = DevMgrRequest::AddMmioDev(MmioDevRequest {
+ mmio_base,
+ mmio_size,
+ mmio_irq,
+ });
+
+ self.call_hotplug_device(req, callback)
+ }
+
+ pub(crate) fn remove_hotplug_mmio_device(
+ &self,
+ dev: &Arc,
+ callback: Option>,
+ ) -> Result<()> {
+ if !self.is_hotplug {
+ return Err(DeviceMgrError::InvalidOperation);
+ }
+ let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?;
+ let req = DevMgrRequest::DelMmioDev(MmioDevRequest {
+ mmio_base,
+ mmio_size,
+ mmio_irq,
+ });
+
+ self.call_hotplug_device(req, callback)
+ }
+}
+
+#[cfg(all(feature = "hotplug", feature = "acpi"))]
+impl DeviceOpContext {
+ // TODO: We will implement this when we develop ACPI virtualization
+}
+
+/// Device manager for virtual machines, which manages all device for a virtual machine.
+pub struct DeviceManager {
+ io_manager: Arc>,
+ io_lock: Arc>,
+ irq_manager: Arc,
+ res_manager: Arc,
+ vm_fd: Arc,
+ pub(crate) logger: slog::Logger,
+
+ pub(crate) con_manager: ConsoleManager,
+ pub(crate) legacy_manager: Option,
+ #[cfg(target_arch = "aarch64")]
+ pub(crate) mmio_device_info: HashMap<(DeviceType, String), MMIODeviceInfo>,
+ #[cfg(feature = "virtio-vsock")]
+ pub(crate) vsock_manager: VsockDeviceMgr,
+
+ #[cfg(feature = "virtio-blk")]
+ // If there is a Root Block Device, this should be added as the first element of the list.
+ // This is necessary because we want the root to always be mounted on /dev/vda.
+ pub(crate) block_manager: BlockDeviceMgr,
+
+ #[cfg(feature = "virtio-net")]
+ pub(crate) virtio_net_manager: VirtioNetDeviceMgr,
+
+ #[cfg(feature = "virtio-fs")]
+ fs_manager: Arc>,
+}
+
+impl DeviceManager {
+ /// Create a new device manager instance.
+ pub fn new(
+ vm_fd: Arc,
+ res_manager: Arc,
+ epoll_manager: EpollManager,
+ logger: &slog::Logger,
+ ) -> Self {
+ DeviceManager {
+ io_manager: Arc::new(ArcSwap::new(Arc::new(IoManager::new()))),
+ io_lock: Arc::new(Mutex::new(())),
+ irq_manager: Arc::new(KvmIrqManager::new(vm_fd.clone())),
+ res_manager,
+ vm_fd,
+ logger: logger.new(slog::o!()),
+
+ con_manager: ConsoleManager::new(epoll_manager, logger),
+ legacy_manager: None,
+ #[cfg(target_arch = "aarch64")]
+ mmio_device_info: HashMap::new(),
+ #[cfg(feature = "virtio-vsock")]
+ vsock_manager: VsockDeviceMgr::default(),
+ #[cfg(feature = "virtio-blk")]
+ block_manager: BlockDeviceMgr::default(),
+ #[cfg(feature = "virtio-net")]
+ virtio_net_manager: VirtioNetDeviceMgr::default(),
+ #[cfg(feature = "virtio-fs")]
+ fs_manager: Arc::new(Mutex::new(FsDeviceMgr::default())),
+ }
+ }
+
+ /// Get the underlying IoManager to dispatch IO read/write requests.
+ pub fn io_manager(&self) -> IoManagerCached {
+ IoManagerCached::new(self.io_manager.clone())
+ }
+
+ /// Create the underline interrupt manager for the device manager.
+ pub fn create_interrupt_manager(&mut self) -> Result<()> {
+ self.irq_manager
+ .initialize()
+ .map_err(DeviceMgrError::CreateDevice)
+ }
+
+ /// Get the underlying logger.
+ pub fn logger(&self) -> &slog::Logger {
+ &self.logger
+ }
+
+ /// Create legacy devices associted virtual machine
+ #[allow(unused_variables)]
+ pub fn create_legacy_devices(
+ &mut self,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), StartMicroVmError> {
+ #[cfg(any(
+ target_arch = "x86_64",
+ all(target_arch = "aarch64", feature = "dbs-virtio-devices")
+ ))]
+ {
+ let mut tx = ctx.io_context.begin_tx();
+ let legacy_manager;
+
+ #[cfg(target_arch = "x86_64")]
+ {
+ legacy_manager = LegacyDeviceManager::create_manager(
+ &mut tx.io_manager,
+ Some(self.vm_fd.clone()),
+ );
+ }
+
+ #[cfg(target_arch = "aarch64")]
+ #[cfg(feature = "dbs-virtio-devices")]
+ {
+ let resources = self.get_legacy_resources()?;
+ legacy_manager = LegacyDeviceManager::create_manager(
+ &mut tx.io_manager,
+ Some(self.vm_fd.clone()),
+ &resources,
+ );
+ }
+
+ match legacy_manager {
+ Ok(v) => {
+ self.legacy_manager = Some(v);
+ ctx.io_context.commit_tx(tx);
+ }
+ Err(e) => {
+ ctx.io_context.cancel_tx(tx);
+ return Err(StartMicroVmError::LegacyDevice(e));
+ }
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Init legacy devices with logger stream in associted virtual machine
+ pub fn init_legacy_devices(
+ &mut self,
+ dmesg_fifo: Option>,
+ com1_sock_path: Option,
+ _ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), StartMicroVmError> {
+ // Connect serial ports to the console and dmesg_fifo.
+ self.set_guest_kernel_log_stream(dmesg_fifo)
+ .map_err(|_| StartMicroVmError::EventFd)?;
+
+ info!(self.logger, "init console path: {:?}", com1_sock_path);
+ if let Some(path) = com1_sock_path {
+ if let Some(legacy_manager) = self.legacy_manager.as_ref() {
+ let com1 = legacy_manager.get_com1_serial();
+ self.con_manager
+ .create_socket_console(com1, path)
+ .map_err(StartMicroVmError::DeviceManager)?;
+ }
+ } else if let Some(legacy_manager) = self.legacy_manager.as_ref() {
+ let com1 = legacy_manager.get_com1_serial();
+ self.con_manager
+ .create_stdio_console(com1)
+ .map_err(StartMicroVmError::DeviceManager)?;
+ }
+
+ Ok(())
+ }
+
+ /// Set the stream for guest kernel log.
+ ///
+ /// Note: com2 is used for guest kernel logging.
+ /// TODO: check whether it works with aarch64.
+ pub fn set_guest_kernel_log_stream(
+ &self,
+ stream: Option>,
+ ) -> std::result::Result<(), io::Error> {
+ if let Some(legacy) = self.legacy_manager.as_ref() {
+ legacy
+ .get_com2_serial()
+ .lock()
+ .unwrap()
+ .set_output_stream(stream);
+ }
+ Ok(())
+ }
+
+ /// Reset the console into canonical mode.
+ pub fn reset_console(&self) -> Result<()> {
+ self.con_manager.reset_console()
+ }
+
+ /// Create all registered devices when booting the associated virtual machine.
+ pub fn create_devices(
+ &mut self,
+ vm_as: GuestAddressSpaceImpl,
+ epoll_mgr: EpollManager,
+ kernel_config: &mut KernelConfigInfo,
+ com1_sock_path: Option,
+ dmesg_fifo: Option>,
+ address_space: Option<&AddressSpace>,
+ ) -> std::result::Result<(), StartMicroVmError> {
+ let mut ctx = DeviceOpContext::new(
+ Some(epoll_mgr),
+ self,
+ Some(vm_as),
+ address_space.cloned(),
+ false,
+ );
+
+ self.create_legacy_devices(&mut ctx)?;
+ self.init_legacy_devices(dmesg_fifo, com1_sock_path, &mut ctx)?;
+
+ #[cfg(feature = "virtio-blk")]
+ self.block_manager
+ .attach_devices(&mut ctx)
+ .map_err(StartMicroVmError::BlockDeviceError)?;
+
+ #[cfg(feature = "virtio-fs")]
+ {
+ let mut fs_manager = self.fs_manager.lock().unwrap();
+ fs_manager
+ .attach_devices(&mut ctx)
+ .map_err(StartMicroVmError::FsDeviceError)?;
+ }
+
+ #[cfg(feature = "virtio-net")]
+ self.virtio_net_manager
+ .attach_devices(&mut ctx)
+ .map_err(StartMicroVmError::VirtioNetDeviceError)?;
+
+ #[cfg(feature = "virtio-vsock")]
+ self.vsock_manager.attach_devices(&mut ctx)?;
+
+ #[cfg(feature = "virtio-blk")]
+ self.block_manager
+ .generate_kernel_boot_args(kernel_config)
+ .map_err(StartMicroVmError::DeviceManager)?;
+ ctx.generate_kernel_boot_args(kernel_config)
+ .map_err(StartMicroVmError::DeviceManager)?;
+
+ #[cfg(target_arch = "aarch64")]
+ {
+ let dev_info = ctx
+ .generate_virtio_device_info()
+ .map_err(StartMicroVmError::DeviceManager)?;
+ self.mmio_device_info.extend(dev_info);
+ }
+
+ Ok(())
+ }
+
+ /// Start all registered devices when booting the associated virtual machine.
+ pub fn start_devices(&mut self) -> std::result::Result<(), StartMicroVmError> {
+ // TODO: add vfio support here. issue #4589.
+ Ok(())
+ }
+
+ /// Remove all devices when shutdown the associated virtual machine
+ pub fn remove_devices(
+ &mut self,
+ vm_as: GuestAddressSpaceImpl,
+ epoll_mgr: EpollManager,
+ address_space: Option<&AddressSpace>,
+ ) -> Result<()> {
+ // create context for removing devices
+ let mut ctx = DeviceOpContext::new(
+ Some(epoll_mgr),
+ self,
+ Some(vm_as),
+ address_space.cloned(),
+ true,
+ );
+
+ #[cfg(feature = "virtio-blk")]
+ self.block_manager.remove_devices(&mut ctx)?;
+ Ok(())
+ }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl DeviceManager {
+ /// Get the underlying eventfd for vm exit notification.
+ pub fn get_reset_eventfd(&self) -> Result {
+ if let Some(legacy) = self.legacy_manager.as_ref() {
+ legacy
+ .get_reset_eventfd()
+ .map_err(DeviceMgrError::LegacyManager)
+ } else {
+ Err(DeviceMgrError::LegacyManager(legacy::Error::EventFd(
+ io::Error::from_raw_os_error(libc::ENOENT),
+ )))
+ }
+ }
+}
+
+#[cfg(target_arch = "aarch64")]
+impl DeviceManager {
+ /// Return mmio device info for FDT build.
+ pub fn get_mmio_device_info(&self) -> Option<&HashMap<(DeviceType, String), MMIODeviceInfo>> {
+ Some(&self.mmio_device_info)
+ }
+
+ #[cfg(feature = "dbs-virtio-devices")]
+ fn get_legacy_resources(
+ &mut self,
+ ) -> std::result::Result, StartMicroVmError> {
+ let mut resources = HashMap::new();
+ let legacy_devices = vec![
+ (DeviceType::Serial, String::from(COM1)),
+ (DeviceType::Serial, String::from(COM2)),
+ (DeviceType::RTC, String::from(RTC)),
+ ];
+
+ for (device_type, device_id) in legacy_devices {
+ let res = self.allocate_mmio_device_resource()?;
+ self.add_mmio_device_info(&res, device_type, device_id.clone(), None);
+ resources.insert(device_id.clone(), res);
+ }
+
+ Ok(resources)
+ }
+
+ fn mmio_device_info_to_resources(
+ &self,
+ key: &(DeviceType, String),
+ ) -> std::result::Result {
+ self.mmio_device_info
+ .get(key)
+ .map(|info| {
+ let mut resources = DeviceResources::new();
+ resources.append(Resource::LegacyIrq(info.irqs[0]));
+ resources.append(Resource::MmioAddressRange {
+ base: info.base,
+ size: info.size,
+ });
+ resources
+ })
+ .ok_or(StartMicroVmError::DeviceManager(
+ DeviceMgrError::GetDeviceResource,
+ ))
+ }
+
+ #[cfg(feature = "dbs-virtio-devices")]
+ fn allocate_mmio_device_resource(
+ &self,
+ ) -> std::result::Result {
+ let mut requests = Vec::new();
+ requests.push(ResourceConstraint::MmioAddress {
+ range: None,
+ align: MMIO_DEFAULT_CFG_SIZE,
+ size: MMIO_DEFAULT_CFG_SIZE,
+ });
+ requests.push(ResourceConstraint::LegacyIrq { irq: None });
+
+ self.res_manager
+ .allocate_device_resources(&requests, false)
+ .map_err(StartMicroVmError::AllocateResource)
+ }
+
+ fn add_mmio_device_info(
+ &mut self,
+ resource: &DeviceResources,
+ device_type: DeviceType,
+ device_id: String,
+ msi_device_id: Option,
+ ) {
+ let (base, size) = resource.get_mmio_address_ranges()[0];
+ let irq = resource.get_legacy_irq().unwrap();
+ self.mmio_device_info.insert(
+ (device_type, device_id),
+ MMIODeviceInfo::new(base, size, vec![irq], msi_device_id),
+ );
+ }
+
+ #[cfg(feature = "dbs-virtio-devices")]
+ fn get_virtio_mmio_device_info(device: &Arc) -> Result<(u64, u64, u32)> {
+ let resources = device.get_assigned_resources();
+ let irq = resources
+ .get_legacy_irq()
+ .ok_or(DeviceMgrError::GetDeviceResource)?;
+
+ if let Some(mmio_dev) = device.as_any().downcast_ref::() {
+ if let Resource::MmioAddressRange { base, size } = mmio_dev.get_mmio_cfg_res() {
+ return Ok((base, size, irq));
+ }
+ }
+
+ Err(DeviceMgrError::GetDeviceResource)
+ }
+}
+
+#[cfg(feature = "dbs-virtio-devices")]
+impl DeviceManager {
+ fn get_virtio_device_info(device: &Arc) -> Result<(u64, u64, u32)> {
+ let resources = device.get_assigned_resources();
+ let irq = resources
+ .get_legacy_irq()
+ .ok_or(DeviceMgrError::GetDeviceResource)?;
+ let mmio_address_range = device.get_trapped_io_resources().get_mmio_address_ranges();
+
+ // Assume the first MMIO region is virtio configuration region.
+ // Virtio-fs needs to pay attention to this assumption.
+ if let Some(range) = mmio_address_range.into_iter().next() {
+ Ok((range.0, range.1, irq))
+ } else {
+ Err(DeviceMgrError::GetDeviceResource)
+ }
+ }
+
+ /// Create an Virtio MMIO transport layer device for the virtio backend device.
+ pub fn create_mmio_virtio_device(
+ device: DbsVirtioDevice,
+ ctx: &mut DeviceOpContext,
+ use_shared_irq: bool,
+ use_generic_irq: bool,
+ ) -> std::result::Result, DeviceMgrError> {
+ let features = DRAGONBALL_FEATURE_INTR_USED | DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY;
+ DeviceManager::create_mmio_virtio_device_with_features(
+ device,
+ ctx,
+ Some(features),
+ use_shared_irq,
+ use_generic_irq,
+ )
+ }
+
+ /// Create an Virtio MMIO transport layer device for the virtio backend device with specified
+ /// features.
+ pub fn create_mmio_virtio_device_with_features(
+ device: DbsVirtioDevice,
+ ctx: &mut DeviceOpContext,
+ features: Option,
+ use_shared_irq: bool,
+ use_generic_irq: bool,
+ ) -> std::result::Result, DeviceMgrError> {
+ // Every emulated Virtio MMIO device needs a 4K configuration space,
+ // and another 4K space for per queue notification.
+ const MMIO_ADDRESS_DEFAULT: ResourceConstraint = ResourceConstraint::MmioAddress {
+ range: None,
+ align: 0,
+ size: MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE,
+ };
+ let mut requests = vec![MMIO_ADDRESS_DEFAULT];
+ device.get_resource_requirements(&mut requests, use_generic_irq);
+ let resources = ctx
+ .res_manager
+ .allocate_device_resources(&requests, use_shared_irq)
+ .map_err(|_| DeviceMgrError::GetDeviceResource)?;
+
+ let virtio_dev = match MmioV2Device::new(
+ ctx.vm_fd.clone(),
+ ctx.get_vm_as()?,
+ ctx.irq_manager.clone(),
+ device,
+ resources,
+ features,
+ ) {
+ Ok(d) => d,
+ Err(e) => return Err(DeviceMgrError::Virtio(e)),
+ };
+
+ Self::register_mmio_virtio_device(Arc::new(virtio_dev), ctx)
+ }
+
+ /// Teardown the Virtio MMIO transport layer device associated with the virtio backend device.
+ pub fn destroy_mmio_virtio_device(
+ device: Arc,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), DeviceMgrError> {
+ Self::destroy_mmio_device(device.clone(), ctx)?;
+
+ let mmio_dev = device
+ .as_any()
+ .downcast_ref::()
+ .ok_or(DeviceMgrError::InvalidOperation)?;
+
+ mmio_dev.remove();
+
+ Ok(())
+ }
+
+ fn destroy_mmio_device(
+ device: Arc,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), DeviceMgrError> {
+ // unregister IoManager
+ Self::deregister_mmio_virtio_device(&device, ctx)?;
+
+ // unregister Resource manager
+ let resources = device.get_assigned_resources();
+ ctx.res_manager
+ .free_device_resources(&resources)
+ .map_err(DeviceMgrError::ResourceError)?;
+
+ Ok(())
+ }
+
+ /// Create an Virtio MMIO transport layer device for the virtio backend device.
+ pub fn register_mmio_virtio_device(
+ device: Arc,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result, DeviceMgrError> {
+ let (mmio_base, mmio_size, irq) = Self::get_virtio_device_info(&device)?;
+ info!(
+ ctx.logger(),
+ "create virtio mmio device 0x{:x}@0x{:x}, irq: 0x{:x}", mmio_size, mmio_base, irq
+ );
+ let resources = device.get_trapped_io_resources();
+
+ let mut tx = ctx.io_context.begin_tx();
+ if let Err(e) = ctx
+ .io_context
+ .register_device_io(&mut tx, device.clone(), &resources)
+ {
+ ctx.io_context.cancel_tx(tx);
+ Err(DeviceMgrError::IoManager(e))
+ } else {
+ ctx.virtio_devices.push(device.clone());
+ ctx.io_context.commit_tx(tx);
+ Ok(device)
+ }
+ }
+
+ /// Deregister a Virtio MMIO device from IoManager
+ pub fn deregister_mmio_virtio_device(
+ device: &Arc,
+ ctx: &mut DeviceOpContext,
+ ) -> std::result::Result<(), DeviceMgrError> {
+ let resources = device.get_trapped_io_resources();
+ info!(
+ ctx.logger(),
+ "unregister mmio virtio device: {:?}", resources
+ );
+ let mut tx = ctx.io_context.begin_tx();
+ if let Err(e) = ctx.io_context.unregister_device_io(&mut tx, &resources) {
+ ctx.io_context.cancel_tx(tx);
+ Err(DeviceMgrError::IoManager(e))
+ } else {
+ ctx.io_context.commit_tx(tx);
+ Ok(())
+ }
+ }
+}
+
+#[cfg(feature = "hotplug")]
+impl DeviceManager {
+ /// Get Unix Domain Socket path for the vsock device.
+ pub fn get_vsock_inner_connector(&mut self) -> Option {
+ #[cfg(feature = "virtio-vsock")]
+ {
+ self.vsock_manager
+ .get_default_connector()
+ .map(|d| Some(d))
+ .unwrap_or(None)
+ }
+ #[cfg(not(feature = "virtio-vsock"))]
+ {
+ return None;
+ }
+ }
+}
diff --git a/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs b/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs
new file mode 100644
index 0000000000..3e81f29487
--- /dev/null
+++ b/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs
@@ -0,0 +1,387 @@
+// Copyright 2020-2022 Alibaba, Inc. or its affiliates. All Rights Reserved.
+// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the THIRD-PARTY file.
+
+use std::convert::TryInto;
+use std::sync::Arc;
+
+use dbs_utils::net::{MacAddr, Tap, TapError};
+use dbs_utils::rate_limiter::BucketUpdate;
+use dbs_virtio_devices as virtio;
+use dbs_virtio_devices::net::Net;
+use dbs_virtio_devices::Error as VirtioError;
+use serde_derive::{Deserialize, Serialize};
+
+use crate::address_space_manager::GuestAddressSpaceImpl;
+use crate::config_manager::{
+ ConfigItem, DeviceConfigInfo, DeviceConfigInfos, RateLimiterConfigInfo,
+};
+use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext};
+use crate::get_bucket_update;
+
+use super::DbsMmioV2Device;
+
+/// Default number of virtio queues, one rx/tx pair.
+pub const NUM_QUEUES: usize = 2;
+/// Default size of virtio queues.
+pub const QUEUE_SIZE: u16 = 256;
+// The flag of whether to use the shared irq.
+const USE_SHARED_IRQ: bool = true;
+// The flag of whether to use the generic irq.
+const USE_GENERIC_IRQ: bool = true;
+
+/// Errors associated with virtio net device operations.
+#[derive(Debug, thiserror::Error)]
+pub enum VirtioNetDeviceError {
+ /// The virtual machine instance ID is invalid.
+ #[error("the virtual machine instance ID is invalid")]
+ InvalidVMID,
+
+ /// The iface ID is invalid.
+ #[error("invalid virtio-net iface id '{0}'")]
+ InvalidIfaceId(String),
+
+ /// Invalid queue number configuration for virtio_net device.
+ #[error("invalid queue number {0} for virtio-net device")]
+ InvalidQueueNum(usize),
+
+ /// Failure from device manager,
+ #[error("failure in device manager operations, {0}")]
+ DeviceManager(#[source] DeviceMgrError),
+
+ /// The Context Identifier is already in use.
+ #[error("the device ID {0} already exists")]
+ DeviceIDAlreadyExist(String),
+
+ /// The MAC address is already in use.
+ #[error("the guest MAC address {0} is already in use")]
+ GuestMacAddressInUse(String),
+
+ /// The host device name is already in use.
+ #[error("the host device name {0} is already in use")]
+ HostDeviceNameInUse(String),
+
+ /// Cannot open/create tap device.
+ #[error("cannot open TAP device")]
+ OpenTap(#[source] TapError),
+
+ /// Failure from virtio subsystem.
+ #[error(transparent)]
+ Virtio(VirtioError),
+
+ /// Failed to send patch message to net epoll handler.
+ #[error("could not send patch message to the net epoll handler")]
+ NetEpollHanderSendFail,
+
+ /// The update is not allowed after booting the microvm.
+ #[error("update operation is not allowed after boot")]
+ UpdateNotAllowedPostBoot,
+
+ /// Split this at some point.
+ /// Internal errors are due to resource exhaustion.
+ /// Users errors are due to invalid permissions.
+ #[error("cannot create network device: {0}")]
+ CreateNetDevice(#[source] VirtioError),
+
+ /// Cannot initialize a MMIO Network Device or add a device to the MMIO Bus.
+ #[error("failure while registering network device: {0}")]
+ RegisterNetDevice(#[source] DeviceMgrError),
+}
+
+/// Configuration information for virtio net devices.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
+pub struct VirtioNetDeviceConfigUpdateInfo {
+ /// ID of the guest network interface.
+ pub iface_id: String,
+ /// Rate Limiter for received packages.
+ pub rx_rate_limiter: Option,
+ /// Rate Limiter for transmitted packages.
+ pub tx_rate_limiter: Option,
+}
+
+impl VirtioNetDeviceConfigUpdateInfo {
+ /// Provides a `BucketUpdate` description for the RX bandwidth rate limiter.
+ pub fn rx_bytes(&self) -> BucketUpdate {
+ get_bucket_update!(self, rx_rate_limiter, bandwidth)
+ }
+ /// Provides a `BucketUpdate` description for the RX ops rate limiter.
+ pub fn rx_ops(&self) -> BucketUpdate {
+ get_bucket_update!(self, rx_rate_limiter, ops)
+ }
+ /// Provides a `BucketUpdate` description for the TX bandwidth rate limiter.
+ pub fn tx_bytes(&self) -> BucketUpdate {
+ get_bucket_update!(self, tx_rate_limiter, bandwidth)
+ }
+ /// Provides a `BucketUpdate` description for the TX ops rate limiter.
+ pub fn tx_ops(&self) -> BucketUpdate {
+ get_bucket_update!(self, tx_rate_limiter, ops)
+ }
+}
+
+/// Configuration information for virtio net devices.
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, Default)]
+pub struct VirtioNetDeviceConfigInfo {
+ /// ID of the guest network interface.
+ pub iface_id: String,
+ /// Host level path for the guest network interface.
+ pub host_dev_name: String,
+ /// Number of virtqueues to use.
+ pub num_queues: usize,
+ /// Size of each virtqueue. Unit: byte.
+ pub queue_size: u16,
+ /// Guest MAC address.
+ pub guest_mac: Option,
+ /// Rate Limiter for received packages.
+ pub rx_rate_limiter: Option,
+ /// Rate Limiter for transmitted packages.
+ pub tx_rate_limiter: Option,
+ /// allow duplicate mac
+ pub allow_duplicate_mac: bool,
+ /// Use shared irq
+ pub use_shared_irq: Option,
+ /// Use generic irq
+ pub use_generic_irq: Option,
+}
+
+impl VirtioNetDeviceConfigInfo {
+ /// Returns the tap device that `host_dev_name` refers to.
+ pub fn open_tap(&self) -> std::result::Result {
+ Tap::open_named(self.host_dev_name.as_str(), false).map_err(VirtioNetDeviceError::OpenTap)
+ }
+
+ /// Returns a reference to the mac address. It the mac address is not configured, it
+ /// return None.
+ pub fn guest_mac(&self) -> Option<&MacAddr> {
+ self.guest_mac.as_ref()
+ }
+
+ ///Rx and Tx queue and max queue sizes
+ pub fn queue_sizes(&self) -> Vec {
+ let mut queue_size = self.queue_size;
+ if queue_size == 0 {
+ queue_size = QUEUE_SIZE;
+ }
+ let num_queues = if self.num_queues > 0 {
+ self.num_queues
+ } else {
+ NUM_QUEUES
+ };
+
+ (0..num_queues).map(|_| queue_size).collect::>()
+ }
+}
+
+impl ConfigItem for VirtioNetDeviceConfigInfo {
+ type Err = VirtioNetDeviceError;
+
+ fn id(&self) -> &str {
+ &self.iface_id
+ }
+
+ fn check_conflicts(&self, other: &Self) -> Result<(), VirtioNetDeviceError> {
+ if self.iface_id == other.iface_id {
+ Err(VirtioNetDeviceError::DeviceIDAlreadyExist(
+ self.iface_id.clone(),
+ ))
+ } else if !other.allow_duplicate_mac
+ && self.guest_mac.is_some()
+ && self.guest_mac == other.guest_mac
+ {
+ Err(VirtioNetDeviceError::GuestMacAddressInUse(
+ self.guest_mac.as_ref().unwrap().to_string(),
+ ))
+ } else if self.host_dev_name == other.host_dev_name {
+ Err(VirtioNetDeviceError::HostDeviceNameInUse(
+ self.host_dev_name.clone(),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+/// Virtio Net Device Info
+pub type VirtioNetDeviceInfo = DeviceConfigInfo;
+
+/// Device manager to manage all virtio net devices.
+pub struct VirtioNetDeviceMgr {
+ pub(crate) info_list: DeviceConfigInfos,
+ pub(crate) use_shared_irq: bool,
+}
+
+impl VirtioNetDeviceMgr {
+ /// Gets the index of the device with the specified `drive_id` if it exists in the list.
+ pub fn get_index_of_iface_id(&self, if_id: &str) -> Option {
+ self.info_list
+ .iter()
+ .position(|info| info.config.iface_id.eq(if_id))
+ }
+
+ /// Insert or update a virtio net device into the manager.
+ pub fn insert_device(
+ device_mgr: &mut DeviceManager,
+ mut ctx: DeviceOpContext,
+ config: VirtioNetDeviceConfigInfo,
+ ) -> std::result::Result<(), VirtioNetDeviceError> {
+ if config.num_queues % 2 != 0 {
+ return Err(VirtioNetDeviceError::InvalidQueueNum(config.num_queues));
+ }
+ if !cfg!(feature = "hotplug") && ctx.is_hotplug {
+ return Err(VirtioNetDeviceError::UpdateNotAllowedPostBoot);
+ }
+
+ let mgr = &mut device_mgr.virtio_net_manager;
+
+ slog::info!(
+ ctx.logger(),
+ "add virtio-net device configuration";
+ "subsystem" => "net_dev_mgr",
+ "id" => &config.iface_id,
+ "host_dev_name" => &config.host_dev_name,
+ );
+
+ let device_index = mgr.info_list.insert_or_update(&config)?;
+
+ if ctx.is_hotplug {
+ slog::info!(
+ ctx.logger(),
+ "attach virtio-net device";
+ "subsystem" => "net_dev_mgr",
+ "id" => &config.iface_id,
+ "host_dev_name" => &config.host_dev_name,
+ );
+
+ match Self::create_device(&config, &mut ctx) {
+ Ok(device) => {
+ let dev = DeviceManager::create_mmio_virtio_device(
+ device,
+ &mut ctx,
+ config.use_shared_irq.unwrap_or(mgr.use_shared_irq),
+ config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
+ )
+ .map_err(VirtioNetDeviceError::DeviceManager)?;
+ ctx.insert_hotplug_mmio_device(&dev.clone(), None)
+ .map_err(VirtioNetDeviceError::DeviceManager)?;
+ // live-upgrade need save/restore device from info.device.
+ mgr.info_list[device_index].set_device(dev);
+ }
+ Err(e) => {
+ mgr.info_list.remove(device_index);
+ return Err(VirtioNetDeviceError::Virtio(e));
+ }
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Update the ratelimiter settings of a virtio net device.
+ pub fn update_device_ratelimiters(
+ device_mgr: &mut DeviceManager,
+ new_cfg: VirtioNetDeviceConfigUpdateInfo,
+ ) -> std::result::Result<(), VirtioNetDeviceError> {
+ let mgr = &mut device_mgr.virtio_net_manager;
+ match mgr.get_index_of_iface_id(&new_cfg.iface_id) {
+ Some(index) => {
+ let config = &mut mgr.info_list[index].config;
+ config.rx_rate_limiter = new_cfg.rx_rate_limiter.clone();
+ config.tx_rate_limiter = new_cfg.tx_rate_limiter.clone();
+ let device = mgr.info_list[index].device.as_mut().ok_or_else(|| {
+ VirtioNetDeviceError::InvalidIfaceId(new_cfg.iface_id.clone())
+ })?;
+
+ if let Some(mmio_dev) = device.as_any().downcast_ref::() {
+ let guard = mmio_dev.state();
+ let inner_dev = guard.get_inner_device();
+ if let Some(net_dev) = inner_dev
+ .as_any()
+ .downcast_ref::