diff --git a/.github/workflows/commit-message-check.yaml b/.github/workflows/commit-message-check.yaml index fbdb02b6df..191e94b0da 100644 --- a/.github/workflows/commit-message-check.yaml +++ b/.github/workflows/commit-message-check.yaml @@ -47,7 +47,7 @@ jobs: uses: tim-actions/commit-message-checker-with-regex@v0.3.1 with: commits: ${{ steps.get-pr-commits.outputs.commits }} - pattern: '^.{0,75}(\n.*)*$' + pattern: '^.{0,75}(\n.*)*$|^Merge pull request (?:kata-containers)?#[\d]+ from.*' error: 'Subject too long (max 75)' post_error: ${{ env.error_msg }} @@ -95,6 +95,6 @@ jobs: uses: tim-actions/commit-message-checker-with-regex@v0.3.1 with: commits: ${{ steps.get-pr-commits.outputs.commits }} - pattern: '^[\s\t]*[^:\s\t]+[\s\t]*:' + pattern: '^[\s\t]*[^:\s\t]+[\s\t]*:|^Merge pull request (?:kata-containers)?#[\d]+ from.*' error: 'Failed to find subsystem in subject' post_error: ${{ env.error_msg }} diff --git a/.github/workflows/move-issues-to-in-progress.yaml b/.github/workflows/move-issues-to-in-progress.yaml index 0e15abaea3..5ab9beb98d 100644 --- a/.github/workflows/move-issues-to-in-progress.yaml +++ b/.github/workflows/move-issues-to-in-progress.yaml @@ -59,7 +59,7 @@ jobs: exit 1 } - project_name="Issue backlog" + project_name="runtime-rs" project_type="org" project_column="In progress" diff --git a/Makefile b/Makefile index 2b6f6a748f..4d2be6b4d8 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,10 @@ # List of available components COMPONENTS = +COMPONENTS += libs COMPONENTS += agent COMPONENTS += runtime +COMPONENTS += runtime-rs # List of available tools TOOLS = @@ -21,11 +23,6 @@ STANDARD_TARGETS = build check clean install test vendor default: all -all: logging-crate-tests build - -logging-crate-tests: - make -C src/libs/logging - include utils.mk include ./tools/packaging/kata-deploy/local-build/Makefile @@ -49,7 +46,6 @@ docs-url-alive-check: binary-tarball \ default \ install-binary-tarball \ - logging-crate-tests \ static-checks \ docs-url-alive-check diff --git a/README.md b/README.md index 90a5c9209a..4a7a6ea9a7 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ See the [official documentation](docs) including: - [Developer guide](docs/Developer-Guide.md) - [Design documents](docs/design) - [Architecture overview](docs/design/architecture) + - [Architecture 3.0 overview](docs/design/architecture_3.0/) ## Configuration @@ -117,6 +118,8 @@ The table below lists the core parts of the project: |-|-|-| | [runtime](src/runtime) | core | Main component run by a container manager and providing a containerd shimv2 runtime implementation. | | [agent](src/agent) | core | Management process running inside the virtual machine / POD that sets up the container environment. | +| [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) | +| [`dragonball`](src/dragonball) | core | An optional built-in VMM brings out-of-the-box Kata Containers experience with optimizations on container workloads | | [documentation](docs) | documentation | Documentation common to all components (such as design and install documentation). | | [libraries](src/libs) | core | Library crates shared by multiple Kata Container components or published to [`crates.io`](https://crates.io/index.html) | | [tests](https://github.com/kata-containers/tests) | tests | Excludes unit tests which live with the main code. | diff --git a/docs/design/architecture_3.0/README.md b/docs/design/architecture_3.0/README.md new file mode 100644 index 0000000000..562404bd51 --- /dev/null +++ b/docs/design/architecture_3.0/README.md @@ -0,0 +1,170 @@ +# Kata 3.0 Architecture +## Overview +In cloud-native scenarios, there is an increased demand for container startup speed, resource consumption, stability, and security, areas where the present Kata Containers runtime is challenged relative to other runtimes. To achieve this, we propose a solid, field-tested and secure Rust version of the kata-runtime. + +Also, we provide the following designs: + +- Turn key solution with builtin `Dragonball` Sandbox +- Async I/O to reduce resource consumption +- Extensible framework for multiple services, runtimes and hypervisors +- Lifecycle management for sandbox and container associated resources + +### Rationale for choosing Rust + +We chose Rust because it is designed as a system language with a focus on efficiency. +In contrast to Go, Rust makes a variety of design trade-offs in order to obtain +good execution performance, with innovative techniques that, in contrast to C or +C++, provide reasonable protection against common memory errors (buffer +overflow, invalid pointers, range errors), error checking (ensuring errors are +dealt with), thread safety, ownership of resources, and more. + +These benefits were verified in our project when the Kata Containers guest agent +was rewritten in Rust. We notably saw a significant reduction in memory usage +with the Rust-based implementation. + + +## Design +### Architecture +![architecture](./images/architecture.png) +### Built-in VMM +#### Current Kata 2.x architecture +![not_builtin_vmm](./images/not_built_in_vmm.png) +As shown in the figure, runtime and VMM are separate processes. The runtime process forks the VMM process and interacts through the inter-process RPC. Typically, process interaction consumes more resources than peers within the process, and it will result in relatively low efficiency. At the same time, the cost of resource operation and maintenance should be considered. For example, when performing resource recovery under abnormal conditions, the exception of any process must be detected by others and activate the appropriate resource recovery process. If there are additional processes, the recovery becomes even more difficult. +#### How To Support Built-in VMM +We provide `Dragonball` Sandbox to enable built-in VMM by integrating VMM's function into the Rust library. We could perform VMM-related functionalities by using the library. Because runtime and VMM are in the same process, there is a benefit in terms of message processing speed and API synchronization. It can also guarantee the consistency of the runtime and the VMM life cycle, reducing resource recovery and exception handling maintenance, as shown in the figure: +![builtin_vmm](./images/built_in_vmm.png) +### Async Support +#### Why Need Async +**Async is already in stable Rust and allows us to write async code** + +- Async provides significantly reduced CPU and memory overhead, especially for workloads with a large amount of IO-bound tasks +- Async is zero-cost in Rust, which means that you only pay for what you use. Specifically, you can use async without heap allocations and dynamic dispatch, which greatly improves efficiency +- For more (see [Why Async?](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) and [The State of Asynchronous Rust](https://rust-lang.github.io/async-book/01_getting_started/03_state_of_async_rust.html)). + +**There may be several problems if implementing kata-runtime with Sync Rust** + +- Too many threads with a new TTRPC connection + - TTRPC threads: reaper thread(1) + listener thread(1) + client handler(2) +- Add 3 I/O threads with a new container +- In Sync mode, implementing a timeout mechanism is challenging. For example, in TTRPC API interaction, the timeout mechanism is difficult to align with Golang +#### How To Support Async +The kata-runtime is controlled by TOKIO_RUNTIME_WORKER_THREADS to run the OS thread, which is 2 threads by default. For TTRPC and container-related threads run in the `tokio` thread in a unified manner, and related dependencies need to be switched to Async, such as Timer, File, Netlink, etc. With the help of Async, we can easily support no-block I/O and timer. Currently, we only utilize Async for kata-runtime. The built-in VMM keeps the OS thread because it can ensure that the threads are controllable. + +**For N tokio worker threads and M containers** + +- Sync runtime(both OS thread and `tokio` task are OS thread but without `tokio` worker thread) OS thread number: 4 + 12*M +- Async runtime(only OS thread is OS thread) OS thread number: 2 + N +```shell +├─ main(OS thread) +├─ async-logger(OS thread) +└─ tokio worker(N * OS thread) + ├─ agent log forwarder(1 * tokio task) + ├─ health check thread(1 * tokio task) + ├─ TTRPC reaper thread(M * tokio task) + ├─ TTRPC listener thread(M * tokio task) + ├─ TTRPC client handler thread(7 * M * tokio task) + ├─ container stdin io thread(M * tokio task) + ├─ container stdin io thread(M * tokio task) + └─ container stdin io thread(M * tokio task) +``` +### Extensible Framework +The Kata 3.x runtime is designed with the extension of service, runtime, and hypervisor, combined with configuration to meet the needs of different scenarios. At present, the service provides a register mechanism to support multiple services. Services could interact with runtime through messages. In addition, the runtime handler handles messages from services. To meet the needs of a binary that supports multiple runtimes and hypervisors, the startup must obtain the runtime handler type and hypervisor type through configuration. + +![framework](./images/framework.png) +### Resource Manager +In our case, there will be a variety of resources, and every resource has several subtypes. Especially for `Virt-Container`, every subtype of resource has different operations. And there may be dependencies, such as the share-fs rootfs and the share-fs volume will use share-fs resources to share files to the VM. Currently, network and share-fs are regarded as sandbox resources, while rootfs, volume, and cgroup are regarded as container resources. Also, we abstract a common interface for each resource and use subclass operations to evaluate the differences between different subtypes. +![resource manager](./images/resourceManager.png) + +## Roadmap + +- Stage 1 (June): provide basic features (current delivered) +- Stage 2 (September): support common features +- Stage 3: support full features + +| **Class** | **Sub-Class** | **Development Stage** | +| -------------------------- | ------------------- | --------------------- | +| Service | task service | Stage 1 | +| | extend service | Stage 3 | +| | image service | Stage 3 | +| Runtime handler | `Virt-Container` | Stage 1 | +| | `Wasm-Container` | Stage 3 | +| | `Linux-Container` | Stage 3 | +| Endpoint | VETH Endpoint | Stage 1 | +| | Physical Endpoint | Stage 2 | +| | Tap Endpoint | Stage 2 | +| | `Tuntap` Endpoint | Stage 2 | +| | `IPVlan` Endpoint | Stage 3 | +| | `MacVlan` Endpoint | Stage 3 | +| | MACVTAP Endpoint | Stage 3 | +| | `VhostUserEndpoint` | Stage 3 | +| Network Interworking Model | Tc filter | Stage 1 | +| | Route | Stage 1 | +| | `MacVtap` | Stage 3 | +| Storage | Virtio-fs | Stage 1 | +| | `nydus` | Stage 2 | +| Hypervisor | `Dragonball` | Stage 1 | +| | QEMU | Stage 2 | +| | ACRN | Stage 3 | +| | Cloud Hypervisor | Stage 3 | +| | Firecracker | Stage 3 | + +## FAQ + +- Are the "service", "message dispatcher" and "runtime handler" all part of the single Kata 3.x runtime binary? + + Yes. They are components in Kata 3.x runtime. And they will be packed into one binary. + 1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc. + 2. Message dispatcher, it is used to match multiple requests from the service module. + 3. Runtime handler is used to deal with the operation for sandbox and container. +- What is the name of the Kata 3.x runtime binary? + + Apparently we can't use `containerd-shim-v2-kata` because it's already used. We are facing the hardest issue of "naming" again. Any suggestions are welcomed. + Internally we use `containerd-shim-v2-rund`. + +- Is the Kata 3.x design compatible with the containerd shimv2 architecture? + + Yes. It is designed to follow the functionality of go version kata. And it implements the `containerd shim v2` interface/protocol. + +- How will users migrate to the Kata 3.x architecture? + + The migration plan will be provided before the Kata 3.x is merging into the main branch. + +- Is `Dragonball` limited to its own built-in VMM? Can the `Dragonball` system be configured to work using an external `Dragonball` VMM/hypervisor? + + The `Dragonball` could work as an external hypervisor. However, stability and performance is challenging in this case. Built in VMM could optimise the container overhead, and it's easy to maintain stability. + + `runD` is the `containerd-shim-v2` counterpart of `runC` and can run a pod/containers. `Dragonball` is a `microvm`/VMM that is designed to run container workloads. Instead of `microvm`/VMM, we sometimes refer to it as secure sandbox. + +- QEMU, Cloud Hypervisor and Firecracker support are planned, but how that would work. Are they working in separate process? + + Yes. They are unable to work as built in VMM. + +- What is `upcall`? + + The `upcall` is used to hotplug CPU/memory/MMIO devices, and it solves two issues. + 1. avoid dependency on PCI/ACPI + 2. avoid dependency on `udevd` within guest and get deterministic results for hotplug operations. So `upcall` is an alternative to ACPI based CPU/memory/device hotplug. And we may cooperate with the community to add support for ACPI based CPU/memory/device hotplug if needed. + + `Dbs-upcall` is a `vsock-based` direct communication tool between VMM and guests. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests once the kernel has started. And the client side is in VMM , it'll be a thread that communicates with VSOCK through `uds`. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid virtualization of ACPI to minimize virtual machine's overhead. And there could be many other usage through this direct communication channel. It's already open source. + https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall + +- The URL below says the kernel patches work with 4.19, but do they also work with 5.15+ ? + + Forward compatibility should be achievable, we have ported it to 5.10 based kernel. + +- Are these patches platform-specific or would they work for any architecture that supports VSOCK? + + It's almost platform independent, but some message related to CPU hotplug are platform dependent. + +- Could the kernel driver be replaced with a userland daemon in the guest using loopback VSOCK? + + We need to create device nodes for hot-added CPU/memory/devices, so it's not easy for userspace daemon to do these tasks. + +- The fact that `upcall` allows communication between the VMM and the guest suggests that this architecture might be incompatible with https://github.com/confidential-containers where the VMM should have no knowledge of what happens inside the VM. + + 1. `TDX` doesn't support CPU/memory hotplug yet. + 2. For ACPI based device hotplug, it depends on ACPI `DSDT` table, and the guest kernel will execute `ASL` code to handle during handling those hotplug event. And it should be easier to audit VSOCK based communication than ACPI `ASL` methods. + +- What is the security boundary for the monolithic / "Built-in VMM" case? + + It has the security boundary of virtualization. More details will be provided in next stage. \ No newline at end of file diff --git a/docs/design/architecture_3.0/images/architecture.png b/docs/design/architecture_3.0/images/architecture.png new file mode 100644 index 0000000000..5825f1eb4e Binary files /dev/null and b/docs/design/architecture_3.0/images/architecture.png differ diff --git a/docs/design/architecture_3.0/images/built_in_vmm.png b/docs/design/architecture_3.0/images/built_in_vmm.png new file mode 100644 index 0000000000..91386c5eb6 Binary files /dev/null and b/docs/design/architecture_3.0/images/built_in_vmm.png differ diff --git a/docs/design/architecture_3.0/images/framework.png b/docs/design/architecture_3.0/images/framework.png new file mode 100644 index 0000000000..992afdfff5 Binary files /dev/null and b/docs/design/architecture_3.0/images/framework.png differ diff --git a/docs/design/architecture_3.0/images/not_built_in_vmm.png b/docs/design/architecture_3.0/images/not_built_in_vmm.png new file mode 100644 index 0000000000..ad1d5b8e34 Binary files /dev/null and b/docs/design/architecture_3.0/images/not_built_in_vmm.png differ diff --git a/docs/design/architecture_3.0/images/resourceManager.png b/docs/design/architecture_3.0/images/resourceManager.png new file mode 100644 index 0000000000..7a8fcae0c7 Binary files /dev/null and b/docs/design/architecture_3.0/images/resourceManager.png differ diff --git a/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio b/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio new file mode 100644 index 0000000000..61882a8892 --- /dev/null +++ b/docs/design/architecture_3.0/images/source_code/kata_3.0_images.drawio @@ -0,0 +1 @@ +5Vxbk6MoFP41qdp9yFQUNd2Pnb7tw8zWbHXVTvcjicSwjZJBcptfvxBRI5jWXIxm+ilyRMTvfH4c8JAeuA/XzwzOZ9+oj0jPHvjrHnjo2bY1vPXEj7RsEos3cBJDwLCvKuWGF/wLKeNAWRfYR3GhIqeUcDwvGic0itCEF2yQMboqVptSUrzrHAbIMLxMIDGtP7DPZ4n1xh3k9r8QDmbpna2BOhPCtLIyxDPo09WOCTz2wD2jlCdH4foeEQleikty3dOes1nHGIp4nQs8PnwlT39PfeufH084Dp5fFkHfcZNmlpAs1BOr3vJNCgGji8hHspVBD4xWM8zRyxxO5NmVcLqwzXhIRMkSh1NMyD0llIlyRCNRaWR2VPV9iRhH6x2T6vgzoiHibCOqqLNOiqpikT1U5VXuk6zObMcf9o0yQsWDIGs7h0ocKLQOQM66CuAsrwicU4KbU4JbY7DZBmwcxu8SN8SWWGCzH0SrNRBviiB6JoiWXQKi0xSI7rCafMgXOqaKlPEZDWgEyWNuHRXpmdf5Sulc4fkf4nyjRBkuOC2i7cN4lnkm5oy+ZxoJPkKeQxYg/sHzqTFDPsKH/mGIQI6XRakuQ1td+p1i0ZXMr0Dza+bEtImYLtgEqas0n2XdON6NwPAiDuVw1OmXAdgdexkcA0XKZ4h1G0XH6xiK7sGBwPkFo1qy9iKfvKnVklIpPTeNSM8dY3CzU2EuJSWur0yp35/21NeHea2+OEh6cFb58gzGhCiOtwLmwVC+RtE4nm+95hEuPT6HUYFR3s+FjH5Hk+T9upO4BuM/PAGn6L/o1aBw+Kc8ls2JMD7i/SkMMdkkl2W37NkAOOIXMgyJYc0ai2EU94U+4GneC3EUyF8fC2ri8YKjtOMCnqTvSQVzrCVEzEdQW2KSqcJGY8OOmmQR3UXUZNhGOILWmL/Ky7+4qvS2c+ZhrVreFjaq0LygnD2WOckxN4eqfCuE7muErj3dy2bfZwfu1gBOSIg/putORhdF+IDVdnSRdmAHviVmvD8ROg5xtI3WZgJPIo80PMUz8yJoSeSgwVaCJCQ4iERxIpAUDYORRBBPILlTJ0Ls+2SfbBffgzN4xbKqWV3qFn1acj63mIsYux4JYSRGc9MjHWC4PiXvAMXNlY0uSqvjVgMHLiutljkP7rK2DjUA3daZZ86ACY4W68+lri6ooQhlS53Nqas5pb4SdfWsznHcnGxeg7o6oH11NdeKr0hdnboxUnPMM6dMKxiHn0xcnWpeX1hczQnZlYpr+xRPb9ZxcdVXBZyyEd4rI2FzXzPNCVSE+Iqy9y4yr699unFKvgZfmHnmpIkhtfS2/yX+bWTVcEjdWWxjsmqbc4l4Jh6xP42vgdHuoHVGm0G/TPG5Evg8t234zGh1Scki7ORHXB0+rywquix8ZlTUxZHcHhw5kltuU8gBMwb6icJFF2lnd24YB2YYtAw/wwBuuKL1ARyYEZXPYECjMZQXd5/N7Q/h4DpyYnUR9cqAu7CImqt075BDeasAqRy6jtOv/TEcmCHQv98+j4SW0viyEmoGUe+IRegq5NO7bZu/jikCaULmp14zLf3cf9E1U6dG9g+K/Du5bUbiSGAc40nRGUWYjkm5Eoiyzetu4S1vQRbzy7al9Lpjs80rM7kUKpWZXKBuKteOg90S/6a2E7PXbe1TiVsze91MNtWD2KHWUIKM0dChSa16kqrnXiBJ1THFXMUigyxbVX8FhH+/wrHQ+wLz68sLQzH+Bcfb9iRzFTCicXfUcx9KyfrxC6uLTrbfTd2lt7ulrHSpYvBFONYu4J9uYTqWh2kVOp3GqJENEumb0oRg2bUFqyprXdT9jhgWDytpoWlcLmtvBVUr17jmRSxFsCMiZunZNPqnmroiZoNycTlexM5OZnNmfiKZq4h5bhJWj5BWp8hl60mC+sBWl1x6Voc+0rbPLTOFrmFunRT4XUoU0+XwauIOO01c91jiGg3ddo65Byd2NbDv7EhinrBfrZKRTkt7YLM/XjiUavpIbp88ANfscMXWOD3d5sStcaKY/2VDUj3/4wvw+D8=7Zxde5pKEIB/jZf68K1eRk2T9DQnaUxi25s+K6xAg6yFRWN+/VniouBsDTZAICe96MMOuIR3Zmdn9qulDuePZwFaOJfEwl5LkazHljpqKUpfM9j/sWC9ERgSF9iBa21E8k4wdp8wF0pcGrkWDjMPUkI86i6yQpP4PjZpRoaCgKyyj82Il33rAtkYCMYm8qB04lrU2Uh7urSTn2PXdpI3yxK/M0fJw1wQOsgiq5RIPW2pw4AQurmaPw6xF7NLuOABUabU+O63L0YreuZrshO0N5V9OuYn208IsE//umo6Gc4847cbkadfN79Hq98Xs59tZVP1EnkR58W/la4TgAGJfAvHlUgtdbByXIrHC2TGd1fMYpjMoXOPlWR2OXM9b0g8ErCyT3z20CDnH88/cokDih9TquMfc4bJHNNgzR7hdzWuFm6XcpeXVzstdxPVOSkN68mDiFuWva16R49dcIBH6On9wNRzsjTKQqkClBSFDzFLHCxdxuvPYOWXwYY0IA84QdtS1L7RVZGxh5zJsWzpuFsid1nJghcYsSwCr5YFXgPg3XnsYQsiv8d3NsOGaTK5hUJnWwdQj9XtTyWpRDWoWs3UYAA1EOrgoAw1cM+S0UBZnHXj7Th/icLPeLK6o9HyZnL1ebycRcO20gegXwZ8pOtO+5E/e56KPLuW07MrZRFPVJkifu8GlEmGxKfI9WMrLxA9A9+zNBH6njJVjVLR9zt6Bv42KE7RV0T0tdLow451gsI5p180/ipdiyHloS1rJdF2aIS/Rku8uB6YT6P19f2Pn1dtaOpzHIab7tRywwWiplO0uSPcm5kiczfMHp7OKgzIlQpdjTC5kQH+IPKpO4/xO8i3vHfkavbY5w3gS2MPAxhnvYh71pAU72BK8ynApfQgV7VSrl3Atbk0VSknze2YzKv6Q88LrZF93r6/uyBXET0Z4ytRfwh4YsvGY14kAXWITXzkne6kgyzx3TNfCFlwzr8wpWs+OoYiSvYTUhRQflfWmQD7yVCarBytkpBEgYkPGBEPetlLbUwP0FF4NhgTOKjhAHuIusvskFvhtg9j9VGAGOgpin9bcD5UVivogVZgwFbQKykHEneTMEz5iufR64hWEvNBloKQr1qWMOYYeiSymOg8T/9XF7JyDdEqAO0nN8BmgMyHg4FcXZjW0VyhSw0w7z6aElZocq4oTa4ySlOgS20wT2GcJuJZSJwmBgr96r+Yrkjw0JiOX1dr1/wTq0xRvSGEzsLGQNUgVEHqWy1UmFLcEy+aFz9kX6GlvjlUOC81tBm/RWOgGm8K9ay3Gt2uL6eDrvXln9n6K5mGrmB4cuyw7/7U6OZfoU8VQoUdlef60WOr+SPsWq75DFmTOn0VAi9kSkM4SAGAF463LKIyCLU0FRLdTppmQtd+WTyhp724YuVL5CO7BNMtja0hdTQ1C9fod9SuyGLL8Q9CvDrAe2JZLOliTlfagGOs7UZz1uUacIYTEDfbzPY9GLMeG7PR3/17Y95wYuJuYR4emK09Y0PuJv75Ra5yp9ctCW0PoL3F84WHaPGDNNUMLgpI18ya4bjY/eVlY0xZVXL3fbqAahGrOoVUBTMN98Pru8Zw1eXcXEtbUyXkCpOODVfJIXThRXaTCetSHQg3OMsQeANde8ndisd39dL4iubhDY+9dzBlF3Z8McKbdZm76G3zAHvh9hmgFMaI7s+1pxfncPYCdSDPtX1WNJk62MvUQUzcZSHNCb8xdy3refpfpOqsMdRKtbKkV9lyYD65dBlI0l6GxDw0fF+zVqRC1N2XUPcr9VEwteSkfUybzNmQ68UZppZW4pqa1t+KaOv1og0TS27VB6ep6oVZq7/zgEkmxzz1muSm9fqThrklJz2fu6QxoAUmXTM/LViE4WEbmfFHbBx2cxyIwKzrRhvmoEuHhMUHHtXMddW+X0wS0BTv6+FFM2mLrLtmtGEWA/PT1GLk/31mmm9qUzQzX54Oc+ydwL51Eh86sYOd0k+WXGbNfixIrdnfFNkf/Y0//Fz4zut5Lowe07dG67/SR2YF/6Hdli+u4E/PN4sOTeCy3Av9+RuuievTlFvdN4pusgE3qWMzRcd/tlM4qKm/X5MhjvKSejekQL3PhrTF8ArbguFzmbZ1nJXUVvn70zd5ld99uaqy9Q2D+A9972lJsH1u/+iVvAoXVLU/FF2ywgXb1j8ULtBST9nXeCeJnV7fyoH9lK10mNN8KP14X1xLhQ9ul+sT+8yNHsjYG03a7rD3TbCWcEj8mWsxhbgo/vX2jAo4dFCPeH57SFlcsD0Uhvw6e8LOA6amw+/M2Del954eZ4bHbCzpdfRu1lQES6Bl4XLHApICocbhnAlQ7N/vJKuKrN7LRVa0gqY0sHCSZOEhP85ZSIw3WixIAAeHPhrR0dsIhG1I6nekbjnaNtrfroPJ/Q9f/hV8vnw6Pb3QJoKtBPBQH2kWb32VhsMDLez4QzfqcoSY0gdjWJUeQCNUS474pZDj2/6Mu5LxQxWkB6JGoZbXJs7dH9/P+rfawD25e3ryNam3vha0iRD51pTEe0HmRS9AfuOjf/b5y6ItuJrUKWnliZA/NH4z5YzemwYSX3NIAeouTHilBlhxd/DtJvrenR6snv4H7VtZe5s4FP01fkw+FhvbjzF2lpk2ScfTpM3LfDIIowYQFcLL/PqRQKzCcdIYnHSSl8DVgjnn6ujqSvR0099cEBC6n7ENvZ6m2JuePu1pmjocG+wft2xTi6EIw5IgW1QqDHP0LxRGRVhjZMOoUpFi7FEUVo0WDgJo0YoNEILX1WoO9qpPDcESSoa5BTzZeo9s6qbW0UAp7JcQLd3syaoiSnyQVRaGyAU2XpdM+qynmwRjml75GxN6HLwMF4apeXtnX9yB7/6Gajd35s33k7Sz85c0yV+BwID+ctcmcLW/f5Dg29Wn+defV6rpuYZooqyAFwu8kM8BZW8LyQpZULw63WZ4EhwHNuR9qj19snYRhfMQWLx0zTyI2Vzqe6LYBpGb13WQ55nYwyTpSHccaFgWs0eU4EdYKrGH4wVjQZ88870FPitIKNyUWBc4XEDsQ0q2rIoo1frivbe1+3XhIWpGu1vyjqweEE65zLsugGcXAvtmHq4f/Nn1l8Xi0p2YN+vZ5OvlxbyBh7Pbq9dhL8HtaDvgNhbGwGgRbuOtoa1KaEtQs8Ee8ksrJt52QoD1COl+zKsEOTigQhDVUY2RAAewRcjVcRVz1ZAxb4JcU9rCXJMwpyB6PJTQ1JwdqvYADpucfWwMddCms9e1pQn5br1dl5DH1IWkFY3vClVdOzaq/Y40xEPh5fH1RH9zejKQ8PdhFCWxS40G1hkLNuEzROQJfI87g/a1tzaFGm9mCo3SYuV00CIBg8Fb8/+hRMAdItRkeAEUcHU3PI72gl8t+ZULAtvjBQecZ9ksO7L7TUNipC10o92gssbI4NhDYiQxcg8iv8wI8DmcwSIKE2CUg1DSyfxbR1sbHhvtsYT2JxTEmxLc7xheXT82vFn6poTvrmiRvSOtYgY8tAy48jOYGP76hCOBLOCdiQIf2ba3a0Yu6FEaZoC2KJDi9qNHmKq8TCVxQJEPn/BtmYyqNItJs2EefY+kSZOA2iFps0909HBhBpPbS+THfwZabE0bEjm/dVw0rMVFow7Dokb85TEzJWCJgwXgbQ8Y+lgWHDhOU+ijG/pYt1sEfVTz+i4zDI2gy7mdL9CP38G8W0fy+PohJ2vOLBK8RyiP7pVNGZp0UWSjFSuwPBClzzR+xnwTZ+KBR3hiMWnl2OTWbAGVtQ6LB2dVYsMZQejYoNyq4QEBPAkrVQpKM6MPyBKx6fasl4hkug3C+LJtFCzrZh8FJxmWvETrh5vdPzwKQbDrZyVxg9TS9HBsX25DHvlFOF9UMjrSvqr9M3PYYEvQzqzvzo+1o0uCnOlqz4/tvgUGcKH9Zn58jgi0kkjr/+nDR9diOTfC3NNBy5iAhZesZ0oiU8P3RYExIJaIiwfKixY1aRiXHRFIuoopFkG0mnqdlbjulCe8KIHwHHupMyfl3PIZrwpDANczewnn2ZsMJ4Vjwryg8FXu+Xk5W+fR7V/QAxThYNZQWzOLm4gCQs/4YYkd3SVLgua2MLB/sWUEl35VYPQzLpalKmyJs2LUVGr0htPnLWDaGh392l5GQy5d07scHHIqSxoDsORIu3yj4gc5ijm/xcJQ8MaQVwqO6oNJqYvV69b0EY6JBZ8AQYDL3mAJ6X4l4YA8STZJ8FmVKzVTJ5reYsR9OU/41w4tjGrkp+8jGtX4z3/FK5axch6hJph5KuhDLT/UskW11Gtbj3pjiqdLuWzIi74HvYQbRL/xiqequPvey9Jz/G66Eb0kN9vSzS0kbKQnI3L68mNxZeV9ait3r/JmmtS29OabH9n8rLemvZM1fPjDu/b968mPlW/OnZXrNqQQa9K7ayPkQ3o/pPeQ0nvS3x+otqW8jSNDzvN+CO8rhDc7Dl8S3qcEqW3dVUc7sgaH1139bvYQXW3v/1n8cG6+nG8j0xw36K7JddfmbgR463xvP5Lc7m1se+ZfCPCbpUhPSf4ZPUJquT15p7TVA0z1RW/TCQOjQUy0A4hJI93POED5im8NOoJ1aOyHtd8lqnKyOPSSbCnFHNs4DDGhH8PnpTyPVOV0rCv5n7p/MCnjU2XYIfPPOJH5uu9H9n6u00lue9B/KRN6x0TIue7nnsxUHJ7+Vkzzt2ROOkPY4s4auy0+zEsDlOLzRn32Hw==7Zpdc5s4FIZ/jS+bAQQYLhvno9OZ7GTqbbbdmx0FFFsTjByQY7u/fiVb4kNSCLbBcZskM4l1EAL0nPfo6OABGM1W1xmcT29IjJKBY8WrAbgYOI5tWz77xy3rrSXgLW6YZDgWnUrDGP9CwmgJ6wLHKK91pIQkFM/rxoikKYpozQazjCzr3R5IUr/qHE6QZhhHMNGt/+CYTsVTeFZp/4LwZEqLBxZHZlB2FoZ8CmOyrJjA5QCMMkLo9tNsNUIJnzw5L9HTl6evV9ercxz+m9w/X8Lnp+Gn7WBXu5xSPEKGUtrt0ILlM0wWYr7Es9K1nMCMLNIY8UGsAThfTjFF4zmM+NElcxlme8BJMiIJyZgpJSniJpLSKzjDCfeav/GMeYBj/YWW7O83MoMp6xLDfLoZ1maNKZ0l4iM/VbiRHbB2y4cXk/SMMopWFfRiMq4RmSGarVkXcdQVZwi/dgXlZekkXhhubdOKgwDZEQrHnBQjl5PPPoj534EF6JrFwAGxh4LYbQekgUGNFW/cQkpRlm4sjsWtOc3II6pcOnDuge/3CJBptUbQ1wm6UrtVgoWgOyfoni7B4+Px6nQCnY7tGuj4fcHxNDjZIqVsHhsY2S8x6pbFfeC5nqWzfwgiFEU9MvKdGqPQwMikoN5CoK8xuru5ecd8HFAXke29NaHh2yUM+69PTo+IwLCOaKgTckxxzg77QhRoiFiOyx76/coIhHVGbx7nQg3RZIFyhsh6ZF7Ld2CHpg3nm9+DGR6LiAMMsrFMyVtfSOR+tcKEbUkpxCnKDsfh+6ORZR2MY5LAPBdXzR8RjaayIfemVp87prDFamSCBnqDZv8G0I6GwxTWjoujxRYWpfFnXtZhrYh7M+aBH60w/cHBnHmi9VNMJv98sRLMNo21aBwGhbHI1j/kuLzxc3P9oSfb5VU3LXnZ3WjmZJFFqKGfyHgpzCaoaTyRd6G4Vu3SfaNavjCgl7YMJZDi53qNzOQP4gq3BPMMoshLLfvMqzuf7dUH2T65OK9allKGUvMnbdHdTo02EHMiuK50m/MOecMtqyuOcNbS37cjlt5fzOoBgmhRETAL4gX3fM07jy+kjvUQtNSDXK5PRBC+FSpLozJEWzm4vpKqKuN0pAZXqc7YvncENeglmA81dKMG+7TUYKuJ4m+mhqHVfFtq/3C3/o51DLXpxbRbEmuCY8km1Tbrsiij12lggie8rBIxB2c5Njjn+SqOYPJZHJjhOOZjGxPsegp+mml1i4KOSTr9bUxblNyMUVOZu1dmWkuQRey0K5GzjKPm2LlvnO44anoto2ZwUkHTCdScuniHvnNODbT0XH2t8kLk7Cz46EXIMWKAIUXMairqf8QhdXtv1Qga3k8ba5ZdxKFvc/T1KY5/XYDl+P72P//O/v7dhFSD+H7ebqovn4Ft4NPX+00zH72ofIeZJshD3sDpTy/9O9JLJSenpY66qP0bOcmL7byeH1jw2mMvdLrreWOEOpEF3VZcz1brrm2X8yIPkD7cU4lMu+GgeVuj9pf31dW2xiwfveT/IZ9O0mFwWuoBijOqTr+veuTmvnf1vFJEsF3zffWrHudN1bPfZrIUTKG5XtTTuKi8Kp/w1OVzZlV+9izIFXciQ77Xj5jUd5sHLi2sWX5Pe9u9/LY7uPwf7Zpdb6M4FIZ/TS5b+QMIXHbSmVlp1aqadnZm9mblgpuwJTgCp0n216+dmAR/pCEJpKmmGqmDD8YEP+e8Pj7Qw4Px/GtBJqMbltCsh0Ay7+HrHkIQgkD8Jy2LlSWULWkYFmmiOm0M9+l/VBmBsk7ThJZaR85YxtOJboxZntOYazZSFGymd3timX7XCRlSy3Afk8y2/kgTPlJP4YON/Q+aDkd8/cDqzJhUnZWhHJGEzWom/LmHBwVjfHU0ng9oJievmpeb8jv4l9G+3/8zGz3+83D3cH97sRrsyz6XrB+hoDlvd2jF8oVkUzVf6ln5oprAgk3zhMpBQA9/mo1STu8nJJZnZ8JlhO0pzbIBy1ghTDnLqTSxnH8h4zSTXvOQjoUHIHBLZ+LvNzYmueiSkHK0HBaKxoiPM3UoL1VuBEPRbvjwapJeaMHpvIZeTcZXysaUFwvRRZ311BXKrz1FebZxEj+KVrZRzUFQoDoS5ZjD9cibyRcHav73YIHbZtFDOPFpmHjNgLzCQGMlG3eEc1rkSwsC0lrygj3T2q1D9IiDoEOAIlY1goFN0KtiVyOIuiLonS/B0+PxdTqhTQd6DjqdhZdvwSmmORfz+AojuI1RuyweQ9/zgc3+KYxpHHfIKEAao8jByBVBXleMAovRXzc3vzEfhPUggv5bE+q/XcJw+PqEOkSE+zqivk0IuXQORl0hCi1EIscVD/37hhGOdEZvrnORhWg4paVABJ6F18od2LFpw6flv6MZnooIwo6wAa7krSsk1X61xkRsSTlJc1ocjyMIBgMAjsYxzEhZqruWz5THo6pR7U1BlzumqMFq5IKGO4MG3wG0k+FwydppcTQoJ9A8uZJlHdGKpTenUvjpPOU/JZhLX7V+qcmUx9dzxWzZWKjGcVAEi2LxsxpXNn4t79/3q/bmrstWddv9aJZsWsR0d8bLSTGkr42n8i6aaNUu2zfq5QsH+spW0Izw9EWvkbn8Qd3hjqUyg1jnpQBe+rrzQV8fZPXk6rp6WcoYysyfrEV3NTXWQMKJyKLWbSI7lK/8ZHPFgdjw99WIG+9fz+oRAdGgpuMOiC3uucs7Tx9ILcdD2DAequX6TAIiAJGxNBpDNA0HLzBSVWOclqLBM6ozMPBPEA0N6mMf0XBQNMDzigZoJorvLBr64PWfZfaP9uuPwCmizS543rHECjiRbHJrs14VZew6DcnSoSyrxMLBRY6NP8l8NY1JdqVOjNMkkWM7E2w9BT/PtLpBQccVOt1tTO2iaDPVNOZux0xbCbLSTlhTzo2OurXzUJ1uWTX9hqoZnpVootDMqdfv0PfOqbGVnpuvVbYoZ2viY9eJ76kATDgVVldR/0OHzO090Ag63k87a5ad6VDl0Xvr0JEb9QNyuHemQ/isdAhipEuHuSFvqkJr+TKyqrazN4iQ8z7bfhf03L+rrWyMT39cXwSLK/z3t5vvJbydZk+LVlZxXbV2vPPyWo+lS8+P6vF0AS7hrg2YbN3RQgT9UqW7iDJsR5mbwHmV0DygeyEOD13sI2OghgW01jzbXuqX6zuYFCymZbltpa859lst+qf5zGwLnvpCjn3bz9qo0ruB2S+I31KK0EFa5NeVCOyQoZYVx2uqONF5KY7xMQkyv0BoqjgQ6QNh01W7Vhz79fn6U64P1VmeDU8nOqK5+QZ6hXjzJTn+/D8=7Vpdc5s6EP01fmwHiU8/Nm7a5s5NJrfONM1TRwHFKAHkEXJs99dXsoUBidrEBsd3ksmMwy5CmD17VquDB/YoXXxlaBpf0ggnA2hFi4H9eQAhAJYn/knPcu0JpCUdE0YiNah0jMlvrJyW8s5IhPPaQE5pwsm07gxpluGQ13yIMTqvD3ugSf2uUzTBhmMcosT03pKIx+opXKv0f8NkEvPNA6szKSoGK0ceo4jOKy77fGCPGKV8fZQuRjiRwSvicnET336f3k3PfesxeAK31xN+82E92ZeXXLJ5BIYzvvfUd/dX83/8X/8Nb6eP367AxdgbAnWJ9YySmYqXela+LALI6CyLsJzEGthn85hwPJ6iUJ6di5QRvgeSJCOaUCZcGc2wdNGMf0EpSWTW3JBUZAC0rvBcfH6nKcrEkAjl8WpaIIyYp4k6lJeqNAKBsFs+vArSM2YcLyrQq2B8xTTFnC3FEHXWUVeovHZUKOZlkrjD4doXVxIEFpmDVGJONjOXwRcHKv4vwAJ0gsXuwG+JdQ0TaVwjzjHLVh5oSW/OGX3CBdwDaAfw3vY8LQ2EP3JxEDk9AujWAfQbAPSbAAR9AQgNANks4wKDLTiCXnA0QLoPXMe1TJAeghCHYY8gebAG0qa6VlACRYGtouT0BZJtgPTj8vINAwTtOo2A99oIOSdfB2HrOqiWw77As/06eIGJ3QanKnZg2Bd4rgGeaLDEQ79dgtlDjWCvXgI9A6PJDOcCI+tJ5Ljs/w9qAUVQz1Z/B4N4LEiamj9oNUAC+4LENyAR+yGOSIbZ4Wh43mhkWSeLhqMTpGkFakLD7guN4B2NkhtN5eqoaAx39wM4iz5JrUBYYYLynMiCjheE/5S4fHSVdadiKY8/LxRkK2OpjMMwEVCw5c9iXmncre7vu4Vd3nVlFbd9GZg5nbEQ7+5xOWITzHd3WjiqSShmalS3VA3IFz6GE8TJc114aUoHdYdrSmRnsOlELfDRrVcC6NYnWT+5uq6qdWhT6X2RsWlfh8aYSCQRWlaGTeWAfMtX1skCbS3d1zOWyb+J6gE6QQvRppkQf0nPXdl5fCJ1zAe3JR/8k+KDZw21dVGboi0bHE/vQLWJOmKDoykywHePwIYWstk7G/ZhQ3BabAB6l/h/Y0Ngbf9ebnPj1Xo8OAbbTI3zmkYG4USvyY1NePFqwJRHUEImUlsJRYKLFts+k+0qCVHySZ1ISRTJuRv763oHfppddQuhpok7ve03gSmDtquaWux2RNpokFXtBJXKWdbR5tq5b53uuGrCllXTPamqCQO9pzZyqnVPbRvtud+uq+6s+JjK8NmMJJxkwtmk4r+Xoe2vzBpkLwCOWoZMudgsQ4JJY2VSxmM6oRlKzkuvFvlyzL+UTlWAHzHnSxVhNOO0Hv6OSwXwWtYKr2WtaF0EDgPD1IUvsoRk+MMPInhAH3IDm7ej4uuvyTZy16up+MDUjNst4fsuqAdqY3tsm15pCQdteXmcNRzob2h1MavtCq7/5MV4C9jRzgcUmaipb/3uTEzR/p0OndDBPjE21JOrMzZA7zhs0HSAl7JBmOVPDtfDyx9u2ud/AA==7Vpdc+I2FP01PLJjSf7iMSGEbadps8smDX1TbAW0ayzGFgH66ythGduSlziAgc5mmGGsa/ka33PP1dE1HdSfrYYJnk/vWEiiDrTCVQfddCDs2a74loZ1ZnAtOzNMEhpmJlAYRvRfooyWsi5oSNLKRM5YxOm8agxYHJOAV2w4SdiyOu2FRdW7zvGEGIZRgCPT+jcN+TSz+o5V2D8TOpnmdwaWOjPD+WRlSKc4ZMuSCQ06qJ8wxrOj2apPIhm7PC645z6MfyNfxwysx8G0+3R/1e1mzm7fc8n2ERIS8+O6hpnrVxwtVLzUs/J1HsCELeKQSCdWB10vp5ST0RwH8uxSZIywvdAo6rOIJcIUs5hIE4v5LZ7RSGbNNzoTGQCtP8lSfH9lMxyLKSFOpxu3QAymfBapQ3mpSiPgi3HDh1dBeiUJJ6sS9CoYQ8JmhCdrMUWdtdUVKq9thfKySBKn18ts01KCwDxzsErMydZzEXxxoOL/DizQUbB4O/A7Yl3BRA7uMeckiTcWaElryhP2g+RwdyDy4TNyXS0NhD10iB/aLQLoVAH0agD06gAEbQFoGwAmi5gLDHbgCFrB0QDp2XdsxzJBevEDEgQtguTCCkjb6lpCCeQFtoyS3RZIjgHS493dLwwQRFUaAffcCLkXXwdh4zqolsO2wENeFTzfxG6LUxk70GsLPM8ATwgs8dC/LsFQTyPY2Uugb2A0WZBUYGT9EDku5f9BElAE9XrzORjEU0FSJ/6gVQMJbAuSngGJ2A9xTGOSHI6G6/b7lnWxaNg6QepWoDo0UFto5ArxA44yEc4HBzDgMFAgcXgluwViFEQ4Taks6WRF+ZME5pOjRmMVTHl8s1KYbQZrNTgMFIFFsn7K/crBeHN/z8nHxV03o/y270MzZYskIDvmKZXLcTIhu/wprUXCShPFzI3ypqoG+tyWkAhz+lptvdTlg7rDPaNSG2y1qAU+OdVaAJ2qk+zJ1XXlbofmSldGxrY9C43hSCQRXpemzeWEdMdP1tkCkZbvmcci+7dRPYAQDdo29YT4SXq+lZ2nJ9KR+eA15EPvovjgWj1tZdRcNGWD7eoaVHN0JDbYWk8GeM4J2NCgcfbBhn3YkOugS6ED0IXi/40OvrX7dzn10qvxfHAKupltznsWGowTapMb+/D87YDZIcERncj2SiAyXIhsdC0FKw1wdKVOzGgYSt+1CruqwS9TVzfo1dRxp7UtJzA7oc3Kpha7NyJtKGRVPEGpdBaFtL547luoj1w27YZl07uoqgl9XVQbOdVYVCNDn3vNZPXRio/ZHL5e0Ih3aSysdZ38jzq0+7VZTesLgFPWIWg2W9qVbwfu8PcQf2eqQ85F1SGAYLV06DvyplVIf3MP3Xa29gBqP7iq3g5WV+7t8+Cv73j45dtoMRx+/wetbmh3XzKUMrZahN54lWUfKcULMhb8G1fod5IlGpnU+LyeXw+uBt5D9Dhyfv/yx+NDl3cva2Pju9VE09/8NF6f81q61sbHX513RbXuDxDWPGEBSdOfrc+l/D3XUn2KxRc6bj1C5eUXOWam7dFcF8Piz2kZxMU//NDgPw==7Z3Rcps4FIafJpdhkAQCLpM0bWfb7qZNO9vuTQeDbNgAYjC24z59hS0MWAQ7MUZsVrnImAMWRv/5dMSRBBfoJn58l7lp8In6JLqAuv94gd5cQGgAwP4XhvXWYOvm1jDLQn9rApXhPvxFuFHn1kXok3njwJzSKA/TptGjSUK8vGFzs4yumodNadQ8a+rOiGC499xItP4d+nnAr8LUK/t7Es6C8sxA53titzyYG+aB69NVzYRuL9BNRmm+/RQ/3pCoqLuyXuIr78fnN95tguKrf4P53XQB08ttYW+f85XdJWQkyV9cdPrwKZtOYvNj+mX94evbL+Yf8c9LiLZlL91owSssI3O6yDzCrBmZEnbOzefJuqgCN/En9JHXR74uK3m+CuPITdjW9Tx3s5y7AasmdB3QLPxFk9yNmKUweEEY+R/dNV0UF5NnhJQbtWO/MjPbC5iR/aDwlzvZnGvz/eKAMCHZ13VKeAnFYXSR+MTnX9qJVXxjFrnzOf88fyC5F/CNKSuK/1Zgs+0jK5oLsiRZTh5rbsYr/h2hMcmzosL4XtvmPsQhQpbGKVpVPolLZIKaP1q6wVngHMx2hVdasw9c7udIb3RIv6fvKghzcp+6XrG9Yu1EoVUeR6VCPVT9NIyiGxrRbHNGNJ1Ooedt/CmjD6S2x8cTbOJWsbpd/Gi1jKZYhu5oumXquz9Rul2jUZeuLKZ/5UxBuYTkK5o9MCOrjDBfj0BBAnyTWG0KOthC7lkVBE5TQggYcNJlw4JsgkzEZ8GLb5JoQle3leF6Y2A76i1qu4ht8mwa5qsiojJLQqvG+m0YlU0zSfzyCK+QPfS2Rn4IeL5mvEE53AyxXzIjnSVy/Yoq6vSBjERuHi6b8b9/NS1BTYZNVoTK6Vzh18AP61ADsumzFX2n0GeNij4khkCl5q6JPKgmwn2ryb96R0N2KVVDoINmS3DpOEjXHGfXm8LNIreXyEvZ85HdzzrBbcQQXPWcwmRK/+cNN9rr+pa3vtJabSRGWcX5Mzi3B+Ic4D3OmZs4MkEXoz2TJN2dTiFeKgWwpWHZlDuK8qd9+CDlpS7DU+7YMiEvr7vmNkvCPLhIgoyGdUa67RttrNtwgvB5E1l792G2IR11AwiaKdSfgTocCHUkdNxtUybqUHCbNFgzadxI4f4U7mMI7YY4wKR4fwbvhjTei6Qdksa7ODhF84BkkmH33XmwK2IwjiFreaVzrBJuHVnRwxzLS7hBYNaHL/eGk8/NsZhwC5MoTIphkmWY5SEdxWCJzKi9f0Nu4hHQrtJuJ9EuLe1mO8DRbGm0i1k3JlviuxFVxD9NPDaQdOKxiu+nJNrxUPEdWHvEW8iy68TDQYnHXQNq8XYuqXTWx5RuN8AIWFfRvcOVD7M+WHTfZx1b2NasinVgDgu7GN6TbWAfC+ljyr4hJH/SE1YDa6eQbg02sCaSbiENyYrqljiwlnsMtXyTgFOwi7CPIaxbamjtJNihNNgBNDUMq7A+LOxQcBsmdK7C+hOkO+aQYR3pywB9T9589q1Z9jNZXv1pR5di89y1aGu3XkpQ8rUs26q7C295RH/oZyUXMPdSbrYtOIOBW5wBA+dM3iA2+//1dVydTv/SZVwI6Bp0qj/bkEzxK10O1CVePVp3+XI9WLce1/tqhGOH00ykQ+ZGu1htnu0WvOu6G7Ga5qPIqL8oy9YP6cJ6I6AZNdIdSzLpr3Tp0SCkO9JIh1Am6WLmZkmjRTyGmD4i0pFhNVGXTDoQu+YK9WNRLx81IoF1IDWqA7EL782Y0qmCvS6TaTqaXu/BS4b9tS7oPwn2ljky7ZUnrQuPTBtoJpQFe8uDAyYR9YoBc3l9+QMTW/vhWebq0nYpVMf8BICH6pkLk9yQga36lNaB+RV75psHfxTgSiS4j5T5eTBnLfC4bseR6qQ/eb95EHskrZOOgNOY/WKcbUlK54U3uI8Lw1juy8fEvA21OvKO7GQ7gor5lzOPZIV66GCo2VBSqEfi4sUq1CvsW3r02Gh5sOXApItLBxXpR5Pe+0P2jibdRMx3Bli50nndLTfl8jiXclNeZNSlI6wSaycgPFRiTURYx3Y9WKNhEW55IOdDmKpI3Yq5Ae1GB112Lr3lgV0K+aORl5aKAyY2NatCHg4ctsVcnE+m7iLKFfat2JtAlzVe/v6bHX5wvwd3i3u69n7E/3z7633LTFZBrya4B9Trbf5oo+9VbNy5eU6yZGOButGqWU9TTA29KZpVvj2g/rIA2CKTeS6ZjlhZ0OTmgEx9jDH3Vdl7aUbLaanslrpG56prMVH15HRedo35nv83KpD7cot7u1E4K3zZY9VJmP26qLHiaVpXfEcc+v4mrrYp2UTySeYaDEl57wbYn73fpq41pLpHPA7rFJJeFGr6mhlvjYykrhec6PPFJF+nRByrU1AdEhobh4UG5pBKH/HwimdRdSDz0lM92vtxXjYwYtoljdykuAOhG2LSlGbic9cUMAeFto8ABvWjNNusXkm2vc+q3uuGbn8D \ No newline at end of file diff --git a/docs/hypervisors.md b/docs/hypervisors.md index 02dd49aa12..e380450b20 100644 --- a/docs/hypervisors.md +++ b/docs/hypervisors.md @@ -33,6 +33,7 @@ are available, their default values and how each setting can be used. [Cloud Hypervisor] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-clh.toml` | [Firecracker] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-fc.toml` | [QEMU] | C | all | Type 2 ([KVM]) | `configuration-qemu.toml` | +[`Dragonball`] | rust | `aarch64`, `x86_64` | Type 2 ([KVM]) | `configuration-dragonball.toml` | ## Determine currently configured hypervisor @@ -52,6 +53,7 @@ the hypervisors: [Cloud Hypervisor] | Low latency, small memory footprint, small attack surface | Minimal | | excellent | excellent | High performance modern cloud workloads | | [Firecracker] | Very slimline | Extremely minimal | Doesn't support all device types | excellent | excellent | Serverless / FaaS | | [QEMU] | Lots of features | Lots | | good | good | Good option for most users | | All users | +[`Dragonball`] | Built-in VMM, low CPU and memory overhead| Minimal | | excellent | excellent | Optimized for most container workloads | `out-of-the-box` Kata Containers experience | For further details, see the [Virtualization in Kata Containers](design/virtualization.md) document and the official documentation for each hypervisor. @@ -60,3 +62,4 @@ For further details, see the [Virtualization in Kata Containers](design/virtuali [Firecracker]: https://github.com/firecracker-microvm/firecracker [KVM]: https://en.wikipedia.org/wiki/Kernel-based_Virtual_Machine [QEMU]: http://www.qemu-project.org +[`Dragonball`]: https://github.com/openanolis/dragonball-sandbox diff --git a/docs/install/README.md b/docs/install/README.md index 9ad55f0f21..0ed42d87f5 100644 --- a/docs/install/README.md +++ b/docs/install/README.md @@ -79,3 +79,6 @@ versions. This is not recommended for normal users. * [upgrading document](../Upgrading.md) * [developer guide](../Developer-Guide.md) * [runtime documentation](../../src/runtime/README.md) + +## Kata Containers 3.0 rust runtime installation +* [installation guide](../install/kata-containers-3.0-rust-runtime-installation-guide.md) diff --git a/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md b/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md new file mode 100644 index 0000000000..122e43b512 --- /dev/null +++ b/docs/install/kata-containers-3.0-rust-runtime-installation-guide.md @@ -0,0 +1,101 @@ +# Kata Containers 3.0 rust runtime installation +The following is an overview of the different installation methods available. + +## Prerequisites + +Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check +[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata +Containers. + +### Platform support + +Kata Containers 3.0 rust runtime currently runs on 64-bit systems supporting the following +architectures: + +> **Notes:** +> For other architectures, see https://github.com/kata-containers/kata-containers/issues/4320 + +| Architecture | Virtualization technology | +|-|-| +| `x86_64`| [Intel](https://www.intel.com) VT-x | +| `aarch64` ("`arm64`")| [ARM](https://www.arm.com) Hyp | + +## Packaged installation methods + +| Installation method | Description | Automatic updates | Use case | Availability +|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- | +| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | No | +| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No | +| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. | No | +| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No | +| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No | +| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes | + +### Kata Deploy Installation +`ToDo` +### Official packages +`ToDo` +### Snap Installation +`ToDo` +### Automatic Installation +`ToDo` +### Manual Installation +`ToDo` + +## Build from source installation + +### Rust Environment Set Up + +* Download `Rustup` and install `Rust` + > **Notes:** + > Rust version 1.58 is needed + + Example for `x86_64` + ``` + $ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + $ source $HOME/.cargo/env + $ rustup install 1.58 + $ rustup default 1.58-x86_64-unknown-linux-gnu + ``` + +* Musl support for fully static binary + + Example for `x86_64` + ``` + $ rustup target add x86_64-unknown-linux-musl + ``` +* [Musl `libc`](http://musl.libc.org/) install + + Example for musl 1.2.3 + ``` + $ wget https://git.musl-libc.org/cgit/musl/snapshot/musl-1.2.3.tar.gz + $ tar vxf musl-1.2.3.tar.gz + $ cd musl-1.2.3/ + $ ./configure --prefix=/usr/local/ + $ make && sudo make install + ``` + + +### Install Kata 3.0 Rust Runtime Shim + +``` +$ git clone https://github.com/kata-containers/kata-containers.git +$ cd kata-containers/src/runtime-rs +$ make && make install +``` +After running the command above, the default config file `configuration.toml` will be installed under `/usr/share/defaults/kata-containers/`, the binary file `containerd-shim-kata-v2` will be installed under `/user/local/bin` . + +### Build Kata Containers Kernel +Follow the [Kernel installation guide](/tools/packaging/kernel/README.md). + +### Build Kata Rootfs +Follow the [Rootfs installation guide](../../tools/osbuilder/rootfs-builder/README.md). + +### Build Kata Image +Follow the [Image installation guide](../../tools/osbuilder/image-builder/README.md). + +### Install Containerd + +Follow the [Containerd installation guide](container-manager/containerd/containerd-install.md). + + diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index abfca3a780..e26f7bd032 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -98,6 +98,12 @@ version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" +[[package]] +name = "byte-unit" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8" + [[package]] name = "byteorder" version = "1.4.3" @@ -224,6 +230,12 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "common-path" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101" + [[package]] name = "core-foundation-sys" version = "0.8.3" @@ -322,6 +334,17 @@ dependencies = [ "libc", ] +[[package]] +name = "fail" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +dependencies = [ + "lazy_static", + "log", + "rand 0.8.5", +] + [[package]] name = "fastrand" version = "1.7.0" @@ -442,6 +465,17 @@ dependencies = [ "slab", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.7" @@ -453,6 +487,12 @@ dependencies = [ "wasi 0.11.0+wasi-snapshot-preview1", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "hashbrown" version = "0.12.1" @@ -584,13 +624,14 @@ dependencies = [ "clap", "futures", "ipnetwork", + "kata-sys-util", "lazy_static", "libc", "log", "logging", "netlink-packet-utils 0.4.1", "netlink-sys 0.7.0", - "nix 0.23.1", + "nix 0.24.2", "oci", "opentelemetry", "procfs", @@ -621,6 +662,47 @@ dependencies = [ "vsock-exporter", ] +[[package]] +name = "kata-sys-util" +version = "0.1.0" +dependencies = [ + "byteorder", + "cgroups-rs", + "chrono", + "common-path", + "fail", + "kata-types", + "lazy_static", + "libc", + "nix 0.24.2", + "oci", + "once_cell", + "rand 0.7.3", + "serde_json", + "slog", + "slog-scope", + "subprocess", + "thiserror", +] + +[[package]] +name = "kata-types" +version = "0.1.0" +dependencies = [ + "byte-unit", + "glob", + "lazy_static", + "num_cpus", + "oci", + "regex", + "serde", + "serde_json", + "slog", + "slog-scope", + "thiserror", + "toml", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -857,6 +939,7 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "libc", + "memoffset", ] [[package]] @@ -935,7 +1018,7 @@ dependencies = [ "lazy_static", "percent-encoding", "pin-project", - "rand", + "rand 0.8.5", "serde", "thiserror", "tokio", @@ -1199,9 +1282,9 @@ dependencies = [ [[package]] name = "protobuf" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" dependencies = [ "serde", "serde_derive", @@ -1209,18 +1292,18 @@ dependencies = [ [[package]] name = "protobuf-codegen" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c" +checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707" dependencies = [ "protobuf", ] [[package]] name = "protobuf-codegen-pure" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440" +checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6" dependencies = [ "protobuf", "protobuf-codegen", @@ -1231,6 +1314,7 @@ name = "protocols" version = "0.1.0" dependencies = [ "async-trait", + "oci", "protobuf", "ttrpc", "ttrpc-codegen", @@ -1245,6 +1329,19 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -1252,8 +1349,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", ] [[package]] @@ -1263,7 +1370,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -1272,7 +1388,16 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" dependencies = [ - "getrandom", + "getrandom 0.2.7", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", ] [[package]] @@ -1579,6 +1704,16 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "syn" version = "1.0.98" @@ -1846,9 +1981,9 @@ dependencies = [ [[package]] name = "ttrpc" -version = "0.5.3" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c46d73bc2a74f2440921b6539afbed68064b48b2c4f194c637430d1c83d052ad" +checksum = "2ecfff459a859c6ba6668ff72b34c2f1d94d9d58f7088414c2674ad0f31cc7d8" dependencies = [ "async-trait", "byteorder", @@ -1947,6 +2082,12 @@ dependencies = [ "tokio-vsock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.10.0+wasi-snapshot-preview1" diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml index a25120b4ff..166d080e9b 100644 --- a/src/agent/Cargo.toml +++ b/src/agent/Cargo.toml @@ -7,12 +7,12 @@ edition = "2018" [dependencies] oci = { path = "../libs/oci" } rustjail = { path = "rustjail" } -protocols = { path = "../libs/protocols" } +protocols = { path = "../libs/protocols", features = ["async"] } lazy_static = "1.3.0" -ttrpc = { version = "0.5.0", features = ["async", "protobuf-codec"], default-features = false } -protobuf = "=2.14.0" +ttrpc = { version = "0.6.0", features = ["async"], default-features = false } +protobuf = "2.27.0" libc = "0.2.58" -nix = "0.23.0" +nix = "0.24.1" capctl = "0.2.0" serde_json = "1.0.39" scan_fmt = "0.2.3" @@ -20,6 +20,7 @@ scopeguard = "1.0.0" thiserror = "1.0.26" regex = "1.5.5" serial_test = "0.5.1" +kata-sys-util = { path = "../libs/kata-sys-util" } sysinfo = "0.23.0" # Async helpers diff --git a/src/agent/Makefile b/src/agent/Makefile index 533411bee6..f0e86fd6bc 100644 --- a/src/agent/Makefile +++ b/src/agent/Makefile @@ -107,10 +107,7 @@ endef ##TARGET default: build code default: $(TARGET) show-header -$(TARGET): $(GENERATED_CODE) logging-crate-tests $(TARGET_PATH) - -logging-crate-tests: - make -C $(CWD)/../libs/logging +$(TARGET): $(GENERATED_CODE) $(TARGET_PATH) $(TARGET_PATH): show-summary @RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) $(if $(findstring release,$(BUILD_TYPE)),--release) $(EXTRA_RUSTFEATURES) @@ -203,7 +200,6 @@ codecov-html: check_tarpaulin .PHONY: \ help \ - logging-crate-tests \ optimize \ show-header \ show-summary \ diff --git a/src/agent/rustjail/Cargo.toml b/src/agent/rustjail/Cargo.toml index 78c0f962eb..375591c9f7 100644 --- a/src/agent/rustjail/Cargo.toml +++ b/src/agent/rustjail/Cargo.toml @@ -16,7 +16,7 @@ scopeguard = "1.0.0" capctl = "0.2.0" lazy_static = "1.3.0" libc = "0.2.58" -protobuf = "=2.14.0" +protobuf = "2.27.0" slog = "2.5.2" slog-scope = "4.1.2" scan_fmt = "0.2.6" @@ -27,7 +27,7 @@ cgroups = { package = "cgroups-rs", version = "0.2.8" } rlimit = "0.5.3" cfg-if = "0.1.0" -tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] } +tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros", "rt"] } futures = "0.3.17" async-trait = "0.1.31" inotify = "0.9.2" diff --git a/src/agent/src/console.rs b/src/agent/src/console.rs index c705af1b71..8f1ae5ff32 100644 --- a/src/agent/src/console.rs +++ b/src/agent/src/console.rs @@ -9,7 +9,7 @@ use anyhow::{anyhow, Result}; use nix::fcntl::{self, FcntlArg, FdFlag, OFlag}; use nix::libc::{STDERR_FILENO, STDIN_FILENO, STDOUT_FILENO}; use nix::pty::{openpty, OpenptyResult}; -use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType}; +use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr}; use nix::sys::stat::Mode; use nix::sys::wait; use nix::unistd::{self, close, dup2, fork, setsid, ForkResult, Pid}; @@ -67,7 +67,7 @@ pub async fn debug_console_handler( SockFlag::SOCK_CLOEXEC, None, )?; - let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, port); + let addr = VsockAddr::new(libc::VMADDR_CID_ANY, port); socket::bind(listenfd, &addr)?; socket::listen(listenfd, 1)?; diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs index eaf208601b..1a1bce4364 100644 --- a/src/agent/src/main.rs +++ b/src/agent/src/main.rs @@ -22,7 +22,7 @@ extern crate slog; use anyhow::{anyhow, Context, Result}; use clap::{AppSettings, Parser}; use nix::fcntl::OFlag; -use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType}; +use nix::sys::socket::{self, AddressFamily, SockFlag, SockType, VsockAddr}; use nix::unistd::{self, dup, Pid}; use std::env; use std::ffi::OsStr; @@ -128,7 +128,7 @@ async fn create_logger_task(rfd: RawFd, vsock_port: u32, shutdown: Receiver>, } -// A container ID must match this regex: -// -// ^[a-zA-Z0-9][a-zA-Z0-9_.-]+$ -// -fn verify_cid(id: &str) -> Result<()> { - let mut chars = id.chars(); - - let valid = match chars.next() { - Some(first) - if first.is_alphanumeric() - && id.len() > 1 - && chars.all(|c| c.is_alphanumeric() || ['.', '-', '_'].contains(&c)) => - { - true - } - _ => false, - }; - - match valid { - true => Ok(()), - false => Err(anyhow!("invalid container ID: {:?}", id)), - } -} - impl AgentService { #[instrument] async fn do_create_container( @@ -165,7 +142,7 @@ impl AgentService { ) -> Result<()> { let cid = req.container_id.clone(); - verify_cid(&cid)?; + kata_sys_util::validate::verify_id(&cid)?; let mut oci_spec = req.OCI.clone(); let use_sandbox_pidns = req.get_sandbox_pidns(); @@ -650,7 +627,7 @@ impl AgentService { } #[async_trait] -impl protocols::agent_ttrpc::AgentService for AgentService { +impl agent_ttrpc::AgentService for AgentService { async fn create_container( &self, ctx: &TtrpcContext, @@ -1536,7 +1513,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService { struct HealthService; #[async_trait] -impl protocols::health_ttrpc::Health for HealthService { +impl health_ttrpc::Health for HealthService { async fn check( &self, _ctx: &TtrpcContext, @@ -1675,18 +1652,17 @@ async fn read_stream(reader: Arc>>, l: usize) -> Resu } pub fn start(s: Arc>, server_address: &str) -> Result { - let agent_service = Box::new(AgentService { sandbox: s }) - as Box; + let agent_service = + Box::new(AgentService { sandbox: s }) as Box; let agent_worker = Arc::new(agent_service); - let health_service = - Box::new(HealthService {}) as Box; + let health_service = Box::new(HealthService {}) as Box; let health_worker = Arc::new(health_service); - let aservice = protocols::agent_ttrpc::create_agent_service(agent_worker); + let aservice = agent_ttrpc::create_agent_service(agent_worker); - let hservice = protocols::health_ttrpc::create_health(health_worker); + let hservice = health_ttrpc::create_health(health_worker); let server = TtrpcServer::new() .bind(server_address)? @@ -2012,7 +1988,7 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> { mod tests { use super::*; use crate::{ - assert_result, namespace::Namespace, protocols::agent_ttrpc::AgentService as _, + assert_result, namespace::Namespace, protocols::agent_ttrpc_async::AgentService as _, skip_if_not_root, }; use nix::mount; @@ -2672,233 +2648,6 @@ OtherField:other } } - #[tokio::test] - async fn test_verify_cid() { - #[derive(Debug)] - struct TestData<'a> { - id: &'a str, - expect_error: bool, - } - - let tests = &[ - TestData { - // Cannot be blank - id: "", - expect_error: true, - }, - TestData { - // Cannot be a space - id: " ", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: ".", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "-", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "_", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: " a", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: ".a", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "-a", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "_a", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "..", - expect_error: true, - }, - TestData { - // Too short - id: "a", - expect_error: true, - }, - TestData { - // Too short - id: "z", - expect_error: true, - }, - TestData { - // Too short - id: "A", - expect_error: true, - }, - TestData { - // Too short - id: "Z", - expect_error: true, - }, - TestData { - // Too short - id: "0", - expect_error: true, - }, - TestData { - // Too short - id: "9", - expect_error: true, - }, - TestData { - // Must start with an alphanumeric - id: "-1", - expect_error: true, - }, - TestData { - id: "/", - expect_error: true, - }, - TestData { - id: "a/", - expect_error: true, - }, - TestData { - id: "a/../", - expect_error: true, - }, - TestData { - id: "../a", - expect_error: true, - }, - TestData { - id: "../../a", - expect_error: true, - }, - TestData { - id: "../../../a", - expect_error: true, - }, - TestData { - id: "foo/../bar", - expect_error: true, - }, - TestData { - id: "foo bar", - expect_error: true, - }, - TestData { - id: "a.", - expect_error: false, - }, - TestData { - id: "a..", - expect_error: false, - }, - TestData { - id: "aa", - expect_error: false, - }, - TestData { - id: "aa.", - expect_error: false, - }, - TestData { - id: "hello..world", - expect_error: false, - }, - TestData { - id: "hello/../world", - expect_error: true, - }, - TestData { - id: "aa1245124sadfasdfgasdga.", - expect_error: false, - }, - TestData { - id: "aAzZ0123456789_.-", - expect_error: false, - }, - TestData { - id: "abcdefghijklmnopqrstuvwxyz0123456789.-_", - expect_error: false, - }, - TestData { - id: "0123456789abcdefghijklmnopqrstuvwxyz.-_", - expect_error: false, - }, - TestData { - id: " abcdefghijklmnopqrstuvwxyz0123456789.-_", - expect_error: true, - }, - TestData { - id: ".abcdefghijklmnopqrstuvwxyz0123456789.-_", - expect_error: true, - }, - TestData { - id: "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", - expect_error: false, - }, - TestData { - id: "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ.-_", - expect_error: false, - }, - TestData { - id: " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", - expect_error: true, - }, - TestData { - id: ".ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", - expect_error: true, - }, - TestData { - id: "/a/b/c", - expect_error: true, - }, - TestData { - id: "a/b/c", - expect_error: true, - }, - TestData { - id: "foo/../../../etc/passwd", - expect_error: true, - }, - TestData { - id: "../../../../../../etc/motd", - expect_error: true, - }, - TestData { - id: "/etc/passwd", - expect_error: true, - }, - ]; - - for (i, d) in tests.iter().enumerate() { - let msg = format!("test[{}]: {:?}", i, d); - - let result = verify_cid(d.id); - - let msg = format!("{}, result: {:?}", msg, result); - - if result.is_ok() { - assert!(!d.expect_error, "{}", msg); - } else { - assert!(d.expect_error, "{}", msg); - } - } - } - #[tokio::test] async fn test_volume_capacity_stats() { skip_if_not_root!(); diff --git a/src/dragonball/.gitignore b/src/dragonball/.gitignore new file mode 100644 index 0000000000..64f40ab296 --- /dev/null +++ b/src/dragonball/.gitignore @@ -0,0 +1,3 @@ +target +Cargo.lock +.idea diff --git a/src/dragonball/Cargo.toml b/src/dragonball/Cargo.toml new file mode 100644 index 0000000000..0f4aa582f9 --- /dev/null +++ b/src/dragonball/Cargo.toml @@ -0,0 +1,65 @@ +[package] +name = "dragonball" +version = "0.1.0" +authors = ["The Kata Containers community "] +description = "A secure sandbox for Kata Containers" +keywords = ["kata-containers", "sandbox", "vmm", "dragonball"] +homepage = "https://katacontainers.io/" +repository = "https://github.com/kata-containers/kata-containers.git" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +arc-swap = "1.5.0" +bytes = "1.1.0" +dbs-address-space = "0.1.0" +dbs-allocator = "0.1.0" +dbs-arch = "0.1.0" +dbs-boot = "0.2.0" +dbs-device = "0.1.0" +dbs-interrupt = { version = "0.1.0", features = ["kvm-irq"] } +dbs-legacy-devices = "0.1.0" +dbs-upcall = { version = "0.1.0", optional = true } +dbs-utils = "0.1.0" +dbs-virtio-devices = { version = "0.1.0", optional = true, features = ["virtio-mmio"] } +kvm-bindings = "0.5.0" +kvm-ioctls = "0.11.0" +lazy_static = "1.2" +libc = "0.2.39" +linux-loader = "0.4.0" +log = "0.4.14" +nix = "0.23.1" +seccompiler = "0.2.0" +serde = "1.0.27" +serde_derive = "1.0.27" +serde_json = "1.0.9" +slog = "2.5.2" +slog-scope = "4.4.0" +thiserror = "1" +vmm-sys-util = "0.9.0" +virtio-queue = { version = "0.1.0", optional = true } +vm-memory = { version = "0.7.0", features = ["backend-mmap"] } + +[dev-dependencies] +slog-term = "2.9.0" +slog-async = "2.7.0" + +[features] +acpi = [] +atomic-guest-memory = [] +hotplug = ["virtio-vsock"] +virtio-vsock = ["dbs-virtio-devices/virtio-vsock", "virtio-queue"] +virtio-blk = ["dbs-virtio-devices/virtio-blk", "virtio-queue"] +virtio-net = ["dbs-virtio-devices/virtio-net", "virtio-queue"] +# virtio-fs only work on atomic-guest-memory +virtio-fs = ["dbs-virtio-devices/virtio-fs", "virtio-queue", "atomic-guest-memory"] + +[patch.'crates-io'] +dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-boot = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-arch = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } diff --git a/src/dragonball/LICENSE b/src/dragonball/LICENSE new file mode 120000 index 0000000000..30cff7403d --- /dev/null +++ b/src/dragonball/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/Makefile b/src/dragonball/Makefile new file mode 100644 index 0000000000..8acd29de57 --- /dev/null +++ b/src/dragonball/Makefile @@ -0,0 +1,29 @@ +# Copyright (c) 2019-2022 Alibaba Cloud. All rights reserved. +# Copyright (c) 2019-2022 Ant Group. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +default: build + +build: + # FIXME: This line will be removed when we solve the vm-memory dependency problem in Dragonball Sandbox + cargo update -p vm-memory:0.8.0 --precise 0.7.0 + cargo build --all-features + +check: clippy format + +clippy: + @echo "INFO: cargo clippy..." + cargo clippy --all-targets --all-features \ + -- \ + -D warnings + +format: + @echo "INFO: cargo fmt..." + cargo fmt -- --check + +clean: + cargo clean + +test: + @echo "INFO: testing dragonball for development build" + cargo test --all-features -- --nocapture diff --git a/src/dragonball/README.md b/src/dragonball/README.md new file mode 100644 index 0000000000..c9d7e5119c --- /dev/null +++ b/src/dragonball/README.md @@ -0,0 +1,40 @@ +# Introduction +`Dragonball Sandbox` is a light-weight virtual machine manager (VMM) based on Linux Kernel-based Virtual Machine (KVM), +which is optimized for container workloads with: +- container image management and acceleration service +- flexible and high-performance virtual device drivers +- low CPU and memory overhead +- minimal startup time +- optimized concurrent startup speed + +`Dragonball Sandbox` aims to provide a simple solution for the Kata Containers community. It is integrated into Kata 3.0 +runtime as a built-in VMM and gives users an out-of-the-box Kata Containers experience without complex environment setup +and configuration process. + +# Getting Started +[TODO](https://github.com/kata-containers/kata-containers/issues/4302) + +# Documentation + +Device: [Device Document](docs/device.md) +vCPU: [vCPU Document](docs/vcpu.md) +API: [API Document](docs/api.md) + +Currently, the documents are still actively adding. +You could see the [official documentation](docs/) page for more details. + +# Supported Architectures +- x86-64 +- aarch64 + +# Supported Kernel +[TODO](https://github.com/kata-containers/kata-containers/issues/4303) + +# Acknowledgement +Part of the code is based on the [Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) project, [`crosvm`](https://github.com/google/crosvm) project and [Firecracker](https://github.com/firecracker-microvm/firecracker) project. They are all rust written virtual machine managers with advantages on safety and security. + +`Dragonball sandbox` is designed to be a VMM that is customized for Kata Containers and we will focus on optimizing container workloads for Kata ecosystem. The focus on the Kata community is what differentiates us from other rust written virtual machines. + +# License + +`Dragonball` is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0. \ No newline at end of file diff --git a/src/dragonball/THIRD-PARTY b/src/dragonball/THIRD-PARTY new file mode 100644 index 0000000000..c3069125a3 --- /dev/null +++ b/src/dragonball/THIRD-PARTY @@ -0,0 +1,27 @@ +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/src/dragonball/docs/api.md b/src/dragonball/docs/api.md new file mode 100644 index 0000000000..dab49835ae --- /dev/null +++ b/src/dragonball/docs/api.md @@ -0,0 +1,27 @@ +# API + +We provide plenty API for Kata runtime to interact with `Dragonball` virtual machine manager. +This document provides the introduction for each of them. + +## `ConfigureBootSource` +Configure the boot source of the VM using `BootSourceConfig`. This action can only be called before the VM has booted. + +### Boot Source Config +1. `kernel_path`: Path of the kernel image. `Dragonball` only supports compressed kernel image for now. +2. `initrd_path`: Path of the initrd (could be None) +3. `boot_args`: Boot arguments passed to the kernel (could be None) + +## `SetVmConfiguration` +Set virtual machine configuration using `VmConfigInfo` to initialize VM. + +### VM Config Info +1. `vcpu_count`: Number of vCPU to start. Currently we only support up to 255 vCPUs. +2. `max_vcpu_count`: Max number of vCPU can be added through CPU hotplug. +3. `cpu_pm`: CPU power management. +4. `cpu_topology`: CPU topology information (including `threads_per_core`, `cores_per_die`, `dies_per_socket` and `sockets`). +5. `vpmu_feature`: `vPMU` feature level. +6. `mem_type`: Memory type that can be either `hugetlbfs` or `shmem`, default is `shmem`. +7. `mem_file_path` : Memory file path. +8. `mem_size_mib`: The memory size in MiB. The maximum memory size is 1TB. +9. `serial_path`: Optional sock path. + diff --git a/src/dragonball/docs/device.md b/src/dragonball/docs/device.md new file mode 100644 index 0000000000..ab2e078e7b --- /dev/null +++ b/src/dragonball/docs/device.md @@ -0,0 +1,20 @@ +# Device + +## Device Manager + +Currently we have following device manager: +| Name | Description | +| --- | --- | +| [address space manager](../src/address_space_manager.rs) | abstracts virtual machine's physical management and provide mapping for guest virtual memory and MMIO ranges of emulated virtual devices, pass-through devices and vCPU | +| [config manager](../src/config_manager.rs) | provides abstractions for configuration information | +| [console manager](../src/device_manager/console_manager.rs) | provides management for all console devices | +| [resource manager](../src/resource_manager.rs) |provides resource management for `legacy_irq_pool`, `msi_irq_pool`, `pio_pool`, `mmio_pool`, `mem_pool`, `kvm_mem_slot_pool` with builder `ResourceManagerBuilder` | +| [VSOCK device manager](../src/device_manager/vsock_dev_mgr.rs) | provides configuration info for `VIRTIO-VSOCK` and management for all VSOCK devices | + + +## Device supported +`VIRTIO-VSOCK` +`i8042` +`COM1` +`COM2` + diff --git a/src/dragonball/docs/vcpu.md b/src/dragonball/docs/vcpu.md new file mode 100644 index 0000000000..e2be8037b6 --- /dev/null +++ b/src/dragonball/docs/vcpu.md @@ -0,0 +1,42 @@ +# vCPU + +## vCPU Manager +The vCPU manager is to manage all vCPU related actions, we will dive into some of the important structure members in this doc. + +For now, aarch64 vCPU support is still under development, we'll introduce it when we merge `runtime-rs` to the master branch. (issue: #4445) + +### vCPU config +`VcpuConfig` is used to configure guest overall CPU info. + +`boot_vcpu_count` is used to define the initial vCPU number. + +`max_vcpu_count` is used to define the maximum vCPU number and it's used for the upper boundary for CPU hotplug feature + +`thread_per_core`, `cores_per_die`, `dies_per_socket` and `socket` are used to define CPU topology. + +`vpmu_feature` is used to define `vPMU` feature level. +If `vPMU` feature is `Disabled`, it means `vPMU` feature is off (by default). +If `vPMU` feature is `LimitedlyEnabled`, it means minimal `vPMU` counters are supported (cycles and instructions). +If `vPMU` feature is `FullyEnabled`, it means all `vPMU` counters are supported + +## vCPU State + +There are four states for vCPU state machine: `running`, `paused`, `waiting_exit`, `exited`. There is a state machine to maintain the task flow. + +When the vCPU is created, it'll turn to `paused` state. After vCPU resource is ready at VMM, it'll send a `Resume` event to the vCPU thread, and then vCPU state will change to `running`. + +During the `running` state, VMM will catch vCPU exit and execute different logic according to the exit reason. + +If the VMM catch some exit reasons that it cannot handle, the state will change to `waiting_exit` and VMM will stop the virtual machine. +When the state switches to `waiting_exit`, an exit event will be sent to vCPU `exit_evt`, event manager will detect the change in `exit_evt` and set VMM `exit_evt_flag` as 1. A thread serving for VMM event loop will check `exit_evt_flag` and if the flag is 1, it'll stop the VMM. + +When the VMM is stopped / destroyed, the state will change to `exited`. + +## vCPU Hot plug +Since `Dragonball Sandbox` doesn't support virtualization of ACPI system, we use [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) to establish a direct communication channel between `Dragonball` and Guest in order to trigger vCPU hotplug. + +To use `upcall`, kernel patches are needed, you can get the patches from [`upcall`](https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall) page, and we'll provide a ready-to-use guest kernel binary for you to try. + +vCPU hot plug / hot unplug range is [1, `max_vcpu_count`]. Operations not in this range will be invalid. + + diff --git a/src/dragonball/src/address_space_manager.rs b/src/dragonball/src/address_space_manager.rs new file mode 100644 index 0000000000..9992833e0c --- /dev/null +++ b/src/dragonball/src/address_space_manager.rs @@ -0,0 +1,892 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Address space abstraction to manage virtual machine's physical address space. +//! +//! The AddressSpace abstraction is introduced to manage virtual machine's physical address space. +//! The regions in virtual machine's physical address space may be used to: +//! 1) map guest virtual memory +//! 2) map MMIO ranges for emulated virtual devices, such as virtio-fs DAX window. +//! 3) map MMIO ranges for pass-through devices, such as PCI device BARs. +//! 4) map MMIO ranges for to vCPU, such as local APIC. +//! 5) not used/available +//! +//! A related abstraction, vm_memory::GuestMemory, is used to access guest virtual memory only. +//! In other words, AddressSpace is the resource owner, and GuestMemory is an accessor for guest +//! virtual memory. + +use std::collections::{BTreeMap, HashMap}; +use std::fs::File; +use std::os::unix::io::{AsRawFd, FromRawFd}; +use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread; + +use dbs_address_space::{ + AddressSpace, AddressSpaceError, AddressSpaceLayout, AddressSpaceRegion, + AddressSpaceRegionType, NumaNode, NumaNodeInfo, MPOL_MF_MOVE, MPOL_PREFERRED, +}; +use dbs_allocator::Constraint; +use kvm_bindings::kvm_userspace_memory_region; +use kvm_ioctls::VmFd; +use log::{debug, error, info, warn}; +use nix::sys::mman; +use nix::unistd::dup; +#[cfg(feature = "atomic-guest-memory")] +use vm_memory::atomic::GuestMemoryAtomic; +use vm_memory::{ + Address, FileOffset, GuestAddress, GuestAddressSpace, GuestMemoryMmap, GuestMemoryRegion, + GuestRegionMmap, GuestUsize, MemoryRegionAddress, MmapRegion, +}; + +use crate::resource_manager::ResourceManager; +use crate::vm::NumaRegionInfo; + +#[cfg(not(feature = "atomic-guest-memory"))] +/// Concrete GuestAddressSpace type used by the VMM. +pub type GuestAddressSpaceImpl = Arc; + +#[cfg(feature = "atomic-guest-memory")] +/// Concrete GuestAddressSpace type used by the VMM. +pub type GuestAddressSpaceImpl = GuestMemoryAtomic; + +/// Concrete GuestMemory type used by the VMM. +pub type GuestMemoryImpl = as GuestAddressSpace>::M; +/// Concrete GuestRegion type used by the VMM. +pub type GuestRegionImpl = GuestRegionMmap; + +// Maximum number of working threads for memory pre-allocation. +const MAX_PRE_ALLOC_THREAD: u64 = 16; + +// Control the actual number of pre-allocating threads. After several performance tests, we decide to use one thread to do pre-allocating for every 4G memory. +const PRE_ALLOC_GRANULARITY: u64 = 32; + +// We don't have plan to support mainframe computer and only focus on PC servers. +// 64 as max nodes should be enough for now. +const MAX_NODE: u32 = 64; + +// We will split the memory region if it conflicts with the MMIO hole. +// But if the space below the MMIO hole is smaller than the MINIMAL_SPLIT_SPACE, we won't split the memory region in order to enhance performance. +const MINIMAL_SPLIT_SPACE: u64 = 128 << 20; + +/// Errors associated with virtual machine address space management. +#[derive(Debug, thiserror::Error)] +pub enum AddressManagerError { + /// Invalid address space operation. + #[error("invalid address space operation")] + InvalidOperation, + + /// Invalid address range. + #[error("invalid address space region (0x{0:x}, 0x{1:x})")] + InvalidAddressRange(u64, GuestUsize), + + /// No available mem address. + #[error("no available mem address")] + NoAvailableMemAddress, + + /// No available kvm slotse. + #[error("no available kvm slots")] + NoAvailableKvmSlot, + + /// Address manager failed to create memfd to map anonymous memory. + #[error("address manager failed to create memfd to map anonymous memory")] + CreateMemFd(#[source] nix::Error), + + /// Address manager failed to open memory file. + #[error("address manager failed to open memory file")] + OpenFile(#[source] std::io::Error), + + /// Memory file provided is invalid due to empty file path, non-existent file path and other possible mistakes. + #[error("memory file provided to address manager {0} is invalid")] + FileInvalid(String), + + /// Memory file provided is invalid due to empty memory type + #[error("memory type provided to address manager {0} is invalid")] + TypeInvalid(String), + + /// Failed to set size for memory file. + #[error("address manager failed to set size for memory file")] + SetFileSize(#[source] std::io::Error), + + /// Failed to unlink memory file. + #[error("address manager failed to unlink memory file")] + UnlinkFile(#[source] nix::Error), + + /// Failed to duplicate fd of memory file. + #[error("address manager failed to duplicate memory file descriptor")] + DupFd(#[source] nix::Error), + + /// Failure in accessing the memory located at some address. + #[error("address manager failed to access guest memory located at 0x{0:x}")] + AccessGuestMemory(u64, #[source] vm_memory::mmap::Error), + + /// Failed to create GuestMemory + #[error("address manager failed to create guest memory object")] + CreateGuestMemory(#[source] vm_memory::Error), + + /// Failure in initializing guest memory. + #[error("address manager failed to initialize guest memory")] + GuestMemoryNotInitialized, + + /// Failed to mmap() guest memory + #[error("address manager failed to mmap() guest memory into current process")] + MmapGuestMemory(#[source] vm_memory::mmap::MmapRegionError), + + /// Failed to set KVM memory slot. + #[error("address manager failed to configure KVM memory slot")] + KvmSetMemorySlot(#[source] kvm_ioctls::Error), + + /// Failed to set madvise on AddressSpaceRegion + #[error("address manager failed to set madvice() on guest memory region")] + Madvise(#[source] nix::Error), + + /// join threads fail + #[error("address manager failed to join threads")] + JoinFail, + + /// Failed to create Address Space Region + #[error("address manager failed to create Address Space Region {0}")] + CreateAddressSpaceRegion(#[source] AddressSpaceError), +} + +type Result = std::result::Result; + +/// Parameters to configure address space creation operations. +pub struct AddressSpaceMgrBuilder<'a> { + mem_type: &'a str, + mem_file: &'a str, + mem_index: u32, + mem_suffix: bool, + mem_prealloc: bool, + dirty_page_logging: bool, + vmfd: Option>, +} + +impl<'a> AddressSpaceMgrBuilder<'a> { + /// Create a new [`AddressSpaceMgrBuilder`] object. + pub fn new(mem_type: &'a str, mem_file: &'a str) -> Result { + if mem_type.is_empty() { + return Err(AddressManagerError::TypeInvalid(mem_type.to_string())); + } + Ok(AddressSpaceMgrBuilder { + mem_type, + mem_file, + mem_index: 0, + mem_suffix: true, + mem_prealloc: false, + dirty_page_logging: false, + vmfd: None, + }) + } + + /// Enable/disable adding numbered suffix to memory file path. + /// This feature could be useful to generate hugetlbfs files with number suffix. (e.g. shmem0, shmem1) + pub fn toggle_file_suffix(&mut self, enabled: bool) { + self.mem_suffix = enabled; + } + + /// Enable/disable memory pre-allocation. + /// Enable this feature could improve performance stability at the start of workload by avoiding page fault. + /// Disable this feature may influence performance stability but the cpu resource consumption and start-up time will decrease. + pub fn toggle_prealloc(&mut self, prealloc: bool) { + self.mem_prealloc = prealloc; + } + + /// Enable/disable KVM dirty page logging. + pub fn toggle_dirty_page_logging(&mut self, logging: bool) { + self.dirty_page_logging = logging; + } + + /// Set KVM [`VmFd`] handle to configure memory slots. + pub fn set_kvm_vm_fd(&mut self, vmfd: Arc) -> Option> { + let mut existing_vmfd = None; + if self.vmfd.is_some() { + existing_vmfd = self.vmfd.clone(); + } + self.vmfd = Some(vmfd); + existing_vmfd + } + + /// Build a ['AddressSpaceMgr'] using the configured parameters. + pub fn build( + self, + res_mgr: &ResourceManager, + numa_region_infos: &[NumaRegionInfo], + ) -> Result { + let mut mgr = AddressSpaceMgr::default(); + mgr.create_address_space(res_mgr, numa_region_infos, self)?; + Ok(mgr) + } + + fn get_next_mem_file(&mut self) -> String { + if self.mem_suffix { + let path = format!("{}{}", self.mem_file, self.mem_index); + self.mem_index += 1; + path + } else { + self.mem_file.to_string() + } + } +} + +/// Struct to manage virtual machine's physical address space. +pub struct AddressSpaceMgr { + address_space: Option, + vm_as: Option, + base_to_slot: Arc>>, + prealloc_handlers: Vec>, + prealloc_exit: Arc, + numa_nodes: BTreeMap, +} + +impl AddressSpaceMgr { + /// Query address space manager is initialized or not + pub fn is_initialized(&self) -> bool { + self.address_space.is_some() + } + + /// Gets address space. + pub fn address_space(&self) -> Option<&AddressSpace> { + self.address_space.as_ref() + } + + /// Create the address space for a virtual machine. + /// + /// This method is designed to be called when starting up a virtual machine instead of at + /// runtime, so it's expected the virtual machine will be tore down and no strict error recover. + pub fn create_address_space( + &mut self, + res_mgr: &ResourceManager, + numa_region_infos: &[NumaRegionInfo], + mut param: AddressSpaceMgrBuilder, + ) -> Result<()> { + let mut regions = Vec::new(); + let mut start_addr = dbs_boot::layout::GUEST_MEM_START; + + // Create address space regions. + for info in numa_region_infos.iter() { + info!("numa_region_info {:?}", info); + // convert size_in_mib to bytes + let size = info + .size + .checked_shl(20) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + + // Guest memory does not intersect with the MMIO hole. + // TODO: make it work for ARM (issue #4307) + if start_addr > dbs_boot::layout::MMIO_LOW_END + || start_addr + size <= dbs_boot::layout::MMIO_LOW_START + { + let region = self.create_region(start_addr, size, info, &mut param)?; + regions.push(region); + start_addr = start_addr + .checked_add(size) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + } else { + // Add guest memory below the MMIO hole, avoid splitting the memory region + // if the available address region is small than MINIMAL_SPLIT_SPACE MiB. + let mut below_size = dbs_boot::layout::MMIO_LOW_START + .checked_sub(start_addr) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + if below_size < (MINIMAL_SPLIT_SPACE) { + below_size = 0; + } else { + let region = self.create_region(start_addr, below_size, info, &mut param)?; + regions.push(region); + } + + // Add guest memory above the MMIO hole + let above_start = dbs_boot::layout::MMIO_LOW_END + 1; + let above_size = size + .checked_sub(below_size) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + let region = self.create_region(above_start, above_size, info, &mut param)?; + regions.push(region); + start_addr = above_start + .checked_add(above_size) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + } + } + + // Create GuestMemory object + let mut vm_memory = GuestMemoryMmap::new(); + for reg in regions.iter() { + // Allocate used guest memory addresses. + // These addresses are statically allocated, resource allocation/update should not fail. + let constraint = Constraint::new(reg.len()) + .min(reg.start_addr().raw_value()) + .max(reg.last_addr().raw_value()); + let _key = res_mgr + .allocate_mem_address(&constraint) + .ok_or(AddressManagerError::NoAvailableMemAddress)?; + let mmap_reg = self.create_mmap_region(reg.clone())?; + + vm_memory = vm_memory + .insert_region(mmap_reg.clone()) + .map_err(AddressManagerError::CreateGuestMemory)?; + self.map_to_kvm(res_mgr, ¶m, reg, mmap_reg)?; + } + + #[cfg(feature = "atomic-guest-memory")] + { + self.vm_as = Some(AddressSpace::convert_into_vm_as(vm_memory)); + } + #[cfg(not(feature = "atomic-guest-memory"))] + { + self.vm_as = Some(Arc::new(vm_memory)); + } + + let layout = AddressSpaceLayout::new( + *dbs_boot::layout::GUEST_PHYS_END, + dbs_boot::layout::GUEST_MEM_START, + *dbs_boot::layout::GUEST_MEM_END, + ); + self.address_space = Some(AddressSpace::from_regions(regions, layout)); + + Ok(()) + } + + // size unit: Byte + fn create_region( + &mut self, + start_addr: u64, + size_bytes: u64, + info: &NumaRegionInfo, + param: &mut AddressSpaceMgrBuilder, + ) -> Result> { + let mem_file_path = param.get_next_mem_file(); + let region = AddressSpaceRegion::create_default_memory_region( + GuestAddress(start_addr), + size_bytes, + info.host_numa_node_id, + param.mem_type, + &mem_file_path, + param.mem_prealloc, + false, + ) + .map_err(AddressManagerError::CreateAddressSpaceRegion)?; + let region = Arc::new(region); + + self.insert_into_numa_nodes( + ®ion, + info.guest_numa_node_id.unwrap_or(0), + &info.vcpu_ids, + ); + info!( + "create new region: guest addr 0x{:x}-0x{:x} size {}", + start_addr, + start_addr + size_bytes, + size_bytes + ); + + Ok(region) + } + + fn map_to_kvm( + &mut self, + res_mgr: &ResourceManager, + param: &AddressSpaceMgrBuilder, + reg: &Arc, + mmap_reg: Arc, + ) -> Result<()> { + // Build mapping between GPA <-> HVA, by adding kvm memory slot. + let slot = res_mgr + .allocate_kvm_mem_slot(1, None) + .ok_or(AddressManagerError::NoAvailableKvmSlot)?; + + if let Some(vmfd) = param.vmfd.as_ref() { + let host_addr = mmap_reg + .get_host_address(MemoryRegionAddress(0)) + .map_err(|_e| AddressManagerError::InvalidOperation)?; + let flags = 0u32; + + let mem_region = kvm_userspace_memory_region { + slot: slot as u32, + guest_phys_addr: reg.start_addr().raw_value(), + memory_size: reg.len() as u64, + userspace_addr: host_addr as u64, + flags, + }; + + info!( + "VM: guest memory region {:x} starts at {:x?}", + reg.start_addr().raw_value(), + host_addr + ); + // Safe because the guest regions are guaranteed not to overlap. + unsafe { vmfd.set_user_memory_region(mem_region) } + .map_err(AddressManagerError::KvmSetMemorySlot)?; + } + + self.base_to_slot + .lock() + .unwrap() + .insert(reg.start_addr().raw_value(), slot as u32); + + Ok(()) + } + + /// Mmap the address space region into current process. + pub fn create_mmap_region( + &mut self, + region: Arc, + ) -> Result> { + // Special check for 32bit host with 64bit virtual machines. + if region.len() > usize::MAX as u64 { + return Err(AddressManagerError::InvalidAddressRange( + region.start_addr().raw_value(), + region.len(), + )); + } + // The device MMIO regions may not be backed by memory files, so refuse to mmap them. + if region.region_type() == AddressSpaceRegionType::DeviceMemory { + return Err(AddressManagerError::InvalidOperation); + } + + // The GuestRegionMmap/MmapRegion will take ownership of the FileOffset object, + // so we have to duplicate the fd here. It's really a dirty design. + let file_offset = match region.file_offset().as_ref() { + Some(fo) => { + let fd = dup(fo.file().as_raw_fd()).map_err(AddressManagerError::DupFd)?; + // Safe because we have just duplicated the raw fd. + let file = unsafe { File::from_raw_fd(fd) }; + let file_offset = FileOffset::new(file, fo.start()); + Some(file_offset) + } + None => None, + }; + let perm_flags = if (region.perm_flags() & libc::MAP_POPULATE) != 0 && region.is_hugepage() + { + // mmap(MAP_POPULATE) conflicts with madive(MADV_HUGEPAGE) because mmap(MAP_POPULATE) + // will pre-fault in all memory with normal pages before madive(MADV_HUGEPAGE) gets + // called. So remove the MAP_POPULATE flag and memory will be faulted in by working + // threads. + region.perm_flags() & (!libc::MAP_POPULATE) + } else { + region.perm_flags() + }; + let mmap_reg = MmapRegion::build( + file_offset, + region.len() as usize, + libc::PROT_READ | libc::PROT_WRITE, + perm_flags, + ) + .map_err(AddressManagerError::MmapGuestMemory)?; + + if region.is_anonpage() { + self.configure_anon_mem(&mmap_reg)?; + } + if let Some(node_id) = region.host_numa_node_id() { + self.configure_numa(&mmap_reg, node_id)?; + } + if region.is_hugepage() { + self.configure_thp_and_prealloc(®ion, &mmap_reg)?; + } + + let reg = GuestRegionImpl::new(mmap_reg, region.start_addr()) + .map_err(AddressManagerError::CreateGuestMemory)?; + Ok(Arc::new(reg)) + } + + fn configure_anon_mem(&self, mmap_reg: &MmapRegion) -> Result<()> { + unsafe { + mman::madvise( + mmap_reg.as_ptr() as *mut libc::c_void, + mmap_reg.size(), + mman::MmapAdvise::MADV_DONTFORK, + ) + } + .map_err(AddressManagerError::Madvise) + } + + fn configure_numa(&self, mmap_reg: &MmapRegion, node_id: u32) -> Result<()> { + let nodemask = 1_u64 + .checked_shl(node_id) + .ok_or_else(|| AddressManagerError::InvalidOperation)?; + let res = unsafe { + libc::syscall( + libc::SYS_mbind, + mmap_reg.as_ptr() as *mut libc::c_void, + mmap_reg.size(), + MPOL_PREFERRED, + &nodemask as *const u64, + MAX_NODE, + MPOL_MF_MOVE, + ) + }; + if res < 0 { + warn!( + "failed to mbind memory to host_numa_node_id {}: this may affect performance", + node_id + ); + } + Ok(()) + } + + // We set Transparent Huge Page (THP) through mmap to increase performance. + // In order to reduce the impact of page fault on performance, we start several threads (up to MAX_PRE_ALLOC_THREAD) to touch every 4k page of the memory region to manually do memory pre-allocation. + // The reason why we don't use mmap to enable THP and pre-alloction is that THP setting won't take effect in this operation (tested in kernel 4.9) + fn configure_thp_and_prealloc( + &mut self, + region: &Arc, + mmap_reg: &MmapRegion, + ) -> Result<()> { + debug!( + "Setting MADV_HUGEPAGE on AddressSpaceRegion addr {:x?} len {:x?}", + mmap_reg.as_ptr(), + mmap_reg.size() + ); + + // Safe because we just create the MmapRegion + unsafe { + mman::madvise( + mmap_reg.as_ptr() as *mut libc::c_void, + mmap_reg.size(), + mman::MmapAdvise::MADV_HUGEPAGE, + ) + } + .map_err(AddressManagerError::Madvise)?; + + if region.perm_flags() & libc::MAP_POPULATE > 0 { + // Touch every 4k page to trigger allocation. The step is 4K instead of 2M to ensure + // pre-allocation when running out of huge pages. + const PAGE_SIZE: u64 = 4096; + const PAGE_SHIFT: u32 = 12; + let addr = mmap_reg.as_ptr() as u64; + // Here we use >> PAGE_SHIFT to calculate how many 4K pages in the memory region. + let npage = (mmap_reg.size() as u64) >> PAGE_SHIFT; + + let mut touch_thread = ((mmap_reg.size() as u64) >> PRE_ALLOC_GRANULARITY) + 1; + if touch_thread > MAX_PRE_ALLOC_THREAD { + touch_thread = MAX_PRE_ALLOC_THREAD; + } + + let per_npage = npage / touch_thread; + for n in 0..touch_thread { + let start_npage = per_npage * n; + let end_npage = if n == (touch_thread - 1) { + npage + } else { + per_npage * (n + 1) + }; + let mut per_addr = addr + (start_npage * PAGE_SIZE); + let should_stop = self.prealloc_exit.clone(); + + let handler = thread::Builder::new() + .name("PreallocThread".to_string()) + .spawn(move || { + info!("PreallocThread start start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}", + start_npage, end_npage, per_addr, touch_thread ); + for _ in start_npage..end_npage { + if should_stop.load(Ordering::Acquire) { + info!("PreallocThread stop start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}", + start_npage, end_npage, per_addr, touch_thread); + break; + } + + // Reading from a THP page may be served by the zero page, so only + // write operation could ensure THP memory allocation. So use + // the compare_exchange(old_val, old_val) trick to trigger allocation. + let addr_ptr = per_addr as *mut u8; + let read_byte = unsafe { std::ptr::read_volatile(addr_ptr) }; + let atomic_u8 : &AtomicU8 = unsafe {&*(addr_ptr as *mut AtomicU8)}; + let _ = atomic_u8.compare_exchange(read_byte, read_byte, Ordering::SeqCst, Ordering::SeqCst); + per_addr += PAGE_SIZE; + } + + info!("PreallocThread done start_npage: {:?}, end_npage: {:?}, per_addr: {:?}, thread_number: {:?}", + start_npage, end_npage, per_addr, touch_thread ); + }); + + match handler { + Err(e) => error!( + "Failed to create working thread for async pre-allocation, {:?}. This may affect performance stability at the start of the workload.", + e + ), + Ok(hdl) => self.prealloc_handlers.push(hdl), + } + } + } + + Ok(()) + } + + /// Get the address space object + pub fn get_address_space(&self) -> Option<&AddressSpace> { + self.address_space.as_ref() + } + + /// Get the default guest memory object, which will be used to access virtual machine's default + /// guest memory. + pub fn get_vm_as(&self) -> Option<&GuestAddressSpaceImpl> { + self.vm_as.as_ref() + } + + /// Get the base to slot map + pub fn get_base_to_slot_map(&self) -> Arc>> { + self.base_to_slot.clone() + } + + /// get numa nodes infos from address space manager. + pub fn get_numa_nodes(&self) -> &BTreeMap { + &self.numa_nodes + } + + /// add cpu and memory numa informations to BtreeMap + fn insert_into_numa_nodes( + &mut self, + region: &Arc, + guest_numa_node_id: u32, + vcpu_ids: &[u32], + ) { + let node = self + .numa_nodes + .entry(guest_numa_node_id) + .or_insert_with(NumaNode::new); + node.add_info(&NumaNodeInfo { + base: region.start_addr(), + size: region.len(), + }); + node.add_vcpu_ids(vcpu_ids); + } + + /// get address space layout from address space manager. + pub fn get_layout(&self) -> Result { + self.address_space + .as_ref() + .map(|v| v.layout()) + .ok_or(AddressManagerError::GuestMemoryNotInitialized) + } + + /// Wait for the pre-allocation working threads to finish work. + /// + /// Force all working threads to exit if `stop` is true. + pub fn wait_prealloc(&mut self, stop: bool) -> Result<()> { + if stop { + self.prealloc_exit.store(true, Ordering::Release); + } + while let Some(handlers) = self.prealloc_handlers.pop() { + if let Err(e) = handlers.join() { + error!("wait_prealloc join fail {:?}", e); + return Err(AddressManagerError::JoinFail); + } + } + Ok(()) + } +} + +impl Default for AddressSpaceMgr { + /// Create a new empty AddressSpaceMgr + fn default() -> Self { + AddressSpaceMgr { + address_space: None, + vm_as: None, + base_to_slot: Arc::new(Mutex::new(HashMap::new())), + prealloc_handlers: Vec::new(), + prealloc_exit: Arc::new(AtomicBool::new(false)), + numa_nodes: BTreeMap::new(), + } + } +} + +#[cfg(test)] +mod tests { + use dbs_boot::layout::GUEST_MEM_START; + use std::ops::Deref; + + use vm_memory::{Bytes, GuestAddressSpace, GuestMemory, GuestMemoryRegion}; + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn test_create_address_space() { + let res_mgr = ResourceManager::new(None); + let mem_size = 128 << 20; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: vec![1, 2], + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + let vm_as = as_mgr.get_vm_as().unwrap(); + let guard = vm_as.memory(); + let gmem = guard.deref(); + assert_eq!(gmem.num_regions(), 1); + + let reg = gmem + .find_region(GuestAddress(GUEST_MEM_START + mem_size - 1)) + .unwrap(); + assert_eq!(reg.start_addr(), GuestAddress(GUEST_MEM_START)); + assert_eq!(reg.len(), mem_size); + assert!(gmem + .find_region(GuestAddress(GUEST_MEM_START + mem_size)) + .is_none()); + assert!(reg.file_offset().is_some()); + + let buf = [0x1u8, 0x2u8, 0x3u8, 0x4u8, 0x5u8]; + gmem.write_slice(&buf, GuestAddress(GUEST_MEM_START)) + .unwrap(); + + // Update middle of mapped memory region + let mut val = 0xa5u8; + gmem.write_obj(val, GuestAddress(GUEST_MEM_START + 0x1)) + .unwrap(); + val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x1)).unwrap(); + assert_eq!(val, 0xa5); + val = gmem.read_obj(GuestAddress(GUEST_MEM_START)).unwrap(); + assert_eq!(val, 1); + val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x2)).unwrap(); + assert_eq!(val, 3); + val = gmem.read_obj(GuestAddress(GUEST_MEM_START + 0x5)).unwrap(); + assert_eq!(val, 0); + + // Read ahead of mapped memory region + assert!(gmem + .read_obj::(GuestAddress(GUEST_MEM_START + mem_size)) + .is_err()); + + let res_mgr = ResourceManager::new(None); + let mem_size = dbs_boot::layout::MMIO_LOW_START + (1 << 30); + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: vec![1, 2], + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + let vm_as = as_mgr.get_vm_as().unwrap(); + let guard = vm_as.memory(); + let gmem = guard.deref(); + #[cfg(target_arch = "x86_64")] + assert_eq!(gmem.num_regions(), 2); + #[cfg(target_arch = "aarch64")] + assert_eq!(gmem.num_regions(), 1); + + // Test dropping GuestMemoryMmap object releases all resources. + for _ in 0..10000 { + let res_mgr = ResourceManager::new(None); + let mem_size = 1 << 20; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: vec![1, 2], + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let _as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + } + let file = TempFile::new().unwrap().into_file(); + let fd = file.as_raw_fd(); + // fd should be small enough if there's no leaking of fds. + assert!(fd < 1000); + } + + #[test] + fn test_address_space_mgr_get_boundary() { + let layout = AddressSpaceLayout::new( + *dbs_boot::layout::GUEST_PHYS_END, + dbs_boot::layout::GUEST_MEM_START, + *dbs_boot::layout::GUEST_MEM_END, + ); + let res_mgr = ResourceManager::new(None); + let mem_size = 128 << 20; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: vec![1, 2], + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + assert_eq!(as_mgr.get_layout().unwrap(), layout); + } + + #[test] + fn test_address_space_mgr_get_numa_nodes() { + let res_mgr = ResourceManager::new(None); + let mem_size = 128 << 20; + let cpu_vec = vec![1, 2]; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: cpu_vec.clone(), + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + let mut numa_node = NumaNode::new(); + numa_node.add_info(&NumaNodeInfo { + base: GuestAddress(GUEST_MEM_START), + size: mem_size, + }); + numa_node.add_vcpu_ids(&cpu_vec); + + assert_eq!(*as_mgr.get_numa_nodes().get(&0).unwrap(), numa_node); + } + + #[test] + fn test_address_space_mgr_async_prealloc() { + let res_mgr = ResourceManager::new(None); + let mem_size = 2 << 20; + let cpu_vec = vec![1, 2]; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: cpu_vec, + }]; + let mut builder = AddressSpaceMgrBuilder::new("hugeshmem", "").unwrap(); + builder.toggle_prealloc(true); + let mut as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + as_mgr.wait_prealloc(false).unwrap(); + } + + #[test] + fn test_address_space_mgr_builder() { + let mut builder = AddressSpaceMgrBuilder::new("shmem", "/tmp/shmem").unwrap(); + + assert_eq!(builder.mem_type, "shmem"); + assert_eq!(builder.mem_file, "/tmp/shmem"); + assert_eq!(builder.mem_index, 0); + assert!(builder.mem_suffix); + assert!(!builder.mem_prealloc); + assert!(!builder.dirty_page_logging); + assert!(builder.vmfd.is_none()); + + assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem0"); + assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem1"); + assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem2"); + assert_eq!(builder.mem_index, 3); + + builder.toggle_file_suffix(false); + assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem"); + assert_eq!(&builder.get_next_mem_file(), "/tmp/shmem"); + assert_eq!(builder.mem_index, 3); + + builder.toggle_prealloc(true); + builder.toggle_dirty_page_logging(true); + assert!(builder.mem_prealloc); + assert!(builder.dirty_page_logging); + } + + #[test] + fn test_configure_invalid_numa() { + let res_mgr = ResourceManager::new(None); + let mem_size = 128 << 20; + let numa_region_infos = vec![NumaRegionInfo { + size: mem_size >> 20, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids: vec![1, 2], + }]; + let builder = AddressSpaceMgrBuilder::new("shmem", "").unwrap(); + let as_mgr = builder.build(&res_mgr, &numa_region_infos).unwrap(); + let mmap_reg = MmapRegion::new(8).unwrap(); + + assert!(as_mgr.configure_numa(&mmap_reg, u32::MAX).is_err()); + } +} diff --git a/src/dragonball/src/api/mod.rs b/src/dragonball/src/api/mod.rs new file mode 100644 index 0000000000..75ca6af690 --- /dev/null +++ b/src/dragonball/src/api/mod.rs @@ -0,0 +1,6 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! API related data structures to configure the vmm. + +pub mod v1; diff --git a/src/dragonball/src/api/v1/boot_source.rs b/src/dragonball/src/api/v1/boot_source.rs new file mode 100644 index 0000000000..8ff7e030dc --- /dev/null +++ b/src/dragonball/src/api/v1/boot_source.rs @@ -0,0 +1,55 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde_derive::{Deserialize, Serialize}; + +/// Default guest kernel command line: +/// - `reboot=k` shutdown the guest on reboot, instead of well... rebooting; +/// - `panic=1` on panic, reboot after 1 second; +/// - `pci=off` do not scan for PCI devices (ser boot time); +/// - `nomodules` disable loadable kernel module support; +/// - `8250.nr_uarts=0` disable 8250 serial interface; +/// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (ser boot time); +/// - `i8042.nomux` do not probe i8042 for a multiplexing controller (ser boot time); +/// - `i8042.nopnp` do not use ACPIPnP to discover KBD/AUX controllers (ser boot time); +/// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (ser boot time). +pub const DEFAULT_KERNEL_CMDLINE: &str = "reboot=k panic=1 pci=off nomodules 8250.nr_uarts=0 \ + i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd"; + +/// Strongly typed data structure used to configure the boot source of the microvm. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, Default)] +#[serde(deny_unknown_fields)] +pub struct BootSourceConfig { + /// Path of the kernel image. + /// We only support uncompressed kernel for Dragonball. + pub kernel_path: String, + /// Path of the initrd, if there is one. + /// ps. rootfs is set in BlockDeviceConfigInfo + pub initrd_path: Option, + /// The boot arguments to pass to the kernel. + #[serde(skip_serializing_if = "Option::is_none")] + pub boot_args: Option, +} + +/// Errors associated with actions on `BootSourceConfig`. +#[derive(Debug, thiserror::Error)] +pub enum BootSourceConfigError { + /// The kernel file cannot be opened. + #[error( + "the kernel file cannot be opened due to invalid kernel path or invalid permissions: {0}" + )] + InvalidKernelPath(#[source] std::io::Error), + + /// The initrd file cannot be opened. + #[error("the initrd file cannot be opened due to invalid path or invalid permissions: {0}")] + InvalidInitrdPath(#[source] std::io::Error), + + /// The kernel command line is invalid. + #[error("the kernel command line is invalid: {0}")] + InvalidKernelCommandLine(#[source] linux_loader::cmdline::Error), + + /// The boot source cannot be update post boot. + #[error("the update operation is not allowed after boot")] + UpdateNotAllowedPostBoot, +} diff --git a/src/dragonball/src/api/v1/instance_info.rs b/src/dragonball/src/api/v1/instance_info.rs new file mode 100644 index 0000000000..ae159aa614 --- /dev/null +++ b/src/dragonball/src/api/v1/instance_info.rs @@ -0,0 +1,88 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use serde_derive::{Deserialize, Serialize}; + +/// The microvm state. +/// +/// When Dragonball starts, the instance state is Uninitialized. Once start_microvm method is +/// called, the state goes from Uninitialized to Starting. The state is changed to Running until +/// the start_microvm method ends. Halting and Halted are currently unsupported. +#[derive(Copy, Clone, Debug, Deserialize, PartialEq, Serialize)] +pub enum InstanceState { + /// Microvm is not initialized. + Uninitialized, + /// Microvm is starting. + Starting, + /// Microvm is running. + Running, + /// Microvm is Paused. + Paused, + /// Microvm received a halt instruction. + Halting, + /// Microvm is halted. + Halted, + /// Microvm exit instead of process exit. + Exited(i32), +} + +/// The state of async actions +#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)] +pub enum AsyncState { + /// Uninitialized + Uninitialized, + /// Success + Success, + /// Failure + Failure, +} + +/// The strongly typed that contains general information about the microVM. +#[derive(Debug, Deserialize, Serialize)] +pub struct InstanceInfo { + /// The ID of the microVM. + pub id: String, + /// The state of the microVM. + pub state: InstanceState, + /// The version of the VMM that runs the microVM. + pub vmm_version: String, + /// The pid of the current VMM process. + pub pid: u32, + /// The state of async actions. + pub async_state: AsyncState, + /// List of tids of vcpu threads (vcpu index, tid) + pub tids: Vec<(u8, u32)>, + /// Last instance downtime + pub last_instance_downtime: u64, +} + +impl InstanceInfo { + /// create instance info object with given id, version, and platform type + pub fn new(id: String, vmm_version: String) -> Self { + InstanceInfo { + id, + state: InstanceState::Uninitialized, + vmm_version, + pid: std::process::id(), + async_state: AsyncState::Uninitialized, + tids: Vec::new(), + last_instance_downtime: 0, + } + } +} + +impl Default for InstanceInfo { + fn default() -> Self { + InstanceInfo { + id: String::from(""), + state: InstanceState::Uninitialized, + vmm_version: env!("CARGO_PKG_VERSION").to_string(), + pid: std::process::id(), + async_state: AsyncState::Uninitialized, + tids: Vec::new(), + last_instance_downtime: 0, + } + } +} diff --git a/src/dragonball/src/api/v1/machine_config.rs b/src/dragonball/src/api/v1/machine_config.rs new file mode 100644 index 0000000000..e4ae228679 --- /dev/null +++ b/src/dragonball/src/api/v1/machine_config.rs @@ -0,0 +1,86 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// We only support this number of vcpus for now. Mostly because we have set all vcpu related metrics as u8 +/// and breaking u8 will take extra efforts. +pub const MAX_SUPPORTED_VCPUS: u8 = 254; + +/// Memory hotplug value should have alignment in this size (unit: MiB) +pub const MEMORY_HOTPLUG_ALIGHMENT: u8 = 64; + +/// Errors associated with configuring the microVM. +#[derive(Debug, PartialEq, thiserror::Error)] +pub enum VmConfigError { + /// Cannot update the configuration of the microvm post boot. + #[error("update operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// The max vcpu count is invalid. + #[error("the vCPU number shouldn't large than {}", MAX_SUPPORTED_VCPUS)] + VcpuCountExceedsMaximum, + + /// The vcpu count is invalid. When hyperthreading is enabled, the `cpu_count` must be either + /// 1 or an even number. + #[error( + "the vCPU number '{0}' can only be 1 or an even number when hyperthreading is enabled" + )] + InvalidVcpuCount(u8), + + /// The threads_per_core is invalid. It should be either 1 or 2. + #[error("the threads_per_core number '{0}' can only be 1 or 2")] + InvalidThreadsPerCore(u8), + + /// The cores_per_die is invalid. It should be larger than 0. + #[error("the cores_per_die number '{0}' can only be larger than 0")] + InvalidCoresPerDie(u8), + + /// The dies_per_socket is invalid. It should be larger than 0. + #[error("the dies_per_socket number '{0}' can only be larger than 0")] + InvalidDiesPerSocket(u8), + + /// The socket number is invalid. It should be either 1 or 2. + #[error("the socket number '{0}' can only be 1 or 2")] + InvalidSocket(u8), + + /// max vcpu count inferred from cpu topology(threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count + #[error("the max vcpu count inferred from cpu topology '{0}' (threads_per_core * cores_per_die * dies_per_socket * sockets) should be larger or equal to vcpu_count")] + InvalidCpuTopology(u8), + + /// The max vcpu count is invalid. + #[error( + "the max vCPU number '{0}' shouldn't less than vCPU count and can only be 1 or an even number when hyperthreading is enabled" + )] + InvalidMaxVcpuCount(u8), + + /// The memory size is invalid. The memory can only be an unsigned integer. + #[error("the memory size 0x{0:x}MiB is invalid")] + InvalidMemorySize(usize), + + /// The hotplug memory size is invalid. The memory can only be an unsigned integer. + #[error( + "the hotplug memory size '{0}' (MiB) is invalid, must be multiple of {}", + MEMORY_HOTPLUG_ALIGHMENT + )] + InvalidHotplugMemorySize(usize), + + /// The memory type is invalid. + #[error("the memory type '{0}' is invalid")] + InvalidMemType(String), + + /// The memory file path is invalid. + #[error("the memory file path is invalid")] + InvalidMemFilePath(String), + + /// NUMA region memory size is invalid + #[error("Total size of memory in NUMA regions: {0}, should matches memory size in config")] + InvalidNumaRegionMemorySize(usize), + + /// NUMA region vCPU count is invalid + #[error("Total counts of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")] + InvalidNumaRegionCpuCount(u16), + + /// NUMA region vCPU count is invalid + #[error("Max id of vCPUs in NUMA regions: {0}, should matches max vcpu count in config")] + InvalidNumaRegionCpuMaxId(u16), +} diff --git a/src/dragonball/src/api/v1/mod.rs b/src/dragonball/src/api/v1/mod.rs new file mode 100644 index 0000000000..99e3075ebb --- /dev/null +++ b/src/dragonball/src/api/v1/mod.rs @@ -0,0 +1,19 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! API Version 1 related data structures to configure the vmm. + +mod vmm_action; +pub use self::vmm_action::*; + +/// Wrapper for configuring the microVM boot source. +mod boot_source; +pub use self::boot_source::{BootSourceConfig, BootSourceConfigError, DEFAULT_KERNEL_CMDLINE}; + +/// Wrapper over the microVM general information. +mod instance_info; +pub use self::instance_info::{InstanceInfo, InstanceState}; + +/// Wrapper for configuring the memory and CPU of the microVM. +mod machine_config; +pub use self::machine_config::{VmConfigError, MAX_SUPPORTED_VCPUS}; diff --git a/src/dragonball/src/api/v1/vmm_action.rs b/src/dragonball/src/api/v1/vmm_action.rs new file mode 100644 index 0000000000..06004f3f0c --- /dev/null +++ b/src/dragonball/src/api/v1/vmm_action.rs @@ -0,0 +1,636 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::fs::File; +use std::sync::mpsc::{Receiver, Sender, TryRecvError}; + +use log::{debug, error, info, warn}; + +use crate::error::{Result, StartMicroVmError, StopMicrovmError}; +use crate::event_manager::EventManager; +use crate::vm::{CpuTopology, KernelConfigInfo, VmConfigInfo}; +use crate::vmm::Vmm; + +use self::VmConfigError::*; +use self::VmmActionError::MachineConfig; + +#[cfg(feature = "virtio-blk")] +pub use crate::device_manager::blk_dev_mgr::{ + BlockDeviceConfigInfo, BlockDeviceConfigUpdateInfo, BlockDeviceError, BlockDeviceMgr, +}; +#[cfg(feature = "virtio-fs")] +pub use crate::device_manager::fs_dev_mgr::{ + FsDeviceConfigInfo, FsDeviceConfigUpdateInfo, FsDeviceError, FsDeviceMgr, FsMountConfigInfo, +}; +#[cfg(feature = "virtio-net")] +pub use crate::device_manager::virtio_net_dev_mgr::{ + VirtioNetDeviceConfigInfo, VirtioNetDeviceConfigUpdateInfo, VirtioNetDeviceError, + VirtioNetDeviceMgr, +}; +#[cfg(feature = "virtio-vsock")] +pub use crate::device_manager::vsock_dev_mgr::{VsockDeviceConfigInfo, VsockDeviceError}; + +use super::*; + +/// Wrapper for all errors associated with VMM actions. +#[derive(Debug, thiserror::Error)] +pub enum VmmActionError { + /// Invalid virtual machine instance ID. + #[error("the virtual machine instance ID is invalid")] + InvalidVMID, + + /// Failed to hotplug, due to Upcall not ready. + #[error("Upcall not ready, can't hotplug device.")] + UpcallNotReady, + + /// The action `ConfigureBootSource` failed either because of bad user input or an internal + /// error. + #[error("failed to configure boot source for VM: {0}")] + BootSource(#[source] BootSourceConfigError), + + /// The action `StartMicroVm` failed either because of bad user input or an internal error. + #[error("failed to boot the VM: {0}")] + StartMicroVm(#[source] StartMicroVmError), + + /// The action `StopMicroVm` failed either because of bad user input or an internal error. + #[error("failed to shutdown the VM: {0}")] + StopMicrovm(#[source] StopMicrovmError), + + /// One of the actions `GetVmConfiguration` or `SetVmConfiguration` failed either because of bad + /// input or an internal error. + #[error("failed to set configuration for the VM: {0}")] + MachineConfig(#[source] VmConfigError), + + #[cfg(feature = "virtio-vsock")] + /// The action `InsertVsockDevice` failed either because of bad user input or an internal error. + #[error("failed to add virtio-vsock device: {0}")] + Vsock(#[source] VsockDeviceError), + + #[cfg(feature = "virtio-blk")] + /// Block device related errors. + #[error("virtio-blk device error: {0}")] + Block(#[source] BlockDeviceError), + + #[cfg(feature = "virtio-net")] + /// Net device related errors. + #[error("virtio-net device error: {0}")] + VirtioNet(#[source] VirtioNetDeviceError), + + #[cfg(feature = "virtio-fs")] + /// The action `InsertFsDevice` failed either because of bad user input or an internal error. + #[error("virtio-fs device: {0}")] + FsDevice(#[source] FsDeviceError), +} + +/// This enum represents the public interface of the VMM. Each action contains various +/// bits of information (ids, paths, etc.). +#[derive(Clone, Debug, PartialEq)] +pub enum VmmAction { + /// Configure the boot source of the microVM using `BootSourceConfig`. + /// This action can only be called before the microVM has booted. + ConfigureBootSource(BootSourceConfig), + + /// Launch the microVM. This action can only be called before the microVM has booted. + StartMicroVm, + + /// Shutdown the vmicroVM. This action can only be called after the microVM has booted. + /// When vmm is used as the crate by the other process, which is need to + /// shutdown the vcpu threads and destory all of the object. + ShutdownMicroVm, + + /// Get the configuration of the microVM. + GetVmConfiguration, + + /// Set the microVM configuration (memory & vcpu) using `VmConfig` as input. This + /// action can only be called before the microVM has booted. + SetVmConfiguration(VmConfigInfo), + + #[cfg(feature = "virtio-vsock")] + /// Add a new vsock device or update one that already exists using the + /// `VsockDeviceConfig` as input. This action can only be called before the microVM has + /// booted. The response is sent using the `OutcomeSender`. + InsertVsockDevice(VsockDeviceConfigInfo), + + #[cfg(feature = "virtio-blk")] + /// Add a new block device or update one that already exists using the `BlockDeviceConfig` as + /// input. This action can only be called before the microVM has booted. + InsertBlockDevice(BlockDeviceConfigInfo), + + #[cfg(feature = "virtio-blk")] + /// Remove a new block device for according to given drive_id + RemoveBlockDevice(String), + + #[cfg(feature = "virtio-blk")] + /// Update a block device, after microVM start. Currently, the only updatable properties + /// are the RX and TX rate limiters. + UpdateBlockDevice(BlockDeviceConfigUpdateInfo), + + #[cfg(feature = "virtio-net")] + /// Add a new network interface config or update one that already exists using the + /// `NetworkInterfaceConfig` as input. This action can only be called before the microVM has + /// booted. The response is sent using the `OutcomeSender`. + InsertNetworkDevice(VirtioNetDeviceConfigInfo), + + #[cfg(feature = "virtio-net")] + /// Update a network interface, after microVM start. Currently, the only updatable properties + /// are the RX and TX rate limiters. + UpdateNetworkInterface(VirtioNetDeviceConfigUpdateInfo), + + #[cfg(feature = "virtio-fs")] + /// Add a new shared fs device or update one that already exists using the + /// `FsDeviceConfig` as input. This action can only be called before the microVM has + /// booted. + InsertFsDevice(FsDeviceConfigInfo), + + #[cfg(feature = "virtio-fs")] + /// Attach a new virtiofs Backend fs or detach an existing virtiofs Backend fs using the + /// `FsMountConfig` as input. This action can only be called _after_ the microVM has + /// booted. + ManipulateFsBackendFs(FsMountConfigInfo), + + #[cfg(feature = "virtio-fs")] + /// Update fs rate limiter, after microVM start. + UpdateFsDevice(FsDeviceConfigUpdateInfo), +} + +/// The enum represents the response sent by the VMM in case of success. The response is either +/// empty, when no data needs to be sent, or an internal VMM structure. +#[derive(Debug)] +pub enum VmmData { + /// No data is sent on the channel. + Empty, + /// The microVM configuration represented by `VmConfigInfo`. + MachineConfiguration(Box), +} + +/// Request data type used to communicate between the API and the VMM. +pub type VmmRequest = Box; + +/// Data type used to communicate between the API and the VMM. +pub type VmmRequestResult = std::result::Result; + +/// Response data type used to communicate between the API and the VMM. +pub type VmmResponse = Box; + +/// VMM Service to handle requests from the API server. +/// +/// There are two levels of API servers as below: +/// API client <--> VMM API Server <--> VMM Core +pub struct VmmService { + from_api: Receiver, + to_api: Sender, + machine_config: VmConfigInfo, +} + +impl VmmService { + /// Create a new VMM API server instance. + pub fn new(from_api: Receiver, to_api: Sender) -> Self { + VmmService { + from_api, + to_api, + machine_config: VmConfigInfo::default(), + } + } + + /// Handle requests from the HTTP API Server and send back replies. + pub fn run_vmm_action(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> Result<()> { + let request = match self.from_api.try_recv() { + Ok(t) => *t, + Err(TryRecvError::Empty) => { + warn!("Got a spurious notification from api thread"); + return Ok(()); + } + Err(TryRecvError::Disconnected) => { + panic!("The channel's sending half was disconnected. Cannot receive data."); + } + }; + debug!("receive vmm action: {:?}", request); + + let response = match request { + VmmAction::ConfigureBootSource(boot_source_body) => { + self.configure_boot_source(vmm, boot_source_body) + } + VmmAction::StartMicroVm => self.start_microvm(vmm, event_mgr), + VmmAction::ShutdownMicroVm => self.shutdown_microvm(vmm), + VmmAction::GetVmConfiguration => Ok(VmmData::MachineConfiguration(Box::new( + self.machine_config.clone(), + ))), + VmmAction::SetVmConfiguration(machine_config) => { + self.set_vm_configuration(vmm, machine_config) + } + #[cfg(feature = "virtio-vsock")] + VmmAction::InsertVsockDevice(vsock_cfg) => self.add_vsock_device(vmm, vsock_cfg), + #[cfg(feature = "virtio-blk")] + VmmAction::InsertBlockDevice(block_device_config) => { + self.add_block_device(vmm, event_mgr, block_device_config) + } + #[cfg(feature = "virtio-blk")] + VmmAction::UpdateBlockDevice(blk_update) => { + self.update_blk_rate_limiters(vmm, blk_update) + } + #[cfg(feature = "virtio-blk")] + VmmAction::RemoveBlockDevice(drive_id) => { + self.remove_block_device(vmm, event_mgr, &drive_id) + } + #[cfg(feature = "virtio-net")] + VmmAction::InsertNetworkDevice(virtio_net_cfg) => { + self.add_virtio_net_device(vmm, event_mgr, virtio_net_cfg) + } + #[cfg(feature = "virtio-net")] + VmmAction::UpdateNetworkInterface(netif_update) => { + self.update_net_rate_limiters(vmm, netif_update) + } + #[cfg(feature = "virtio-fs")] + VmmAction::InsertFsDevice(fs_cfg) => self.add_fs_device(vmm, fs_cfg), + + #[cfg(feature = "virtio-fs")] + VmmAction::ManipulateFsBackendFs(fs_mount_cfg) => { + self.manipulate_fs_backend_fs(vmm, fs_mount_cfg) + } + #[cfg(feature = "virtio-fs")] + VmmAction::UpdateFsDevice(fs_update_cfg) => { + self.update_fs_rate_limiters(vmm, fs_update_cfg) + } + }; + + debug!("send vmm response: {:?}", response); + self.send_response(response) + } + + fn send_response(&self, result: VmmRequestResult) -> Result<()> { + self.to_api + .send(Box::new(result)) + .map_err(|_| ()) + .expect("vmm: one-shot API result channel has been closed"); + + Ok(()) + } + + fn configure_boot_source( + &self, + vmm: &mut Vmm, + boot_source_config: BootSourceConfig, + ) -> VmmRequestResult { + use super::BootSourceConfigError::{ + InvalidInitrdPath, InvalidKernelCommandLine, InvalidKernelPath, + UpdateNotAllowedPostBoot, + }; + use super::VmmActionError::BootSource; + + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + if vm.is_vm_initialized() { + return Err(BootSource(UpdateNotAllowedPostBoot)); + } + + let kernel_file = File::open(&boot_source_config.kernel_path) + .map_err(|e| BootSource(InvalidKernelPath(e)))?; + + let initrd_file = match boot_source_config.initrd_path { + None => None, + Some(ref path) => Some(File::open(path).map_err(|e| BootSource(InvalidInitrdPath(e)))?), + }; + + let mut cmdline = linux_loader::cmdline::Cmdline::new(dbs_boot::layout::CMDLINE_MAX_SIZE); + let boot_args = boot_source_config + .boot_args + .clone() + .unwrap_or_else(|| String::from(DEFAULT_KERNEL_CMDLINE)); + cmdline + .insert_str(boot_args) + .map_err(|e| BootSource(InvalidKernelCommandLine(e)))?; + + let kernel_config = KernelConfigInfo::new(kernel_file, initrd_file, cmdline); + vm.set_kernel_config(kernel_config); + + Ok(VmmData::Empty) + } + + fn start_microvm(&mut self, vmm: &mut Vmm, event_mgr: &mut EventManager) -> VmmRequestResult { + use self::StartMicroVmError::MicroVMAlreadyRunning; + use self::VmmActionError::StartMicroVm; + + let vmm_seccomp_filter = vmm.vmm_seccomp_filter(); + let vcpu_seccomp_filter = vmm.vcpu_seccomp_filter(); + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + if vm.is_vm_initialized() { + return Err(StartMicroVm(MicroVMAlreadyRunning)); + } + + vm.start_microvm(event_mgr, vmm_seccomp_filter, vcpu_seccomp_filter) + .map(|_| VmmData::Empty) + .map_err(StartMicroVm) + } + + fn shutdown_microvm(&mut self, vmm: &mut Vmm) -> VmmRequestResult { + vmm.event_ctx.exit_evt_triggered = true; + + Ok(VmmData::Empty) + } + + /// Set virtual machine configuration. + pub fn set_vm_configuration( + &mut self, + vmm: &mut Vmm, + machine_config: VmConfigInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + if vm.is_vm_initialized() { + return Err(MachineConfig(UpdateNotAllowedPostBoot)); + } + + // If the check is successful, set it up together. + let mut config = vm.vm_config().clone(); + if config.vcpu_count != machine_config.vcpu_count { + let vcpu_count = machine_config.vcpu_count; + // Check that the vcpu_count value is >=1. + if vcpu_count == 0 { + return Err(MachineConfig(InvalidVcpuCount(vcpu_count))); + } + config.vcpu_count = vcpu_count; + } + + if config.cpu_topology != machine_config.cpu_topology { + let cpu_topology = &machine_config.cpu_topology; + config.cpu_topology = handle_cpu_topology(cpu_topology, config.vcpu_count)?.clone(); + } else { + // the same default + let mut default_cpu_topology = CpuTopology { + threads_per_core: 1, + cores_per_die: config.vcpu_count, + dies_per_socket: 1, + sockets: 1, + }; + if machine_config.max_vcpu_count > config.vcpu_count { + default_cpu_topology.cores_per_die = machine_config.max_vcpu_count; + } + config.cpu_topology = default_cpu_topology; + } + let cpu_topology = &config.cpu_topology; + let max_vcpu_from_topo = cpu_topology.threads_per_core + * cpu_topology.cores_per_die + * cpu_topology.dies_per_socket + * cpu_topology.sockets; + // If the max_vcpu_count inferred by cpu_topology is not equal to + // max_vcpu_count, max_vcpu_count will be changed. currently, max vcpu size + // is used when cpu_topology is not defined and help define the cores_per_die + // for the default cpu topology. + let mut max_vcpu_count = machine_config.max_vcpu_count; + if max_vcpu_count < config.vcpu_count { + return Err(MachineConfig(InvalidMaxVcpuCount(max_vcpu_count))); + } + if max_vcpu_from_topo != max_vcpu_count { + max_vcpu_count = max_vcpu_from_topo; + info!("Since max_vcpu_count is not equal to cpu topo information, we have changed the max vcpu count to {}", max_vcpu_from_topo); + } + config.max_vcpu_count = max_vcpu_count; + + config.cpu_pm = machine_config.cpu_pm; + config.mem_type = machine_config.mem_type; + + let mem_size_mib_value = machine_config.mem_size_mib; + // Support 1TB memory at most, 2MB aligned for huge page. + if mem_size_mib_value == 0 || mem_size_mib_value > 0x10_0000 || mem_size_mib_value % 2 != 0 + { + return Err(MachineConfig(InvalidMemorySize(mem_size_mib_value))); + } + config.mem_size_mib = mem_size_mib_value; + + config.mem_file_path = machine_config.mem_file_path.clone(); + + if config.mem_type == "hugetlbfs" && config.mem_file_path.is_empty() { + return Err(MachineConfig(InvalidMemFilePath("".to_owned()))); + } + config.vpmu_feature = machine_config.vpmu_feature; + + let vm_id = vm.shared_info().read().unwrap().id.clone(); + let serial_path = match machine_config.serial_path { + Some(value) => value, + None => { + if config.serial_path.is_none() { + String::from("/run/dragonball/") + &vm_id + "_com1" + } else { + // Safe to unwrap() because we have checked it has a value. + config.serial_path.as_ref().unwrap().clone() + } + } + }; + config.serial_path = Some(serial_path); + + vm.set_vm_config(config.clone()); + self.machine_config = config; + + Ok(VmmData::Empty) + } + + #[cfg(feature = "virtio-vsock")] + fn add_vsock_device(&self, vmm: &mut Vmm, config: VsockDeviceConfigInfo) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + if vm.is_vm_initialized() { + return Err(VmmActionError::Vsock( + VsockDeviceError::UpdateNotAllowedPostBoot, + )); + } + + // VMADDR_CID_ANY (-1U) means any address for binding; + // VMADDR_CID_HYPERVISOR (0) is reserved for services built into the hypervisor; + // VMADDR_CID_RESERVED (1) must not be used; + // VMADDR_CID_HOST (2) is the well-known address of the host. + if config.guest_cid <= 2 { + return Err(VmmActionError::Vsock(VsockDeviceError::GuestCIDInvalid( + config.guest_cid, + ))); + } + + info!("add_vsock_device: {:?}", config); + let ctx = vm.create_device_op_context(None).map_err(|e| { + info!("create device op context error: {:?}", e); + VmmActionError::Vsock(VsockDeviceError::UpdateNotAllowedPostBoot) + })?; + + vm.device_manager_mut() + .vsock_manager + .insert_device(ctx, config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::Vsock) + } + + #[cfg(feature = "virtio-blk")] + // Only call this function as part of the API. + // If the drive_id does not exist, a new Block Device Config is added to the list. + fn add_block_device( + &mut self, + vmm: &mut Vmm, + event_mgr: &mut EventManager, + config: BlockDeviceConfigInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + let ctx = vm + .create_device_op_context(Some(event_mgr.epoll_manager())) + .map_err(|e| { + if let StartMicroVmError::UpcallNotReady = e { + return VmmActionError::UpcallNotReady; + } + VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot) + })?; + + BlockDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::Block) + } + + #[cfg(feature = "virtio-blk")] + /// Updates configuration for an emulated net device as described in `config`. + fn update_blk_rate_limiters( + &mut self, + vmm: &mut Vmm, + config: BlockDeviceConfigUpdateInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + + BlockDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::Block) + } + + #[cfg(feature = "virtio-blk")] + // Remove the device + fn remove_block_device( + &mut self, + vmm: &mut Vmm, + event_mgr: &mut EventManager, + drive_id: &str, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + let ctx = vm + .create_device_op_context(Some(event_mgr.epoll_manager())) + .map_err(|_| VmmActionError::Block(BlockDeviceError::UpdateNotAllowedPostBoot))?; + + BlockDeviceMgr::remove_device(vm.device_manager_mut(), ctx, drive_id) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::Block) + } + + #[cfg(feature = "virtio-net")] + fn add_virtio_net_device( + &mut self, + vmm: &mut Vmm, + event_mgr: &mut EventManager, + config: VirtioNetDeviceConfigInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + let ctx = vm + .create_device_op_context(Some(event_mgr.epoll_manager())) + .map_err(|e| { + if let StartMicroVmError::MicroVMAlreadyRunning = e { + VmmActionError::VirtioNet(VirtioNetDeviceError::UpdateNotAllowedPostBoot) + } else if let StartMicroVmError::UpcallNotReady = e { + VmmActionError::UpcallNotReady + } else { + VmmActionError::StartMicroVm(e) + } + })?; + + VirtioNetDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::VirtioNet) + } + + #[cfg(feature = "virtio-net")] + fn update_net_rate_limiters( + &mut self, + vmm: &mut Vmm, + config: VirtioNetDeviceConfigUpdateInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + + VirtioNetDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::VirtioNet) + } + + #[cfg(feature = "virtio-fs")] + fn add_fs_device(&mut self, vmm: &mut Vmm, config: FsDeviceConfigInfo) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + let hotplug = vm.is_vm_initialized(); + if !cfg!(feature = "hotplug") && hotplug { + return Err(VmmActionError::FsDevice( + FsDeviceError::UpdateNotAllowedPostBoot, + )); + } + + let ctx = vm.create_device_op_context(None).map_err(|e| { + info!("create device op context error: {:?}", e); + VmmActionError::FsDevice(FsDeviceError::UpdateNotAllowedPostBoot) + })?; + FsDeviceMgr::insert_device(vm.device_manager_mut(), ctx, config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::FsDevice) + } + + #[cfg(feature = "virtio-fs")] + fn manipulate_fs_backend_fs( + &self, + vmm: &mut Vmm, + config: FsMountConfigInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + + if !vm.is_vm_initialized() { + return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning)); + } + + FsDeviceMgr::manipulate_backend_fs(vm.device_manager_mut(), config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::FsDevice) + } + + #[cfg(feature = "virtio-fs")] + fn update_fs_rate_limiters( + &self, + vmm: &mut Vmm, + config: FsDeviceConfigUpdateInfo, + ) -> VmmRequestResult { + let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?; + + if !vm.is_vm_initialized() { + return Err(VmmActionError::FsDevice(FsDeviceError::MicroVMNotRunning)); + } + + FsDeviceMgr::update_device_ratelimiters(vm.device_manager_mut(), config) + .map(|_| VmmData::Empty) + .map_err(VmmActionError::FsDevice) + } +} + +fn handle_cpu_topology( + cpu_topology: &CpuTopology, + vcpu_count: u8, +) -> std::result::Result<&CpuTopology, VmmActionError> { + // Check if dies_per_socket, cores_per_die, threads_per_core and socket number is valid + if cpu_topology.threads_per_core < 1 || cpu_topology.threads_per_core > 2 { + return Err(MachineConfig(InvalidThreadsPerCore( + cpu_topology.threads_per_core, + ))); + } + let vcpu_count_from_topo = cpu_topology + .sockets + .checked_mul(cpu_topology.dies_per_socket) + .ok_or(MachineConfig(VcpuCountExceedsMaximum))? + .checked_mul(cpu_topology.cores_per_die) + .ok_or(MachineConfig(VcpuCountExceedsMaximum))? + .checked_mul(cpu_topology.threads_per_core) + .ok_or(MachineConfig(VcpuCountExceedsMaximum))?; + if vcpu_count_from_topo > MAX_SUPPORTED_VCPUS { + return Err(MachineConfig(VcpuCountExceedsMaximum)); + } + if vcpu_count_from_topo < vcpu_count { + return Err(MachineConfig(InvalidCpuTopology(vcpu_count_from_topo))); + } + + Ok(cpu_topology) +} diff --git a/src/dragonball/src/config_manager.rs b/src/dragonball/src/config_manager.rs new file mode 100644 index 0000000000..f855be1266 --- /dev/null +++ b/src/dragonball/src/config_manager.rs @@ -0,0 +1,760 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::convert::TryInto; +use std::io; +use std::ops::{Index, IndexMut}; +use std::sync::Arc; + +use dbs_device::DeviceIo; +use dbs_utils::rate_limiter::{RateLimiter, TokenBucket}; +use serde_derive::{Deserialize, Serialize}; + +/// Get bucket update for rate limiter. +#[macro_export] +macro_rules! get_bucket_update { + ($self:ident, $rate_limiter: ident, $metric: ident) => {{ + match &$self.$rate_limiter { + Some(rl_cfg) => { + let tb_cfg = &rl_cfg.$metric; + dbs_utils::rate_limiter::RateLimiter::make_bucket( + tb_cfg.size, + tb_cfg.one_time_burst, + tb_cfg.refill_time, + ) + // Updated active rate-limiter. + .map(dbs_utils::rate_limiter::BucketUpdate::Update) + // Updated/deactivated rate-limiter + .unwrap_or(dbs_utils::rate_limiter::BucketUpdate::Disabled) + } + // No update to the rate-limiter. + None => dbs_utils::rate_limiter::BucketUpdate::None, + } + }}; +} + +/// Trait for generic configuration information. +pub trait ConfigItem { + /// Related errors. + type Err; + + /// Get the unique identifier of the configuration item. + fn id(&self) -> &str; + + /// Check whether current configuration item conflicts with another one. + fn check_conflicts(&self, other: &Self) -> std::result::Result<(), Self::Err>; +} + +/// Struct to manage a group of configuration items. +#[derive(Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct ConfigInfos +where + T: ConfigItem + Clone, +{ + configs: Vec, +} + +impl ConfigInfos +where + T: ConfigItem + Clone + Default, +{ + /// Constructor + pub fn new() -> Self { + ConfigInfos::default() + } + + /// Insert a configuration item in the group. + pub fn insert(&mut self, config: T) -> std::result::Result<(), T::Err> { + for item in self.configs.iter() { + config.check_conflicts(item)?; + } + self.configs.push(config); + + Ok(()) + } + + /// Update a configuration item in the group. + pub fn update(&mut self, config: T, err: T::Err) -> std::result::Result<(), T::Err> { + match self.get_index_by_id(&config) { + None => Err(err), + Some(index) => { + for (idx, item) in self.configs.iter().enumerate() { + if idx != index { + config.check_conflicts(item)?; + } + } + self.configs[index] = config; + Ok(()) + } + } + } + + /// Insert or update a configuration item in the group. + pub fn insert_or_update(&mut self, config: T) -> std::result::Result<(), T::Err> { + match self.get_index_by_id(&config) { + None => { + for item in self.configs.iter() { + config.check_conflicts(item)?; + } + + self.configs.push(config) + } + Some(index) => { + for (idx, item) in self.configs.iter().enumerate() { + if idx != index { + config.check_conflicts(item)?; + } + } + self.configs[index] = config; + } + } + + Ok(()) + } + + /// Remove the matching configuration entry. + pub fn remove(&mut self, config: &T) -> Option { + if let Some(index) = self.get_index_by_id(config) { + Some(self.configs.remove(index)) + } else { + None + } + } + + /// Returns an immutable iterator over the config items + pub fn iter(&self) -> ::std::slice::Iter { + self.configs.iter() + } + + /// Get the configuration entry with matching ID. + pub fn get_by_id(&self, item: &T) -> Option<&T> { + let id = item.id(); + + self.configs.iter().rfind(|cfg| cfg.id() == id) + } + + fn get_index_by_id(&self, item: &T) -> Option { + let id = item.id(); + self.configs.iter().position(|cfg| cfg.id() == id) + } +} + +impl Clone for ConfigInfos +where + T: ConfigItem + Clone, +{ + fn clone(&self) -> Self { + ConfigInfos { + configs: self.configs.clone(), + } + } +} + +/// Struct to maintain configuration information for a device. +pub struct DeviceConfigInfo +where + T: ConfigItem + Clone, +{ + /// Configuration information for the device object. + pub config: T, + /// The associated device object. + pub device: Option>, +} + +impl DeviceConfigInfo +where + T: ConfigItem + Clone, +{ + /// Create a new instance of ['DeviceInfoGroup']. + pub fn new(config: T) -> Self { + DeviceConfigInfo { + config, + device: None, + } + } + + /// Create a new instance of ['DeviceInfoGroup'] with optional device. + pub fn new_with_device(config: T, device: Option>) -> Self { + DeviceConfigInfo { config, device } + } + + /// Set the device object associated with the configuration. + pub fn set_device(&mut self, device: Arc) { + self.device = Some(device); + } +} + +impl Clone for DeviceConfigInfo +where + T: ConfigItem + Clone, +{ + fn clone(&self) -> Self { + DeviceConfigInfo::new_with_device(self.config.clone(), self.device.clone()) + } +} + +/// Struct to maintain configuration information for a group of devices. +pub struct DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + info_list: Vec>, +} + +impl Default for DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + fn default() -> Self { + Self::new() + } +} + +impl DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + /// Create a new instance of ['DeviceConfigInfos']. + pub fn new() -> Self { + DeviceConfigInfos { + info_list: Vec::new(), + } + } + + /// Insert or update configuration information for a device. + pub fn insert_or_update(&mut self, config: &T) -> std::result::Result { + let device_info = DeviceConfigInfo::new(config.clone()); + Ok(match self.get_index_by_id(config) { + Some(index) => { + for (idx, info) in self.info_list.iter().enumerate() { + if idx != index { + info.config.check_conflicts(config)?; + } + } + self.info_list[index] = device_info; + index + } + None => { + for info in self.info_list.iter() { + info.config.check_conflicts(config)?; + } + self.info_list.push(device_info); + self.info_list.len() - 1 + } + }) + } + + /// Remove a device configuration information object. + pub fn remove(&mut self, index: usize) -> Option> { + if self.info_list.len() > index { + Some(self.info_list.remove(index)) + } else { + None + } + } + + /// Get number of device configuration information objects. + pub fn len(&self) -> usize { + self.info_list.len() + } + + /// Returns true if the device configuration information objects is empty. + pub fn is_empty(&self) -> bool { + self.info_list.len() == 0 + } + + /// Add a device configuration information object at the tail. + pub fn push(&mut self, info: DeviceConfigInfo) { + self.info_list.push(info); + } + + /// Iterator for configuration information objects. + pub fn iter(&self) -> std::slice::Iter> { + self.info_list.iter() + } + + /// Mutable iterator for configuration information objects. + pub fn iter_mut(&mut self) -> std::slice::IterMut> { + self.info_list.iter_mut() + } + + fn get_index_by_id(&self, config: &T) -> Option { + self.info_list + .iter() + .position(|info| info.config.id().eq(config.id())) + } +} + +impl Index for DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + type Output = DeviceConfigInfo; + fn index(&self, idx: usize) -> &Self::Output { + &self.info_list[idx] + } +} + +impl IndexMut for DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + fn index_mut(&mut self, idx: usize) -> &mut Self::Output { + &mut self.info_list[idx] + } +} + +impl Clone for DeviceConfigInfos +where + T: ConfigItem + Clone, +{ + fn clone(&self) -> Self { + DeviceConfigInfos { + info_list: self.info_list.clone(), + } + } +} + +/// Configuration information for RateLimiter token bucket. +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct TokenBucketConfigInfo { + /// The size for the token bucket. A TokenBucket of `size` total capacity will take `refill_time` + /// milliseconds to go from zero tokens to total capacity. + pub size: u64, + /// Number of free initial tokens, that can be consumed at no cost. + pub one_time_burst: u64, + /// Complete refill time in milliseconds. + pub refill_time: u64, +} + +impl TokenBucketConfigInfo { + fn resize(&mut self, n: u64) { + if n != 0 { + self.size /= n; + self.one_time_burst /= n; + } + } +} + +impl From for TokenBucket { + fn from(t: TokenBucketConfigInfo) -> TokenBucket { + (&t).into() + } +} + +impl From<&TokenBucketConfigInfo> for TokenBucket { + fn from(t: &TokenBucketConfigInfo) -> TokenBucket { + TokenBucket::new(t.size, t.one_time_burst, t.refill_time) + } +} + +/// Configuration information for RateLimiter objects. +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct RateLimiterConfigInfo { + /// Data used to initialize the RateLimiter::bandwidth bucket. + pub bandwidth: TokenBucketConfigInfo, + /// Data used to initialize the RateLimiter::ops bucket. + pub ops: TokenBucketConfigInfo, +} + +impl RateLimiterConfigInfo { + /// Update the bandwidth budget configuration. + pub fn update_bandwidth(&mut self, new_config: TokenBucketConfigInfo) { + self.bandwidth = new_config; + } + + /// Update the ops budget configuration. + pub fn update_ops(&mut self, new_config: TokenBucketConfigInfo) { + self.ops = new_config; + } + + /// resize the limiter to its 1/n. + pub fn resize(&mut self, n: u64) { + self.bandwidth.resize(n); + self.ops.resize(n); + } +} + +impl TryInto for &RateLimiterConfigInfo { + type Error = io::Error; + + fn try_into(self) -> Result { + RateLimiter::new( + self.bandwidth.size, + self.bandwidth.one_time_burst, + self.bandwidth.refill_time, + self.ops.size, + self.ops.one_time_burst, + self.ops.refill_time, + ) + } +} + +impl TryInto for RateLimiterConfigInfo { + type Error = io::Error; + + fn try_into(self) -> Result { + RateLimiter::new( + self.bandwidth.size, + self.bandwidth.one_time_burst, + self.bandwidth.refill_time, + self.ops.size, + self.ops.one_time_burst, + self.ops.refill_time, + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug, thiserror::Error)] + pub enum DummyError { + #[error("configuration entry exists")] + Exist, + } + + #[derive(Clone, Debug, Default)] + pub struct DummyConfigInfo { + id: String, + content: String, + } + + impl ConfigItem for DummyConfigInfo { + type Err = DummyError; + + fn id(&self) -> &str { + &self.id + } + + fn check_conflicts(&self, other: &Self) -> Result<(), DummyError> { + if self.id == other.id || self.content == other.content { + Err(DummyError::Exist) + } else { + Ok(()) + } + } + } + + type DummyConfigInfos = ConfigInfos; + + #[test] + fn test_insert_config_info() { + let mut configs = DummyConfigInfos::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert(config1).unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + + // Test case: cannot insert new item with the same id. + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.insert(config2).unwrap_err(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert(config3).unwrap(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + assert_eq!(configs.configs[1].id, "2"); + assert_eq!(configs.configs[1].content, "c"); + + // Test case: cannot insert new item with the same content. + let config4 = DummyConfigInfo { + id: "3".to_owned(), + content: "c".to_owned(), + }; + configs.insert(config4).unwrap_err(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + assert_eq!(configs.configs[1].id, "2"); + assert_eq!(configs.configs[1].content, "c"); + } + + #[test] + fn test_update_config_info() { + let mut configs = DummyConfigInfos::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert(config1).unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + + // Test case: succeed to update an existing entry + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.update(config2, DummyError::Exist).unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + + // Test case: cannot update a non-existing entry + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.update(config3, DummyError::Exist).unwrap_err(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + + // Test case: cannot update an entry with conflicting content + let config4 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert(config4).unwrap(); + let config5 = DummyConfigInfo { + id: "1".to_owned(), + content: "c".to_owned(), + }; + configs.update(config5, DummyError::Exist).unwrap_err(); + } + + #[test] + fn test_insert_or_update_config_info() { + let mut configs = DummyConfigInfos::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert_or_update(config1).unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "a"); + + // Test case: succeed to update an existing entry + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.insert_or_update(config2.clone()).unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + + // Add a second entry + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(config3.clone()).unwrap(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + assert_eq!(configs.configs[1].id, "2"); + assert_eq!(configs.configs[1].content, "c"); + + // Lookup the first entry + let config4 = configs + .get_by_id(&DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }) + .unwrap(); + assert_eq!(config4.id, config2.id); + assert_eq!(config4.content, config2.content); + + // Lookup the second entry + let config5 = configs + .get_by_id(&DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }) + .unwrap(); + assert_eq!(config5.id, config3.id); + assert_eq!(config5.content, config3.content); + + // Test case: can't insert an entry with conflicting content + let config6 = DummyConfigInfo { + id: "3".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(config6).unwrap_err(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + assert_eq!(configs.configs[1].id, "2"); + assert_eq!(configs.configs[1].content, "c"); + } + + #[test] + fn test_remove_config_info() { + let mut configs = DummyConfigInfos::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert_or_update(config1).unwrap(); + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.insert_or_update(config2.clone()).unwrap(); + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(config3.clone()).unwrap(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].id, "1"); + assert_eq!(configs.configs[0].content, "b"); + assert_eq!(configs.configs[1].id, "2"); + assert_eq!(configs.configs[1].content, "c"); + + let config4 = configs + .remove(&DummyConfigInfo { + id: "1".to_owned(), + content: "no value".to_owned(), + }) + .unwrap(); + assert_eq!(config4.id, config2.id); + assert_eq!(config4.content, config2.content); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].id, "2"); + assert_eq!(configs.configs[0].content, "c"); + + let config5 = configs + .remove(&DummyConfigInfo { + id: "2".to_owned(), + content: "no value".to_owned(), + }) + .unwrap(); + assert_eq!(config5.id, config3.id); + assert_eq!(config5.content, config3.content); + assert_eq!(configs.configs.len(), 0); + } + + type DummyDeviceInfoList = DeviceConfigInfos; + + #[test] + fn test_insert_or_update_device_info() { + let mut configs = DummyDeviceInfoList::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert_or_update(&config1).unwrap(); + assert_eq!(configs.len(), 1); + assert_eq!(configs[0].config.id, "1"); + assert_eq!(configs[0].config.content, "a"); + + // Test case: succeed to update an existing entry + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.insert_or_update(&config2 /* */).unwrap(); + assert_eq!(configs.len(), 1); + assert_eq!(configs[0].config.id, "1"); + assert_eq!(configs[0].config.content, "b"); + + // Add a second entry + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(&config3).unwrap(); + assert_eq!(configs.len(), 2); + assert_eq!(configs[0].config.id, "1"); + assert_eq!(configs[0].config.content, "b"); + assert_eq!(configs[1].config.id, "2"); + assert_eq!(configs[1].config.content, "c"); + + // Lookup the first entry + let config4_id = configs + .get_index_by_id(&DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }) + .unwrap(); + let config4 = &configs[config4_id].config; + assert_eq!(config4.id, config2.id); + assert_eq!(config4.content, config2.content); + + // Lookup the second entry + let config5_id = configs + .get_index_by_id(&DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }) + .unwrap(); + let config5 = &configs[config5_id].config; + assert_eq!(config5.id, config3.id); + assert_eq!(config5.content, config3.content); + + // Test case: can't insert an entry with conflicting content + let config6 = DummyConfigInfo { + id: "3".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(&config6).unwrap_err(); + assert_eq!(configs.len(), 2); + assert_eq!(configs[0].config.id, "1"); + assert_eq!(configs[0].config.content, "b"); + assert_eq!(configs[1].config.id, "2"); + assert_eq!(configs[1].config.content, "c"); + } + + #[test] + fn test_remove_device_info() { + let mut configs = DummyDeviceInfoList::new(); + + let config1 = DummyConfigInfo { + id: "1".to_owned(), + content: "a".to_owned(), + }; + configs.insert_or_update(&config1).unwrap(); + let config2 = DummyConfigInfo { + id: "1".to_owned(), + content: "b".to_owned(), + }; + configs.insert_or_update(&config2).unwrap(); + let config3 = DummyConfigInfo { + id: "2".to_owned(), + content: "c".to_owned(), + }; + configs.insert_or_update(&config3).unwrap(); + assert_eq!(configs.len(), 2); + assert_eq!(configs[0].config.id, "1"); + assert_eq!(configs[0].config.content, "b"); + assert_eq!(configs[1].config.id, "2"); + assert_eq!(configs[1].config.content, "c"); + + let config4 = configs.remove(0).unwrap().config; + assert_eq!(config4.id, config2.id); + assert_eq!(config4.content, config2.content); + assert_eq!(configs.len(), 1); + assert_eq!(configs[0].config.id, "2"); + assert_eq!(configs[0].config.content, "c"); + + let config5 = configs.remove(0).unwrap().config; + assert_eq!(config5.id, config3.id); + assert_eq!(config5.content, config3.content); + assert_eq!(configs.len(), 0); + } +} diff --git a/src/dragonball/src/device_manager/blk_dev_mgr.rs b/src/dragonball/src/device_manager/blk_dev_mgr.rs new file mode 100644 index 0000000000..e4688b4f6f --- /dev/null +++ b/src/dragonball/src/device_manager/blk_dev_mgr.rs @@ -0,0 +1,773 @@ +// Copyright 2020-2022 Alibaba, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Device manager for virtio-blk and vhost-user-blk devices. +use std::collections::{vec_deque, VecDeque}; +use std::convert::TryInto; +use std::fs::OpenOptions; +use std::os::unix::fs::OpenOptionsExt; +use std::os::unix::io::AsRawFd; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use dbs_virtio_devices as virtio; +use dbs_virtio_devices::block::{aio::Aio, io_uring::IoUring, Block, LocalFile, Ufile}; +use serde_derive::{Deserialize, Serialize}; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::config_manager::{ConfigItem, DeviceConfigInfo, RateLimiterConfigInfo}; +use crate::device_manager::blk_dev_mgr::BlockDeviceError::InvalidDeviceId; +use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext}; +use crate::get_bucket_update; +use crate::vm::KernelConfigInfo; + +use super::DbsMmioV2Device; + +// The flag of whether to use the shared irq. +const USE_SHARED_IRQ: bool = true; +// The flag of whether to use the generic irq. +const USE_GENERIC_IRQ: bool = true; + +macro_rules! info( + ($l:expr, $($args:tt)+) => { + slog::info!($l, $($args)+; slog::o!("subsystem" => "block_manager")) + }; +); + +macro_rules! error( + ($l:expr, $($args:tt)+) => { + slog::error!($l, $($args)+; slog::o!("subsystem" => "block_manager")) + }; +); + +/// Default queue size for VirtIo block devices. +pub const QUEUE_SIZE: u16 = 128; + +/// Errors associated with the operations allowed on a drive. +#[derive(Debug, thiserror::Error)] +pub enum BlockDeviceError { + /// Invalid VM instance ID. + #[error("invalid VM instance id")] + InvalidVMID, + + /// The block device path is invalid. + #[error("invalid block device path '{0}'")] + InvalidBlockDevicePath(PathBuf), + + /// The block device type is invalid. + #[error("invalid block device type")] + InvalidBlockDeviceType, + + /// The block device path was already used for a different drive. + #[error("block device path '{0}' already exists")] + BlockDevicePathAlreadyExists(PathBuf), + + /// The device id doesn't exist. + #[error("invalid block device id '{0}'")] + InvalidDeviceId(String), + + /// Cannot perform the requested operation after booting the microVM. + #[error("block device does not support runtime update")] + UpdateNotAllowedPostBoot, + + /// A root block device was already added. + #[error("could not add multiple virtual machine root devices")] + RootBlockDeviceAlreadyAdded, + + /// Failed to send patch message to block epoll handler. + #[error("could not send patch message to the block epoll handler")] + BlockEpollHanderSendFail, + + /// Failure from device manager, + #[error("device manager errors: {0}")] + DeviceManager(#[from] DeviceMgrError), + + /// Failure from virtio subsystem. + #[error(transparent)] + Virtio(virtio::Error), + + /// Unable to seek the block device backing file due to invalid permissions or + /// the file was deleted/corrupted. + #[error("cannot create block device: {0}")] + CreateBlockDevice(#[source] virtio::Error), + + /// Cannot open the block device backing file. + #[error("cannot open the block device backing file: {0}")] + OpenBlockDevice(#[source] std::io::Error), + + /// Cannot initialize a MMIO Block Device or add a device to the MMIO Bus. + #[error("failure while registering block device: {0}")] + RegisterBlockDevice(#[source] DeviceMgrError), +} + +/// Type of low level storage device/protocol for virtio-blk devices. +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum BlockDeviceType { + /// Unknown low level device type. + Unknown, + /// Vhost-user-blk based low level device. + /// SPOOL is a reliable NVMe virtualization system for the cloud environment. + /// You could learn more SPOOL here: https://www.usenix.org/conference/atc20/presentation/xue + Spool, + /// Local disk/file based low level device. + RawBlock, +} + +impl BlockDeviceType { + /// Get type of low level storage device/protocol by parsing `path`. + pub fn get_type(path: &str) -> BlockDeviceType { + // SPOOL path should be started with "spool", e.g. "spool:/device1" + if path.starts_with("spool:/") { + BlockDeviceType::Spool + } else { + BlockDeviceType::RawBlock + } + } +} + +/// Configuration information for a block device. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct BlockDeviceConfigUpdateInfo { + /// Unique identifier of the drive. + pub drive_id: String, + /// Rate Limiter for I/O operations. + pub rate_limiter: Option, +} + +impl BlockDeviceConfigUpdateInfo { + /// Provides a `BucketUpdate` description for the bandwidth rate limiter. + pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate { + get_bucket_update!(self, rate_limiter, bandwidth) + } + /// Provides a `BucketUpdate` description for the ops rate limiter. + pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate { + get_bucket_update!(self, rate_limiter, ops) + } +} + +/// Configuration information for a block device. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct BlockDeviceConfigInfo { + /// Unique identifier of the drive. + pub drive_id: String, + /// Type of low level storage/protocol. + pub device_type: BlockDeviceType, + /// Path of the drive. + pub path_on_host: PathBuf, + /// If set to true, it makes the current device the root block device. + /// Setting this flag to true will mount the block device in the + /// guest under /dev/vda unless the part_uuid is present. + pub is_root_device: bool, + /// Part-UUID. Represents the unique id of the boot partition of this device. + /// It is optional and it will be used only if the `is_root_device` field is true. + pub part_uuid: Option, + /// If set to true, the drive is opened in read-only mode. Otherwise, the + /// drive is opened as read-write. + pub is_read_only: bool, + /// If set to false, the drive is opened with buffered I/O mode. Otherwise, the + /// drive is opened with direct I/O mode. + pub is_direct: bool, + /// Don't close `path_on_host` file when dropping the device. + pub no_drop: bool, + /// Block device multi-queue + pub num_queues: usize, + /// Virtio queue size. Size: byte + pub queue_size: u16, + /// Rate Limiter for I/O operations. + pub rate_limiter: Option, + /// Use shared irq + pub use_shared_irq: Option, + /// Use generic irq + pub use_generic_irq: Option, +} + +impl std::default::Default for BlockDeviceConfigInfo { + fn default() -> Self { + Self { + drive_id: String::default(), + device_type: BlockDeviceType::RawBlock, + path_on_host: PathBuf::default(), + is_root_device: false, + part_uuid: None, + is_read_only: false, + is_direct: Self::default_direct(), + no_drop: Self::default_no_drop(), + num_queues: Self::default_num_queues(), + queue_size: 256, + rate_limiter: None, + use_shared_irq: None, + use_generic_irq: None, + } + } +} + +impl BlockDeviceConfigInfo { + /// Get default queue numbers + pub fn default_num_queues() -> usize { + 1 + } + + /// Get default value of is_direct switch + pub fn default_direct() -> bool { + true + } + + /// Get default value of no_drop switch + pub fn default_no_drop() -> bool { + false + } + + /// Get type of low level storage/protocol. + pub fn device_type(&self) -> BlockDeviceType { + self.device_type + } + + /// Returns a reference to `path_on_host`. + pub fn path_on_host(&self) -> &PathBuf { + &self.path_on_host + } + + /// Returns a reference to the part_uuid. + pub fn get_part_uuid(&self) -> Option<&String> { + self.part_uuid.as_ref() + } + + /// Checks whether the drive had read only permissions. + pub fn is_read_only(&self) -> bool { + self.is_read_only + } + + /// Checks whether the drive uses direct I/O + pub fn is_direct(&self) -> bool { + self.is_direct + } + + /// Get number and size of queues supported. + pub fn queue_sizes(&self) -> Vec { + (0..self.num_queues) + .map(|_| self.queue_size) + .collect::>() + } +} + +impl ConfigItem for BlockDeviceConfigInfo { + type Err = BlockDeviceError; + + fn id(&self) -> &str { + &self.drive_id + } + + fn check_conflicts(&self, other: &Self) -> Result<(), BlockDeviceError> { + if self.drive_id == other.drive_id { + Ok(()) + } else if self.path_on_host == other.path_on_host { + Err(BlockDeviceError::BlockDevicePathAlreadyExists( + self.path_on_host.clone(), + )) + } else { + Ok(()) + } + } +} + +impl std::fmt::Debug for BlockDeviceInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.config) + } +} + +/// Block Device Info +pub type BlockDeviceInfo = DeviceConfigInfo; + +/// Wrapper for the collection that holds all the Block Devices Configs +//#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +#[derive(Clone)] +pub struct BlockDeviceMgr { + /// A list of `BlockDeviceInfo` objects. + info_list: VecDeque, + has_root_block: bool, + has_part_uuid_root: bool, + read_only_root: bool, + part_uuid: Option, + use_shared_irq: bool, +} + +impl BlockDeviceMgr { + /// returns a front-to-back iterator. + pub fn iter(&self) -> vec_deque::Iter { + self.info_list.iter() + } + + /// Checks whether any of the added BlockDevice is the root. + pub fn has_root_block_device(&self) -> bool { + self.has_root_block + } + + /// Checks whether the root device is configured using a part UUID. + pub fn has_part_uuid_root(&self) -> bool { + self.has_part_uuid_root + } + + /// Checks whether the root device has read-only permisssions. + pub fn is_read_only_root(&self) -> bool { + self.read_only_root + } + + /// Gets the index of the device with the specified `drive_id` if it exists in the list. + pub fn get_index_of_drive_id(&self, id: &str) -> Option { + self.info_list + .iter() + .position(|info| info.config.id().eq(id)) + } + + /// Gets the 'BlockDeviceConfigInfo' of the device with the specified `drive_id` if it exists in the list. + pub fn get_config_of_drive_id(&self, drive_id: &str) -> Option { + match self.get_index_of_drive_id(drive_id) { + Some(index) => { + let config = self.info_list.get(index).unwrap().config.clone(); + Some(config) + } + None => None, + } + } + + /// Inserts `block_device_config` in the block device configuration list. + /// If an entry with the same id already exists, it will attempt to update + /// the existing entry. + /// Inserting a secondary root block device will fail. + pub fn insert_device( + device_mgr: &mut DeviceManager, + mut ctx: DeviceOpContext, + config: BlockDeviceConfigInfo, + ) -> std::result::Result<(), BlockDeviceError> { + if !cfg!(feature = "hotplug") && ctx.is_hotplug { + return Err(BlockDeviceError::UpdateNotAllowedPostBoot); + } + + let mgr = &mut device_mgr.block_manager; + + // If the id of the drive already exists in the list, the operation is update. + match mgr.get_index_of_drive_id(config.id()) { + Some(index) => { + // No support for runtime update yet. + if ctx.is_hotplug { + Err(BlockDeviceError::BlockDevicePathAlreadyExists( + config.path_on_host.clone(), + )) + } else { + for (idx, info) in mgr.info_list.iter().enumerate() { + if idx != index { + info.config.check_conflicts(&config)?; + } + } + mgr.update(index, config) + } + } + None => { + for info in mgr.info_list.iter() { + info.config.check_conflicts(&config)?; + } + let index = mgr.create(config.clone())?; + if !ctx.is_hotplug { + return Ok(()); + } + + match config.device_type { + BlockDeviceType::RawBlock => { + let device = Self::create_blk_device(&config, &mut ctx) + .map_err(BlockDeviceError::Virtio)?; + let dev = DeviceManager::create_mmio_virtio_device( + device, + &mut ctx, + config.use_shared_irq.unwrap_or(mgr.use_shared_irq), + config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(BlockDeviceError::DeviceManager)?; + mgr.update_device_by_index(index, Arc::clone(&dev))?; + // live-upgrade need save/restore device from info.device. + mgr.info_list[index].set_device(dev.clone()); + ctx.insert_hotplug_mmio_device(&dev, None).map_err(|e| { + let logger = ctx.logger().new(slog::o!()); + BlockDeviceMgr::remove_device(device_mgr, ctx, &config.drive_id) + .unwrap(); + error!( + logger, + "failed to hot-add virtio block device {}, {:?}", + &config.drive_id, + e + ); + BlockDeviceError::DeviceManager(e) + }) + } + _ => Err(BlockDeviceError::InvalidBlockDeviceType), + } + } + } + } + + /// Attaches all block devices from the BlockDevicesConfig. + pub fn attach_devices( + &mut self, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), BlockDeviceError> { + for info in self.info_list.iter_mut() { + match info.config.device_type { + BlockDeviceType::RawBlock => { + info!( + ctx.logger(), + "attach virtio-blk device, drive_id {}, path {}", + info.config.drive_id, + info.config.path_on_host.to_str().unwrap_or("") + ); + let device = Self::create_blk_device(&info.config, ctx) + .map_err(BlockDeviceError::Virtio)?; + let device = DeviceManager::create_mmio_virtio_device( + device, + ctx, + info.config.use_shared_irq.unwrap_or(self.use_shared_irq), + info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(BlockDeviceError::RegisterBlockDevice)?; + info.device = Some(device); + } + _ => { + return Err(BlockDeviceError::OpenBlockDevice( + std::io::Error::from_raw_os_error(libc::EINVAL), + )); + } + } + } + + Ok(()) + } + + /// Removes all virtio-blk devices + pub fn remove_devices(&mut self, ctx: &mut DeviceOpContext) -> Result<(), DeviceMgrError> { + while let Some(mut info) = self.info_list.pop_back() { + info!(ctx.logger(), "remove drive {}", info.config.drive_id); + if let Some(device) = info.device.take() { + DeviceManager::destroy_mmio_virtio_device(device, ctx)?; + } + } + + Ok(()) + } + + fn remove(&mut self, drive_id: &str) -> Option { + match self.get_index_of_drive_id(drive_id) { + Some(index) => self.info_list.remove(index), + None => None, + } + } + + /// remove a block device, it basically is the inverse operation of `insert_device`` + pub fn remove_device( + dev_mgr: &mut DeviceManager, + mut ctx: DeviceOpContext, + drive_id: &str, + ) -> std::result::Result<(), BlockDeviceError> { + if !cfg!(feature = "hotplug") { + return Err(BlockDeviceError::UpdateNotAllowedPostBoot); + } + + let mgr = &mut dev_mgr.block_manager; + match mgr.remove(drive_id) { + Some(mut info) => { + info!(ctx.logger(), "remove drive {}", info.config.drive_id); + if let Some(device) = info.device.take() { + DeviceManager::destroy_mmio_virtio_device(device, &mut ctx) + .map_err(BlockDeviceError::DeviceManager)?; + } + } + None => return Err(BlockDeviceError::InvalidDeviceId(drive_id.to_owned())), + } + + Ok(()) + } + + fn create_blk_device( + cfg: &BlockDeviceConfigInfo, + ctx: &mut DeviceOpContext, + ) -> std::result::Result>, virtio::Error> { + let epoll_mgr = ctx.epoll_mgr.clone().ok_or(virtio::Error::InvalidInput)?; + + let mut block_files: Vec> = vec![]; + + match cfg.device_type { + BlockDeviceType::RawBlock => { + let custom_flags = if cfg.is_direct() { + info!( + ctx.logger(), + "Open block device \"{}\" in direct mode.", + cfg.path_on_host().display() + ); + libc::O_DIRECT + } else { + info!( + ctx.logger(), + "Open block device \"{}\" in buffer mode.", + cfg.path_on_host().display(), + ); + 0 + }; + let io_uring_supported = IoUring::is_supported(); + for i in 0..cfg.num_queues { + let queue_size = cfg.queue_sizes()[i] as u32; + let file = OpenOptions::new() + .read(true) + .custom_flags(custom_flags) + .write(!cfg.is_read_only()) + .open(cfg.path_on_host())?; + info!(ctx.logger(), "Queue {}: block file opened", i); + + if io_uring_supported { + info!( + ctx.logger(), + "Queue {}: Using io_uring Raw disk file, queue size {}.", i, queue_size + ); + let io_engine = IoUring::new(file.as_raw_fd(), queue_size)?; + block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?)); + } else { + info!( + ctx.logger(), + "Queue {}: Since io_uring_supported is not enabled, change to default support of Aio Raw disk file, queue size {}", i, queue_size + ); + let io_engine = Aio::new(file.as_raw_fd(), queue_size)?; + block_files.push(Box::new(LocalFile::new(file, cfg.no_drop, io_engine)?)); + } + } + } + _ => { + error!( + ctx.logger(), + "invalid block device type: {:?}", cfg.device_type + ); + return Err(virtio::Error::InvalidInput); + } + }; + + let mut limiters = vec![]; + for _i in 0..cfg.num_queues { + if let Some(limiter) = cfg.rate_limiter.clone().map(|mut v| { + v.resize(cfg.num_queues as u64); + v.try_into().unwrap() + }) { + limiters.push(limiter); + } + } + + Ok(Box::new(Block::new( + block_files, + cfg.is_read_only, + Arc::new(cfg.queue_sizes()), + epoll_mgr, + limiters, + )?)) + } + + /// Generated guest kernel commandline related to root block device. + pub fn generate_kernel_boot_args( + &self, + kernel_config: &mut KernelConfigInfo, + ) -> std::result::Result<(), DeviceMgrError> { + // Respect user configuration if kernel_cmdline contains "root=", + // special attention for the case when kernel command line starting with "root=xxx" + let old_kernel_cmdline = format!(" {}", kernel_config.kernel_cmdline().as_str()); + if !old_kernel_cmdline.contains(" root=") && self.has_root_block { + let cmdline = kernel_config.kernel_cmdline_mut(); + if let Some(ref uuid) = self.part_uuid { + cmdline + .insert("root", &format!("PART_UUID={}", uuid)) + .map_err(DeviceMgrError::Cmdline)?; + } else { + cmdline + .insert("root", "/dev/vda") + .map_err(DeviceMgrError::Cmdline)?; + } + if self.read_only_root { + if old_kernel_cmdline.contains(" rw") { + return Err(DeviceMgrError::InvalidOperation); + } + cmdline.insert_str("ro").map_err(DeviceMgrError::Cmdline)?; + } + } + + Ok(()) + } + + /// insert a block device's config. return index on success. + fn create( + &mut self, + block_device_config: BlockDeviceConfigInfo, + ) -> std::result::Result { + self.check_data_file_present(&block_device_config)?; + if self + .get_index_of_drive_path(&block_device_config.path_on_host) + .is_some() + { + return Err(BlockDeviceError::BlockDevicePathAlreadyExists( + block_device_config.path_on_host, + )); + } + + // check whether the Device Config belongs to a root device + // we need to satisfy the condition by which a VMM can only have on root device + if block_device_config.is_root_device { + if self.has_root_block { + return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded); + } else { + self.has_root_block = true; + self.read_only_root = block_device_config.is_read_only; + self.has_part_uuid_root = block_device_config.part_uuid.is_some(); + self.part_uuid = block_device_config.part_uuid.clone(); + // Root Device should be the first in the list whether or not PART_UUID is specified + // in order to avoid bugs in case of switching from part_uuid boot scenarios to + // /dev/vda boot type. + self.info_list + .push_front(BlockDeviceInfo::new(block_device_config)); + Ok(0) + } + } else { + self.info_list + .push_back(BlockDeviceInfo::new(block_device_config)); + Ok(self.info_list.len() - 1) + } + } + + /// Updates a Block Device Config. The update fails if it would result in two + /// root block devices. + fn update( + &mut self, + mut index: usize, + new_config: BlockDeviceConfigInfo, + ) -> std::result::Result<(), BlockDeviceError> { + // Check if the path exists + self.check_data_file_present(&new_config)?; + if let Some(idx) = self.get_index_of_drive_path(&new_config.path_on_host) { + if idx != index { + return Err(BlockDeviceError::BlockDevicePathAlreadyExists( + new_config.path_on_host.clone(), + )); + } + } + + if self.info_list.get(index).is_none() { + return Err(InvalidDeviceId(index.to_string())); + } + // Check if the root block device is being updated. + if self.info_list[index].config.is_root_device { + self.has_root_block = new_config.is_root_device; + self.read_only_root = new_config.is_root_device && new_config.is_read_only; + self.has_part_uuid_root = new_config.part_uuid.is_some(); + self.part_uuid = new_config.part_uuid.clone(); + } else if new_config.is_root_device { + // Check if a second root block device is being added. + if self.has_root_block { + return Err(BlockDeviceError::RootBlockDeviceAlreadyAdded); + } else { + // One of the non-root blocks is becoming root. + self.has_root_block = true; + self.read_only_root = new_config.is_read_only; + self.has_part_uuid_root = new_config.part_uuid.is_some(); + self.part_uuid = new_config.part_uuid.clone(); + + // Make sure the root device is on the first position. + self.info_list.swap(0, index); + // Block config to be updated has moved to first position. + index = 0; + } + } + // Update the config. + self.info_list[index].config = new_config; + + Ok(()) + } + + fn check_data_file_present( + &self, + block_device_config: &BlockDeviceConfigInfo, + ) -> std::result::Result<(), BlockDeviceError> { + if block_device_config.device_type == BlockDeviceType::RawBlock + && !block_device_config.path_on_host.exists() + { + Err(BlockDeviceError::InvalidBlockDevicePath( + block_device_config.path_on_host.clone(), + )) + } else { + Ok(()) + } + } + + fn get_index_of_drive_path(&self, drive_path: &Path) -> Option { + self.info_list + .iter() + .position(|info| info.config.path_on_host.eq(drive_path)) + } + + /// update devce information in `info_list`. The caller of this method is + /// `insert_device` when hotplug is true. + pub fn update_device_by_index( + &mut self, + index: usize, + device: Arc, + ) -> Result<(), BlockDeviceError> { + if let Some(info) = self.info_list.get_mut(index) { + info.device = Some(device); + return Ok(()); + } + + Err(BlockDeviceError::InvalidDeviceId("".to_owned())) + } + + /// Update the ratelimiter settings of a virtio blk device. + pub fn update_device_ratelimiters( + device_mgr: &mut DeviceManager, + new_cfg: BlockDeviceConfigUpdateInfo, + ) -> std::result::Result<(), BlockDeviceError> { + let mgr = &mut device_mgr.block_manager; + match mgr.get_index_of_drive_id(&new_cfg.drive_id) { + Some(index) => { + let config = &mut mgr.info_list[index].config; + config.rate_limiter = new_cfg.rate_limiter.clone(); + let device = mgr.info_list[index] + .device + .as_mut() + .ok_or_else(|| BlockDeviceError::InvalidDeviceId("".to_owned()))?; + if let Some(mmio_dev) = device.as_any().downcast_ref::() { + let guard = mmio_dev.state(); + let inner_dev = guard.get_inner_device(); + if let Some(blk_dev) = inner_dev + .as_any() + .downcast_ref::>() + { + return blk_dev + .set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops()) + .map(|_p| ()) + .map_err(|_e| BlockDeviceError::BlockEpollHanderSendFail); + } + } + Ok(()) + } + None => Err(BlockDeviceError::InvalidDeviceId(new_cfg.drive_id)), + } + } +} + +impl Default for BlockDeviceMgr { + /// Constructor for the BlockDeviceMgr. It initializes an empty LinkedList. + fn default() -> BlockDeviceMgr { + BlockDeviceMgr { + info_list: VecDeque::::new(), + has_root_block: false, + has_part_uuid_root: false, + read_only_root: false, + part_uuid: None, + use_shared_irq: USE_SHARED_IRQ, + } + } +} diff --git a/src/dragonball/src/device_manager/console_manager.rs b/src/dragonball/src/device_manager/console_manager.rs new file mode 100644 index 0000000000..1e3b2a2f22 --- /dev/null +++ b/src/dragonball/src/device_manager/console_manager.rs @@ -0,0 +1,440 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Virtual machine console device manager. +//! +//! A virtual console are composed up of two parts: frontend in virtual machine and backend in +//! host OS. A frontend may be serial port, virtio-console etc, a backend may be stdio or Unix +//! domain socket. The manager connects the frontend with the backend. +use std::io::{self, Read}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::path::Path; +use std::sync::{Arc, Mutex}; + +use bytes::{BufMut, BytesMut}; +use dbs_legacy_devices::{ConsoleHandler, SerialDevice}; +use dbs_utils::epoll_manager::{ + EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId, +}; +use vmm_sys_util::terminal::Terminal; + +use super::{DeviceMgrError, Result}; + +const EPOLL_EVENT_SERIAL: u32 = 0; +const EPOLL_EVENT_SERIAL_DATA: u32 = 1; +const EPOLL_EVENT_STDIN: u32 = 2; +// Maximal backend throughput for every data transaction. +const MAX_BACKEND_THROUGHPUT: usize = 64; + +/// Errors related to Console manager operations. +#[derive(Debug, thiserror::Error)] +pub enum ConsoleManagerError { + /// Cannot create unix domain socket for serial port + #[error("cannot create socket for serial console")] + CreateSerialSock(#[source] std::io::Error), + + /// An operation on the epoll instance failed due to resource exhaustion or bad configuration. + #[error("failure while managing epoll event for console fd")] + EpollMgr(#[source] dbs_utils::epoll_manager::Error), + + /// Cannot set mode for terminal. + #[error("failure while setting attribute for terminal")] + StdinHandle(#[source] vmm_sys_util::errno::Error), +} + +enum Backend { + StdinHandle(std::io::Stdin), + SockPath(String), +} + +/// Console manager to manage frontend and backend console devices. +pub struct ConsoleManager { + epoll_mgr: EpollManager, + logger: slog::Logger, + subscriber_id: Option, + backend: Option, +} + +impl ConsoleManager { + /// Create a console manager instance. + pub fn new(epoll_mgr: EpollManager, logger: &slog::Logger) -> Self { + let logger = logger.new(slog::o!("subsystem" => "console_manager")); + ConsoleManager { + epoll_mgr, + logger, + subscriber_id: Default::default(), + backend: None, + } + } + + /// Create a console backend device by using stdio streams. + pub fn create_stdio_console(&mut self, device: Arc>) -> Result<()> { + let stdin_handle = std::io::stdin(); + stdin_handle + .lock() + .set_raw_mode() + .map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?; + + let handler = ConsoleEpollHandler::new(device, Some(stdin_handle), None, &self.logger); + self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler))); + self.backend = Some(Backend::StdinHandle(std::io::stdin())); + + Ok(()) + } + + /// Create s console backend device by using Unix Domain socket. + pub fn create_socket_console( + &mut self, + device: Arc>, + sock_path: String, + ) -> Result<()> { + let sock_listener = Self::bind_domain_socket(&sock_path).map_err(|e| { + DeviceMgrError::ConsoleManager(ConsoleManagerError::CreateSerialSock(e)) + })?; + let handler = ConsoleEpollHandler::new(device, None, Some(sock_listener), &self.logger); + + self.subscriber_id = Some(self.epoll_mgr.add_subscriber(Box::new(handler))); + self.backend = Some(Backend::SockPath(sock_path)); + + Ok(()) + } + + /// Reset the host side terminal to canonical mode. + pub fn reset_console(&self) -> Result<()> { + if let Some(Backend::StdinHandle(stdin_handle)) = self.backend.as_ref() { + stdin_handle + .lock() + .set_canon_mode() + .map_err(|e| DeviceMgrError::ConsoleManager(ConsoleManagerError::StdinHandle(e)))?; + } + + Ok(()) + } + + fn bind_domain_socket(serial_path: &str) -> std::result::Result { + let path = Path::new(serial_path); + if path.is_file() { + let _ = std::fs::remove_file(serial_path); + } + + UnixListener::bind(path) + } +} + +struct ConsoleEpollHandler { + device: Arc>, + stdin_handle: Option, + sock_listener: Option, + sock_conn: Option, + logger: slog::Logger, +} + +impl ConsoleEpollHandler { + fn new( + device: Arc>, + stdin_handle: Option, + sock_listener: Option, + logger: &slog::Logger, + ) -> Self { + ConsoleEpollHandler { + device, + stdin_handle, + sock_listener, + sock_conn: None, + logger: logger.new(slog::o!("subsystem" => "console_manager")), + } + } + + fn uds_listener_accept(&mut self, ops: &mut EventOps) -> std::io::Result<()> { + if self.sock_conn.is_some() { + slog::warn!(self.logger, + "UDS for serial port 1 already exists, reject the new connection"; + "subsystem" => "console_mgr", + ); + // Do not expected poisoned lock. + let _ = self.sock_listener.as_mut().unwrap().accept(); + } else { + // Safe to unwrap() because self.sock_conn is Some(). + let (conn_sock, _) = self.sock_listener.as_ref().unwrap().accept()?; + let events = Events::with_data(&conn_sock, EPOLL_EVENT_SERIAL_DATA, EventSet::IN); + if let Err(e) = ops.add(events) { + slog::error!(self.logger, + "failed to register epoll event for serial, {:?}", e; + "subsystem" => "console_mgr", + ); + return Err(std::io::Error::last_os_error()); + } + + let conn_sock_copy = conn_sock.try_clone()?; + // Do not expected poisoned lock. + self.device + .lock() + .unwrap() + .set_output_stream(Some(Box::new(conn_sock_copy))); + + self.sock_conn = Some(conn_sock); + } + + Ok(()) + } + + fn uds_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> { + let mut should_drop = true; + + if let Some(conn_sock) = self.sock_conn.as_mut() { + let mut out = [0u8; MAX_BACKEND_THROUGHPUT]; + match conn_sock.read(&mut out[..]) { + Ok(0) => { + // Zero-length read means EOF. Remove this conn sock. + self.device + .lock() + .expect("console: poisoned console lock") + .set_output_stream(None); + } + Ok(count) => { + self.device + .lock() + .expect("console: poisoned console lock") + .raw_input(&out[..count])?; + should_drop = false; + } + Err(e) => { + slog::warn!(self.logger, + "error while reading serial conn sock: {:?}", e; + "subsystem" => "console_mgr" + ); + self.device + .lock() + .expect("console: poisoned console lock") + .set_output_stream(None); + } + } + } + + if should_drop { + assert!(self.sock_conn.is_some()); + // Safe to unwrap() because self.sock_conn is Some(). + let sock_conn = self.sock_conn.take().unwrap(); + let events = Events::with_data(&sock_conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN); + if let Err(e) = ops.remove(events) { + slog::error!(self.logger, + "failed deregister epoll event for UDS, {:?}", e; + "subsystem" => "console_mgr" + ); + } + } + + Ok(()) + } + + fn stdio_read_in(&mut self, ops: &mut EventOps) -> std::io::Result<()> { + let mut should_drop = true; + + if let Some(handle) = self.stdin_handle.as_ref() { + let mut out = [0u8; MAX_BACKEND_THROUGHPUT]; + // Safe to unwrap() because self.stdin_handle is Some(). + let stdin_lock = handle.lock(); + match stdin_lock.read_raw(&mut out[..]) { + Ok(0) => { + // Zero-length read indicates EOF. Remove from pollables. + self.device + .lock() + .expect("console: poisoned console lock") + .set_output_stream(None); + } + Ok(count) => { + self.device + .lock() + .expect("console: poisoned console lock") + .raw_input(&out[..count])?; + should_drop = false; + } + Err(e) => { + slog::warn!(self.logger, + "error while reading stdin: {:?}", e; + "subsystem" => "console_mgr" + ); + self.device + .lock() + .expect("console: poisoned console lock") + .set_output_stream(None); + } + } + } + + if should_drop { + let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN); + if let Err(e) = ops.remove(events) { + slog::error!(self.logger, + "failed to deregister epoll event for stdin, {:?}", e; + "subsystem" => "console_mgr" + ); + } + } + + Ok(()) + } +} + +impl MutEventSubscriber for ConsoleEpollHandler { + fn process(&mut self, events: Events, ops: &mut EventOps) { + slog::trace!(self.logger, "ConsoleEpollHandler::process()"); + let slot = events.data(); + match slot { + EPOLL_EVENT_SERIAL => { + if let Err(e) = self.uds_listener_accept(ops) { + slog::warn!(self.logger, "failed to accept incoming connection, {:?}", e); + } + } + EPOLL_EVENT_SERIAL_DATA => { + if let Err(e) = self.uds_read_in(ops) { + slog::warn!(self.logger, "failed to read data from UDS, {:?}", e); + } + } + EPOLL_EVENT_STDIN => { + if let Err(e) = self.stdio_read_in(ops) { + slog::warn!(self.logger, "failed to read data from stdin, {:?}", e); + } + } + _ => slog::error!(self.logger, "unknown epoll slot number {}", slot), + } + } + + fn init(&mut self, ops: &mut EventOps) { + slog::trace!(self.logger, "ConsoleEpollHandler::init()"); + + if self.stdin_handle.is_some() { + slog::info!(self.logger, "ConsoleEpollHandler: stdin handler"); + let events = Events::with_data_raw(libc::STDIN_FILENO, EPOLL_EVENT_STDIN, EventSet::IN); + if let Err(e) = ops.add(events) { + slog::error!( + self.logger, + "failed to register epoll event for stdin, {:?}", + e + ); + } + } + if let Some(sock) = self.sock_listener.as_ref() { + slog::info!(self.logger, "ConsoleEpollHandler: sock listener"); + let events = Events::with_data(sock, EPOLL_EVENT_SERIAL, EventSet::IN); + if let Err(e) = ops.add(events) { + slog::error!( + self.logger, + "failed to register epoll event for UDS listener, {:?}", + e + ); + } + } + + if let Some(conn) = self.sock_conn.as_ref() { + slog::info!(self.logger, "ConsoleEpollHandler: sock connection"); + let events = Events::with_data(conn, EPOLL_EVENT_SERIAL_DATA, EventSet::IN); + if let Err(e) = ops.add(events) { + slog::error!( + self.logger, + "failed to register epoll event for UDS connection, {:?}", + e + ); + } + } + } +} + +/// Writer to process guest kernel dmesg. +pub struct DmesgWriter { + buf: BytesMut, + logger: slog::Logger, +} + +impl DmesgWriter { + /// Creates a new instance. + pub fn new(logger: &slog::Logger) -> Self { + Self { + buf: BytesMut::with_capacity(1024), + logger: logger.new(slog::o!("subsystem" => "dmesg")), + } + } +} + +impl io::Write for DmesgWriter { + /// 0000000 [ 0 . 0 3 4 9 1 6 ] R + /// 5b 20 20 20 20 30 2e 30 33 34 39 31 36 5d 20 52 + /// 0000020 u n / s b i n / i n i t a s + /// 75 6e 20 2f 73 62 69 6e 2f 69 6e 69 74 20 61 73 + /// 0000040 i n i t p r o c e s s \r \n [ + /// + /// dmesg message end a line with /r/n . When redirect message to logger, we should + /// remove the /r/n . + fn write(&mut self, buf: &[u8]) -> io::Result { + let arr: Vec<&[u8]> = buf.split(|c| *c == b'\n').collect(); + let count = arr.len(); + + for (i, sub) in arr.iter().enumerate() { + if sub.is_empty() { + if !self.buf.is_empty() { + slog::info!( + self.logger, + "{}", + String::from_utf8_lossy(self.buf.as_ref()).trim_end() + ); + self.buf.clear(); + } + } else if sub.len() < buf.len() && i < count - 1 { + slog::info!( + self.logger, + "{}{}", + String::from_utf8_lossy(self.buf.as_ref()).trim_end(), + String::from_utf8_lossy(sub).trim_end(), + ); + self.buf.clear(); + } else { + self.buf.put_slice(sub); + } + } + + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use slog::Drain; + use std::io::Write; + + fn create_logger() -> slog::Logger { + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain).build().fuse(); + slog::Logger::root(drain, slog::o!()) + } + + #[test] + fn test_dmesg_writer() { + let mut writer = DmesgWriter { + buf: Default::default(), + logger: create_logger(), + }; + + writer.flush().unwrap(); + writer.write_all("".as_bytes()).unwrap(); + writer.write_all("\n".as_bytes()).unwrap(); + writer.write_all("\n\n".as_bytes()).unwrap(); + writer.write_all("\n\n\n".as_bytes()).unwrap(); + writer.write_all("12\n23\n34\n56".as_bytes()).unwrap(); + writer.write_all("78".as_bytes()).unwrap(); + writer.write_all("90\n".as_bytes()).unwrap(); + writer.flush().unwrap(); + } + + // TODO: add unit tests for console manager +} diff --git a/src/dragonball/src/device_manager/fs_dev_mgr.rs b/src/dragonball/src/device_manager/fs_dev_mgr.rs new file mode 100644 index 0000000000..088dc980f1 --- /dev/null +++ b/src/dragonball/src/device_manager/fs_dev_mgr.rs @@ -0,0 +1,528 @@ +// Copyright 2020-2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Intel Corporation. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::convert::TryInto; + +use dbs_utils::epoll_manager::EpollManager; +use dbs_virtio_devices::{self as virtio, Error as VirtIoError}; +use serde_derive::{Deserialize, Serialize}; +use slog::{error, info}; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::config_manager::{ + ConfigItem, DeviceConfigInfo, DeviceConfigInfos, RateLimiterConfigInfo, +}; +use crate::device_manager::{ + DbsMmioV2Device, DeviceManager, DeviceMgrError, DeviceOpContext, DeviceVirtioRegionHandler, +}; +use crate::get_bucket_update; + +use super::DbsVirtioDevice; + +// The flag of whether to use the shared irq. +const USE_SHARED_IRQ: bool = true; +// The flag of whether to use the generic irq. +const USE_GENERIC_IRQ: bool = true; +// Default cache size is 2 Gi since this is a typical VM memory size. +const DEFAULT_CACHE_SIZE: u64 = 2 * 1024 * 1024 * 1024; +// We have 2 supported fs device mode, vhostuser and virtio +const VHOSTUSER_FS_MODE: &str = "vhostuser"; +// We have 2 supported fs device mode, vhostuser and virtio +const VIRTIO_FS_MODE: &str = "virtio"; + +/// Errors associated with `FsDeviceConfig`. +#[derive(Debug, thiserror::Error)] +pub enum FsDeviceError { + /// Invalid fs, "virtio" or "vhostuser" is allowed. + #[error("the fs type is invalid, virtio or vhostuser is allowed")] + InvalidFs, + + /// Cannot access address space. + #[error("Cannot access address space.")] + AddressSpaceNotInitialized, + + /// Cannot convert RateLimterConfigInfo into RateLimiter. + #[error("failure while converting RateLimterConfigInfo into RateLimiter: {0}")] + RateLimterConfigInfoTryInto(#[source] std::io::Error), + + /// The fs device tag was already used for a different fs. + #[error("VirtioFs device tag {0} already exists")] + FsDeviceTagAlreadyExists(String), + + /// The fs device path was already used for a different fs. + #[error("VirtioFs device tag {0} already exists")] + FsDevicePathAlreadyExists(String), + + /// The update is not allowed after booting the microvm. + #[error("update operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// The attachbackendfs operation fails. + #[error("Fs device attach a backend fs failed")] + AttachBackendFailed(String), + + /// attach backend fs must be done when vm is running. + #[error("vm is not running when attaching a backend fs")] + MicroVMNotRunning, + + /// The mount tag doesn't exist. + #[error("fs tag'{0}' doesn't exist")] + TagNotExists(String), + + /// Failed to send patch message to VirtioFs epoll handler. + #[error("could not send patch message to the VirtioFs epoll handler")] + VirtioFsEpollHanderSendFail, + + /// Creating a shared-fs device fails (if the vhost-user socket cannot be open.) + #[error("cannot create shared-fs device: {0}")] + CreateFsDevice(#[source] VirtIoError), + + /// Cannot initialize a shared-fs device or add a device to the MMIO Bus. + #[error("failure while registering shared-fs device: {0}")] + RegisterFsDevice(#[source] DeviceMgrError), + + /// The device manager errors. + #[error("DeviceManager error: {0}")] + DeviceManager(#[source] DeviceMgrError), +} + +/// Configuration information for a vhost-user-fs device. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct FsDeviceConfigInfo { + /// vhost-user socket path. + pub sock_path: String, + /// virtiofs mount tag name used inside the guest. + /// used as the device name during mount. + pub tag: String, + /// Number of virtqueues to use. + pub num_queues: usize, + /// Size of each virtqueue. Unit: byte. + pub queue_size: u16, + /// DAX cache window size + pub cache_size: u64, + /// Number of thread pool workers. + pub thread_pool_size: u16, + /// The caching policy the file system should use (auto, always or never). + /// This cache policy is set for virtio-fs, visit https://gitlab.com/virtio-fs/virtiofsd to get further information. + pub cache_policy: String, + /// Writeback cache + pub writeback_cache: bool, + /// Enable no_open or not + pub no_open: bool, + /// Enable xattr or not + pub xattr: bool, + /// Drop CAP_SYS_RESOURCE or not + pub drop_sys_resource: bool, + /// virtio fs or vhostuser fs. + pub mode: String, + /// Enable kill_priv_v2 or not + pub fuse_killpriv_v2: bool, + /// Enable no_readdir or not + pub no_readdir: bool, + /// Rate Limiter for I/O operations. + pub rate_limiter: Option, + /// Use shared irq + pub use_shared_irq: Option, + /// Use generic irq + pub use_generic_irq: Option, +} + +impl std::default::Default for FsDeviceConfigInfo { + fn default() -> Self { + Self { + sock_path: String::default(), + tag: String::default(), + num_queues: 1, + queue_size: 1024, + cache_size: DEFAULT_CACHE_SIZE, + thread_pool_size: 0, + cache_policy: Self::default_cache_policy(), + writeback_cache: Self::default_writeback_cache(), + no_open: Self::default_no_open(), + fuse_killpriv_v2: Self::default_fuse_killpriv_v2(), + no_readdir: Self::default_no_readdir(), + xattr: Self::default_xattr(), + drop_sys_resource: Self::default_drop_sys_resource(), + mode: Self::default_fs_mode(), + rate_limiter: Some(RateLimiterConfigInfo::default()), + use_shared_irq: None, + use_generic_irq: None, + } + } +} + +impl FsDeviceConfigInfo { + /// The default mode is set to 'virtio' for 'virtio-fs' device. + pub fn default_fs_mode() -> String { + String::from(VIRTIO_FS_MODE) + } + + /// The default cache policy + pub fn default_cache_policy() -> String { + "always".to_string() + } + + /// The default setting of writeback cache + pub fn default_writeback_cache() -> bool { + true + } + + /// The default setting of no_open + pub fn default_no_open() -> bool { + true + } + + /// The default setting of killpriv_v2 + pub fn default_fuse_killpriv_v2() -> bool { + false + } + + /// The default setting of xattr + pub fn default_xattr() -> bool { + false + } + + /// The default setting of drop_sys_resource + pub fn default_drop_sys_resource() -> bool { + false + } + + /// The default setting of no_readdir + pub fn default_no_readdir() -> bool { + false + } + + /// The default setting of rate limiter + pub fn default_fs_rate_limiter() -> Option { + None + } +} + +/// Configuration information for virtio-fs. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct FsDeviceConfigUpdateInfo { + /// virtiofs mount tag name used inside the guest. + /// used as the device name during mount. + pub tag: String, + /// Rate Limiter for I/O operations. + pub rate_limiter: Option, +} + +impl FsDeviceConfigUpdateInfo { + /// Provides a `BucketUpdate` description for the bandwidth rate limiter. + pub fn bytes(&self) -> dbs_utils::rate_limiter::BucketUpdate { + get_bucket_update!(self, rate_limiter, bandwidth) + } + /// Provides a `BucketUpdate` description for the ops rate limiter. + pub fn ops(&self) -> dbs_utils::rate_limiter::BucketUpdate { + get_bucket_update!(self, rate_limiter, ops) + } +} + +impl ConfigItem for FsDeviceConfigInfo { + type Err = FsDeviceError; + + fn id(&self) -> &str { + &self.tag + } + + fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> { + if self.tag == other.tag { + Err(FsDeviceError::FsDeviceTagAlreadyExists(self.tag.clone())) + } else if self.mode.as_str() == VHOSTUSER_FS_MODE && self.sock_path == other.sock_path { + Err(FsDeviceError::FsDevicePathAlreadyExists( + self.sock_path.clone(), + )) + } else { + Ok(()) + } + } +} + +/// Configuration information of manipulating backend fs for a virtiofs device. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct FsMountConfigInfo { + /// Mount operations, mount, update, umount + pub ops: String, + /// The backend fs type to mount. + pub fstype: Option, + /// the source file/directory the backend fs points to + pub source: Option, + /// where the backend fs gets mounted + pub mountpoint: String, + /// backend fs config content in json format + pub config: Option, + /// virtiofs mount tag name used inside the guest. + /// used as the device name during mount. + pub tag: String, + /// Path to file that contains file lists that should be prefetched by rafs + pub prefetch_list_path: Option, + /// What size file supports dax + pub dax_threshold_size_kb: Option, +} + +pub(crate) type FsDeviceInfo = DeviceConfigInfo; + +impl ConfigItem for FsDeviceInfo { + type Err = FsDeviceError; + fn id(&self) -> &str { + &self.config.tag + } + + fn check_conflicts(&self, other: &Self) -> Result<(), FsDeviceError> { + if self.config.tag == other.config.tag { + Err(FsDeviceError::FsDeviceTagAlreadyExists( + self.config.tag.clone(), + )) + } else if self.config.sock_path == other.config.sock_path { + Err(FsDeviceError::FsDevicePathAlreadyExists( + self.config.sock_path.clone(), + )) + } else { + Ok(()) + } + } +} + +/// Wrapper for the collection that holds all the Fs Devices Configs +pub struct FsDeviceMgr { + /// A list of `FsDeviceConfig` objects. + pub(crate) info_list: DeviceConfigInfos, + pub(crate) use_shared_irq: bool, +} + +impl FsDeviceMgr { + /// Inserts `fs_cfg` in the shared-fs device configuration list. + pub fn insert_device( + device_mgr: &mut DeviceManager, + ctx: DeviceOpContext, + fs_cfg: FsDeviceConfigInfo, + ) -> std::result::Result<(), FsDeviceError> { + // It's too complicated to manage life cycle of shared-fs service process for hotplug. + if ctx.is_hotplug { + error!( + ctx.logger(), + "no support of shared-fs device hotplug"; + "subsystem" => "shared-fs", + "tag" => &fs_cfg.tag, + ); + return Err(FsDeviceError::UpdateNotAllowedPostBoot); + } + + info!( + ctx.logger(), + "add shared-fs device configuration"; + "subsystem" => "shared-fs", + "tag" => &fs_cfg.tag, + ); + device_mgr + .fs_manager + .lock() + .unwrap() + .info_list + .insert_or_update(&fs_cfg)?; + + Ok(()) + } + + /// Attaches all vhost-user-fs devices from the FsDevicesConfig. + pub fn attach_devices( + &mut self, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), FsDeviceError> { + let epoll_mgr = ctx + .epoll_mgr + .clone() + .ok_or(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput))?; + + for info in self.info_list.iter_mut() { + let device = Self::create_fs_device(&info.config, ctx, epoll_mgr.clone())?; + let mmio_device = DeviceManager::create_mmio_virtio_device( + device, + ctx, + info.config.use_shared_irq.unwrap_or(self.use_shared_irq), + info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(FsDeviceError::RegisterFsDevice)?; + + info.set_device(mmio_device); + } + + Ok(()) + } + + fn create_fs_device( + config: &FsDeviceConfigInfo, + ctx: &mut DeviceOpContext, + epoll_mgr: EpollManager, + ) -> std::result::Result { + match &config.mode as &str { + VIRTIO_FS_MODE => Self::attach_virtio_fs_devices(config, ctx, epoll_mgr), + _ => Err(FsDeviceError::CreateFsDevice(virtio::Error::InvalidInput)), + } + } + + fn attach_virtio_fs_devices( + config: &FsDeviceConfigInfo, + ctx: &mut DeviceOpContext, + epoll_mgr: EpollManager, + ) -> std::result::Result { + info!( + ctx.logger(), + "add virtio-fs device configuration"; + "subsystem" => "virito-fs", + "tag" => &config.tag, + "dax_window_size" => &config.cache_size, + ); + + let limiter = if let Some(rlc) = config.rate_limiter.clone() { + Some( + rlc.try_into() + .map_err(FsDeviceError::RateLimterConfigInfoTryInto)?, + ) + } else { + None + }; + + let vm_as = ctx.get_vm_as().map_err(|e| { + error!(ctx.logger(), "virtio-fs get vm_as error: {:?}", e; + "subsystem" => "virito-fs"); + FsDeviceError::DeviceManager(e) + })?; + let address_space = match ctx.address_space.as_ref() { + Some(address_space) => address_space.clone(), + None => { + error!(ctx.logger(), "virtio-fs get address_space error"; "subsystem" => "virito-fs"); + return Err(FsDeviceError::AddressSpaceNotInitialized); + } + }; + let handler = DeviceVirtioRegionHandler { + vm_as, + address_space, + }; + + let device = Box::new( + virtio::fs::VirtioFs::new( + &config.tag, + config.num_queues, + config.queue_size, + config.cache_size, + &config.cache_policy, + config.thread_pool_size, + config.writeback_cache, + config.no_open, + config.fuse_killpriv_v2, + config.xattr, + config.drop_sys_resource, + config.no_readdir, + Box::new(handler), + epoll_mgr, + limiter, + ) + .map_err(FsDeviceError::CreateFsDevice)?, + ); + + Ok(device) + } + + /// Attach a backend fs to a VirtioFs device or detach a backend + /// fs from a Virtiofs device + pub fn manipulate_backend_fs( + device_mgr: &mut DeviceManager, + config: FsMountConfigInfo, + ) -> std::result::Result<(), FsDeviceError> { + let mut found = false; + + let mgr = &mut device_mgr.fs_manager.lock().unwrap(); + for info in mgr + .info_list + .iter() + .filter(|info| info.config.tag.as_str() == config.tag.as_str()) + { + found = true; + if let Some(device) = info.device.as_ref() { + if let Some(mmio_dev) = device.as_any().downcast_ref::() { + let mut guard = mmio_dev.state(); + let inner_dev = guard.get_inner_device_mut(); + if let Some(virtio_fs_dev) = inner_dev + .as_any_mut() + .downcast_mut::>() + { + return virtio_fs_dev + .manipulate_backend_fs( + config.source, + config.fstype, + &config.mountpoint, + config.config, + &config.ops, + config.prefetch_list_path, + config.dax_threshold_size_kb, + ) + .map(|_p| ()) + .map_err(|e| FsDeviceError::AttachBackendFailed(e.to_string())); + } + } + } + } + if !found { + Err(FsDeviceError::AttachBackendFailed( + "fs tag not found".to_string(), + )) + } else { + Ok(()) + } + } + + /// Gets the index of the device with the specified `tag` if it exists in the list. + pub fn get_index_of_tag(&self, tag: &str) -> Option { + self.info_list + .iter() + .position(|info| info.config.id().eq(tag)) + } + + /// Update the ratelimiter settings of a virtio fs device. + pub fn update_device_ratelimiters( + device_mgr: &mut DeviceManager, + new_cfg: FsDeviceConfigUpdateInfo, + ) -> std::result::Result<(), FsDeviceError> { + let mgr = &mut device_mgr.fs_manager.lock().unwrap(); + match mgr.get_index_of_tag(&new_cfg.tag) { + Some(index) => { + let config = &mut mgr.info_list[index].config; + config.rate_limiter = new_cfg.rate_limiter.clone(); + let device = mgr.info_list[index] + .device + .as_mut() + .ok_or_else(|| FsDeviceError::TagNotExists("".to_owned()))?; + + if let Some(mmio_dev) = device.as_any().downcast_ref::() { + let guard = mmio_dev.state(); + let inner_dev = guard.get_inner_device(); + if let Some(fs_dev) = inner_dev + .as_any() + .downcast_ref::>() + { + return fs_dev + .set_patch_rate_limiters(new_cfg.bytes(), new_cfg.ops()) + .map(|_p| ()) + .map_err(|_e| FsDeviceError::VirtioFsEpollHanderSendFail); + } + } + Ok(()) + } + None => Err(FsDeviceError::TagNotExists(new_cfg.tag)), + } + } +} + +impl Default for FsDeviceMgr { + /// Create a new `FsDeviceMgr` object.. + fn default() -> Self { + FsDeviceMgr { + info_list: DeviceConfigInfos::new(), + use_shared_irq: USE_SHARED_IRQ, + } + } +} diff --git a/src/dragonball/src/device_manager/legacy.rs b/src/dragonball/src/device_manager/legacy.rs new file mode 100644 index 0000000000..50a47cab73 --- /dev/null +++ b/src/dragonball/src/device_manager/legacy.rs @@ -0,0 +1,246 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Device Manager for Legacy Devices. + +use std::io; +use std::sync::{Arc, Mutex}; + +use dbs_device::device_manager::Error as IoManagerError; +#[cfg(target_arch = "aarch64")] +use dbs_legacy_devices::RTCDevice; +use dbs_legacy_devices::SerialDevice; +use vmm_sys_util::eventfd::EventFd; + +// The I8042 Data Port (IO Port 0x60) is used for reading data that was received from a I8042 device or from the I8042 controller itself and writing data to a I8042 device or to the I8042 controller itself. +const I8042_DATA_PORT: u16 = 0x60; + +/// Errors generated by legacy device manager. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Cannot add legacy device to Bus. + #[error("bus failure while managing legacy device")] + BusError(#[source] IoManagerError), + + /// Cannot create EventFd. + #[error("failure while reading EventFd file descriptor")] + EventFd(#[source] io::Error), + + /// Failed to register/deregister interrupt. + #[error("failure while managing interrupt for legacy device")] + IrqManager(#[source] vmm_sys_util::errno::Error), +} + +/// The `LegacyDeviceManager` is a wrapper that is used for registering legacy devices +/// on an I/O Bus. +/// +/// It currently manages the uart and i8042 devices. The `LegacyDeviceManger` should be initialized +/// only by using the constructor. +pub struct LegacyDeviceManager { + #[cfg(target_arch = "x86_64")] + i8042_reset_eventfd: EventFd, + #[cfg(target_arch = "aarch64")] + pub(crate) _rtc_device: Arc>, + #[cfg(target_arch = "aarch64")] + _rtc_eventfd: EventFd, + pub(crate) com1_device: Arc>, + _com1_eventfd: EventFd, + pub(crate) com2_device: Arc>, + _com2_eventfd: EventFd, +} + +impl LegacyDeviceManager { + /// Get the serial device for com1. + pub fn get_com1_serial(&self) -> Arc> { + self.com1_device.clone() + } + + /// Get the serial device for com2 + pub fn get_com2_serial(&self) -> Arc> { + self.com2_device.clone() + } +} + +#[cfg(target_arch = "x86_64")] +pub(crate) mod x86_64 { + use super::*; + use dbs_device::device_manager::IoManager; + use dbs_device::resources::Resource; + use dbs_legacy_devices::{EventFdTrigger, I8042Device, I8042DeviceMetrics}; + use kvm_ioctls::VmFd; + + pub(crate) const COM1_IRQ: u32 = 4; + pub(crate) const COM1_PORT1: u16 = 0x3f8; + pub(crate) const COM2_IRQ: u32 = 3; + pub(crate) const COM2_PORT1: u16 = 0x2f8; + + type Result = ::std::result::Result; + + impl LegacyDeviceManager { + /// Create a LegacyDeviceManager instance handling legacy devices (uart, i8042). + pub fn create_manager(bus: &mut IoManager, vm_fd: Option>) -> Result { + let (com1_device, com1_eventfd) = + Self::create_com_device(bus, vm_fd.as_ref(), COM1_IRQ, COM1_PORT1)?; + let (com2_device, com2_eventfd) = + Self::create_com_device(bus, vm_fd.as_ref(), COM2_IRQ, COM2_PORT1)?; + + let exit_evt = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?; + let i8042_device = Arc::new(Mutex::new(I8042Device::new( + EventFdTrigger::new(exit_evt.try_clone().map_err(Error::EventFd)?), + Arc::new(I8042DeviceMetrics::default()), + ))); + let resources = [Resource::PioAddressRange { + // 0x60 and 0x64 are the io ports that i8042 devices used. + // We register pio address range from 0x60 - 0x64 with base I8042_DATA_PORT for i8042 to use. + base: I8042_DATA_PORT, + size: 0x5, + }]; + bus.register_device_io(i8042_device, &resources) + .map_err(Error::BusError)?; + + Ok(LegacyDeviceManager { + i8042_reset_eventfd: exit_evt, + com1_device, + _com1_eventfd: com1_eventfd, + com2_device, + _com2_eventfd: com2_eventfd, + }) + } + + /// Get the eventfd for exit notification. + pub fn get_reset_eventfd(&self) -> Result { + self.i8042_reset_eventfd.try_clone().map_err(Error::EventFd) + } + + fn create_com_device( + bus: &mut IoManager, + vm_fd: Option<&Arc>, + irq: u32, + port_base: u16, + ) -> Result<(Arc>, EventFd)> { + let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?; + let device = Arc::new(Mutex::new(SerialDevice::new( + eventfd.try_clone().map_err(Error::EventFd)?, + ))); + // port_base defines the base port address for the COM devices. + // Since every COM device has 8 data registers so we register the pio address range as size 0x8. + let resources = [Resource::PioAddressRange { + base: port_base, + size: 0x8, + }]; + bus.register_device_io(device.clone(), &resources) + .map_err(Error::BusError)?; + + if let Some(fd) = vm_fd { + fd.register_irqfd(&eventfd, irq) + .map_err(Error::IrqManager)?; + } + + Ok((device, eventfd)) + } + } +} + +#[cfg(target_arch = "aarch64")] +pub(crate) mod aarch64 { + use super::*; + use dbs_device::device_manager::IoManager; + use dbs_device::resources::DeviceResources; + use kvm_ioctls::VmFd; + use std::collections::HashMap; + + type Result = ::std::result::Result; + + /// LegacyDeviceType: com1 + pub const COM1: &str = "com1"; + /// LegacyDeviceType: com2 + pub const COM2: &str = "com2"; + /// LegacyDeviceType: rtc + pub const RTC: &str = "rtc"; + + impl LegacyDeviceManager { + /// Create a LegacyDeviceManager instance handling legacy devices. + pub fn create_manager( + bus: &mut IoManager, + vm_fd: Option>, + resources: &HashMap, + ) -> Result { + let (com1_device, com1_eventfd) = + Self::create_com_device(bus, vm_fd.as_ref(), resources.get(COM1).unwrap())?; + let (com2_device, com2_eventfd) = + Self::create_com_device(bus, vm_fd.as_ref(), resources.get(COM2).unwrap())?; + let (rtc_device, rtc_eventfd) = + Self::create_rtc_device(bus, vm_fd.as_ref(), resources.get(RTC).unwrap())?; + + Ok(LegacyDeviceManager { + _rtc_device: rtc_device, + _rtc_eventfd: rtc_eventfd, + com1_device, + _com1_eventfd: com1_eventfd, + com2_device, + _com2_eventfd: com2_eventfd, + }) + } + + fn create_com_device( + bus: &mut IoManager, + vm_fd: Option<&Arc>, + resources: &DeviceResources, + ) -> Result<(Arc>, EventFd)> { + let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?; + let device = Arc::new(Mutex::new(SerialDevice::new( + eventfd.try_clone().map_err(Error::EventFd)?, + ))); + + bus.register_device_io(device.clone(), resources.get_all_resources()) + .map_err(Error::BusError)?; + + if let Some(fd) = vm_fd { + let irq = resources.get_legacy_irq().unwrap(); + fd.register_irqfd(&eventfd, irq) + .map_err(Error::IrqManager)?; + } + + Ok((device, eventfd)) + } + + fn create_rtc_device( + bus: &mut IoManager, + vm_fd: Option<&Arc>, + resources: &DeviceResources, + ) -> Result<(Arc>, EventFd)> { + let eventfd = EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?; + let device = Arc::new(Mutex::new(RTCDevice::new())); + + bus.register_device_io(device.clone(), resources.get_all_resources()) + .map_err(Error::BusError)?; + + if let Some(fd) = vm_fd { + let irq = resources.get_legacy_irq().unwrap(); + fd.register_irqfd(&eventfd, irq) + .map_err(Error::IrqManager)?; + } + + Ok((device, eventfd)) + } + } +} + +#[cfg(test)] +mod tests { + #[cfg(target_arch = "x86_64")] + use super::*; + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_create_legacy_device_manager() { + let mut bus = dbs_device::device_manager::IoManager::new(); + let mgr = LegacyDeviceManager::create_manager(&mut bus, None).unwrap(); + let _exit_fd = mgr.get_reset_eventfd().unwrap(); + } +} diff --git a/src/dragonball/src/device_manager/memory_region_handler.rs b/src/dragonball/src/device_manager/memory_region_handler.rs new file mode 100644 index 0000000000..2be149ef97 --- /dev/null +++ b/src/dragonball/src/device_manager/memory_region_handler.rs @@ -0,0 +1,110 @@ +// Copyright 2022 Alibaba, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::io; +use std::sync::Arc; + +use dbs_address_space::{AddressSpace, AddressSpaceRegion, AddressSpaceRegionType}; +use dbs_virtio_devices::{Error as VirtIoError, VirtioRegionHandler}; +use log::{debug, error}; +use vm_memory::{FileOffset, GuestAddressSpace, GuestMemoryRegion, GuestRegionMmap}; + +use crate::address_space_manager::GuestAddressSpaceImpl; + +/// This struct implements the VirtioRegionHandler trait, which inserts the memory +/// region of the virtio device into vm_as and address_space. +/// +/// * After region is inserted into the vm_as, the virtio device can read guest memory +/// data using vm_as.get_slice with GuestAddress. +/// +/// * Insert virtio memory into address_space so that the correct guest last address can +/// be found when initializing the e820 table. The e820 table is a table that describes +/// guest memory prepared before the guest startup. we need to config the correct guest +/// memory address and length in the table. The virtio device memory belongs to the MMIO +/// space and does not belong to the Guest Memory space. Therefore, it cannot be configured +/// into the e820 table. When creating AddressSpaceRegion we use +/// AddressSpaceRegionType::ReservedMemory type, in this way, address_space will know that +/// this region a special memory, it will don't put the this memory in e820 table. +/// +/// This function relies on the atomic-guest-memory feature. Without this feature enabled, memory +/// regions cannot be inserted into vm_as. Because the insert_region interface of vm_as does +/// not insert regions in place, but returns an array of inserted regions. We need to manually +/// replace this array of regions with vm_as, and that's what atomic-guest-memory feature does. +/// So we rely on the atomic-guest-memory feature here +pub struct DeviceVirtioRegionHandler { + pub(crate) vm_as: GuestAddressSpaceImpl, + pub(crate) address_space: AddressSpace, +} + +impl DeviceVirtioRegionHandler { + fn insert_address_space( + &mut self, + region: Arc, + ) -> std::result::Result<(), VirtIoError> { + let file_offset = match region.file_offset() { + // TODO: use from_arc + Some(f) => Some(FileOffset::new(f.file().try_clone()?, 0)), + None => None, + }; + + let as_region = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DAXMemory, + region.start_addr(), + region.size() as u64, + None, + file_offset, + region.flags(), + false, + )); + + self.address_space.insert_region(as_region).map_err(|e| { + error!("inserting address apace error: {}", e); + // dbs-virtio-devices should not depend on dbs-address-space. + // So here io::Error is used instead of AddressSpaceError directly. + VirtIoError::IOError(io::Error::new( + io::ErrorKind::Other, + format!( + "invalid address space region ({0:#x}, {1:#x})", + region.start_addr().0, + region.len() + ), + )) + })?; + Ok(()) + } + + fn insert_vm_as( + &mut self, + region: Arc, + ) -> std::result::Result<(), VirtIoError> { + let vm_as_new = self.vm_as.memory().insert_region(region).map_err(|e| { + error!( + "DeviceVirtioRegionHandler failed to insert guest memory region: {:?}.", + e + ); + VirtIoError::InsertMmap(e) + })?; + // Do not expect poisoned lock here, so safe to unwrap(). + self.vm_as.lock().unwrap().replace(vm_as_new); + + Ok(()) + } +} + +impl VirtioRegionHandler for DeviceVirtioRegionHandler { + fn insert_region( + &mut self, + region: Arc, + ) -> std::result::Result<(), VirtIoError> { + debug!( + "add geust memory region to address_space/vm_as, new region: {:?}", + region + ); + + self.insert_address_space(region.clone())?; + self.insert_vm_as(region)?; + + Ok(()) + } +} diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs new file mode 100644 index 0000000000..43c237d4ce --- /dev/null +++ b/src/dragonball/src/device_manager/mod.rs @@ -0,0 +1,1003 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Device manager to manage IO devices for a virtual machine. + +#[cfg(target_arch = "aarch64")] +use std::collections::HashMap; + +use std::io; +use std::sync::{Arc, Mutex, MutexGuard}; + +use arc_swap::ArcSwap; +use dbs_address_space::AddressSpace; +#[cfg(target_arch = "aarch64")] +use dbs_arch::{DeviceType, MMIODeviceInfo}; +use dbs_device::device_manager::{Error as IoManagerError, IoManager, IoManagerContext}; +#[cfg(target_arch = "aarch64")] +use dbs_device::resources::DeviceResources; +use dbs_device::resources::Resource; +use dbs_device::DeviceIo; +use dbs_interrupt::KvmIrqManager; +use dbs_legacy_devices::ConsoleHandler; +use dbs_utils::epoll_manager::EpollManager; +use kvm_ioctls::VmFd; + +#[cfg(feature = "dbs-virtio-devices")] +use dbs_device::resources::ResourceConstraint; +#[cfg(feature = "dbs-virtio-devices")] +use dbs_virtio_devices as virtio; +#[cfg(feature = "dbs-virtio-devices")] +use dbs_virtio_devices::{ + mmio::{ + MmioV2Device, DRAGONBALL_FEATURE_INTR_USED, DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY, + DRAGONBALL_MMIO_DOORBELL_SIZE, MMIO_DEFAULT_CFG_SIZE, + }, + VirtioDevice, +}; + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +use dbs_upcall::{ + DevMgrRequest, DevMgrService, MmioDevRequest, UpcallClient, UpcallClientError, + UpcallClientRequest, UpcallClientResponse, +}; +#[cfg(feature = "hotplug")] +use dbs_virtio_devices::vsock::backend::VsockInnerConnector; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::error::StartMicroVmError; +use crate::resource_manager::ResourceManager; +use crate::vm::{KernelConfigInfo, Vm}; +use crate::IoManagerCached; + +/// Virtual machine console device manager. +pub mod console_manager; +/// Console Manager for virtual machines console device. +pub use self::console_manager::ConsoleManager; + +mod legacy; +pub use self::legacy::{Error as LegacyDeviceError, LegacyDeviceManager}; + +#[cfg(target_arch = "aarch64")] +pub use self::legacy::aarch64::{COM1, COM2, RTC}; + +#[cfg(feature = "virtio-vsock")] +/// Device manager for user-space vsock devices. +pub mod vsock_dev_mgr; +#[cfg(feature = "virtio-vsock")] +use self::vsock_dev_mgr::VsockDeviceMgr; + +#[cfg(feature = "virtio-blk")] +/// virtio-block device manager +pub mod blk_dev_mgr; +#[cfg(feature = "virtio-blk")] +use self::blk_dev_mgr::BlockDeviceMgr; + +#[cfg(feature = "virtio-net")] +/// Device manager for virtio-net devices. +pub mod virtio_net_dev_mgr; +#[cfg(feature = "virtio-net")] +use self::virtio_net_dev_mgr::VirtioNetDeviceMgr; + +#[cfg(feature = "virtio-fs")] +/// virtio-block device manager +pub mod fs_dev_mgr; +#[cfg(feature = "virtio-fs")] +use self::fs_dev_mgr::FsDeviceMgr; +#[cfg(feature = "virtio-fs")] +mod memory_region_handler; +#[cfg(feature = "virtio-fs")] +pub use self::memory_region_handler::*; + +macro_rules! info( + ($l:expr, $($args:tt)+) => { + slog::info!($l, $($args)+; slog::o!("subsystem" => "device_manager")) + }; +); + +/// Errors related to device manager operations. +#[derive(Debug, thiserror::Error)] +pub enum DeviceMgrError { + /// Invalid operation. + #[error("invalid device manager operation")] + InvalidOperation, + + /// Failed to get device resource. + #[error("failed to get device assigned resources")] + GetDeviceResource, + + /// Appending to kernel command line failed. + #[error("failed to add kernel command line parameter for device: {0}")] + Cmdline(#[source] linux_loader::cmdline::Error), + + /// Failed to manage console devices. + #[error(transparent)] + ConsoleManager(console_manager::ConsoleManagerError), + + /// Failed to create the device. + #[error("failed to create virtual device: {0}")] + CreateDevice(#[source] io::Error), + + /// Failed to perform an operation on the bus. + #[error(transparent)] + IoManager(IoManagerError), + + /// Failure from legacy device manager. + #[error(transparent)] + LegacyManager(legacy::Error), + + #[cfg(feature = "dbs-virtio-devices")] + /// Error from Virtio subsystem. + #[error(transparent)] + Virtio(virtio::Error), + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + /// Failed to hotplug the device. + #[error("failed to hotplug virtual device")] + HotplugDevice(#[source] UpcallClientError), + + /// Failed to free device resource. + #[error("failed to free device resources: {0}")] + ResourceError(#[source] crate::resource_manager::ResourceError), +} + +/// Specialized version of `std::result::Result` for device manager operations. +pub type Result = ::std::result::Result; + +/// Type of the dragonball virtio devices. +#[cfg(feature = "dbs-virtio-devices")] +pub type DbsVirtioDevice = Box< + dyn VirtioDevice, +>; + +/// Type of the dragonball virtio mmio devices. +#[cfg(feature = "dbs-virtio-devices")] +pub type DbsMmioV2Device = + MmioV2Device; + +/// Struct to support transactional operations for device management. +pub struct DeviceManagerTx { + io_manager: IoManager, + _io_lock: Arc>, + _guard: MutexGuard<'static, ()>, +} + +impl DeviceManagerTx { + fn new(mgr_ctx: &DeviceManagerContext) -> Self { + // Do not expect poisoned lock. + let guard = mgr_ctx.io_lock.lock().unwrap(); + + // It's really a heavy burden to carry on a lifetime parameter for MutexGuard. + // So we play a tricky here that we hold a reference to the Arc> and transmute + // the MutexGuard<'a, ()> to MutexGuard<'static, ()>. + // It's safe because we hold a reference to the Mutex lock. + let guard = + unsafe { std::mem::transmute::, MutexGuard<'static, ()>>(guard) }; + + DeviceManagerTx { + io_manager: mgr_ctx.io_manager.load().as_ref().clone(), + _io_lock: mgr_ctx.io_lock.clone(), + _guard: guard, + } + } +} + +/// Operation context for device management. +#[derive(Clone)] +pub struct DeviceManagerContext { + io_manager: Arc>, + io_lock: Arc>, +} + +impl DeviceManagerContext { + /// Create a DeviceManagerContext object. + pub fn new(io_manager: Arc>, io_lock: Arc>) -> Self { + DeviceManagerContext { + io_manager, + io_lock, + } + } +} + +impl IoManagerContext for DeviceManagerContext { + type Context = DeviceManagerTx; + + fn begin_tx(&self) -> Self::Context { + DeviceManagerTx::new(self) + } + + fn commit_tx(&self, context: Self::Context) { + self.io_manager.store(Arc::new(context.io_manager)); + } + + fn cancel_tx(&self, context: Self::Context) { + drop(context); + } + + fn register_device_io( + &self, + ctx: &mut Self::Context, + device: Arc, + resources: &[Resource], + ) -> std::result::Result<(), dbs_device::device_manager::Error> { + ctx.io_manager.register_device_io(device, resources) + } + + fn unregister_device_io( + &self, + ctx: &mut Self::Context, + resources: &[Resource], + ) -> std::result::Result<(), dbs_device::device_manager::Error> { + ctx.io_manager.unregister_device_io(resources) + } +} + +/// Context for device addition/removal operations. +pub struct DeviceOpContext { + epoll_mgr: Option, + io_context: DeviceManagerContext, + irq_manager: Arc, + res_manager: Arc, + vm_fd: Arc, + vm_as: Option, + address_space: Option, + logger: slog::Logger, + is_hotplug: bool, + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: Option>>, + #[cfg(feature = "dbs-virtio-devices")] + virtio_devices: Vec>, +} + +impl DeviceOpContext { + pub(crate) fn new( + epoll_mgr: Option, + device_mgr: &DeviceManager, + vm_as: Option, + address_space: Option, + is_hotplug: bool, + ) -> Self { + let irq_manager = device_mgr.irq_manager.clone(); + let res_manager = device_mgr.res_manager.clone(); + + let vm_fd = device_mgr.vm_fd.clone(); + let io_context = DeviceManagerContext { + io_manager: device_mgr.io_manager.clone(), + io_lock: device_mgr.io_lock.clone(), + }; + let logger = device_mgr.logger.new(slog::o!()); + + DeviceOpContext { + epoll_mgr, + io_context, + irq_manager, + res_manager, + vm_fd, + vm_as, + address_space, + logger, + is_hotplug, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: None, + #[cfg(feature = "dbs-virtio-devices")] + virtio_devices: Vec::new(), + } + } + + pub(crate) fn create_boot_ctx(vm: &Vm, epoll_mgr: Option) -> Self { + Self::new(epoll_mgr, vm.device_manager(), None, None, false) + } + + pub(crate) fn get_vm_as(&self) -> Result { + match self.vm_as.as_ref() { + Some(v) => Ok(v.clone()), + None => Err(DeviceMgrError::InvalidOperation), + } + } + + pub(crate) fn logger(&self) -> &slog::Logger { + &self.logger + } + + #[allow(unused_variables)] + fn generate_kernel_boot_args(&mut self, kernel_config: &mut KernelConfigInfo) -> Result<()> { + if self.is_hotplug { + return Err(DeviceMgrError::InvalidOperation); + } + + #[cfg(feature = "dbs-virtio-devices")] + { + let cmdline = kernel_config.kernel_cmdline_mut(); + + for device in self.virtio_devices.iter() { + let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_device_info(device)?; + + // as per doc, [virtio_mmio.]device=@: needs to be appended + // to kernel commandline for virtio mmio devices to get recognized + // the size parameter has to be transformed to KiB, so dividing hexadecimal value in + // bytes to 1024; further, the '{}' formatting rust construct will automatically + // transform it to decimal + cmdline + .insert( + "virtio_mmio.device", + &format!("{}K@0x{:08x}:{}", mmio_size / 1024, mmio_base, irq), + ) + .map_err(DeviceMgrError::Cmdline)?; + } + } + + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + fn generate_virtio_device_info(&self) -> Result> { + let mut dev_info = HashMap::new(); + #[cfg(feature = "dbs-virtio-devices")] + for (_index, device) in self.virtio_devices.iter().enumerate() { + let (mmio_base, mmio_size, irq) = DeviceManager::get_virtio_mmio_device_info(device)?; + let dev_type; + let device_id; + if let Some(mmiov2_device) = device.as_any().downcast_ref::() { + dev_type = mmiov2_device.get_device_type(); + device_id = None; + } else { + return Err(DeviceMgrError::InvalidOperation); + } + dev_info.insert( + ( + DeviceType::Virtio(dev_type), + format!("virtio-{}@0x{:08x?}", dev_type, mmio_base), + ), + MMIODeviceInfo::new(mmio_base, mmio_size, vec![irq], device_id), + ); + } + Ok(dev_info) + } +} + +#[cfg(all(feature = "hotplug", not(feature = "dbs-upcall")))] +impl DeviceOpContext { + pub(crate) fn insert_hotplug_mmio_device( + &self, + _dev: &Arc, + _callback: Option<()>, + ) -> Result<()> { + Err(DeviceMgrError::InvalidOperation) + } + + pub(crate) fn remove_hotplug_mmio_device( + &self, + _dev: &Arc, + _callback: Option<()>, + ) -> Result<()> { + Err(DeviceMgrError::InvalidOperation) + } +} + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +impl DeviceOpContext { + pub(crate) fn create_hotplug_ctx(vm: &Vm, epoll_mgr: Option) -> Self { + let vm_as = vm.vm_as().expect("VM should have memory ready").clone(); + + let mut ctx = Self::new( + epoll_mgr, + vm.device_manager(), + Some(vm_as), + vm.vm_address_space().cloned(), + true, + ); + ctx.upcall_client = vm.upcall_client().clone(); + ctx + } + + fn call_hotplug_device( + &self, + req: DevMgrRequest, + callback: Option>, + ) -> Result<()> { + if let Some(upcall_client) = self.upcall_client.as_ref() { + if let Some(cb) = callback { + upcall_client + .send_request(UpcallClientRequest::DevMgr(req), cb) + .map_err(DeviceMgrError::HotplugDevice)?; + } else { + upcall_client + .send_request_without_result(UpcallClientRequest::DevMgr(req)) + .map_err(DeviceMgrError::HotplugDevice)?; + } + Ok(()) + } else { + Err(DeviceMgrError::InvalidOperation) + } + } + + pub(crate) fn insert_hotplug_mmio_device( + &self, + dev: &Arc, + callback: Option>, + ) -> Result<()> { + if !self.is_hotplug { + return Err(DeviceMgrError::InvalidOperation); + } + + let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?; + let req = DevMgrRequest::AddMmioDev(MmioDevRequest { + mmio_base, + mmio_size, + mmio_irq, + }); + + self.call_hotplug_device(req, callback) + } + + pub(crate) fn remove_hotplug_mmio_device( + &self, + dev: &Arc, + callback: Option>, + ) -> Result<()> { + if !self.is_hotplug { + return Err(DeviceMgrError::InvalidOperation); + } + let (mmio_base, mmio_size, mmio_irq) = DeviceManager::get_virtio_device_info(dev)?; + let req = DevMgrRequest::DelMmioDev(MmioDevRequest { + mmio_base, + mmio_size, + mmio_irq, + }); + + self.call_hotplug_device(req, callback) + } +} + +#[cfg(all(feature = "hotplug", feature = "acpi"))] +impl DeviceOpContext { + // TODO: We will implement this when we develop ACPI virtualization +} + +/// Device manager for virtual machines, which manages all device for a virtual machine. +pub struct DeviceManager { + io_manager: Arc>, + io_lock: Arc>, + irq_manager: Arc, + res_manager: Arc, + vm_fd: Arc, + pub(crate) logger: slog::Logger, + + pub(crate) con_manager: ConsoleManager, + pub(crate) legacy_manager: Option, + #[cfg(target_arch = "aarch64")] + pub(crate) mmio_device_info: HashMap<(DeviceType, String), MMIODeviceInfo>, + #[cfg(feature = "virtio-vsock")] + pub(crate) vsock_manager: VsockDeviceMgr, + + #[cfg(feature = "virtio-blk")] + // If there is a Root Block Device, this should be added as the first element of the list. + // This is necessary because we want the root to always be mounted on /dev/vda. + pub(crate) block_manager: BlockDeviceMgr, + + #[cfg(feature = "virtio-net")] + pub(crate) virtio_net_manager: VirtioNetDeviceMgr, + + #[cfg(feature = "virtio-fs")] + fs_manager: Arc>, +} + +impl DeviceManager { + /// Create a new device manager instance. + pub fn new( + vm_fd: Arc, + res_manager: Arc, + epoll_manager: EpollManager, + logger: &slog::Logger, + ) -> Self { + DeviceManager { + io_manager: Arc::new(ArcSwap::new(Arc::new(IoManager::new()))), + io_lock: Arc::new(Mutex::new(())), + irq_manager: Arc::new(KvmIrqManager::new(vm_fd.clone())), + res_manager, + vm_fd, + logger: logger.new(slog::o!()), + + con_manager: ConsoleManager::new(epoll_manager, logger), + legacy_manager: None, + #[cfg(target_arch = "aarch64")] + mmio_device_info: HashMap::new(), + #[cfg(feature = "virtio-vsock")] + vsock_manager: VsockDeviceMgr::default(), + #[cfg(feature = "virtio-blk")] + block_manager: BlockDeviceMgr::default(), + #[cfg(feature = "virtio-net")] + virtio_net_manager: VirtioNetDeviceMgr::default(), + #[cfg(feature = "virtio-fs")] + fs_manager: Arc::new(Mutex::new(FsDeviceMgr::default())), + } + } + + /// Get the underlying IoManager to dispatch IO read/write requests. + pub fn io_manager(&self) -> IoManagerCached { + IoManagerCached::new(self.io_manager.clone()) + } + + /// Create the underline interrupt manager for the device manager. + pub fn create_interrupt_manager(&mut self) -> Result<()> { + self.irq_manager + .initialize() + .map_err(DeviceMgrError::CreateDevice) + } + + /// Get the underlying logger. + pub fn logger(&self) -> &slog::Logger { + &self.logger + } + + /// Create legacy devices associted virtual machine + #[allow(unused_variables)] + pub fn create_legacy_devices( + &mut self, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), StartMicroVmError> { + #[cfg(any( + target_arch = "x86_64", + all(target_arch = "aarch64", feature = "dbs-virtio-devices") + ))] + { + let mut tx = ctx.io_context.begin_tx(); + let legacy_manager; + + #[cfg(target_arch = "x86_64")] + { + legacy_manager = LegacyDeviceManager::create_manager( + &mut tx.io_manager, + Some(self.vm_fd.clone()), + ); + } + + #[cfg(target_arch = "aarch64")] + #[cfg(feature = "dbs-virtio-devices")] + { + let resources = self.get_legacy_resources()?; + legacy_manager = LegacyDeviceManager::create_manager( + &mut tx.io_manager, + Some(self.vm_fd.clone()), + &resources, + ); + } + + match legacy_manager { + Ok(v) => { + self.legacy_manager = Some(v); + ctx.io_context.commit_tx(tx); + } + Err(e) => { + ctx.io_context.cancel_tx(tx); + return Err(StartMicroVmError::LegacyDevice(e)); + } + } + } + + Ok(()) + } + + /// Init legacy devices with logger stream in associted virtual machine + pub fn init_legacy_devices( + &mut self, + dmesg_fifo: Option>, + com1_sock_path: Option, + _ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), StartMicroVmError> { + // Connect serial ports to the console and dmesg_fifo. + self.set_guest_kernel_log_stream(dmesg_fifo) + .map_err(|_| StartMicroVmError::EventFd)?; + + info!(self.logger, "init console path: {:?}", com1_sock_path); + if let Some(path) = com1_sock_path { + if let Some(legacy_manager) = self.legacy_manager.as_ref() { + let com1 = legacy_manager.get_com1_serial(); + self.con_manager + .create_socket_console(com1, path) + .map_err(StartMicroVmError::DeviceManager)?; + } + } else if let Some(legacy_manager) = self.legacy_manager.as_ref() { + let com1 = legacy_manager.get_com1_serial(); + self.con_manager + .create_stdio_console(com1) + .map_err(StartMicroVmError::DeviceManager)?; + } + + Ok(()) + } + + /// Set the stream for guest kernel log. + /// + /// Note: com2 is used for guest kernel logging. + /// TODO: check whether it works with aarch64. + pub fn set_guest_kernel_log_stream( + &self, + stream: Option>, + ) -> std::result::Result<(), io::Error> { + if let Some(legacy) = self.legacy_manager.as_ref() { + legacy + .get_com2_serial() + .lock() + .unwrap() + .set_output_stream(stream); + } + Ok(()) + } + + /// Reset the console into canonical mode. + pub fn reset_console(&self) -> Result<()> { + self.con_manager.reset_console() + } + + /// Create all registered devices when booting the associated virtual machine. + pub fn create_devices( + &mut self, + vm_as: GuestAddressSpaceImpl, + epoll_mgr: EpollManager, + kernel_config: &mut KernelConfigInfo, + com1_sock_path: Option, + dmesg_fifo: Option>, + address_space: Option<&AddressSpace>, + ) -> std::result::Result<(), StartMicroVmError> { + let mut ctx = DeviceOpContext::new( + Some(epoll_mgr), + self, + Some(vm_as), + address_space.cloned(), + false, + ); + + self.create_legacy_devices(&mut ctx)?; + self.init_legacy_devices(dmesg_fifo, com1_sock_path, &mut ctx)?; + + #[cfg(feature = "virtio-blk")] + self.block_manager + .attach_devices(&mut ctx) + .map_err(StartMicroVmError::BlockDeviceError)?; + + #[cfg(feature = "virtio-fs")] + { + let mut fs_manager = self.fs_manager.lock().unwrap(); + fs_manager + .attach_devices(&mut ctx) + .map_err(StartMicroVmError::FsDeviceError)?; + } + + #[cfg(feature = "virtio-net")] + self.virtio_net_manager + .attach_devices(&mut ctx) + .map_err(StartMicroVmError::VirtioNetDeviceError)?; + + #[cfg(feature = "virtio-vsock")] + self.vsock_manager.attach_devices(&mut ctx)?; + + #[cfg(feature = "virtio-blk")] + self.block_manager + .generate_kernel_boot_args(kernel_config) + .map_err(StartMicroVmError::DeviceManager)?; + ctx.generate_kernel_boot_args(kernel_config) + .map_err(StartMicroVmError::DeviceManager)?; + + #[cfg(target_arch = "aarch64")] + { + let dev_info = ctx + .generate_virtio_device_info() + .map_err(StartMicroVmError::DeviceManager)?; + self.mmio_device_info.extend(dev_info); + } + + Ok(()) + } + + /// Start all registered devices when booting the associated virtual machine. + pub fn start_devices(&mut self) -> std::result::Result<(), StartMicroVmError> { + // TODO: add vfio support here. issue #4589. + Ok(()) + } + + /// Remove all devices when shutdown the associated virtual machine + pub fn remove_devices( + &mut self, + vm_as: GuestAddressSpaceImpl, + epoll_mgr: EpollManager, + address_space: Option<&AddressSpace>, + ) -> Result<()> { + // create context for removing devices + let mut ctx = DeviceOpContext::new( + Some(epoll_mgr), + self, + Some(vm_as), + address_space.cloned(), + true, + ); + + #[cfg(feature = "virtio-blk")] + self.block_manager.remove_devices(&mut ctx)?; + Ok(()) + } +} + +#[cfg(target_arch = "x86_64")] +impl DeviceManager { + /// Get the underlying eventfd for vm exit notification. + pub fn get_reset_eventfd(&self) -> Result { + if let Some(legacy) = self.legacy_manager.as_ref() { + legacy + .get_reset_eventfd() + .map_err(DeviceMgrError::LegacyManager) + } else { + Err(DeviceMgrError::LegacyManager(legacy::Error::EventFd( + io::Error::from_raw_os_error(libc::ENOENT), + ))) + } + } +} + +#[cfg(target_arch = "aarch64")] +impl DeviceManager { + /// Return mmio device info for FDT build. + pub fn get_mmio_device_info(&self) -> Option<&HashMap<(DeviceType, String), MMIODeviceInfo>> { + Some(&self.mmio_device_info) + } + + #[cfg(feature = "dbs-virtio-devices")] + fn get_legacy_resources( + &mut self, + ) -> std::result::Result, StartMicroVmError> { + let mut resources = HashMap::new(); + let legacy_devices = vec![ + (DeviceType::Serial, String::from(COM1)), + (DeviceType::Serial, String::from(COM2)), + (DeviceType::RTC, String::from(RTC)), + ]; + + for (device_type, device_id) in legacy_devices { + let res = self.allocate_mmio_device_resource()?; + self.add_mmio_device_info(&res, device_type, device_id.clone(), None); + resources.insert(device_id.clone(), res); + } + + Ok(resources) + } + + fn mmio_device_info_to_resources( + &self, + key: &(DeviceType, String), + ) -> std::result::Result { + self.mmio_device_info + .get(key) + .map(|info| { + let mut resources = DeviceResources::new(); + resources.append(Resource::LegacyIrq(info.irqs[0])); + resources.append(Resource::MmioAddressRange { + base: info.base, + size: info.size, + }); + resources + }) + .ok_or(StartMicroVmError::DeviceManager( + DeviceMgrError::GetDeviceResource, + )) + } + + #[cfg(feature = "dbs-virtio-devices")] + fn allocate_mmio_device_resource( + &self, + ) -> std::result::Result { + let mut requests = Vec::new(); + requests.push(ResourceConstraint::MmioAddress { + range: None, + align: MMIO_DEFAULT_CFG_SIZE, + size: MMIO_DEFAULT_CFG_SIZE, + }); + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + + self.res_manager + .allocate_device_resources(&requests, false) + .map_err(StartMicroVmError::AllocateResource) + } + + fn add_mmio_device_info( + &mut self, + resource: &DeviceResources, + device_type: DeviceType, + device_id: String, + msi_device_id: Option, + ) { + let (base, size) = resource.get_mmio_address_ranges()[0]; + let irq = resource.get_legacy_irq().unwrap(); + self.mmio_device_info.insert( + (device_type, device_id), + MMIODeviceInfo::new(base, size, vec![irq], msi_device_id), + ); + } + + #[cfg(feature = "dbs-virtio-devices")] + fn get_virtio_mmio_device_info(device: &Arc) -> Result<(u64, u64, u32)> { + let resources = device.get_assigned_resources(); + let irq = resources + .get_legacy_irq() + .ok_or(DeviceMgrError::GetDeviceResource)?; + + if let Some(mmio_dev) = device.as_any().downcast_ref::() { + if let Resource::MmioAddressRange { base, size } = mmio_dev.get_mmio_cfg_res() { + return Ok((base, size, irq)); + } + } + + Err(DeviceMgrError::GetDeviceResource) + } +} + +#[cfg(feature = "dbs-virtio-devices")] +impl DeviceManager { + fn get_virtio_device_info(device: &Arc) -> Result<(u64, u64, u32)> { + let resources = device.get_assigned_resources(); + let irq = resources + .get_legacy_irq() + .ok_or(DeviceMgrError::GetDeviceResource)?; + let mmio_address_range = device.get_trapped_io_resources().get_mmio_address_ranges(); + + // Assume the first MMIO region is virtio configuration region. + // Virtio-fs needs to pay attention to this assumption. + if let Some(range) = mmio_address_range.into_iter().next() { + Ok((range.0, range.1, irq)) + } else { + Err(DeviceMgrError::GetDeviceResource) + } + } + + /// Create an Virtio MMIO transport layer device for the virtio backend device. + pub fn create_mmio_virtio_device( + device: DbsVirtioDevice, + ctx: &mut DeviceOpContext, + use_shared_irq: bool, + use_generic_irq: bool, + ) -> std::result::Result, DeviceMgrError> { + let features = DRAGONBALL_FEATURE_INTR_USED | DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY; + DeviceManager::create_mmio_virtio_device_with_features( + device, + ctx, + Some(features), + use_shared_irq, + use_generic_irq, + ) + } + + /// Create an Virtio MMIO transport layer device for the virtio backend device with specified + /// features. + pub fn create_mmio_virtio_device_with_features( + device: DbsVirtioDevice, + ctx: &mut DeviceOpContext, + features: Option, + use_shared_irq: bool, + use_generic_irq: bool, + ) -> std::result::Result, DeviceMgrError> { + // Every emulated Virtio MMIO device needs a 4K configuration space, + // and another 4K space for per queue notification. + const MMIO_ADDRESS_DEFAULT: ResourceConstraint = ResourceConstraint::MmioAddress { + range: None, + align: 0, + size: MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE, + }; + let mut requests = vec![MMIO_ADDRESS_DEFAULT]; + device.get_resource_requirements(&mut requests, use_generic_irq); + let resources = ctx + .res_manager + .allocate_device_resources(&requests, use_shared_irq) + .map_err(|_| DeviceMgrError::GetDeviceResource)?; + + let virtio_dev = match MmioV2Device::new( + ctx.vm_fd.clone(), + ctx.get_vm_as()?, + ctx.irq_manager.clone(), + device, + resources, + features, + ) { + Ok(d) => d, + Err(e) => return Err(DeviceMgrError::Virtio(e)), + }; + + Self::register_mmio_virtio_device(Arc::new(virtio_dev), ctx) + } + + /// Teardown the Virtio MMIO transport layer device associated with the virtio backend device. + pub fn destroy_mmio_virtio_device( + device: Arc, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), DeviceMgrError> { + Self::destroy_mmio_device(device.clone(), ctx)?; + + let mmio_dev = device + .as_any() + .downcast_ref::() + .ok_or(DeviceMgrError::InvalidOperation)?; + + mmio_dev.remove(); + + Ok(()) + } + + fn destroy_mmio_device( + device: Arc, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), DeviceMgrError> { + // unregister IoManager + Self::deregister_mmio_virtio_device(&device, ctx)?; + + // unregister Resource manager + let resources = device.get_assigned_resources(); + ctx.res_manager + .free_device_resources(&resources) + .map_err(DeviceMgrError::ResourceError)?; + + Ok(()) + } + + /// Create an Virtio MMIO transport layer device for the virtio backend device. + pub fn register_mmio_virtio_device( + device: Arc, + ctx: &mut DeviceOpContext, + ) -> std::result::Result, DeviceMgrError> { + let (mmio_base, mmio_size, irq) = Self::get_virtio_device_info(&device)?; + info!( + ctx.logger(), + "create virtio mmio device 0x{:x}@0x{:x}, irq: 0x{:x}", mmio_size, mmio_base, irq + ); + let resources = device.get_trapped_io_resources(); + + let mut tx = ctx.io_context.begin_tx(); + if let Err(e) = ctx + .io_context + .register_device_io(&mut tx, device.clone(), &resources) + { + ctx.io_context.cancel_tx(tx); + Err(DeviceMgrError::IoManager(e)) + } else { + ctx.virtio_devices.push(device.clone()); + ctx.io_context.commit_tx(tx); + Ok(device) + } + } + + /// Deregister a Virtio MMIO device from IoManager + pub fn deregister_mmio_virtio_device( + device: &Arc, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), DeviceMgrError> { + let resources = device.get_trapped_io_resources(); + info!( + ctx.logger(), + "unregister mmio virtio device: {:?}", resources + ); + let mut tx = ctx.io_context.begin_tx(); + if let Err(e) = ctx.io_context.unregister_device_io(&mut tx, &resources) { + ctx.io_context.cancel_tx(tx); + Err(DeviceMgrError::IoManager(e)) + } else { + ctx.io_context.commit_tx(tx); + Ok(()) + } + } +} + +#[cfg(feature = "hotplug")] +impl DeviceManager { + /// Get Unix Domain Socket path for the vsock device. + pub fn get_vsock_inner_connector(&mut self) -> Option { + #[cfg(feature = "virtio-vsock")] + { + self.vsock_manager + .get_default_connector() + .map(|d| Some(d)) + .unwrap_or(None) + } + #[cfg(not(feature = "virtio-vsock"))] + { + return None; + } + } +} diff --git a/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs b/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs new file mode 100644 index 0000000000..3e81f29487 --- /dev/null +++ b/src/dragonball/src/device_manager/virtio_net_dev_mgr.rs @@ -0,0 +1,387 @@ +// Copyright 2020-2022 Alibaba, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::convert::TryInto; +use std::sync::Arc; + +use dbs_utils::net::{MacAddr, Tap, TapError}; +use dbs_utils::rate_limiter::BucketUpdate; +use dbs_virtio_devices as virtio; +use dbs_virtio_devices::net::Net; +use dbs_virtio_devices::Error as VirtioError; +use serde_derive::{Deserialize, Serialize}; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::config_manager::{ + ConfigItem, DeviceConfigInfo, DeviceConfigInfos, RateLimiterConfigInfo, +}; +use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext}; +use crate::get_bucket_update; + +use super::DbsMmioV2Device; + +/// Default number of virtio queues, one rx/tx pair. +pub const NUM_QUEUES: usize = 2; +/// Default size of virtio queues. +pub const QUEUE_SIZE: u16 = 256; +// The flag of whether to use the shared irq. +const USE_SHARED_IRQ: bool = true; +// The flag of whether to use the generic irq. +const USE_GENERIC_IRQ: bool = true; + +/// Errors associated with virtio net device operations. +#[derive(Debug, thiserror::Error)] +pub enum VirtioNetDeviceError { + /// The virtual machine instance ID is invalid. + #[error("the virtual machine instance ID is invalid")] + InvalidVMID, + + /// The iface ID is invalid. + #[error("invalid virtio-net iface id '{0}'")] + InvalidIfaceId(String), + + /// Invalid queue number configuration for virtio_net device. + #[error("invalid queue number {0} for virtio-net device")] + InvalidQueueNum(usize), + + /// Failure from device manager, + #[error("failure in device manager operations, {0}")] + DeviceManager(#[source] DeviceMgrError), + + /// The Context Identifier is already in use. + #[error("the device ID {0} already exists")] + DeviceIDAlreadyExist(String), + + /// The MAC address is already in use. + #[error("the guest MAC address {0} is already in use")] + GuestMacAddressInUse(String), + + /// The host device name is already in use. + #[error("the host device name {0} is already in use")] + HostDeviceNameInUse(String), + + /// Cannot open/create tap device. + #[error("cannot open TAP device")] + OpenTap(#[source] TapError), + + /// Failure from virtio subsystem. + #[error(transparent)] + Virtio(VirtioError), + + /// Failed to send patch message to net epoll handler. + #[error("could not send patch message to the net epoll handler")] + NetEpollHanderSendFail, + + /// The update is not allowed after booting the microvm. + #[error("update operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// Split this at some point. + /// Internal errors are due to resource exhaustion. + /// Users errors are due to invalid permissions. + #[error("cannot create network device: {0}")] + CreateNetDevice(#[source] VirtioError), + + /// Cannot initialize a MMIO Network Device or add a device to the MMIO Bus. + #[error("failure while registering network device: {0}")] + RegisterNetDevice(#[source] DeviceMgrError), +} + +/// Configuration information for virtio net devices. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct VirtioNetDeviceConfigUpdateInfo { + /// ID of the guest network interface. + pub iface_id: String, + /// Rate Limiter for received packages. + pub rx_rate_limiter: Option, + /// Rate Limiter for transmitted packages. + pub tx_rate_limiter: Option, +} + +impl VirtioNetDeviceConfigUpdateInfo { + /// Provides a `BucketUpdate` description for the RX bandwidth rate limiter. + pub fn rx_bytes(&self) -> BucketUpdate { + get_bucket_update!(self, rx_rate_limiter, bandwidth) + } + /// Provides a `BucketUpdate` description for the RX ops rate limiter. + pub fn rx_ops(&self) -> BucketUpdate { + get_bucket_update!(self, rx_rate_limiter, ops) + } + /// Provides a `BucketUpdate` description for the TX bandwidth rate limiter. + pub fn tx_bytes(&self) -> BucketUpdate { + get_bucket_update!(self, tx_rate_limiter, bandwidth) + } + /// Provides a `BucketUpdate` description for the TX ops rate limiter. + pub fn tx_ops(&self) -> BucketUpdate { + get_bucket_update!(self, tx_rate_limiter, ops) + } +} + +/// Configuration information for virtio net devices. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, Default)] +pub struct VirtioNetDeviceConfigInfo { + /// ID of the guest network interface. + pub iface_id: String, + /// Host level path for the guest network interface. + pub host_dev_name: String, + /// Number of virtqueues to use. + pub num_queues: usize, + /// Size of each virtqueue. Unit: byte. + pub queue_size: u16, + /// Guest MAC address. + pub guest_mac: Option, + /// Rate Limiter for received packages. + pub rx_rate_limiter: Option, + /// Rate Limiter for transmitted packages. + pub tx_rate_limiter: Option, + /// allow duplicate mac + pub allow_duplicate_mac: bool, + /// Use shared irq + pub use_shared_irq: Option, + /// Use generic irq + pub use_generic_irq: Option, +} + +impl VirtioNetDeviceConfigInfo { + /// Returns the tap device that `host_dev_name` refers to. + pub fn open_tap(&self) -> std::result::Result { + Tap::open_named(self.host_dev_name.as_str(), false).map_err(VirtioNetDeviceError::OpenTap) + } + + /// Returns a reference to the mac address. It the mac address is not configured, it + /// return None. + pub fn guest_mac(&self) -> Option<&MacAddr> { + self.guest_mac.as_ref() + } + + ///Rx and Tx queue and max queue sizes + pub fn queue_sizes(&self) -> Vec { + let mut queue_size = self.queue_size; + if queue_size == 0 { + queue_size = QUEUE_SIZE; + } + let num_queues = if self.num_queues > 0 { + self.num_queues + } else { + NUM_QUEUES + }; + + (0..num_queues).map(|_| queue_size).collect::>() + } +} + +impl ConfigItem for VirtioNetDeviceConfigInfo { + type Err = VirtioNetDeviceError; + + fn id(&self) -> &str { + &self.iface_id + } + + fn check_conflicts(&self, other: &Self) -> Result<(), VirtioNetDeviceError> { + if self.iface_id == other.iface_id { + Err(VirtioNetDeviceError::DeviceIDAlreadyExist( + self.iface_id.clone(), + )) + } else if !other.allow_duplicate_mac + && self.guest_mac.is_some() + && self.guest_mac == other.guest_mac + { + Err(VirtioNetDeviceError::GuestMacAddressInUse( + self.guest_mac.as_ref().unwrap().to_string(), + )) + } else if self.host_dev_name == other.host_dev_name { + Err(VirtioNetDeviceError::HostDeviceNameInUse( + self.host_dev_name.clone(), + )) + } else { + Ok(()) + } + } +} + +/// Virtio Net Device Info +pub type VirtioNetDeviceInfo = DeviceConfigInfo; + +/// Device manager to manage all virtio net devices. +pub struct VirtioNetDeviceMgr { + pub(crate) info_list: DeviceConfigInfos, + pub(crate) use_shared_irq: bool, +} + +impl VirtioNetDeviceMgr { + /// Gets the index of the device with the specified `drive_id` if it exists in the list. + pub fn get_index_of_iface_id(&self, if_id: &str) -> Option { + self.info_list + .iter() + .position(|info| info.config.iface_id.eq(if_id)) + } + + /// Insert or update a virtio net device into the manager. + pub fn insert_device( + device_mgr: &mut DeviceManager, + mut ctx: DeviceOpContext, + config: VirtioNetDeviceConfigInfo, + ) -> std::result::Result<(), VirtioNetDeviceError> { + if config.num_queues % 2 != 0 { + return Err(VirtioNetDeviceError::InvalidQueueNum(config.num_queues)); + } + if !cfg!(feature = "hotplug") && ctx.is_hotplug { + return Err(VirtioNetDeviceError::UpdateNotAllowedPostBoot); + } + + let mgr = &mut device_mgr.virtio_net_manager; + + slog::info!( + ctx.logger(), + "add virtio-net device configuration"; + "subsystem" => "net_dev_mgr", + "id" => &config.iface_id, + "host_dev_name" => &config.host_dev_name, + ); + + let device_index = mgr.info_list.insert_or_update(&config)?; + + if ctx.is_hotplug { + slog::info!( + ctx.logger(), + "attach virtio-net device"; + "subsystem" => "net_dev_mgr", + "id" => &config.iface_id, + "host_dev_name" => &config.host_dev_name, + ); + + match Self::create_device(&config, &mut ctx) { + Ok(device) => { + let dev = DeviceManager::create_mmio_virtio_device( + device, + &mut ctx, + config.use_shared_irq.unwrap_or(mgr.use_shared_irq), + config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(VirtioNetDeviceError::DeviceManager)?; + ctx.insert_hotplug_mmio_device(&dev.clone(), None) + .map_err(VirtioNetDeviceError::DeviceManager)?; + // live-upgrade need save/restore device from info.device. + mgr.info_list[device_index].set_device(dev); + } + Err(e) => { + mgr.info_list.remove(device_index); + return Err(VirtioNetDeviceError::Virtio(e)); + } + } + } + + Ok(()) + } + + /// Update the ratelimiter settings of a virtio net device. + pub fn update_device_ratelimiters( + device_mgr: &mut DeviceManager, + new_cfg: VirtioNetDeviceConfigUpdateInfo, + ) -> std::result::Result<(), VirtioNetDeviceError> { + let mgr = &mut device_mgr.virtio_net_manager; + match mgr.get_index_of_iface_id(&new_cfg.iface_id) { + Some(index) => { + let config = &mut mgr.info_list[index].config; + config.rx_rate_limiter = new_cfg.rx_rate_limiter.clone(); + config.tx_rate_limiter = new_cfg.tx_rate_limiter.clone(); + let device = mgr.info_list[index].device.as_mut().ok_or_else(|| { + VirtioNetDeviceError::InvalidIfaceId(new_cfg.iface_id.clone()) + })?; + + if let Some(mmio_dev) = device.as_any().downcast_ref::() { + let guard = mmio_dev.state(); + let inner_dev = guard.get_inner_device(); + if let Some(net_dev) = inner_dev + .as_any() + .downcast_ref::>() + { + return net_dev + .set_patch_rate_limiters( + new_cfg.rx_bytes(), + new_cfg.rx_ops(), + new_cfg.tx_bytes(), + new_cfg.tx_ops(), + ) + .map(|_p| ()) + .map_err(|_e| VirtioNetDeviceError::NetEpollHanderSendFail); + } + } + Ok(()) + } + None => Err(VirtioNetDeviceError::InvalidIfaceId( + new_cfg.iface_id.clone(), + )), + } + } + + /// Attach all configured vsock device to the virtual machine instance. + pub fn attach_devices( + &mut self, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), VirtioNetDeviceError> { + for info in self.info_list.iter_mut() { + slog::info!( + ctx.logger(), + "attach virtio-net device"; + "subsystem" => "net_dev_mgr", + "id" => &info.config.iface_id, + "host_dev_name" => &info.config.host_dev_name, + ); + + let device = Self::create_device(&info.config, ctx) + .map_err(VirtioNetDeviceError::CreateNetDevice)?; + let device = DeviceManager::create_mmio_virtio_device( + device, + ctx, + info.config.use_shared_irq.unwrap_or(self.use_shared_irq), + info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(VirtioNetDeviceError::RegisterNetDevice)?; + info.set_device(device); + } + + Ok(()) + } + + fn create_device( + cfg: &VirtioNetDeviceConfigInfo, + ctx: &mut DeviceOpContext, + ) -> std::result::Result>, virtio::Error> { + let epoll_mgr = ctx.epoll_mgr.clone().ok_or(virtio::Error::InvalidInput)?; + let rx_rate_limiter = match cfg.rx_rate_limiter.as_ref() { + Some(rl) => Some(rl.try_into().map_err(virtio::Error::IOError)?), + None => None, + }; + let tx_rate_limiter = match cfg.tx_rate_limiter.as_ref() { + Some(rl) => Some(rl.try_into().map_err(virtio::Error::IOError)?), + None => None, + }; + + let net_device = Net::new( + cfg.host_dev_name.clone(), + cfg.guest_mac(), + Arc::new(cfg.queue_sizes()), + epoll_mgr, + rx_rate_limiter, + tx_rate_limiter, + )?; + + Ok(Box::new(net_device)) + } +} + +impl Default for VirtioNetDeviceMgr { + /// Create a new virtio net device manager. + fn default() -> Self { + VirtioNetDeviceMgr { + info_list: DeviceConfigInfos::new(), + use_shared_irq: USE_SHARED_IRQ, + } + } +} diff --git a/src/dragonball/src/device_manager/vsock_dev_mgr.rs b/src/dragonball/src/device_manager/vsock_dev_mgr.rs new file mode 100644 index 0000000000..4f0f074134 --- /dev/null +++ b/src/dragonball/src/device_manager/vsock_dev_mgr.rs @@ -0,0 +1,299 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::sync::Arc; + +use dbs_virtio_devices as virtio; +use dbs_virtio_devices::mmio::DRAGONBALL_FEATURE_INTR_USED; +use dbs_virtio_devices::vsock::backend::{ + VsockInnerBackend, VsockInnerConnector, VsockTcpBackend, VsockUnixStreamBackend, +}; +use dbs_virtio_devices::vsock::Vsock; +use dbs_virtio_devices::Error as VirtioError; +use serde_derive::{Deserialize, Serialize}; + +use super::StartMicroVmError; +use crate::config_manager::{ConfigItem, DeviceConfigInfo, DeviceConfigInfos}; +use crate::device_manager::{DeviceManager, DeviceOpContext}; + +pub use dbs_virtio_devices::vsock::QUEUE_SIZES; + +const SUBSYSTEM: &str = "vsock_dev_mgr"; +// The flag of whether to use the shared irq. +const USE_SHARED_IRQ: bool = true; +// The flag of whether to use the generic irq. +const USE_GENERIC_IRQ: bool = true; + +/// Errors associated with `VsockDeviceConfigInfo`. +#[derive(Debug, thiserror::Error)] +pub enum VsockDeviceError { + /// The virtual machine instance ID is invalid. + #[error("the virtual machine instance ID is invalid")] + InvalidVMID, + + /// The Context Identifier is already in use. + #[error("the device ID {0} already exists")] + DeviceIDAlreadyExist(String), + + /// The Context Identifier is invalid. + #[error("the guest CID {0} is invalid")] + GuestCIDInvalid(u32), + + /// The Context Identifier is already in use. + #[error("the guest CID {0} is already in use")] + GuestCIDAlreadyInUse(u32), + + /// The Unix Domain Socket path is already in use. + #[error("the Unix Domain Socket path {0} is already in use")] + UDSPathAlreadyInUse(String), + + /// The net address is already in use. + #[error("the net address {0} is already in use")] + NetAddrAlreadyInUse(String), + + /// The update is not allowed after booting the microvm. + #[error("update operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// The VsockId Already Exists + #[error("vsock id {0} already exists")] + VsockIdAlreadyExists(String), + + /// Inner backend create error + #[error("vsock inner backend create error: {0}")] + CreateInnerBackend(#[source] std::io::Error), +} + +/// Configuration information for a vsock device. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +pub struct VsockDeviceConfigInfo { + /// ID of the vsock device. + pub id: String, + /// A 32-bit Context Identifier (CID) used to identify the guest. + pub guest_cid: u32, + /// unix domain socket path. + pub uds_path: Option, + /// tcp socket address. + pub tcp_addr: Option, + /// Virtio queue size. + pub queue_size: Vec, + /// Use shared irq + pub use_shared_irq: Option, + /// Use generic irq + pub use_generic_irq: Option, +} + +impl Default for VsockDeviceConfigInfo { + fn default() -> Self { + Self { + id: String::default(), + guest_cid: 0, + uds_path: None, + tcp_addr: None, + queue_size: Vec::from(QUEUE_SIZES), + use_shared_irq: None, + use_generic_irq: None, + } + } +} + +impl VsockDeviceConfigInfo { + /// Get number and size of queues supported. + pub fn queue_sizes(&self) -> Vec { + self.queue_size.clone() + } +} + +impl ConfigItem for VsockDeviceConfigInfo { + type Err = VsockDeviceError; + + fn id(&self) -> &str { + &self.id + } + + fn check_conflicts(&self, other: &Self) -> Result<(), VsockDeviceError> { + if self.id == other.id { + return Err(VsockDeviceError::DeviceIDAlreadyExist(self.id.clone())); + } + if self.guest_cid == other.guest_cid { + return Err(VsockDeviceError::GuestCIDAlreadyInUse(self.guest_cid)); + } + if let (Some(self_uds_path), Some(other_uds_path)) = + (self.uds_path.as_ref(), other.uds_path.as_ref()) + { + if self_uds_path == other_uds_path { + return Err(VsockDeviceError::UDSPathAlreadyInUse(self_uds_path.clone())); + } + } + if let (Some(self_net_addr), Some(other_net_addr)) = + (self.tcp_addr.as_ref(), other.tcp_addr.as_ref()) + { + if self_net_addr == other_net_addr { + return Err(VsockDeviceError::NetAddrAlreadyInUse(self_net_addr.clone())); + } + } + + Ok(()) + } +} + +/// Vsock Device Info +pub type VsockDeviceInfo = DeviceConfigInfo; + +/// Device manager to manage all vsock devices. +pub struct VsockDeviceMgr { + pub(crate) info_list: DeviceConfigInfos, + pub(crate) default_inner_backend: Option, + pub(crate) default_inner_connector: Option, + pub(crate) use_shared_irq: bool, +} + +impl VsockDeviceMgr { + /// Insert or update a vsock device into the manager. + pub fn insert_device( + &mut self, + ctx: DeviceOpContext, + config: VsockDeviceConfigInfo, + ) -> std::result::Result<(), VsockDeviceError> { + if ctx.is_hotplug { + slog::error!( + ctx.logger(), + "no support of virtio-vsock device hotplug"; + "subsystem" => SUBSYSTEM, + "id" => &config.id, + "uds_path" => &config.uds_path, + ); + + return Err(VsockDeviceError::UpdateNotAllowedPostBoot); + } + + // VMADDR_CID_ANY (-1U) means any address for binding; + // VMADDR_CID_HYPERVISOR (0) is reserved for services built into the hypervisor; + // VMADDR_CID_RESERVED (1) must not be used; + // VMADDR_CID_HOST (2) is the well-known address of the host. + if config.guest_cid <= 2 { + return Err(VsockDeviceError::GuestCIDInvalid(config.guest_cid)); + } + + slog::info!( + ctx.logger(), + "add virtio-vsock device configuration"; + "subsystem" => SUBSYSTEM, + "id" => &config.id, + "uds_path" => &config.uds_path, + ); + + self.lazy_make_default_connector()?; + + self.info_list.insert_or_update(&config)?; + + Ok(()) + } + + /// Attach all configured vsock device to the virtual machine instance. + pub fn attach_devices( + &mut self, + ctx: &mut DeviceOpContext, + ) -> std::result::Result<(), StartMicroVmError> { + let epoll_mgr = ctx + .epoll_mgr + .clone() + .ok_or(StartMicroVmError::CreateVsockDevice( + virtio::Error::InvalidInput, + ))?; + + for info in self.info_list.iter_mut() { + slog::info!( + ctx.logger(), + "attach virtio-vsock device"; + "subsystem" => SUBSYSTEM, + "id" => &info.config.id, + "uds_path" => &info.config.uds_path, + ); + + let mut device = Box::new( + Vsock::new( + info.config.guest_cid as u64, + Arc::new(info.config.queue_sizes()), + epoll_mgr.clone(), + ) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?, + ); + if let Some(uds_path) = info.config.uds_path.as_ref() { + let unix_backend = VsockUnixStreamBackend::new(uds_path.clone()) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?; + device + .add_backend(Box::new(unix_backend), true) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?; + } + if let Some(tcp_addr) = info.config.tcp_addr.as_ref() { + let tcp_backend = VsockTcpBackend::new(tcp_addr.clone()) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?; + device + .add_backend(Box::new(tcp_backend), false) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?; + } + // add inner backend to the the first added vsock device + if let Some(inner_backend) = self.default_inner_backend.take() { + device + .add_backend(Box::new(inner_backend), false) + .map_err(VirtioError::VirtioVsockError) + .map_err(StartMicroVmError::CreateVsockDevice)?; + } + let device = DeviceManager::create_mmio_virtio_device_with_features( + device, + ctx, + Some(DRAGONBALL_FEATURE_INTR_USED), + info.config.use_shared_irq.unwrap_or(self.use_shared_irq), + info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ), + ) + .map_err(StartMicroVmError::RegisterVsockDevice)?; + info.device = Some(device); + } + + Ok(()) + } + + // check the default connector is present, or build it. + fn lazy_make_default_connector(&mut self) -> std::result::Result<(), VsockDeviceError> { + if self.default_inner_connector.is_none() { + let inner_backend = + VsockInnerBackend::new().map_err(VsockDeviceError::CreateInnerBackend)?; + self.default_inner_connector = Some(inner_backend.get_connector()); + self.default_inner_backend = Some(inner_backend); + } + Ok(()) + } + + /// Get the default vsock inner connector. + pub fn get_default_connector( + &mut self, + ) -> std::result::Result { + self.lazy_make_default_connector()?; + + // safe to unwrap, because we created the inner connector before + Ok(self.default_inner_connector.clone().unwrap()) + } +} + +impl Default for VsockDeviceMgr { + /// Create a new Vsock device manager. + fn default() -> Self { + VsockDeviceMgr { + info_list: DeviceConfigInfos::new(), + default_inner_backend: None, + default_inner_connector: None, + use_shared_irq: USE_SHARED_IRQ, + } + } +} diff --git a/src/dragonball/src/error.rs b/src/dragonball/src/error.rs new file mode 100644 index 0000000000..35cf639bb2 --- /dev/null +++ b/src/dragonball/src/error.rs @@ -0,0 +1,224 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file + +//! Error codes for the virtual machine monitor subsystem. + +#[cfg(feature = "dbs-virtio-devices")] +use dbs_virtio_devices::Error as VirtIoError; + +use crate::{address_space_manager, device_manager, resource_manager, vcpu, vm}; + +/// Shorthand result type for internal VMM commands. +pub type Result = std::result::Result; + +/// Errors associated with the VMM internal logic. +/// +/// These errors cannot be generated by direct user input, but can result from bad configuration +/// of the host (for example if Dragonball doesn't have permissions to open the KVM fd). +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Empty AddressSpace from parameters. + #[error("Empty AddressSpace from parameters")] + AddressSpace, + + /// The zero page extends past the end of guest_mem. + #[error("the guest zero page extends past the end of guest memory")] + ZeroPagePastRamEnd, + + /// Error writing the zero page of guest memory. + #[error("failed to write to guest zero page")] + ZeroPageSetup, + + /// Failure occurs in issuing KVM ioctls and errors will be returned from kvm_ioctls lib. + #[error("failure in issuing KVM ioctl command: {0}")] + Kvm(#[source] kvm_ioctls::Error), + + /// The host kernel reports an unsupported KVM API version. + #[error("unsupported KVM version {0}")] + KvmApiVersion(i32), + + /// Cannot initialize the KVM context due to missing capabilities. + #[error("missing KVM capability: {0:?}")] + KvmCap(kvm_ioctls::Cap), + + #[cfg(target_arch = "x86_64")] + #[error("failed to configure MSRs: {0:?}")] + /// Cannot configure MSRs + GuestMSRs(dbs_arch::msr::Error), + + /// MSR inner error + #[error("MSR inner error")] + Msr(vmm_sys_util::fam::Error), + + /// Error writing MP table to memory. + #[cfg(target_arch = "x86_64")] + #[error("failed to write MP table to guest memory: {0}")] + MpTableSetup(#[source] dbs_boot::mptable::Error), + + /// Fail to boot system + #[error("failed to boot system: {0}")] + BootSystem(#[source] dbs_boot::Error), + + /// Cannot open the VM file descriptor. + #[error(transparent)] + Vm(vm::VmError), +} + +/// Errors associated with starting the instance. +#[derive(Debug, thiserror::Error)] +pub enum StartMicroVmError { + /// Failed to allocate resources. + #[error("cannot allocate resources")] + AllocateResource(#[source] resource_manager::ResourceError), + + /// Cannot read from an Event file descriptor. + #[error("failure while reading from EventFd file descriptor")] + EventFd, + + /// Cannot add event to Epoll. + #[error("failure while registering epoll event for file descriptor")] + RegisterEvent, + + /// The start command was issued more than once. + #[error("the virtual machine is already running")] + MicroVMAlreadyRunning, + + /// Cannot start the VM because the kernel was not configured. + #[error("cannot start the virtual machine without kernel configuration")] + MissingKernelConfig, + + #[cfg(feature = "hotplug")] + /// Upcall initialize miss vsock device. + #[error("the upcall client needs a virtio-vsock device for communication")] + UpcallMissVsock, + + /// Upcall is not ready + #[error("the upcall client is not ready")] + UpcallNotReady, + + /// Configuration passed in is invalidate. + #[error("invalid virtual machine configuration: {0} ")] + ConfigureInvalid(String), + + /// This error is thrown by the minimal boot loader implementation. + /// It is related to a faulty memory configuration. + #[error("failure while configuring boot information for the virtual machine: {0}")] + ConfigureSystem(#[source] Error), + + /// Cannot configure the VM. + #[error("failure while configuring the virtual machine: {0}")] + ConfigureVm(#[source] vm::VmError), + + /// Cannot load initrd. + #[error("cannot load Initrd into guest memory: {0}")] + InitrdLoader(#[from] LoadInitrdError), + + /// Cannot load kernel due to invalid memory configuration or invalid kernel image. + #[error("cannot load guest kernel into guest memory: {0}")] + KernelLoader(#[source] linux_loader::loader::Error), + + /// Cannot load command line string. + #[error("failure while configuring guest kernel commandline: {0}")] + LoadCommandline(#[source] linux_loader::loader::Error), + + /// The device manager was not configured. + #[error("the device manager failed to manage devices: {0}")] + DeviceManager(#[source] device_manager::DeviceMgrError), + + /// Cannot add devices to the Legacy I/O Bus. + #[error("failure in managing legacy device: {0}")] + LegacyDevice(#[source] device_manager::LegacyDeviceError), + + #[cfg(feature = "virtio-vsock")] + /// Failed to create the vsock device. + #[error("cannot create virtio-vsock device: {0}")] + CreateVsockDevice(#[source] VirtIoError), + + #[cfg(feature = "virtio-vsock")] + /// Cannot initialize a MMIO Vsock Device or add a device to the MMIO Bus. + #[error("failure while registering virtio-vsock device: {0}")] + RegisterVsockDevice(#[source] device_manager::DeviceMgrError), + + /// Address space manager related error, e.g.cannot access guest address space manager. + #[error("address space manager related error: {0}")] + AddressManagerError(#[source] address_space_manager::AddressManagerError), + + /// Cannot create a new vCPU file descriptor. + #[error("vCPU related error: {0}")] + Vcpu(#[source] vcpu::VcpuManagerError), + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + /// Upcall initialize Error. + #[error("failure while initializing the upcall client: {0}")] + UpcallInitError(#[source] dbs_upcall::UpcallClientError), + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + /// Upcall connect Error. + #[error("failure while connecting the upcall client: {0}")] + UpcallConnectError(#[source] dbs_upcall::UpcallClientError), + + #[cfg(feature = "virtio-blk")] + /// Virtio-blk errors. + #[error("virtio-blk errors: {0}")] + BlockDeviceError(#[source] device_manager::blk_dev_mgr::BlockDeviceError), + + #[cfg(feature = "virtio-net")] + /// Virtio-net errors. + #[error("virtio-net errors: {0}")] + VirtioNetDeviceError(#[source] device_manager::virtio_net_dev_mgr::VirtioNetDeviceError), + + #[cfg(feature = "virtio-fs")] + /// Virtio-fs errors. + #[error("virtio-fs errors: {0}")] + FsDeviceError(#[source] device_manager::fs_dev_mgr::FsDeviceError), +} + +/// Errors associated with starting the instance. +#[derive(Debug, thiserror::Error)] +pub enum StopMicrovmError { + /// Guest memory has not been initialized. + #[error("Guest memory has not been initialized")] + GuestMemoryNotInitialized, + + /// Cannnot remove devices + #[error("Failed to remove devices in device_manager {0}")] + DeviceManager(#[source] device_manager::DeviceMgrError), +} + +/// Errors associated with loading initrd +#[derive(Debug, thiserror::Error)] +pub enum LoadInitrdError { + /// Cannot load initrd due to an invalid memory configuration. + #[error("failed to load the initrd image to guest memory")] + LoadInitrd, + /// Cannot load initrd due to an invalid image. + #[error("failed to read the initrd image: {0}")] + ReadInitrd(#[source] std::io::Error), +} + +/// A dedicated error type to glue with the vmm_epoll crate. +#[derive(Debug, thiserror::Error)] +pub enum EpollError { + /// Generic internal error. + #[error("unclassfied internal error")] + InternalError, + + /// Errors from the epoll subsystem. + #[error("failed to issue epoll syscall: {0}")] + EpollMgr(#[from] dbs_utils::epoll_manager::Error), + + /// Generic IO errors. + #[error(transparent)] + IOError(std::io::Error), + + #[cfg(feature = "dbs-virtio-devices")] + /// Errors from virtio devices. + #[error("failed to manager Virtio device: {0}")] + VirtIoDevice(#[source] VirtIoError), +} diff --git a/src/dragonball/src/event_manager.rs b/src/dragonball/src/event_manager.rs new file mode 100644 index 0000000000..f07b786506 --- /dev/null +++ b/src/dragonball/src/event_manager.rs @@ -0,0 +1,169 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Event manager to manage and handle IO events and requests from API server . + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; + +use dbs_utils::epoll_manager::{ + EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId, +}; +use log::{error, warn}; +use vmm_sys_util::eventfd::EventFd; + +use crate::error::{EpollError, Result}; +use crate::vmm::Vmm; + +// Statically assigned epoll slot for VMM events. +pub(crate) const EPOLL_EVENT_EXIT: u32 = 0; +pub(crate) const EPOLL_EVENT_API_REQUEST: u32 = 1; + +/// Shared information between vmm::vmm_thread_event_loop() and VmmEpollHandler. +pub(crate) struct EventContext { + pub api_event_fd: EventFd, + pub api_event_triggered: bool, + pub exit_evt_triggered: bool, +} + +impl EventContext { + /// Create a new instance of [`EventContext`]. + pub fn new(api_event_fd: EventFd) -> Result { + Ok(EventContext { + api_event_fd, + api_event_triggered: false, + exit_evt_triggered: false, + }) + } +} + +/// Event manager for VMM to handle API requests and IO events. +pub struct EventManager { + epoll_mgr: EpollManager, + subscriber_id: SubscriberId, + vmm_event_count: Arc, +} + +impl Drop for EventManager { + fn drop(&mut self) { + // Vmm -> Vm -> EpollManager -> VmmEpollHandler -> Vmm + // We need to remove VmmEpollHandler to break the circular reference + // so that Vmm can drop. + self.epoll_mgr + .remove_subscriber(self.subscriber_id) + .map_err(|e| { + error!("event_manager: remove_subscriber err. {:?}", e); + e + }) + .ok(); + } +} + +impl EventManager { + /// Create a new event manager associated with the VMM object. + pub fn new(vmm: &Arc>, epoll_mgr: EpollManager) -> Result { + let vmm_event_count = Arc::new(AtomicUsize::new(0)); + let handler: Box = Box::new(VmmEpollHandler { + vmm: vmm.clone(), + vmm_event_count: vmm_event_count.clone(), + }); + let subscriber_id = epoll_mgr.add_subscriber(handler); + + Ok(EventManager { + epoll_mgr, + subscriber_id, + vmm_event_count, + }) + } + + /// Get the underlying epoll event manager. + pub fn epoll_manager(&self) -> EpollManager { + self.epoll_mgr.clone() + } + + /// Registry the eventfd for exit notification. + pub fn register_exit_eventfd( + &mut self, + exit_evt: &EventFd, + ) -> std::result::Result<(), EpollError> { + let events = Events::with_data(exit_evt, EPOLL_EVENT_EXIT, EventSet::IN); + + self.epoll_mgr + .add_event(self.subscriber_id, events) + .map_err(EpollError::EpollMgr) + } + + /// Poll pending events and invoke registered event handler. + /// + /// # Arguments: + /// * max_events: maximum number of pending events to handle + /// * timeout: maximum time in milliseconds to wait + pub fn handle_events(&self, timeout: i32) -> std::result::Result { + self.epoll_mgr + .handle_events(timeout) + .map_err(EpollError::EpollMgr) + } + + /// Fetch the VMM event count and reset it to zero. + pub fn fetch_vmm_event_count(&self) -> usize { + self.vmm_event_count.swap(0, Ordering::AcqRel) + } +} + +struct VmmEpollHandler { + vmm: Arc>, + vmm_event_count: Arc, +} + +impl MutEventSubscriber for VmmEpollHandler { + fn process(&mut self, events: Events, _ops: &mut EventOps) { + // Do not try to recover when the lock has already been poisoned. + // And be careful to avoid deadlock between process() and vmm::vmm_thread_event_loop(). + let mut vmm = self.vmm.lock().unwrap(); + + match events.data() { + EPOLL_EVENT_API_REQUEST => { + if let Err(e) = vmm.event_ctx.api_event_fd.read() { + error!("event_manager: failed to read API eventfd, {:?}", e); + } + vmm.event_ctx.api_event_triggered = true; + self.vmm_event_count.fetch_add(1, Ordering::AcqRel); + } + EPOLL_EVENT_EXIT => { + let vm = vmm.get_vm().unwrap(); + match vm.get_reset_eventfd() { + Some(ev) => { + if let Err(e) = ev.read() { + error!("event_manager: failed to read exit eventfd, {:?}", e); + } + } + None => warn!("event_manager: leftover exit event in epoll context!"), + } + vmm.event_ctx.exit_evt_triggered = true; + self.vmm_event_count.fetch_add(1, Ordering::AcqRel); + } + _ => error!("event_manager: unknown epoll slot number {}", events.data()), + } + } + + fn init(&mut self, ops: &mut EventOps) { + // Do not expect poisoned lock. + let vmm = self.vmm.lock().unwrap(); + let events = Events::with_data( + &vmm.event_ctx.api_event_fd, + EPOLL_EVENT_API_REQUEST, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "event_manager: failed to register epoll event for API server, {:?}", + e + ); + } + } +} diff --git a/src/dragonball/src/io_manager.rs b/src/dragonball/src/io_manager.rs new file mode 100644 index 0000000000..410703bc7a --- /dev/null +++ b/src/dragonball/src/io_manager.rs @@ -0,0 +1,60 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use arc_swap::{ArcSwap, Cache}; +use dbs_device::device_manager::Error; +use dbs_device::device_manager::IoManager; + +/// A specialized version of [`std::result::Result`] for IO manager related operations. +pub type Result = std::result::Result; + +/// Wrapper over IoManager to support device hotplug with [`ArcSwap`] and [`Cache`]. +#[derive(Clone)] +pub struct IoManagerCached(pub(crate) Cache>, Arc>); + +impl IoManagerCached { + /// Create a new instance of [`IoManagerCached`]. + pub fn new(io_manager: Arc>) -> Self { + IoManagerCached(Cache::new(io_manager)) + } + + #[cfg(target_arch = "x86_64")] + #[inline] + /// Read data from IO ports. + pub fn pio_read(&mut self, addr: u16, data: &mut [u8]) -> Result<()> { + self.0.load().pio_read(addr, data) + } + + #[cfg(target_arch = "x86_64")] + #[inline] + /// Write data to IO ports. + pub fn pio_write(&mut self, addr: u16, data: &[u8]) -> Result<()> { + self.0.load().pio_write(addr, data) + } + + #[inline] + /// Read data to MMIO address. + pub fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> Result<()> { + self.0.load().mmio_read(addr, data) + } + + #[inline] + /// Write data to MMIO address. + pub fn mmio_write(&mut self, addr: u64, data: &[u8]) -> Result<()> { + self.0.load().mmio_write(addr, data) + } + + #[inline] + /// Revalidate the inner cache + pub fn revalidate_cache(&mut self) { + let _ = self.0.load(); + } + + #[inline] + /// Get immutable reference to underlying [`IoManager`]. + pub fn load(&mut self) -> &IoManager { + self.0.load() + } +} diff --git a/src/dragonball/src/kvm_context.rs b/src/dragonball/src/kvm_context.rs new file mode 100644 index 0000000000..f160b264b8 --- /dev/null +++ b/src/dragonball/src/kvm_context.rs @@ -0,0 +1,251 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +#![allow(dead_code)] +use kvm_bindings::KVM_API_VERSION; +use kvm_ioctls::{Cap, Kvm, VmFd}; +use std::os::unix::io::{FromRawFd, RawFd}; + +use crate::error::{Error, Result}; + +/// Describes a KVM context that gets attached to the micro VM instance. +/// It gives access to the functionality of the KVM wrapper as long as every required +/// KVM capability is present on the host. +pub struct KvmContext { + kvm: Kvm, + max_memslots: usize, + #[cfg(target_arch = "x86_64")] + supported_msrs: kvm_bindings::MsrList, +} + +impl KvmContext { + /// Create a new KVM context object, using the provided `kvm_fd` if one is presented. + pub fn new(kvm_fd: Option) -> Result { + let kvm = if let Some(fd) = kvm_fd { + // Safe because we expect kvm_fd to contain a valid fd number when is_some() == true. + unsafe { Kvm::from_raw_fd(fd) } + } else { + Kvm::new().map_err(Error::Kvm)? + }; + + if kvm.get_api_version() != KVM_API_VERSION as i32 { + return Err(Error::KvmApiVersion(kvm.get_api_version())); + } + + Self::check_cap(&kvm, Cap::Irqchip)?; + Self::check_cap(&kvm, Cap::Irqfd)?; + Self::check_cap(&kvm, Cap::Ioeventfd)?; + Self::check_cap(&kvm, Cap::UserMemory)?; + #[cfg(target_arch = "x86_64")] + Self::check_cap(&kvm, Cap::SetTssAddr)?; + + #[cfg(target_arch = "x86_64")] + let supported_msrs = dbs_arch::msr::supported_guest_msrs(&kvm).map_err(Error::GuestMSRs)?; + let max_memslots = kvm.get_nr_memslots(); + + Ok(KvmContext { + kvm, + max_memslots, + #[cfg(target_arch = "x86_64")] + supported_msrs, + }) + } + + /// Get underlying KVM object to access kvm-ioctls interfaces. + pub fn kvm(&self) -> &Kvm { + &self.kvm + } + + /// Get the maximum number of memory slots reported by this KVM context. + pub fn max_memslots(&self) -> usize { + self.max_memslots + } + + /// Create a virtual machine object. + pub fn create_vm(&self) -> Result { + self.kvm.create_vm().map_err(Error::Kvm) + } + + /// Get the max vcpu count supported by kvm + pub fn get_max_vcpus(&self) -> usize { + self.kvm.get_max_vcpus() + } + + fn check_cap(kvm: &Kvm, cap: Cap) -> std::result::Result<(), Error> { + if !kvm.check_extension(cap) { + return Err(Error::KvmCap(cap)); + } + Ok(()) + } +} + +#[cfg(target_arch = "x86_64")] +mod x86_64 { + use super::*; + use dbs_arch::msr::*; + use kvm_bindings::{kvm_msr_entry, CpuId, MsrList, Msrs}; + use std::collections::HashSet; + + impl KvmContext { + /// Get information about supported CPUID of x86 processor. + pub fn supported_cpuid( + &self, + max_entries_count: usize, + ) -> std::result::Result { + self.kvm.get_supported_cpuid(max_entries_count) + } + + /// Get information about supported MSRs of x86 processor. + pub fn supported_msrs( + &self, + _max_entries_count: usize, + ) -> std::result::Result { + Ok(self.supported_msrs.clone()) + } + + // It's very sensible to manipulate MSRs, so please be careful to change code below. + fn build_msrs_list(kvm: &Kvm) -> Result { + let mut mset: HashSet = HashSet::new(); + let supported_msr_list = kvm.get_msr_index_list().map_err(super::Error::Kvm)?; + for msr in supported_msr_list.as_slice() { + mset.insert(*msr); + } + + let mut msrs = vec![ + MSR_IA32_APICBASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_IA32_CR_PAT, + ]; + + let filters_list = vec![ + MSR_STAR, + MSR_VM_HSAVE_PA, + MSR_TSC_AUX, + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSCDEADLINE, + MSR_IA32_MISC_ENABLE, + MSR_IA32_BNDCFGS, + MSR_IA32_SPEC_CTRL, + ]; + for msr in filters_list { + if mset.contains(&msr) { + msrs.push(msr); + } + } + + // TODO: several msrs are optional. + + // TODO: Since our guests don't support nested-vmx, LMCE nor SGX for now. + // msrs.push(MSR_IA32_FEATURE_CONTROL); + + msrs.push(MSR_CSTAR); + msrs.push(MSR_KERNEL_GS_BASE); + msrs.push(MSR_SYSCALL_MASK); + msrs.push(MSR_LSTAR); + msrs.push(MSR_IA32_TSC); + + msrs.push(MSR_KVM_SYSTEM_TIME_NEW); + msrs.push(MSR_KVM_WALL_CLOCK_NEW); + + // FIXME: check if it's supported. + msrs.push(MSR_KVM_ASYNC_PF_EN); + msrs.push(MSR_KVM_PV_EOI_EN); + msrs.push(MSR_KVM_STEAL_TIME); + + msrs.push(MSR_CORE_PERF_FIXED_CTR_CTRL); + msrs.push(MSR_CORE_PERF_GLOBAL_CTRL); + msrs.push(MSR_CORE_PERF_GLOBAL_STATUS); + msrs.push(MSR_CORE_PERF_GLOBAL_OVF_CTRL); + + const MAX_FIXED_COUNTERS: u32 = 3; + for i in 0..MAX_FIXED_COUNTERS { + msrs.push(MSR_CORE_PERF_FIXED_CTR0 + i); + } + + // FIXME: skip MCE for now. + + let mtrr_msrs = vec![ + MSR_MTRRdefType, + MSR_MTRRfix64K_00000, + MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000, + MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_C8000, + MSR_MTRRfix4K_D0000, + MSR_MTRRfix4K_D8000, + MSR_MTRRfix4K_E0000, + MSR_MTRRfix4K_E8000, + MSR_MTRRfix4K_F0000, + MSR_MTRRfix4K_F8000, + ]; + for mtrr in mtrr_msrs { + msrs.push(mtrr); + } + + const MSR_MTRRCAP_VCNT: u32 = 8; + for i in 0..MSR_MTRRCAP_VCNT { + msrs.push(0x200 + 2 * i); + msrs.push(0x200 + 2 * i + 1); + } + + let msrs: Vec = msrs + .iter() + .map(|reg| kvm_msr_entry { + index: *reg, + reserved: 0, + data: 0, + }) + .collect(); + + Msrs::from_entries(&msrs).map_err(super::Error::Msr) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use kvm_ioctls::Kvm; + use std::fs::File; + use std::os::unix::fs::MetadataExt; + use std::os::unix::io::{AsRawFd, FromRawFd}; + + #[test] + fn test_create_kvm_context() { + let c = KvmContext::new(None).unwrap(); + + assert!(c.max_memslots >= 32); + + let kvm = Kvm::new().unwrap(); + let f = unsafe { File::from_raw_fd(kvm.as_raw_fd()) }; + let m1 = f.metadata().unwrap(); + let m2 = File::open("/dev/kvm").unwrap().metadata().unwrap(); + + assert_eq!(m1.dev(), m2.dev()); + assert_eq!(m1.ino(), m2.ino()); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_get_supported_cpu_id() { + let c = KvmContext::new(None).unwrap(); + + let _ = c + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .expect("failed to get supported CPUID"); + assert!(c.supported_cpuid(0).is_err()); + } + + #[test] + fn test_create_vm() { + let c = KvmContext::new(None).unwrap(); + + let _ = c.create_vm().unwrap(); + } +} diff --git a/src/dragonball/src/lib.rs b/src/dragonball/src/lib.rs new file mode 100644 index 0000000000..7371e8213a --- /dev/null +++ b/src/dragonball/src/lib.rs @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Dragonball is a light-weight virtual machine manager(VMM) based on Linux Kernel-based Virtual +//! Machine(KVM) which is optimized for container workloads. + +#![warn(missing_docs)] +//TODO: Remove this, after the rest of dragonball has been committed. +#![allow(dead_code)] + +/// Address space manager for virtual machines. +pub mod address_space_manager; +/// API to handle vmm requests. +pub mod api; +/// Structs to maintain configuration information. +pub mod config_manager; +/// Device manager for virtual machines. +pub mod device_manager; +/// Errors related to Virtual machine manager. +pub mod error; +/// KVM operation context for virtual machines. +pub mod kvm_context; +/// Metrics system. +pub mod metric; +/// Resource manager for virtual machines. +pub mod resource_manager; +/// Signal handler for virtual machines. +pub mod signal_handler; +/// Virtual CPU manager for virtual machines. +pub mod vcpu; +/// Virtual machine manager for virtual machines. +pub mod vm; + +mod event_manager; +mod io_manager; +mod vmm; + +pub use self::error::StartMicroVmError; +pub use self::io_manager::IoManagerCached; +pub use self::vmm::Vmm; + +/// Success exit code. +pub const EXIT_CODE_OK: u8 = 0; +/// Generic error exit code. +pub const EXIT_CODE_GENERIC_ERROR: u8 = 1; +/// Generic exit code for an error considered not possible to occur if the program logic is sound. +pub const EXIT_CODE_UNEXPECTED_ERROR: u8 = 2; +/// Dragonball was shut down after intercepting a restricted system call. +pub const EXIT_CODE_BAD_SYSCALL: u8 = 148; +/// Dragonball was shut down after intercepting `SIGBUS`. +pub const EXIT_CODE_SIGBUS: u8 = 149; +/// Dragonball was shut down after intercepting `SIGSEGV`. +pub const EXIT_CODE_SIGSEGV: u8 = 150; +/// Invalid json passed to the Dragonball process for configuring microvm. +pub const EXIT_CODE_INVALID_JSON: u8 = 151; +/// Bad configuration for microvm's resources, when using a single json. +pub const EXIT_CODE_BAD_CONFIGURATION: u8 = 152; +/// Command line arguments parsing error. +pub const EXIT_CODE_ARG_PARSING: u8 = 153; diff --git a/src/dragonball/src/metric.rs b/src/dragonball/src/metric.rs new file mode 100644 index 0000000000..716e9e0440 --- /dev/null +++ b/src/dragonball/src/metric.rs @@ -0,0 +1,58 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use dbs_utils::metric::SharedIncMetric; +use lazy_static::lazy_static; +use serde::Serialize; + +pub use dbs_utils::metric::IncMetric; + +lazy_static! { + /// Static instance used for handling metrics. + pub static ref METRICS: DragonballMetrics = DragonballMetrics::default(); +} + +/// Metrics specific to VCPUs' mode of functioning. +#[derive(Default, Serialize)] +pub struct VcpuMetrics { + /// Number of KVM exits for handling input IO. + pub exit_io_in: SharedIncMetric, + /// Number of KVM exits for handling output IO. + pub exit_io_out: SharedIncMetric, + /// Number of KVM exits for handling MMIO reads. + pub exit_mmio_read: SharedIncMetric, + /// Number of KVM exits for handling MMIO writes. + pub exit_mmio_write: SharedIncMetric, + /// Number of errors during this VCPU's run. + pub failures: SharedIncMetric, + /// Failures in configuring the CPUID. + pub filter_cpuid: SharedIncMetric, +} + +/// Metrics for the seccomp filtering. +#[derive(Default, Serialize)] +pub struct SeccompMetrics { + /// Number of errors inside the seccomp filtering. + pub num_faults: SharedIncMetric, +} + +/// Metrics related to signals. +#[derive(Default, Serialize)] +pub struct SignalMetrics { + /// Number of times that SIGBUS was handled. + pub sigbus: SharedIncMetric, + /// Number of times that SIGSEGV was handled. + pub sigsegv: SharedIncMetric, +} + +/// Structure storing all metrics while enforcing serialization support on them. +#[derive(Default, Serialize)] +pub struct DragonballMetrics { + /// Metrics related to a vcpu's functioning. + pub vcpu: VcpuMetrics, + /// Metrics related to seccomp filtering. + pub seccomp: SeccompMetrics, + /// Metrics related to signals. + pub signals: SignalMetrics, +} diff --git a/src/dragonball/src/resource_manager.rs b/src/dragonball/src/resource_manager.rs new file mode 100644 index 0000000000..2cb32c0546 --- /dev/null +++ b/src/dragonball/src/resource_manager.rs @@ -0,0 +1,785 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Mutex; + +use dbs_allocator::{Constraint, IntervalTree, Range}; +use dbs_boot::layout::{ + GUEST_MEM_END, GUEST_MEM_START, GUEST_PHYS_END, IRQ_BASE as LEGACY_IRQ_BASE, + IRQ_MAX as LEGACY_IRQ_MAX, MMIO_LOW_END, MMIO_LOW_START, +}; +use dbs_device::resources::{DeviceResources, MsiIrqType, Resource, ResourceConstraint}; + +// We reserve the LEGACY_IRQ_BASE(5) for shared IRQ. +const SHARED_IRQ: u32 = LEGACY_IRQ_BASE; +// Since ioapic2 have 24 pins for legacy devices, so irq number 0-23 are used. We will set MSI_IRQ_BASE at 24. +#[cfg(target_arch = "x86_64")] +const MSI_IRQ_BASE: u32 = 24; +#[cfg(target_arch = "aarch64")] +/// We define MSI_IRQ_BASE as LEGACY_IRQ_MAX for aarch64 in order not to conflict with legacy irq numbers. +const MSI_IRQ_BASE: u32 = LEGACY_IRQ_MAX + 1; + +// kvm max irq is defined in arch/x86/include/asm/kvm_host.h +const MSI_IRQ_MAX: u32 = 1023; +// x86's kvm user mem slots is defined in arch/x86/include/asm/kvm_host.h +#[cfg(target_arch = "x86_64")] +const KVM_USER_MEM_SLOTS: u32 = 509; +// aarch64's kvm user mem slots is defined in arch/arm64/include/asm/kvm_host.h +#[cfg(target_arch = "aarch64")] +const KVM_USER_MEM_SLOTS: u32 = 512; +const PIO_MIN: u16 = 0x0; +const PIO_MAX: u16 = 0xFFFF; +// Reserve the 64MB MMIO address range just below 4G, x86 systems have special +// devices, such as LAPIC, IOAPIC, HPET etc, in this range. And we don't explicitly +// allocate MMIO address for those devices. +const MMIO_SPACE_RESERVED: u64 = 0x400_0000; + +/// Errors associated with resource management operations +#[derive(Debug, PartialEq, thiserror::Error)] +pub enum ResourceError { + /// Unknown/unsupported resource type. + #[error("unsupported resource type")] + UnknownResourceType, + + /// Invalid resource range. + #[error("invalid resource range for resource type : {0}")] + InvalidResourceRange(String), + + /// No resource available. + #[error("no resource available")] + NoAvailResource, +} + +#[derive(Default)] +struct ResourceManagerBuilder { + // IntervalTree for allocating legacy irq number. + legacy_irq_pool: IntervalTree<()>, + // IntervalTree for allocating message signal interrupt (MSI) irq number. + msi_irq_pool: IntervalTree<()>, + // IntervalTree for allocating port-mapped io (PIO) address. + pio_pool: IntervalTree<()>, + // IntervalTree for allocating memory-mapped io (MMIO) address. + mmio_pool: IntervalTree<()>, + // IntervalTree for allocating guest memory. + mem_pool: IntervalTree<()>, + // IntervalTree for allocating kvm memory slot. + kvm_mem_slot_pool: IntervalTree<()>, +} + +impl ResourceManagerBuilder { + /// init legacy_irq_pool with arch specific constants. + fn init_legacy_irq_pool(mut self) -> Self { + // The LEGACY_IRQ_BASE irq is reserved for shared IRQ and won't be allocated / reallocated, + // so we don't insert it into the legacy_irq interval tree. + self.legacy_irq_pool + .insert(Range::new(LEGACY_IRQ_BASE + 1, LEGACY_IRQ_MAX), None); + self + } + + /// init msi_irq_pool with arch specific constants. + fn init_msi_irq_pool(mut self) -> Self { + self.msi_irq_pool + .insert(Range::new(MSI_IRQ_BASE, MSI_IRQ_MAX), None); + self + } + + /// init pio_pool with arch specific constants. + fn init_pio_pool(mut self) -> Self { + self.pio_pool.insert(Range::new(PIO_MIN, PIO_MAX), None); + self + } + + /// Create mmio_pool with arch specific constants. + /// allow(clippy) is because `GUEST_MEM_START > MMIO_LOW_END`, we may modify GUEST_MEM_START or + /// MMIO_LOW_END in the future. + #[allow(clippy::absurd_extreme_comparisons)] + fn init_mmio_pool_helper(mmio: &mut IntervalTree<()>) { + mmio.insert(Range::new(MMIO_LOW_START, MMIO_LOW_END), None); + if !(*GUEST_MEM_END < MMIO_LOW_START + || GUEST_MEM_START > MMIO_LOW_END + || MMIO_LOW_START == MMIO_LOW_END) + { + #[cfg(target_arch = "x86_64")] + { + let constraint = Constraint::new(MMIO_SPACE_RESERVED) + .min(MMIO_LOW_END - MMIO_SPACE_RESERVED) + .max(0xffff_ffffu64); + let key = mmio.allocate(&constraint); + if let Some(k) = key.as_ref() { + mmio.update(k, ()); + } else { + panic!("failed to reserve MMIO address range for x86 system devices"); + } + } + } + + if *GUEST_MEM_END < *GUEST_PHYS_END { + mmio.insert(Range::new(*GUEST_MEM_END + 1, *GUEST_PHYS_END), None); + } + } + + /// init mmio_pool with helper function + fn init_mmio_pool(mut self) -> Self { + Self::init_mmio_pool_helper(&mut self.mmio_pool); + self + } + + /// Create mem_pool with arch specific constants. + /// deny(clippy) is because `GUEST_MEM_START > MMIO_LOW_END`, we may modify GUEST_MEM_START or + /// MMIO_LOW_END in the future. + #[allow(clippy::absurd_extreme_comparisons)] + pub(crate) fn init_mem_pool_helper(mem: &mut IntervalTree<()>) { + if *GUEST_MEM_END < MMIO_LOW_START + || GUEST_MEM_START > MMIO_LOW_END + || MMIO_LOW_START == MMIO_LOW_END + { + mem.insert(Range::new(GUEST_MEM_START, *GUEST_MEM_END), None); + } else { + if MMIO_LOW_START > GUEST_MEM_START { + mem.insert(Range::new(GUEST_MEM_START, MMIO_LOW_START - 1), None); + } + if MMIO_LOW_END < *GUEST_MEM_END { + mem.insert(Range::new(MMIO_LOW_END + 1, *GUEST_MEM_END), None); + } + } + } + + /// init mem_pool with helper function + fn init_mem_pool(mut self) -> Self { + Self::init_mem_pool_helper(&mut self.mem_pool); + self + } + + /// init kvm_mem_slot_pool with arch specific constants. + fn init_kvm_mem_slot_pool(mut self, max_kvm_mem_slot: Option) -> Self { + let max_slots = max_kvm_mem_slot.unwrap_or(KVM_USER_MEM_SLOTS as usize); + self.kvm_mem_slot_pool + .insert(Range::new(0, max_slots as u64), None); + self + } + + fn build(self) -> ResourceManager { + ResourceManager { + legacy_irq_pool: Mutex::new(self.legacy_irq_pool), + msi_irq_pool: Mutex::new(self.msi_irq_pool), + pio_pool: Mutex::new(self.pio_pool), + mmio_pool: Mutex::new(self.mmio_pool), + mem_pool: Mutex::new(self.mem_pool), + kvm_mem_slot_pool: Mutex::new(self.kvm_mem_slot_pool), + } + } +} + +/// Resource manager manages all resources for a virtual machine instance. +pub struct ResourceManager { + legacy_irq_pool: Mutex>, + msi_irq_pool: Mutex>, + pio_pool: Mutex>, + mmio_pool: Mutex>, + mem_pool: Mutex>, + kvm_mem_slot_pool: Mutex>, +} + +impl Default for ResourceManager { + fn default() -> Self { + ResourceManagerBuilder::default().build() + } +} + +impl ResourceManager { + /// Create a resource manager instance. + pub fn new(max_kvm_mem_slot: Option) -> Self { + let res_manager_builder = ResourceManagerBuilder::default(); + res_manager_builder + .init_legacy_irq_pool() + .init_msi_irq_pool() + .init_pio_pool() + .init_mmio_pool() + .init_mem_pool() + .init_kvm_mem_slot_pool(max_kvm_mem_slot) + .build() + } + + /// Init mem_pool with arch specific constants. + pub fn init_mem_pool(&self) { + let mut mem = self.mem_pool.lock().unwrap(); + ResourceManagerBuilder::init_mem_pool_helper(&mut mem); + } + + /// Check if mem_pool is empty. + pub fn is_mem_pool_empty(&self) -> bool { + self.mem_pool.lock().unwrap().is_empty() + } + + /// Allocate one legacy irq number. + /// + /// Allocate the specified irq number if `fixed` contains an irq number. + pub fn allocate_legacy_irq(&self, shared: bool, fixed: Option) -> Option { + // if shared_irq is used, just return the shared irq num. + if shared { + return Some(SHARED_IRQ); + } + + let mut constraint = Constraint::new(1u32); + if let Some(v) = fixed { + if v == SHARED_IRQ { + return None; + } + + constraint.min = v as u64; + constraint.max = v as u64; + } + // Safe to unwrap() because we don't expect poisoned lock here. + let mut legacy_irq_pool = self.legacy_irq_pool.lock().unwrap(); + let key = legacy_irq_pool.allocate(&constraint); + if let Some(k) = key.as_ref() { + legacy_irq_pool.update(k, ()); + } + key.map(|v| v.min as u32) + } + + /// Free a legacy irq number. + /// + /// Panic if the irq number is invalid. + pub fn free_legacy_irq(&self, irq: u32) -> Result<(), ResourceError> { + // if the irq number is shared_irq, we don't need to do anything. + if irq == SHARED_IRQ { + return Ok(()); + } + + if !(LEGACY_IRQ_BASE..=LEGACY_IRQ_MAX).contains(&irq) { + return Err(ResourceError::InvalidResourceRange( + "Legacy IRQ".to_string(), + )); + } + let key = Range::new(irq, irq); + // Safe to unwrap() because we don't expect poisoned lock here. + self.legacy_irq_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate a group of MSI irq numbers. + /// + /// The allocated MSI irq numbers may or may not be naturally aligned. + pub fn allocate_msi_irq(&self, count: u32) -> Option { + let constraint = Constraint::new(count); + // Safe to unwrap() because we don't expect poisoned lock here. + let mut msi_irq_pool = self.msi_irq_pool.lock().unwrap(); + let key = msi_irq_pool.allocate(&constraint); + if let Some(k) = key.as_ref() { + msi_irq_pool.update(k, ()); + } + key.map(|v| v.min as u32) + } + + /// Allocate a group of MSI irq numbers, naturally aligned to `count`. + /// + /// This may be used to support PCI MSI, which requires the allocated irq number is naturally + /// aligned. + pub fn allocate_msi_irq_aligned(&self, count: u32) -> Option { + let constraint = Constraint::new(count).align(count); + // Safe to unwrap() because we don't expect poisoned lock here. + let mut msi_irq_pool = self.msi_irq_pool.lock().unwrap(); + let key = msi_irq_pool.allocate(&constraint); + if let Some(k) = key.as_ref() { + msi_irq_pool.update(k, ()); + } + key.map(|v| v.min as u32) + } + + /// Free a group of MSI irq numbers. + /// + /// Panic if `irq` or `count` is invalid. + pub fn free_msi_irq(&self, irq: u32, count: u32) -> Result<(), ResourceError> { + if irq < MSI_IRQ_BASE + || count == 0 + || irq.checked_add(count).is_none() + || irq + count - 1 > MSI_IRQ_MAX + { + return Err(ResourceError::InvalidResourceRange("MSI IRQ".to_string())); + } + let key = Range::new(irq, irq + count - 1); + // Safe to unwrap() because we don't expect poisoned lock here. + self.msi_irq_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate a group of PIO address and returns the allocated PIO base address. + pub fn allocate_pio_address_simple(&self, size: u16) -> Option { + let constraint = Constraint::new(size); + self.allocate_pio_address(&constraint) + } + + /// Allocate a group of PIO address and returns the allocated PIO base address. + pub fn allocate_pio_address(&self, constraint: &Constraint) -> Option { + // Safe to unwrap() because we don't expect poisoned lock here. + let mut pio_pool = self.pio_pool.lock().unwrap(); + let key = pio_pool.allocate(constraint); + if let Some(k) = key.as_ref() { + pio_pool.update(k, ()); + } + key.map(|v| v.min as u16) + } + + /// Free PIO address range `[base, base + size - 1]`. + /// + /// Panic if `base` or `size` is invalid. + pub fn free_pio_address(&self, base: u16, size: u16) -> Result<(), ResourceError> { + if base.checked_add(size).is_none() { + return Err(ResourceError::InvalidResourceRange( + "PIO Address".to_string(), + )); + } + let key = Range::new(base, base + size - 1); + // Safe to unwrap() because we don't expect poisoned lock here. + self.pio_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate a MMIO address range alinged to `align` and returns the allocated base address. + pub fn allocate_mmio_address_aligned(&self, size: u64, align: u64) -> Option { + let constraint = Constraint::new(size).align(align); + self.allocate_mmio_address(&constraint) + } + + /// Allocate a MMIO address range and returns the allocated base address. + pub fn allocate_mmio_address(&self, constraint: &Constraint) -> Option { + // Safe to unwrap() because we don't expect poisoned lock here. + let mut mmio_pool = self.mmio_pool.lock().unwrap(); + let key = mmio_pool.allocate(constraint); + key.map(|v| v.min) + } + + /// Free MMIO address range `[base, base + size - 1]` + pub fn free_mmio_address(&self, base: u64, size: u64) -> Result<(), ResourceError> { + if base.checked_add(size).is_none() { + return Err(ResourceError::InvalidResourceRange( + "MMIO Address".to_string(), + )); + } + let key = Range::new(base, base + size - 1); + // Safe to unwrap() because we don't expect poisoned lock here. + self.mmio_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate guest memory address range and returns the allocated base memory address. + pub fn allocate_mem_address(&self, constraint: &Constraint) -> Option { + // Safe to unwrap() because we don't expect poisoned lock here. + let mut mem_pool = self.mem_pool.lock().unwrap(); + let key = mem_pool.allocate(constraint); + + key.map(|v| v.min) + } + + /// Free the guest memory address range `[base, base + size - 1]`. + /// + /// Panic if the guest memory address range is invalid. + /// allow(clippy) is because `base < GUEST_MEM_START`, we may modify GUEST_MEM_START in the future. + #[allow(clippy::absurd_extreme_comparisons)] + pub fn free_mem_address(&self, base: u64, size: u64) -> Result<(), ResourceError> { + if base.checked_add(size).is_none() + || base < GUEST_MEM_START + || base + size > *GUEST_MEM_END + { + return Err(ResourceError::InvalidResourceRange( + "MEM Address".to_string(), + )); + } + let key = Range::new(base, base + size - 1); + // Safe to unwrap() because we don't expect poisoned lock here. + self.mem_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate a kvm memory slot number. + /// + /// Allocate the specified slot if `fixed` contains a slot number. + pub fn allocate_kvm_mem_slot(&self, size: u32, fixed: Option) -> Option { + let mut constraint = Constraint::new(size); + if let Some(v) = fixed { + constraint.min = v as u64; + constraint.max = v as u64; + } + // Safe to unwrap() because we don't expect poisoned lock here. + let mut kvm_mem_slot_pool = self.kvm_mem_slot_pool.lock().unwrap(); + let key = kvm_mem_slot_pool.allocate(&constraint); + if let Some(k) = key.as_ref() { + kvm_mem_slot_pool.update(k, ()); + } + key.map(|v| v.min as u32) + } + + /// Free a kvm memory slot number. + pub fn free_kvm_mem_slot(&self, slot: u32) -> Result<(), ResourceError> { + let key = Range::new(slot, slot); + // Safe to unwrap() because we don't expect poisoned lock here. + self.kvm_mem_slot_pool.lock().unwrap().free(&key); + Ok(()) + } + + /// Allocate requested resources for a device. + pub fn allocate_device_resources( + &self, + requests: &[ResourceConstraint], + shared_irq: bool, + ) -> std::result::Result { + let mut resources = DeviceResources::new(); + for resource in requests.iter() { + let res = match resource { + ResourceConstraint::PioAddress { range, align, size } => { + let mut constraint = Constraint::new(*size).align(*align); + if let Some(r) = range { + constraint.min = r.0 as u64; + constraint.max = r.1 as u64; + } + match self.allocate_pio_address(&constraint) { + Some(base) => Resource::PioAddressRange { + base: base as u16, + size: *size, + }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + ResourceConstraint::MmioAddress { range, align, size } => { + let mut constraint = Constraint::new(*size).align(*align); + if let Some(r) = range { + constraint.min = r.0; + constraint.max = r.1; + } + match self.allocate_mmio_address(&constraint) { + Some(base) => Resource::MmioAddressRange { base, size: *size }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + ResourceConstraint::MemAddress { range, align, size } => { + let mut constraint = Constraint::new(*size).align(*align); + if let Some(r) = range { + constraint.min = r.0; + constraint.max = r.1; + } + match self.allocate_mem_address(&constraint) { + Some(base) => Resource::MemAddressRange { base, size: *size }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + ResourceConstraint::LegacyIrq { irq } => { + match self.allocate_legacy_irq(shared_irq, *irq) { + Some(v) => Resource::LegacyIrq(v), + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + ResourceConstraint::PciMsiIrq { size } => { + match self.allocate_msi_irq_aligned(*size) { + Some(base) => Resource::MsiIrq { + ty: MsiIrqType::PciMsi, + base, + size: *size, + }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + ResourceConstraint::PciMsixIrq { size } => match self.allocate_msi_irq(*size) { + Some(base) => Resource::MsiIrq { + ty: MsiIrqType::PciMsix, + base, + size: *size, + }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + }, + ResourceConstraint::GenericIrq { size } => match self.allocate_msi_irq(*size) { + Some(base) => Resource::MsiIrq { + ty: MsiIrqType::GenericMsi, + base, + size: *size, + }, + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + }, + ResourceConstraint::KvmMemSlot { slot, size } => { + match self.allocate_kvm_mem_slot(*size, *slot) { + Some(v) => Resource::KvmMemSlot(v), + None => { + if let Err(e) = self.free_device_resources(&resources) { + return Err(e); + } else { + return Err(ResourceError::NoAvailResource); + } + } + } + } + }; + resources.append(res); + } + + Ok(resources) + } + + /// Free resources allocated for a device. + pub fn free_device_resources(&self, resources: &DeviceResources) -> Result<(), ResourceError> { + for res in resources.iter() { + let result = match res { + Resource::PioAddressRange { base, size } => self.free_pio_address(*base, *size), + Resource::MmioAddressRange { base, size } => self.free_mmio_address(*base, *size), + Resource::MemAddressRange { base, size } => self.free_mem_address(*base, *size), + Resource::LegacyIrq(base) => self.free_legacy_irq(*base), + Resource::MsiIrq { ty: _, base, size } => self.free_msi_irq(*base, *size), + Resource::KvmMemSlot(slot) => self.free_kvm_mem_slot(*slot), + Resource::MacAddresss(_) => Ok(()), + }; + if result.is_err() { + return result; + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_allocate_legacy_irq() { + let mgr = ResourceManager::new(None); + + // Allocate/free shared IRQs multiple times. + assert_eq!(mgr.allocate_legacy_irq(true, None).unwrap(), SHARED_IRQ); + assert_eq!(mgr.allocate_legacy_irq(true, None).unwrap(), SHARED_IRQ); + mgr.free_legacy_irq(SHARED_IRQ); + mgr.free_legacy_irq(SHARED_IRQ); + mgr.free_legacy_irq(SHARED_IRQ); + + // Allocate specified IRQs. + assert_eq!( + mgr.allocate_legacy_irq(false, Some(LEGACY_IRQ_BASE + 10)) + .unwrap(), + LEGACY_IRQ_BASE + 10 + ); + mgr.free_legacy_irq(LEGACY_IRQ_BASE + 10); + assert_eq!( + mgr.allocate_legacy_irq(false, Some(LEGACY_IRQ_BASE + 10)) + .unwrap(), + LEGACY_IRQ_BASE + 10 + ); + assert!(mgr + .allocate_legacy_irq(false, Some(LEGACY_IRQ_BASE + 10)) + .is_none()); + + assert!(mgr.allocate_legacy_irq(false, None).is_some()); + + assert!(mgr + .allocate_legacy_irq(false, Some(LEGACY_IRQ_BASE - 1)) + .is_none()); + assert!(mgr + .allocate_legacy_irq(false, Some(LEGACY_IRQ_MAX + 1)) + .is_none()); + assert!(mgr.allocate_legacy_irq(false, Some(SHARED_IRQ)).is_none()); + } + + #[test] + fn test_invalid_free_legacy_irq() { + let mgr = ResourceManager::new(None); + assert_eq!( + mgr.free_legacy_irq(LEGACY_IRQ_MAX + 1), + Err(ResourceError::InvalidResourceRange( + "Legacy IRQ".to_string(), + )) + ); + } + + #[test] + fn test_allocate_msi_irq() { + let mgr = ResourceManager::new(None); + + let msi = mgr.allocate_msi_irq(3).unwrap(); + mgr.free_msi_irq(msi, 3); + let msi = mgr.allocate_msi_irq(3).unwrap(); + mgr.free_msi_irq(msi, 3); + + let irq = mgr.allocate_msi_irq_aligned(8).unwrap(); + assert_eq!(irq & 0x7, 0); + mgr.free_msi_irq(msi, 8); + let irq = mgr.allocate_msi_irq_aligned(8).unwrap(); + assert_eq!(irq & 0x7, 0); + + let irq = mgr.allocate_msi_irq_aligned(512).unwrap(); + assert_eq!(irq, 512); + mgr.free_msi_irq(irq, 512); + let irq = mgr.allocate_msi_irq_aligned(512).unwrap(); + assert_eq!(irq, 512); + + assert!(mgr.allocate_msi_irq(4099).is_none()); + } + + #[test] + fn test_invalid_free_msi_irq() { + let mgr = ResourceManager::new(None); + assert_eq!( + mgr.free_msi_irq(MSI_IRQ_MAX, 3), + Err(ResourceError::InvalidResourceRange("MSI IRQ".to_string())) + ); + } + + #[test] + fn test_allocate_pio_addr() { + let mgr = ResourceManager::new(None); + assert!(mgr.allocate_pio_address_simple(10).is_some()); + let mut requests = vec![ + ResourceConstraint::PioAddress { + range: None, + align: 0x1000, + size: 0x2000, + }, + ResourceConstraint::PioAddress { + range: Some((0x8000, 0x9000)), + align: 0x1000, + size: 0x1000, + }, + ResourceConstraint::PioAddress { + range: Some((0x9000, 0xa000)), + align: 0x1000, + size: 0x1000, + }, + ResourceConstraint::PioAddress { + range: Some((0xb000, 0xc000)), + align: 0x1000, + size: 0x1000, + }, + ]; + let resources = mgr.allocate_device_resources(&requests, false).unwrap(); + mgr.free_device_resources(&resources); + let resources = mgr.allocate_device_resources(&requests, false).unwrap(); + mgr.free_device_resources(&resources); + requests.push(ResourceConstraint::PioAddress { + range: Some((0xc000, 0xc000)), + align: 0x1000, + size: 0x1000, + }); + assert!(mgr.allocate_device_resources(&requests, false).is_err()); + let resources = mgr + .allocate_device_resources(&requests[0..requests.len() - 1], false) + .unwrap(); + mgr.free_device_resources(&resources); + } + + #[test] + fn test_invalid_free_pio_addr() { + let mgr = ResourceManager::new(None); + assert_eq!( + mgr.free_pio_address(u16::MAX, 3), + Err(ResourceError::InvalidResourceRange( + "PIO Address".to_string(), + )) + ); + } + + #[test] + fn test_allocate_kvm_mem_slot() { + let mgr = ResourceManager::new(None); + assert_eq!(mgr.allocate_kvm_mem_slot(1, None).unwrap(), 0); + assert_eq!(mgr.allocate_kvm_mem_slot(1, Some(200)).unwrap(), 200); + mgr.free_kvm_mem_slot(200); + assert_eq!(mgr.allocate_kvm_mem_slot(1, Some(200)).unwrap(), 200); + assert_eq!( + mgr.allocate_kvm_mem_slot(1, Some(KVM_USER_MEM_SLOTS)) + .unwrap(), + KVM_USER_MEM_SLOTS + ); + assert!(mgr + .allocate_kvm_mem_slot(1, Some(KVM_USER_MEM_SLOTS + 1)) + .is_none()); + } + + #[test] + fn test_allocate_mmio_address() { + let mgr = ResourceManager::new(None); + + #[cfg(target_arch = "x86_64")] + { + // Can't allocate from reserved region + let constraint = Constraint::new(0x100_0000u64) + .min(0x1_0000_0000u64 - 0x200_0000u64) + .max(0xffff_ffffu64); + assert!(mgr.allocate_mmio_address(&constraint).is_none()); + } + let constraint = Constraint::new(0x100_0000u64).min(0x1_0000_0000u64 - 0x200_0000u64); + assert!(mgr.allocate_mmio_address(&constraint).is_some()); + + #[cfg(target_arch = "x86_64")] + { + // Can't allocate from reserved region + let constraint = Constraint::new(0x100_0000u64) + .min(0x1_0000_0000u64 - 0x200_0000u64) + .max(0xffff_ffffu64); + assert!(mgr.allocate_mem_address(&constraint).is_none()); + } + #[cfg(target_arch = "aarch64")] + { + let constraint = Constraint::new(0x200_0000u64) + .min(0x1_0000_0000u64 - 0x200_0000u64) + .max(0xffff_fffeu64); + assert!(mgr.allocate_mem_address(&constraint).is_none()); + } + let constraint = Constraint::new(0x100_0000u64).min(0x1_0000_0000u64 - 0x200_0000u64); + assert!(mgr.allocate_mem_address(&constraint).is_some()); + } + + #[test] + #[should_panic] + fn test_allocate_duplicate_memory() { + let mgr = ResourceManager::new(None); + + let constraint_1 = Constraint::new(0x100_0000u64) + .min(0x1_0000_0000u64) + .max(0x1_0000_0000u64 + 0x100_0000u64); + let constraint_2 = Constraint::new(0x100_0000u64) + .min(0x1_0000_0000u64) + .max(0x1_0000_0000u64 + 0x100_0000u64); + + assert!(mgr.allocate_mem_address(&constraint_1).is_some()); + assert!(mgr.allocate_mem_address(&constraint_2).is_some()); + } +} diff --git a/src/dragonball/src/signal_handler.rs b/src/dragonball/src/signal_handler.rs new file mode 100644 index 0000000000..23e9ff3976 --- /dev/null +++ b/src/dragonball/src/signal_handler.rs @@ -0,0 +1,219 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use libc::{_exit, c_int, c_void, siginfo_t, SIGBUS, SIGSEGV, SIGSYS}; +use log::error; +use vmm_sys_util::signal::register_signal_handler; + +use crate::metric::{IncMetric, METRICS}; + +// The offset of `si_syscall` (offending syscall identifier) within the siginfo structure +// expressed as an `(u)int*`. +// Offset `6` for an `i32` field means that the needed information is located at `6 * sizeof(i32)`. +// See /usr/include/linux/signal.h for the C struct definition. +// See https://github.com/rust-lang/libc/issues/716 for why the offset is different in Rust. +const SI_OFF_SYSCALL: isize = 6; + +const SYS_SECCOMP_CODE: i32 = 1; + +extern "C" { + fn __libc_current_sigrtmin() -> c_int; + fn __libc_current_sigrtmax() -> c_int; +} + +/// Gets current sigrtmin +pub fn sigrtmin() -> c_int { + unsafe { __libc_current_sigrtmin() } +} + +/// Gets current sigrtmax +pub fn sigrtmax() -> c_int { + unsafe { __libc_current_sigrtmax() } +} + +/// Signal handler for `SIGSYS`. +/// +/// Increments the `seccomp.num_faults` metric, logs an error message and terminates the process +/// with a specific exit code. +extern "C" fn sigsys_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) { + // Safe because we're just reading some fields from a supposedly valid argument. + let si_signo = unsafe { (*info).si_signo }; + let si_code = unsafe { (*info).si_code }; + + // Sanity check. The condition should never be true. + if num != si_signo || num != SIGSYS || si_code != SYS_SECCOMP_CODE as i32 { + // Safe because we're terminating the process anyway. + unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) }; + } + + // Other signals which might do async unsafe things incompatible with the rest of this + // function are blocked due to the sa_mask used when registering the signal handler. + let syscall = unsafe { *(info as *const i32).offset(SI_OFF_SYSCALL) as usize }; + // SIGSYS is triggered when bad syscalls are detected. num_faults is only added when SIGSYS is detected + // so it actually only collects the count for bad syscalls. + METRICS.seccomp.num_faults.inc(); + error!( + "Shutting down VM after intercepting a bad syscall ({}).", + syscall + ); + + // Safe because we're terminating the process anyway. We don't actually do anything when + // running unit tests. + #[cfg(not(test))] + unsafe { + _exit(i32::from(super::EXIT_CODE_BAD_SYSCALL)) + }; +} + +/// Signal handler for `SIGBUS` and `SIGSEGV`. +/// +/// Logs an error message and terminates the process with a specific exit code. +extern "C" fn sigbus_sigsegv_handler(num: c_int, info: *mut siginfo_t, _unused: *mut c_void) { + // Safe because we're just reading some fields from a supposedly valid argument. + let si_signo = unsafe { (*info).si_signo }; + let si_code = unsafe { (*info).si_code }; + + // Sanity check. The condition should never be true. + if num != si_signo || (num != SIGBUS && num != SIGSEGV) { + // Safe because we're terminating the process anyway. + unsafe { _exit(i32::from(super::EXIT_CODE_UNEXPECTED_ERROR)) }; + } + + // Other signals which might do async unsafe things incompatible with the rest of this + // function are blocked due to the sa_mask used when registering the signal handler. + match si_signo { + SIGBUS => METRICS.signals.sigbus.inc(), + SIGSEGV => METRICS.signals.sigsegv.inc(), + _ => (), + } + + error!( + "Shutting down VM after intercepting signal {}, code {}.", + si_signo, si_code + ); + + // Safe because we're terminating the process anyway. We don't actually do anything when + // running unit tests. + #[cfg(not(test))] + unsafe { + _exit(i32::from(match si_signo { + SIGBUS => super::EXIT_CODE_SIGBUS, + SIGSEGV => super::EXIT_CODE_SIGSEGV, + _ => super::EXIT_CODE_UNEXPECTED_ERROR, + })) + }; +} + +/// Registers all the required signal handlers. +/// +/// Custom handlers are installed for: `SIGBUS`, `SIGSEGV`, `SIGSYS`. +pub fn register_signal_handlers() -> vmm_sys_util::errno::Result<()> { + // Call to unsafe register_signal_handler which is considered unsafe because it will + // register a signal handler which will be called in the current thread and will interrupt + // whatever work is done on the current thread, so we have to keep in mind that the registered + // signal handler must only do async-signal-safe operations. + register_signal_handler(SIGSYS, sigsys_handler)?; + register_signal_handler(SIGBUS, sigbus_sigsegv_handler)?; + register_signal_handler(SIGSEGV, sigbus_sigsegv_handler)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + use libc::{cpu_set_t, syscall}; + use std::convert::TryInto; + use std::{mem, process, thread}; + + use seccompiler::{apply_filter, BpfProgram, SeccompAction, SeccompFilter}; + + // This function is used when running unit tests, so all the unsafes are safe. + fn cpu_count() -> usize { + let mut cpuset: cpu_set_t = unsafe { mem::zeroed() }; + unsafe { + libc::CPU_ZERO(&mut cpuset); + } + let ret = unsafe { + libc::sched_getaffinity( + 0, + mem::size_of::(), + &mut cpuset as *mut cpu_set_t, + ) + }; + assert_eq!(ret, 0); + + let mut num = 0; + for i in 0..libc::CPU_SETSIZE as usize { + if unsafe { libc::CPU_ISSET(i, &cpuset) } { + num += 1; + } + } + num + } + + #[test] + fn test_signal_handler() { + let child = thread::spawn(move || { + assert!(register_signal_handlers().is_ok()); + + let filter = SeccompFilter::new( + vec![ + (libc::SYS_brk, vec![]), + (libc::SYS_exit, vec![]), + (libc::SYS_futex, vec![]), + (libc::SYS_getpid, vec![]), + (libc::SYS_munmap, vec![]), + (libc::SYS_kill, vec![]), + (libc::SYS_rt_sigprocmask, vec![]), + (libc::SYS_rt_sigreturn, vec![]), + (libc::SYS_sched_getaffinity, vec![]), + (libc::SYS_set_tid_address, vec![]), + (libc::SYS_sigaltstack, vec![]), + (libc::SYS_write, vec![]), + ] + .into_iter() + .collect(), + SeccompAction::Trap, + SeccompAction::Allow, + std::env::consts::ARCH.try_into().unwrap(), + ) + .unwrap(); + + assert!(apply_filter(&TryInto::::try_into(filter).unwrap()).is_ok()); + assert_eq!(METRICS.seccomp.num_faults.count(), 0); + + // Call the blacklisted `SYS_mkdirat`. + unsafe { syscall(libc::SYS_mkdirat, "/foo/bar\0") }; + + // Call SIGBUS signal handler. + assert_eq!(METRICS.signals.sigbus.count(), 0); + unsafe { + syscall(libc::SYS_kill, process::id(), SIGBUS); + } + + // Call SIGSEGV signal handler. + assert_eq!(METRICS.signals.sigsegv.count(), 0); + unsafe { + syscall(libc::SYS_kill, process::id(), SIGSEGV); + } + }); + assert!(child.join().is_ok()); + + // Sanity check. + assert!(cpu_count() > 0); + // Kcov somehow messes with our handler getting the SIGSYS signal when a bad syscall + // is caught, so the following assertion no longer holds. Ideally, we'd have a surefire + // way of either preventing this behaviour, or detecting for certain whether this test is + // run by kcov or not. The best we could do so far is to look at the perceived number of + // available CPUs. Kcov seems to make a single CPU available to the process running the + // tests, so we use this as an heuristic to decide if we check the assertion. + if cpu_count() > 1 { + // The signal handler should let the program continue during unit tests. + assert!(METRICS.seccomp.num_faults.count() >= 1); + } + assert!(METRICS.signals.sigbus.count() >= 1); + assert!(METRICS.signals.sigsegv.count() >= 1); + } +} diff --git a/src/dragonball/src/vcpu/aarch64.rs b/src/dragonball/src/vcpu/aarch64.rs new file mode 100644 index 0000000000..054a1f65d4 --- /dev/null +++ b/src/dragonball/src/vcpu/aarch64.rs @@ -0,0 +1,123 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::ops::Deref; +use std::sync::mpsc::{channel, Sender}; +use std::sync::Arc; + +use crate::IoManagerCached; +use dbs_arch::regs; +use dbs_boot::get_fdt_addr; +use dbs_utils::time::TimestampUs; +use kvm_ioctls::{VcpuFd, VmFd}; +use vm_memory::{Address, GuestAddress, GuestAddressSpace}; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent}; +use crate::vcpu::VcpuConfig; + +#[allow(unused)] +impl Vcpu { + /// Constructs a new VCPU for `vm`. + /// + /// # Arguments + /// + /// * `id` - Represents the CPU number between [0, max vcpus). + /// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu. + /// * `io_mgr` - The io-manager used to access port-io and mmio devices. + /// * `exit_evt` - An `EventFd` that will be written into when this vcpu + /// exits. + /// * `vcpu_state_event` - The eventfd which can notify vmm state of some + /// vcpu should change. + /// * `vcpu_state_sender` - The channel to send state change message from + /// vcpu thread to vmm thread. + /// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime. + /// * `support_immediate_exit` - whether kvm uses supports immediate_exit flag. + pub fn new_aarch64( + id: u8, + vcpu_fd: Arc, + io_mgr: IoManagerCached, + exit_evt: EventFd, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + create_ts: TimestampUs, + support_immediate_exit: bool, + ) -> Result { + let (event_sender, event_receiver) = channel(); + let (response_sender, response_receiver) = channel(); + + Ok(Vcpu { + fd: vcpu_fd, + id, + io_mgr, + create_ts, + event_receiver, + event_sender: Some(event_sender), + response_receiver: Some(response_receiver), + response_sender, + vcpu_state_event, + vcpu_state_sender, + support_immediate_exit, + mpidr: 0, + exit_evt, + }) + } + + /// Configures an aarch64 specific vcpu. + /// + /// # Arguments + /// + /// * `vcpu_config` - vCPU config for this vCPU status + /// * `vm_fd` - The kvm `VmFd` for this microvm. + /// * `vm_as` - The guest memory address space used by this microvm. + /// * `kernel_load_addr` - Offset from `guest_mem` at which the kernel is loaded. + /// * `_pgtable_addr` - pgtable address for ap vcpu (not used in aarch64) + pub fn configure( + &mut self, + _vcpu_config: &VcpuConfig, + vm_fd: &VmFd, + vm_as: &GuestAddressSpaceImpl, + kernel_load_addr: Option, + _pgtable_addr: Option, + ) -> Result<()> { + let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); + + // This reads back the kernel's preferred target type. + vm_fd + .get_preferred_target(&mut kvi) + .map_err(VcpuError::VcpuArmPreferredTarget)?; + // We already checked that the capability is supported. + kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; + // Non-boot cpus are powered off initially. + if self.id > 0 { + kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; + } + + self.fd.vcpu_init(&kvi).map_err(VcpuError::VcpuArmInit)?; + + if let Some(address) = kernel_load_addr { + regs::setup_regs( + &self.fd, + self.id, + address.raw_value(), + get_fdt_addr(vm_as.memory().deref()), + ) + .map_err(VcpuError::REGSConfiguration)?; + } + + self.mpidr = regs::read_mpidr(&self.fd).map_err(VcpuError::REGSConfiguration)?; + + Ok(()) + } + + /// Gets the MPIDR register value. + pub fn get_mpidr(&self) -> u64 { + self.mpidr + } +} diff --git a/src/dragonball/src/vcpu/mod.rs b/src/dragonball/src/vcpu/mod.rs new file mode 100644 index 0000000000..5b8ed397ff --- /dev/null +++ b/src/dragonball/src/vcpu/mod.rs @@ -0,0 +1,34 @@ +// Copyright (C) 2022 Alibaba Cloud Computing. All rights reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +mod sm; +mod vcpu_impl; +mod vcpu_manager; + +#[cfg(target_arch = "x86_64")] +use dbs_arch::cpuid::VpmuFeatureLevel; + +pub use vcpu_manager::{VcpuManager, VcpuManagerError}; + +/// vcpu config collection +pub struct VcpuConfig { + /// initial vcpu count + pub boot_vcpu_count: u8, + /// max vcpu count for hotplug + pub max_vcpu_count: u8, + /// threads per core for cpu topology information + pub threads_per_core: u8, + /// cores per die for cpu topology information + pub cores_per_die: u8, + /// dies per socket for cpu topology information + pub dies_per_socket: u8, + /// socket number for cpu topology information + pub sockets: u8, + /// if vpmu feature is Disabled, it means vpmu feature is off (by default) + /// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions) + /// if vpmu feature is FullyEnabled, it means all vpmu counters are supported + #[cfg(target_arch = "x86_64")] + pub vpmu_feature: VpmuFeatureLevel, +} diff --git a/src/dragonball/src/vcpu/sm.rs b/src/dragonball/src/vcpu/sm.rs new file mode 100644 index 0000000000..2a51d64083 --- /dev/null +++ b/src/dragonball/src/vcpu/sm.rs @@ -0,0 +1,149 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::ops::Deref; + +/// Simple abstraction of a state machine. +/// +/// `StateMachine` is a wrapper over `T` that also encodes state information for `T`. +/// +/// Each state for `T` is represented by a `StateFn` which is a function that acts as +/// the state handler for that particular state of `T`. +/// +/// `StateFn` returns exactly one other `StateMachine` thus each state gets clearly +/// defined transitions to other states. +pub struct StateMachine { + function: StateFn, + end_state: bool, +} + +/// Type representing a state handler of a `StateMachine` machine. Each state handler +/// is a function from `T` that handles a specific state of `T`. +type StateFn = fn(&mut T) -> StateMachine; + +impl StateMachine { + /// Creates a new state wrapper. + /// + /// # Arguments + /// + /// `function` - the state handler for this state. + /// `end_state` - whether this state is final. + pub fn new(function: StateFn, end_state: bool) -> StateMachine { + StateMachine { + function, + end_state, + } + } + + /// Creates a new state wrapper that has further possible transitions. + /// + /// # Arguments + /// + /// `function` - the state handler for this state. + pub fn next(function: StateFn) -> StateMachine { + StateMachine::new(function, false) + } + + /// Creates a new state wrapper that has no further transitions. The state machine + /// will finish after running this handler. + /// + /// # Arguments + /// + /// `function` - the state handler for this last state. + pub fn finish(function: StateFn) -> StateMachine { + StateMachine::new(function, true) + } + + /// Runs a state machine for `T` starting from the provided state. + /// + /// # Arguments + /// + /// `machine` - a mutable reference to the object running through the various states. + /// `starting_state_fn` - a `fn(&mut T) -> StateMachine` that should be the handler for + /// the initial state. + pub fn run(machine: &mut T, starting_state_fn: StateFn) { + // Start off in the `starting_state` state. + let mut sf = StateMachine::new(starting_state_fn, false); + // While current state is not a final/end state, keep churning. + while !sf.end_state { + // Run the current state handler, and get the next one. + sf = sf(machine); + } + } +} + +// Implement Deref of `StateMachine` so that we can directly call its underlying state handler. +impl Deref for StateMachine { + type Target = StateFn; + fn deref(&self) -> &Self::Target { + &self.function + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // DummyMachine with states `s1`, `s2` and `s3`. + struct DummyMachine { + private_data_s1: bool, + private_data_s2: bool, + private_data_s3: bool, + } + + impl DummyMachine { + fn new() -> Self { + DummyMachine { + private_data_s1: false, + private_data_s2: false, + private_data_s3: false, + } + } + + // DummyMachine functions here. + + // Simple state-machine: start->s1->s2->s3->done. + fn run(&mut self) { + // Verify the machine has not run yet. + assert!(!self.private_data_s1); + assert!(!self.private_data_s2); + assert!(!self.private_data_s3); + + // Run the state-machine. + StateMachine::run(self, Self::s1); + + // Verify the machine went through all states. + assert!(self.private_data_s1); + assert!(self.private_data_s2); + assert!(self.private_data_s3); + } + + fn s1(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s1); + self.private_data_s1 = true; + StateMachine::next(Self::s2) + } + + fn s2(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s2); + self.private_data_s2 = true; + StateMachine::next(Self::s3) + } + + fn s3(&mut self) -> StateMachine { + // Verify private data mutates along with the states. + assert!(!self.private_data_s3); + self.private_data_s3 = true; + // The machine ends here, adding `s1` as next state to validate this. + StateMachine::finish(Self::s1) + } + } + + #[test] + fn test_sm() { + let mut machine = DummyMachine::new(); + machine.run(); + } +} diff --git a/src/dragonball/src/vcpu/vcpu_impl.rs b/src/dragonball/src/vcpu/vcpu_impl.rs new file mode 100644 index 0000000000..513fa435f9 --- /dev/null +++ b/src/dragonball/src/vcpu/vcpu_impl.rs @@ -0,0 +1,975 @@ +// Copyright (C) 2019-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! The implementation for per vcpu + +use std::cell::Cell; +use std::result; +use std::sync::atomic::{fence, Ordering}; +use std::sync::mpsc::{Receiver, Sender, TryRecvError}; +use std::sync::{Arc, Barrier}; +use std::thread; + +use dbs_utils::time::TimestampUs; +use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; +use kvm_ioctls::{VcpuExit, VcpuFd}; +use libc::{c_int, c_void, siginfo_t}; +use log::{error, info}; +use seccompiler::{apply_filter, BpfProgram, Error as SecError}; +use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::signal::{register_signal_handler, Killable}; + +use super::sm::StateMachine; +use crate::metric::{IncMetric, METRICS}; +use crate::signal_handler::sigrtmin; +use crate::IoManagerCached; + +#[cfg(target_arch = "x86_64")] +#[path = "x86_64.rs"] +mod x86_64; + +#[cfg(target_arch = "aarch64")] +#[path = "aarch64.rs"] +mod aarch64; + +#[cfg(target_arch = "x86_64")] +const MAGIC_IOPORT_BASE: u16 = 0xdbdb; +#[cfg(target_arch = "x86_64")] +const MAGIC_IOPORT_DEBUG_INFO: u16 = MAGIC_IOPORT_BASE; + +/// Signal number (SIGRTMIN) used to kick Vcpus. +pub const VCPU_RTSIG_OFFSET: i32 = 0; + +#[cfg(target_arch = "x86_64")] +/// Errors associated with the wrappers over KVM ioctls. +#[derive(Debug, thiserror::Error)] +pub enum VcpuError { + /// Failed to signal Vcpu. + #[error("cannot signal the vCPU thread")] + SignalVcpu(#[source] vmm_sys_util::errno::Error), + + /// Cannot open the vCPU file descriptor. + #[error("cannot open the vCPU file descriptor")] + VcpuFd(#[source] kvm_ioctls::Error), + + /// Cannot spawn a new vCPU thread. + #[error("cannot spawn vCPU thread")] + VcpuSpawn(#[source] std::io::Error), + + /// Cannot cleanly initialize vCPU TLS. + #[error("cannot cleanly initialize TLS fro vCPU")] + VcpuTlsInit, + + /// Vcpu not present in TLS. + #[error("vCPU not present in the TLS")] + VcpuTlsNotPresent, + + /// Unexpected KVM_RUN exit reason + #[error("Unexpected KVM_RUN exit reason")] + VcpuUnhandledKvmExit, + + /// Pause vcpu failed + #[error("failed to pause vcpus")] + PauseFailed, + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command")] + Kvm(#[source] kvm_ioctls::Error), + + /// Msr error + #[error("failure to deal with MSRs")] + Msr(vmm_sys_util::fam::Error), + + /// A call to cpuid instruction failed on x86_64. + #[error("failure while configuring CPUID for virtual CPU on x86_64")] + CpuId(dbs_arch::cpuid::Error), + + /// Error configuring the floating point related registers on x86_64. + #[error("failure while configuring the floating point related registers on x86_64")] + FPUConfiguration(dbs_arch::regs::Error), + + /// Cannot set the local interruption due to bad configuration on x86_64. + #[error("cannot set the local interruption due to bad configuration on x86_64")] + LocalIntConfiguration(dbs_arch::interrupts::Error), + + /// Error configuring the MSR registers on x86_64. + #[error("failure while configuring the MSR registers on x86_64")] + MSRSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the general purpose registers on x86_64. + #[error("failure while configuring the general purpose registers on x86_64")] + REGSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the special registers on x86_64. + #[error("failure while configuring the special registers on x86_64")] + SREGSConfiguration(dbs_arch::regs::Error), + + /// Error configuring the page table on x86_64. + #[error("failure while configuring the page table on x86_64")] + PageTable(dbs_boot::Error), + + /// The call to KVM_SET_CPUID2 failed on x86_64. + #[error("failure while calling KVM_SET_CPUID2 on x86_64")] + SetSupportedCpusFailed(#[source] kvm_ioctls::Error), +} + +#[cfg(target_arch = "aarch64")] +/// Errors associated with the wrappers over KVM ioctls. +#[derive(Debug, thiserror::Error)] +pub enum VcpuError { + /// Failed to signal Vcpu. + #[error("cannot signal the vCPU thread")] + SignalVcpu(#[source] vmm_sys_util::errno::Error), + + /// Cannot open the vCPU file descriptor. + #[error("cannot open the vCPU file descriptor")] + VcpuFd(#[source] kvm_ioctls::Error), + + /// Cannot spawn a new vCPU thread. + #[error("cannot spawn vCPU thread")] + VcpuSpawn(#[source] std::io::Error), + + /// Cannot cleanly initialize vCPU TLS. + #[error("cannot cleanly initialize TLS fro vCPU")] + VcpuTlsInit, + + /// Vcpu not present in TLS. + #[error("vCPU not present in the TLS")] + VcpuTlsNotPresent, + + /// Unexpected KVM_RUN exit reason + #[error("Unexpected KVM_RUN exit reason")] + VcpuUnhandledKvmExit, + + /// Pause vcpu failed + #[error("failed to pause vcpus")] + PauseFailed, + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command")] + Kvm(#[source] kvm_ioctls::Error), + + /// Msr error + #[error("failure to deal with MSRs")] + Msr(vmm_sys_util::fam::Error), + + #[cfg(target_arch = "aarch64")] + /// Error configuring the general purpose aarch64 registers on aarch64. + #[error("failure while configuring the general purpose registers on aarch64")] + REGSConfiguration(dbs_arch::regs::Error), + + #[cfg(target_arch = "aarch64")] + /// Error setting up the global interrupt controller on aarch64. + #[error("failure while setting up the global interrupt controller on aarch64")] + SetupGIC(dbs_arch::gic::Error), + + #[cfg(target_arch = "aarch64")] + /// Error getting the Vcpu preferred target on aarch64. + #[error("failure while getting the vCPU preferred target on aarch64")] + VcpuArmPreferredTarget(kvm_ioctls::Error), + + #[cfg(target_arch = "aarch64")] + /// Error doing vCPU Init on aarch64. + #[error("failure while doing vCPU init on aarch64")] + VcpuArmInit(kvm_ioctls::Error), +} + +/// Result for Vcpu related operations. +pub type Result = result::Result; + +/// List of events that the Vcpu can receive. +#[derive(Debug)] +pub enum VcpuEvent { + /// Kill the Vcpu. + Exit, + /// Pause the Vcpu. + Pause, + /// Event that should resume the Vcpu. + Resume, + /// Get vcpu thread tid + Gettid, + + /// Event to revalidate vcpu IoManager cache + RevalidateCache, +} + +/// List of responses that the Vcpu reports. +pub enum VcpuResponse { + /// Vcpu is paused. + Paused, + /// Vcpu is resumed. + Resumed, + /// Vcpu index and thread tid. + Tid(u8, u32), + /// Requested Vcpu operation is not allowed. + NotAllowed, + /// Requestion action encountered an error + Error(VcpuError), + /// Vcpu IoManager cache is revalidated + CacheRevalidated, +} + +/// List of events that the vcpu_state_sender can send. +pub enum VcpuStateEvent { + /// (result, response) for hotplug, result 0 means failure, 1 means success. + Hotplug((i32, u32)), +} + +/// Wrapper over vCPU that hides the underlying interactions with the vCPU thread. +pub struct VcpuHandle { + event_sender: Sender, + response_receiver: Receiver, + vcpu_thread: thread::JoinHandle<()>, +} + +impl VcpuHandle { + /// Send event to vCPU thread + pub fn send_event(&self, event: VcpuEvent) -> Result<()> { + // Use expect() to crash if the other thread closed this channel. + self.event_sender + .send(event) + .expect("event sender channel closed on vcpu end."); + // Kick the vCPU so it picks up the message. + self.vcpu_thread + .kill(sigrtmin() + VCPU_RTSIG_OFFSET) + .map_err(VcpuError::SignalVcpu)?; + Ok(()) + } + + /// Receive response from vcpu thread + pub fn response_receiver(&self) -> &Receiver { + &self.response_receiver + } + + #[allow(dead_code)] + /// Join the vcpu thread + pub fn join_vcpu_thread(self) -> thread::Result<()> { + self.vcpu_thread.join() + } +} + +#[derive(PartialEq)] +enum VcpuEmulation { + Handled, + Interrupted, + Stopped, +} + +/// A wrapper around creating and using a kvm-based VCPU. +pub struct Vcpu { + // vCPU fd used by the vCPU + fd: Arc, + // vCPU id info + id: u8, + // Io manager Cached for facilitating IO operations + io_mgr: IoManagerCached, + // Records vCPU create time stamp + create_ts: TimestampUs, + + // The receiving end of events channel owned by the vcpu side. + event_receiver: Receiver, + // The transmitting end of the events channel which will be given to the handler. + event_sender: Option>, + // The receiving end of the responses channel which will be given to the handler. + response_receiver: Option>, + // The transmitting end of the responses channel owned by the vcpu side. + response_sender: Sender, + // Event notifier for CPU hotplug. + // After arm adapts to hotplug vcpu, the dead code macro needs to be removed + #[cfg_attr(target_arch = "aarch64", allow(dead_code))] + vcpu_state_event: EventFd, + // CPU hotplug events. + // After arm adapts to hotplug vcpu, the dead code macro needs to be removed + #[cfg_attr(target_arch = "aarch64", allow(dead_code))] + vcpu_state_sender: Sender, + + // An `EventFd` that will be written into when this vcpu exits. + exit_evt: EventFd, + // Whether kvm used supports immediate_exit flag. + support_immediate_exit: bool, + + // CPUID information for the x86_64 CPU + #[cfg(target_arch = "x86_64")] + cpuid: kvm_bindings::CpuId, + + /// Multiprocessor affinity register recorded for aarch64 + #[cfg(target_arch = "aarch64")] + pub(crate) mpidr: u64, +} + +// Using this for easier explicit type-casting to help IDEs interpret the code. +type VcpuCell = Cell>; + +impl Vcpu { + thread_local!(static TLS_VCPU_PTR: VcpuCell = Cell::new(None)); + + /// Associates `self` with the current thread. + /// + /// It is a prerequisite to successfully run `init_thread_local_data()` before using + /// `run_on_thread_local()` on the current thread. + /// This function will return an error if there already is a `Vcpu` present in the TLS. + fn init_thread_local_data(&mut self) -> Result<()> { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if cell.get().is_some() { + return Err(VcpuError::VcpuTlsInit); + } + cell.set(Some(self as *const Vcpu)); + Ok(()) + }) + } + + /// Deassociates `self` from the current thread. + /// + /// Should be called if the current `self` had called `init_thread_local_data()` and + /// now needs to move to a different thread. + /// + /// Fails if `self` was not previously associated with the current thread. + fn reset_thread_local_data(&mut self) -> Result<()> { + // Best-effort to clean up TLS. If the `Vcpu` was moved to another thread + // _before_ running this, then there is nothing we can do. + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if let Some(vcpu_ptr) = cell.get() { + if vcpu_ptr == self as *const Vcpu { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| cell.take()); + return Ok(()); + } + } + Err(VcpuError::VcpuTlsNotPresent) + }) + } + + /// Runs `func` for the `Vcpu` associated with the current thread. + /// + /// It requires that `init_thread_local_data()` was run on this thread. + /// + /// Fails if there is no `Vcpu` associated with the current thread. + /// + /// # Safety + /// + /// This is marked unsafe as it allows temporary aliasing through + /// dereferencing from pointer an already borrowed `Vcpu`. + unsafe fn run_on_thread_local(func: F) -> Result<()> + where + F: FnOnce(&Vcpu), + { + Self::TLS_VCPU_PTR.with(|cell: &VcpuCell| { + if let Some(vcpu_ptr) = cell.get() { + // Dereferencing here is safe since `TLS_VCPU_PTR` is populated/non-empty, + // and it is being cleared on `Vcpu::drop` so there is no dangling pointer. + let vcpu_ref: &Vcpu = &*vcpu_ptr; + func(vcpu_ref); + Ok(()) + } else { + Err(VcpuError::VcpuTlsNotPresent) + } + }) + } + + /// Registers a signal handler which makes use of TLS and kvm immediate exit to + /// kick the vcpu running on the current thread, if there is one. + pub fn register_kick_signal_handler() { + extern "C" fn handle_signal(_: c_int, _: *mut siginfo_t, _: *mut c_void) { + // This is safe because it's temporarily aliasing the `Vcpu` object, but we are + // only reading `vcpu.fd` which does not change for the lifetime of the `Vcpu`. + unsafe { + let _ = Vcpu::run_on_thread_local(|vcpu| { + vcpu.fd.set_kvm_immediate_exit(1); + fence(Ordering::Release); + }); + } + } + + register_signal_handler(sigrtmin() + VCPU_RTSIG_OFFSET, handle_signal) + .expect("Failed to register vcpu signal handler"); + } + + /// Returns the cpu index as seen by the guest OS. + pub fn cpu_index(&self) -> u8 { + self.id + } + + /// Moves the vcpu to its own thread and constructs a VcpuHandle. + /// The handle can be used to control the remote vcpu. + pub fn start_threaded( + mut self, + seccomp_filter: BpfProgram, + barrier: Arc, + ) -> Result { + let event_sender = self.event_sender.take().unwrap(); + let response_receiver = self.response_receiver.take().unwrap(); + + let vcpu_thread = thread::Builder::new() + .name(format!("db_vcpu{}", self.cpu_index())) + .spawn(move || { + self.init_thread_local_data() + .expect("Cannot cleanly initialize vcpu TLS."); + barrier.wait(); + self.run(seccomp_filter); + }) + .map_err(VcpuError::VcpuSpawn)?; + + Ok(VcpuHandle { + event_sender, + response_receiver, + vcpu_thread, + }) + } + + /// Extract the vcpu running logic for test mocking. + #[cfg(not(test))] + pub fn emulate(fd: &VcpuFd) -> std::result::Result, kvm_ioctls::Error> { + fd.run() + } + + /// Runs the vCPU in KVM context and handles the kvm exit reason. + /// + /// Returns error or enum specifying whether emulation was handled or interrupted. + fn run_emulation(&mut self) -> Result { + match Vcpu::emulate(&self.fd) { + Ok(run) => match run { + #[cfg(target_arch = "x86_64")] + VcpuExit::IoIn(addr, data) => { + let _ = self.io_mgr.pio_read(addr, data); + METRICS.vcpu.exit_io_in.inc(); + Ok(VcpuEmulation::Handled) + } + #[cfg(target_arch = "x86_64")] + VcpuExit::IoOut(addr, data) => { + if !self.check_io_port_info(addr, data)? { + let _ = self.io_mgr.pio_write(addr, data); + } + METRICS.vcpu.exit_io_out.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::MmioRead(addr, data) => { + let _ = self.io_mgr.mmio_read(addr, data); + METRICS.vcpu.exit_mmio_read.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::MmioWrite(addr, data) => { + let _ = self.io_mgr.mmio_write(addr, data); + METRICS.vcpu.exit_mmio_write.inc(); + Ok(VcpuEmulation::Handled) + } + VcpuExit::Hlt => { + info!("Received KVM_EXIT_HLT signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::Shutdown => { + info!("Received KVM_EXIT_SHUTDOWN signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + // Documentation specifies that below kvm exits are considered errors. + VcpuExit::FailEntry => { + METRICS.vcpu.failures.inc(); + error!("Received KVM_EXIT_FAIL_ENTRY signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::InternalError => { + METRICS.vcpu.failures.inc(); + error!("Received KVM_EXIT_INTERNAL_ERROR signal"); + Err(VcpuError::VcpuUnhandledKvmExit) + } + VcpuExit::SystemEvent(event_type, event_flags) => match event_type { + KVM_SYSTEM_EVENT_RESET | KVM_SYSTEM_EVENT_SHUTDOWN => { + info!( + "Received KVM_SYSTEM_EVENT: type: {}, event: {}", + event_type, event_flags + ); + Ok(VcpuEmulation::Stopped) + } + _ => { + METRICS.vcpu.failures.inc(); + error!( + "Received KVM_SYSTEM_EVENT signal type: {}, flag: {}", + event_type, event_flags + ); + Err(VcpuError::VcpuUnhandledKvmExit) + } + }, + r => { + METRICS.vcpu.failures.inc(); + // TODO: Are we sure we want to finish running a vcpu upon + // receiving a vm exit that is not necessarily an error? + error!("Unexpected exit reason on vcpu run: {:?}", r); + Err(VcpuError::VcpuUnhandledKvmExit) + } + }, + // The unwrap on raw_os_error can only fail if we have a logic + // error in our code in which case it is better to panic. + Err(ref e) => { + match e.errno() { + libc::EAGAIN => Ok(VcpuEmulation::Handled), + libc::EINTR => { + self.fd.set_kvm_immediate_exit(0); + // Notify that this KVM_RUN was interrupted. + Ok(VcpuEmulation::Interrupted) + } + _ => { + METRICS.vcpu.failures.inc(); + error!("Failure during vcpu run: {}", e); + #[cfg(target_arch = "x86_64")] + { + error!( + "dump regs: {:?}, dump sregs: {:?}", + self.fd.get_regs(), + self.fd.get_sregs() + ); + } + Err(VcpuError::VcpuUnhandledKvmExit) + } + } + } + } + } + + #[cfg(target_arch = "x86_64")] + // checkout the io port that dragonball used only + fn check_io_port_info(&self, addr: u16, data: &[u8]) -> Result { + let mut checked = false; + + match addr { + // debug info signal + MAGIC_IOPORT_DEBUG_INFO => { + if data.len() == 4 { + let data = unsafe { std::ptr::read(data.as_ptr() as *const u32) }; + log::warn!("KDBG: guest kernel debug info: 0x{:x}", data); + checked = true; + } + } + _ => {} + }; + + Ok(checked) + } + + fn gettid() -> u32 { + nix::unistd::gettid().as_raw() as u32 + } + + fn revalidate_cache(&mut self) -> Result<()> { + self.io_mgr.revalidate_cache(); + + Ok(()) + } + + /// Main loop of the vCPU thread. + /// + /// Runs the vCPU in KVM context in a loop. Handles KVM_EXITs then goes back in. + /// Note that the state of the VCPU and associated VM must be setup first for this to do + /// anything useful. + pub fn run(&mut self, seccomp_filter: BpfProgram) { + // Load seccomp filters for this vCPU thread. + // Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters + // altogether is the desired behaviour. + if let Err(e) = apply_filter(&seccomp_filter) { + if matches!(e, SecError::EmptyFilter) { + info!("vCPU thread {} use empty seccomp filters.", self.id); + } else { + panic!( + "Failed to set the requested seccomp filters on vCPU {}: Error: {}", + self.id, e + ); + } + } + + info!("vcpu {} is running", self.cpu_index()); + + // Start running the machine state in the `Paused` state. + StateMachine::run(self, Self::paused); + } + + // This is the main loop of the `Running` state. + fn running(&mut self) -> StateMachine { + // This loop is here just for optimizing the emulation path. + // No point in ticking the state machine if there are no external events. + loop { + match self.run_emulation() { + // Emulation ran successfully, continue. + Ok(VcpuEmulation::Handled) => { + // We need to break here if kvm doesn't support + // immediate_exit flag. Because the signal sent from vmm + // thread may occurs when handling the vcpu exit events, and + // in this case the external vcpu events may not be handled + // correctly, so we need to check the event_receiver channel + // after handle vcpu exit events to decrease the window that + // doesn't handle the vcpu external events. + if !self.support_immediate_exit { + break; + } + } + // Emulation was interrupted, check external events. + Ok(VcpuEmulation::Interrupted) => break, + // Emulation was stopped due to reset or shutdown. + Ok(VcpuEmulation::Stopped) => return StateMachine::next(Self::waiting_exit), + // Emulation errors lead to vCPU exit. + Err(e) => { + error!("vcpu: {}, run_emulation failed: {:?}", self.id, e); + return StateMachine::next(Self::waiting_exit); + } + } + } + + // By default don't change state. + let mut state = StateMachine::next(Self::running); + + // Break this emulation loop on any transition request/external event. + match self.event_receiver.try_recv() { + // Running ---- Exit ----> Exited + Ok(VcpuEvent::Exit) => { + // Move to 'exited' state. + state = StateMachine::next(Self::exited); + } + // Running ---- Pause ----> Paused + Ok(VcpuEvent::Pause) => { + // Nothing special to do. + self.response_sender + .send(VcpuResponse::Paused) + .expect("failed to send pause status"); + + // TODO: we should call `KVM_KVMCLOCK_CTRL` here to make sure + // TODO continued: the guest soft lockup watchdog does not panic on Resume. + //let _ = self.fd.kvmclock_ctrl(); + + // Move to 'paused' state. + state = StateMachine::next(Self::paused); + } + Ok(VcpuEvent::Resume) => { + self.response_sender + .send(VcpuResponse::Resumed) + .expect("failed to send resume status"); + } + Ok(VcpuEvent::Gettid) => { + self.response_sender + .send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid())) + .expect("failed to send vcpu thread tid"); + } + Ok(VcpuEvent::RevalidateCache) => { + self.revalidate_cache() + .map(|()| { + self.response_sender + .send(VcpuResponse::CacheRevalidated) + .expect("failed to revalidate vcpu IoManager cache"); + }) + .map_err(|e| self.response_sender.send(VcpuResponse::Error(e))) + .expect("failed to revalidate vcpu IoManager cache"); + } + // Unhandled exit of the other end. + Err(TryRecvError::Disconnected) => { + // Move to 'exited' state. + state = StateMachine::next(Self::exited); + } + // All other events or lack thereof have no effect on current 'running' state. + Err(TryRecvError::Empty) => (), + } + + state + } + + // This is the main loop of the `Paused` state. + fn paused(&mut self) -> StateMachine { + match self.event_receiver.recv() { + // Paused ---- Exit ----> Exited + Ok(VcpuEvent::Exit) => { + // Move to 'exited' state. + StateMachine::next(Self::exited) + } + // Paused ---- Resume ----> Running + Ok(VcpuEvent::Resume) => { + self.response_sender + .send(VcpuResponse::Resumed) + .expect("failed to send resume status"); + // Move to 'running' state. + StateMachine::next(Self::running) + } + Ok(VcpuEvent::Pause) => { + self.response_sender + .send(VcpuResponse::Paused) + .expect("failed to send pause status"); + // continue 'pause' state. + StateMachine::next(Self::paused) + } + Ok(VcpuEvent::Gettid) => { + self.response_sender + .send(VcpuResponse::Tid(self.cpu_index(), Vcpu::gettid())) + .expect("failed to send vcpu thread tid"); + StateMachine::next(Self::paused) + } + Ok(VcpuEvent::RevalidateCache) => { + self.revalidate_cache() + .map(|()| { + self.response_sender + .send(VcpuResponse::CacheRevalidated) + .expect("failed to revalidate vcpu IoManager cache"); + }) + .map_err(|e| self.response_sender.send(VcpuResponse::Error(e))) + .expect("failed to revalidate vcpu IoManager cache"); + + StateMachine::next(Self::paused) + } + // Unhandled exit of the other end. + Err(_) => { + // Move to 'exited' state. + StateMachine::next(Self::exited) + } + } + } + + // This is the main loop of the `WaitingExit` state. + fn waiting_exit(&mut self) -> StateMachine { + // trigger vmm to stop machine + if let Err(e) = self.exit_evt.write(1) { + METRICS.vcpu.failures.inc(); + error!("Failed signaling vcpu exit event: {}", e); + } + + let mut state = StateMachine::next(Self::waiting_exit); + + match self.event_receiver.recv() { + Ok(VcpuEvent::Exit) => state = StateMachine::next(Self::exited), + Ok(_) => error!( + "wrong state received in waiting exit state on vcpu {}", + self.id + ), + Err(_) => { + error!( + "vcpu channel closed in waiting exit state on vcpu {}", + self.id + ); + state = StateMachine::next(Self::exited); + } + } + + state + } + + // This is the main loop of the `Exited` state. + fn exited(&mut self) -> StateMachine { + // State machine reached its end. + StateMachine::finish(Self::exited) + } +} + +impl Drop for Vcpu { + fn drop(&mut self) { + let _ = self.reset_thread_local_data(); + } +} + +#[cfg(test)] +pub mod tests { + use std::os::unix::io::AsRawFd; + use std::sync::mpsc::{channel, Receiver}; + use std::sync::Mutex; + + use arc_swap::ArcSwap; + use dbs_device::device_manager::IoManager; + use kvm_ioctls::Kvm; + use lazy_static::lazy_static; + + use super::*; + use crate::kvm_context::KvmContext; + + pub enum EmulationCase { + IoIn, + IoOut, + MmioRead, + MmioWrite, + Hlt, + Shutdown, + FailEntry, + InternalError, + Unknown, + SystemEvent(u32, u64), + Error(i32), + } + + lazy_static! { + pub static ref EMULATE_RES: Mutex = Mutex::new(EmulationCase::Unknown); + } + + impl Vcpu { + pub fn emulate(_fd: &VcpuFd) -> std::result::Result, kvm_ioctls::Error> { + let res = &*EMULATE_RES.lock().unwrap(); + match res { + EmulationCase::IoIn => Ok(VcpuExit::IoIn(0, &mut [])), + EmulationCase::IoOut => Ok(VcpuExit::IoOut(0, &[])), + EmulationCase::MmioRead => Ok(VcpuExit::MmioRead(0, &mut [])), + EmulationCase::MmioWrite => Ok(VcpuExit::MmioWrite(0, &[])), + EmulationCase::Hlt => Ok(VcpuExit::Hlt), + EmulationCase::Shutdown => Ok(VcpuExit::Shutdown), + EmulationCase::FailEntry => Ok(VcpuExit::FailEntry), + EmulationCase::InternalError => Ok(VcpuExit::InternalError), + EmulationCase::Unknown => Ok(VcpuExit::Unknown), + EmulationCase::SystemEvent(event_type, event_flags) => { + Ok(VcpuExit::SystemEvent(*event_type, *event_flags)) + } + EmulationCase::Error(e) => Err(kvm_ioctls::Error::new(*e)), + } + } + } + + #[cfg(target_arch = "x86_64")] + fn create_vcpu() -> (Vcpu, Receiver) { + // Call for kvm too frequently would cause error in some host kernel. + std::thread::sleep(std::time::Duration::from_millis(5)); + + let kvm = Kvm::new().unwrap(); + let vm = Arc::new(kvm.create_vm().unwrap()); + let kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap(); + let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap()); + let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new())))); + let supported_cpuid = kvm_context + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .unwrap(); + let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let (tx, rx) = channel(); + let time_stamp = TimestampUs::default(); + + let vcpu = Vcpu::new_x86_64( + 0, + vcpu_fd, + io_manager, + supported_cpuid, + reset_event_fd, + vcpu_state_event, + tx, + time_stamp, + false, + ) + .unwrap(); + + (vcpu, rx) + } + + #[cfg(target_arch = "aarch64")] + fn create_vcpu() -> (Vcpu, Receiver) { + // Call for kvm too frequently would cause error in some host kernel. + std::thread::sleep(std::time::Duration::from_millis(5)); + + let kvm = Kvm::new().unwrap(); + let vm = Arc::new(kvm.create_vm().unwrap()); + let kvm_context = KvmContext::new(Some(kvm.as_raw_fd())).unwrap(); + let vcpu_fd = Arc::new(vm.create_vcpu(0).unwrap()); + let io_manager = IoManagerCached::new(Arc::new(ArcSwap::new(Arc::new(IoManager::new())))); + let reset_event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let vcpu_state_event = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let (tx, rx) = channel(); + let time_stamp = TimestampUs::default(); + + let vcpu = Vcpu::new_aarch64( + 0, + vcpu_fd, + io_manager, + reset_event_fd, + vcpu_state_event, + tx, + time_stamp, + false, + ) + .unwrap(); + + (vcpu, rx) + } + + #[test] + fn test_vcpu_run_emulation() { + let (mut vcpu, _) = create_vcpu(); + + #[cfg(target_arch = "x86_64")] + { + // Io in + *(EMULATE_RES.lock().unwrap()) = EmulationCase::IoIn; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Io out + *(EMULATE_RES.lock().unwrap()) = EmulationCase::IoOut; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + } + + // Mmio read + *(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioRead; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Mmio write + *(EMULATE_RES.lock().unwrap()) = EmulationCase::MmioWrite; + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // KVM_EXIT_HLT signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Hlt; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_SHUTDOWN signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Shutdown; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_FAIL_ENTRY signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::FailEntry; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_EXIT_INTERNAL_ERROR signal + *(EMULATE_RES.lock().unwrap()) = EmulationCase::InternalError; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // KVM_SYSTEM_EVENT_RESET + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_RESET, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Stopped))); + + // KVM_SYSTEM_EVENT_SHUTDOWN + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(KVM_SYSTEM_EVENT_SHUTDOWN, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Stopped))); + + // Other system event + *(EMULATE_RES.lock().unwrap()) = EmulationCase::SystemEvent(0, 0); + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // Unknown exit reason + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Unknown; + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + + // Error: EAGAIN + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EAGAIN); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Handled))); + + // Error: EINTR + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINTR); + let res = vcpu.run_emulation(); + assert!(matches!(res, Ok(VcpuEmulation::Interrupted))); + + // other error + *(EMULATE_RES.lock().unwrap()) = EmulationCase::Error(libc::EINVAL); + let res = vcpu.run_emulation(); + assert!(matches!(res, Err(VcpuError::VcpuUnhandledKvmExit))); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_vcpu_check_io_port_info() { + let (vcpu, _receiver) = create_vcpu(); + + // debug info signal + let res = vcpu + .check_io_port_info(MAGIC_IOPORT_DEBUG_INFO, &[0, 0, 0, 0]) + .unwrap(); + assert!(res); + } +} diff --git a/src/dragonball/src/vcpu/vcpu_manager.rs b/src/dragonball/src/vcpu/vcpu_manager.rs new file mode 100644 index 0000000000..f6f3e93ffa --- /dev/null +++ b/src/dragonball/src/vcpu/vcpu_manager.rs @@ -0,0 +1,1043 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// +// Copyright © 2019 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +//! vCPU manager to enable bootstrap and CPU hotplug. +use std::io; +use std::os::unix::io::AsRawFd; +use std::sync::mpsc::{channel, Receiver, RecvError, RecvTimeoutError, Sender}; +use std::sync::{Arc, Barrier, Mutex, RwLock}; +use std::time::Duration; + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +use dbs_upcall::{DevMgrService, UpcallClient}; +use dbs_utils::epoll_manager::{EpollManager, EventOps, EventSet, Events, MutEventSubscriber}; +use dbs_utils::time::TimestampUs; +use kvm_ioctls::{Cap, VcpuFd, VmFd}; +use log::{debug, error, info}; +use seccompiler::{apply_filter, BpfProgram, Error as SecError}; +use vm_memory::GuestAddress; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::api::v1::InstanceInfo; +use crate::kvm_context::KvmContext; +use crate::vcpu::vcpu_impl::{ + Vcpu, VcpuError, VcpuEvent, VcpuHandle, VcpuResponse, VcpuStateEvent, +}; +use crate::vcpu::VcpuConfig; +use crate::vm::VmConfigInfo; +use crate::IoManagerCached; + +/// the timeout for communication with vcpu threads +const CPU_RECV_TIMEOUT_MS: u64 = 1000; + +/// vCPU manager error +#[derive(Debug, thiserror::Error)] +pub enum VcpuManagerError { + /// IO errors in vCPU manager + #[error("IO errors in vCPU manager {0}")] + VcpuIO(#[source] io::Error), + + /// vCPU manager is not initialized + #[error("vcpu manager is not initialized")] + VcpuManagerNotInitialized, + + /// Expected vcpu exceed max count + #[error("expected vcpu exceed max count")] + ExpectedVcpuExceedMax, + + /// vCPU not found + #[error("vcpu not found {0}")] + VcpuNotFound(u8), + + /// Cannot recv vCPU thread tid + #[error("cannot get vCPU thread id")] + VcpuGettid, + + /// vCPU pause failed. + #[error("failure while pausing vCPU thread")] + VcpuPause, + + /// vCPU resume failed. + #[error("failure while resuming vCPU thread")] + VcpuResume, + + /// vCPU save failed. + #[error("failure while save vCPU state")] + VcpuSave, + + /// Vcpu is in unexpected state. + #[error("Vcpu is in unexpected state")] + UnexpectedVcpuResponse, + + /// Vcpu not create + #[error("Vcpu is not create")] + VcpuNotCreate, + + /// The number of max_vcpu reached kvm's limitation + #[error("specified vcpu count {0} is greater than max allowed count {1} by kvm")] + MaxVcpuLimitation(u8, usize), + + /// Revalidate vcpu IoManager cache failed. + #[error("failure while revalidating vcpu IoManager cache")] + VcpuRevalidateCache, + + /// Event fd is already set so there could be some problem in the VMM if we try to reset it. + #[error("Event fd is already set for the vcpu")] + EventAlreadyExist, + + /// Response channel error + #[error("Response channel error: {0}")] + VcpuResponseChannel(RecvError), + + /// Vcpu response timeout + #[error("Vcpu response timeout: {0}")] + VcpuResponseTimeout(RecvTimeoutError), + + /// Cannot build seccomp filters. + #[error("failure while configuring seccomp filters: {0}")] + SeccompFilters(#[source] seccompiler::Error), + + /// Cannot send event to vCPU. + #[error("failure while sending message to vCPU thread: {0}")] + VcpuEvent(#[source] VcpuError), + + /// vCPU Error + #[error("vcpu internal error: {0}")] + Vcpu(#[source] VcpuError), + + #[cfg(feature = "hotplug")] + /// vCPU resize error + #[error("resize vcpu error: {0}")] + VcpuResize(#[source] VcpuResizeError), + + /// Kvm Ioctl Error + #[error("failure in issuing KVM ioctl command: {0}")] + Kvm(#[source] kvm_ioctls::Error), +} + +#[cfg(feature = "hotplug")] +/// Errror associated with resize instance +#[derive(Debug, thiserror::Error)] +pub enum VcpuResizeError { + /// vcpu is in hotplug process + #[error("vcpu is in hotplug process")] + VcpuIsHotplugging, + + /// Cannot update the configuration of the microvm pre boot. + #[error("resize vcpu operation is not allowed after boot")] + UpdateNotAllowedPostBoot, + + /// Expected vcpu exceed max count + #[error("expected vcpu exceed max count")] + ExpectedVcpuExceedMax, + + /// vcpu 0 can't be removed + #[error("vcpu 0 can't be removed")] + Vcpu0CanNotBeRemoved, + + /// Lack removable vcpu + #[error("Removable vcpu not enough, removable vcpu num: {0}, number to remove: {1}, present vcpu count {2}")] + LackRemovableVcpus(u16, u16, u16), + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + /// Cannot update the configuration by upcall channel. + #[error("cannot update the configuration by upcall channel: {0}")] + Upcall(#[source] dbs_upcall::UpcallClientError), +} + +/// Result for vCPU manager operations +pub type Result = std::result::Result; + +#[derive(Debug, PartialEq, Copy, Clone)] +enum VcpuAction { + None, + Hotplug, + Hotunplug, +} + +/// Infos related to per vcpu +#[derive(Default)] +pub(crate) struct VcpuInfo { + pub(crate) vcpu: Option, + vcpu_fd: Option>, + handle: Option, + tid: u32, +} + +impl std::fmt::Debug for VcpuInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VcpuInfo") + .field("vcpu", &self.vcpu.is_some()) + .field("vcpu_fd", &self.vcpu_fd.is_some()) + .field("handle", &self.handle.is_some()) + .field("tid", &self.tid) + .finish() + } +} + +/// Manage all vcpu related actions +pub struct VcpuManager { + pub(crate) vcpu_infos: Vec, + vcpu_config: VcpuConfig, + vcpu_seccomp_filter: BpfProgram, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + support_immediate_exit: bool, + + // The purpose of putting a reference of IoManager here is to simplify the + // design of the API when creating vcpus, and the IoManager has numerous OS + // resources that need to be released when vmm exits. However, since + // VcpuManager is referenced by VcpuEpollHandler and VcpuEpollHandler will + // not be released when vmm is closed, we need to release io manager + // manually when we exit all vcpus. + io_manager: Option, + shared_info: Arc>, + vm_as: GuestAddressSpaceImpl, + pub(crate) vm_fd: Arc, + + action_sycn_tx: Option>, + vcpus_in_action: (VcpuAction, Vec), + pub(crate) reset_event_fd: Option, + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_channel: Option>>, + + // X86 specific fields. + #[cfg(target_arch = "x86_64")] + pub(crate) supported_cpuid: kvm_bindings::CpuId, +} + +#[allow(clippy::too_many_arguments)] +impl VcpuManager { + /// Get a new VcpuManager instance + pub fn new( + vm_fd: Arc, + kvm_context: &KvmContext, + vm_config_info: &VmConfigInfo, + vm_as: GuestAddressSpaceImpl, + vcpu_seccomp_filter: BpfProgram, + shared_info: Arc>, + io_manager: IoManagerCached, + epoll_manager: EpollManager, + ) -> Result>> { + let support_immediate_exit = kvm_context.kvm().check_extension(Cap::ImmediateExit); + let max_vcpu_count = vm_config_info.max_vcpu_count; + let kvm_max_vcpu_count = kvm_context.get_max_vcpus(); + + // check the max vcpu count in kvm. max_vcpu_count is u8 and kvm_context.get_max_vcpus() + // returns usize, so convert max_vcpu_count to usize instead of converting kvm max vcpu to + // u8, to avoid wraping usize. Otherwise if kvm_max_vcpu_count is greater than 255, it'll + // be casted into a smaller number. + if max_vcpu_count as usize > kvm_max_vcpu_count { + error!( + "vcpu_manager: specified vcpu count {} is greater than max allowed count {} by kvm", + max_vcpu_count, kvm_max_vcpu_count + ); + return Err(VcpuManagerError::MaxVcpuLimitation( + max_vcpu_count, + kvm_max_vcpu_count, + )); + } + + let mut vcpu_infos = Vec::with_capacity(max_vcpu_count.into()); + vcpu_infos.resize_with(max_vcpu_count.into(), Default::default); + + let (tx, rx) = channel(); + let vcpu_state_event = + EventFd::new(libc::EFD_NONBLOCK).map_err(VcpuManagerError::VcpuIO)?; + let vcpu_state_event2 = vcpu_state_event + .try_clone() + .map_err(VcpuManagerError::VcpuIO)?; + + #[cfg(target_arch = "x86_64")] + let supported_cpuid = kvm_context + .supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) + .map_err(VcpuManagerError::Kvm)?; + #[cfg(target_arch = "x86_64")] + let vpmu_feature_level = match vm_config_info.vpmu_feature { + 1 => dbs_arch::cpuid::VpmuFeatureLevel::LimitedlyEnabled, + 2 => dbs_arch::cpuid::VpmuFeatureLevel::FullyEnabled, + _ => dbs_arch::cpuid::VpmuFeatureLevel::Disabled, + }; + + let vcpu_manager = Arc::new(Mutex::new(VcpuManager { + vcpu_infos, + vcpu_config: VcpuConfig { + boot_vcpu_count: vm_config_info.vcpu_count, + max_vcpu_count, + threads_per_core: vm_config_info.cpu_topology.threads_per_core, + cores_per_die: vm_config_info.cpu_topology.cores_per_die, + dies_per_socket: vm_config_info.cpu_topology.dies_per_socket, + sockets: vm_config_info.cpu_topology.sockets, + #[cfg(target_arch = "x86_64")] + vpmu_feature: vpmu_feature_level, + }, + vcpu_seccomp_filter, + vcpu_state_event, + vcpu_state_sender: tx, + support_immediate_exit, + io_manager: Some(io_manager), + shared_info, + vm_as, + vm_fd, + action_sycn_tx: None, + vcpus_in_action: (VcpuAction::None, Vec::new()), + reset_event_fd: None, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_channel: None, + #[cfg(target_arch = "x86_64")] + supported_cpuid, + })); + + let handler = Box::new(VcpuEpollHandler { + vcpu_manager: vcpu_manager.clone(), + eventfd: vcpu_state_event2, + rx, + }); + epoll_manager.add_subscriber(handler); + + Ok(vcpu_manager) + } + + /// get vcpu instances in vcpu manager + pub fn vcpus(&self) -> Vec<&Vcpu> { + let mut vcpus = Vec::new(); + for vcpu_info in &self.vcpu_infos { + if let Some(vcpu) = &vcpu_info.vcpu { + vcpus.push(vcpu); + } + } + vcpus + } + + /// get vcpu instances in vcpu manager as mut + pub fn vcpus_mut(&mut self) -> Vec<&mut Vcpu> { + let mut vcpus = Vec::new(); + for vcpu_info in &mut self.vcpu_infos { + if let Some(vcpu) = &mut vcpu_info.vcpu { + vcpus.push(vcpu); + } + } + vcpus + } + + /// add reset event fd for each vcpu, if the reset_event_fd is already set, error will be returned. + pub fn set_reset_event_fd(&mut self, reset_event_fd: EventFd) -> Result<()> { + if self.reset_event_fd.is_some() { + return Err(VcpuManagerError::EventAlreadyExist); + } + self.reset_event_fd = Some(reset_event_fd); + Ok(()) + } + + /// create default num of vcpus for bootup + pub fn create_boot_vcpus( + &mut self, + request_ts: TimestampUs, + entry_addr: GuestAddress, + ) -> Result<()> { + info!("create boot vcpus"); + self.create_vcpus( + self.vcpu_config.boot_vcpu_count, + Some(request_ts), + Some(entry_addr), + )?; + + Ok(()) + } + + /// start the boot vcpus + pub fn start_boot_vcpus(&mut self, vmm_seccomp_filter: BpfProgram) -> Result<()> { + info!("start boot vcpus"); + self.start_vcpus(self.vcpu_config.boot_vcpu_count, vmm_seccomp_filter, true)?; + + Ok(()) + } + + /// create a specified num of vcpu + /// note: we can't create vcpus again until the previously created vcpus are + /// started + pub fn create_vcpus( + &mut self, + vcpu_count: u8, + request_ts: Option, + entry_addr: Option, + ) -> Result> { + info!("create vcpus"); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::ExpectedVcpuExceedMax); + } + + let request_ts = request_ts.unwrap_or_default(); + let mut created_cpus = Vec::new(); + for cpu_id in self.calculate_available_vcpus(vcpu_count) { + self.create_vcpu(cpu_id, request_ts.clone(), entry_addr)?; + created_cpus.push(cpu_id); + } + + Ok(created_cpus) + } + + /// start a specified num of vcpu + pub fn start_vcpus( + &mut self, + vcpu_count: u8, + vmm_seccomp_filter: BpfProgram, + need_resume: bool, + ) -> Result<()> { + info!("start vcpus"); + Vcpu::register_kick_signal_handler(); + self.activate_vcpus(vcpu_count, need_resume)?; + + // Load seccomp filters for the VMM thread. + // Execution panics if filters cannot be loaded, use --seccomp-level=0 if skipping filters + // altogether is the desired behaviour. + if let Err(e) = apply_filter(&vmm_seccomp_filter) { + if !matches!(e, SecError::EmptyFilter) { + return Err(VcpuManagerError::SeccompFilters(e)); + } + } + + Ok(()) + } + + /// pause all vcpus + pub fn pause_all_vcpus(&mut self) -> Result<()> { + self.pause_vcpus(&self.present_vcpus()) + } + + /// resume all vcpus + pub fn resume_all_vcpus(&mut self) -> Result<()> { + self.resume_vcpus(&self.present_vcpus()) + } + + /// exit all vcpus, and never restart again + pub fn exit_all_vcpus(&mut self) -> Result<()> { + self.exit_vcpus(&self.present_vcpus())?; + // clear all vcpu infos + self.vcpu_infos.clear(); + // release io manager's reference manually + self.io_manager.take(); + + Ok(()) + } + + /// revalidate IoManager cache of all vcpus + pub fn revalidate_all_vcpus_cache(&mut self) -> Result<()> { + self.revalidate_vcpus_cache(&self.present_vcpus()) + } + + /// return all present vcpus + pub fn present_vcpus(&self) -> Vec { + self.vcpu_infos + .iter() + .enumerate() + .filter(|(_i, info)| info.handle.is_some()) + .map(|(i, _info)| i as u8) + .collect() + } + + /// Get available vcpus to create with target vcpu_count + /// Argument: + /// * vcpu_count: target vcpu_count online in VcpuManager. + /// Return: + /// * return available vcpu ids to create vcpu . + fn calculate_available_vcpus(&self, vcpu_count: u8) -> Vec { + let present_vcpus_count = self.present_vcpus_count(); + let mut available_vcpus = Vec::new(); + + if present_vcpus_count < vcpu_count { + let mut size = vcpu_count - present_vcpus_count; + for cpu_id in 0..self.vcpu_config.max_vcpu_count { + let info = &self.vcpu_infos[cpu_id as usize]; + if info.handle.is_none() { + available_vcpus.push(cpu_id); + size -= 1; + if size == 0 { + break; + } + } + } + } + + available_vcpus + } + + /// Present vcpus count + fn present_vcpus_count(&self) -> u8 { + self.vcpu_infos + .iter() + .fold(0, |sum, info| sum + info.handle.is_some() as u8) + } + + /// Configure single vcpu + fn configure_single_vcpu( + &mut self, + entry_addr: Option, + vcpu: &mut Vcpu, + ) -> std::result::Result<(), VcpuError> { + vcpu.configure( + &self.vcpu_config, + &self.vm_fd, + &self.vm_as, + entry_addr, + None, + ) + } + + fn create_vcpu( + &mut self, + cpu_index: u8, + request_ts: TimestampUs, + entry_addr: Option, + ) -> Result<()> { + info!("creating vcpu {}", cpu_index); + if self.vcpu_infos.get(cpu_index as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(cpu_index)); + } + // We will reuse the kvm's vcpufd after first creation, for we can't + // create vcpufd with same id in one kvm instance. + let kvm_vcpu = match &self.vcpu_infos[cpu_index as usize].vcpu_fd { + Some(vcpu_fd) => vcpu_fd.clone(), + None => { + let vcpu_fd = Arc::new( + self.vm_fd + .create_vcpu(cpu_index as u64) + .map_err(VcpuError::VcpuFd) + .map_err(VcpuManagerError::Vcpu)?, + ); + self.vcpu_infos[cpu_index as usize].vcpu_fd = Some(vcpu_fd.clone()); + vcpu_fd + } + }; + + let mut vcpu = self.create_vcpu_arch(cpu_index, kvm_vcpu, request_ts)?; + self.configure_single_vcpu(entry_addr, &mut vcpu) + .map_err(VcpuManagerError::Vcpu)?; + self.vcpu_infos[cpu_index as usize].vcpu = Some(vcpu); + + Ok(()) + } + + fn start_vcpu(&mut self, cpu_index: u8, barrier: Arc) -> Result<()> { + info!("starting vcpu {}", cpu_index); + if self.vcpu_infos.get(cpu_index as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(cpu_index)); + } + if let Some(vcpu) = self.vcpu_infos[cpu_index as usize].vcpu.take() { + let handle = vcpu + .start_threaded(self.vcpu_seccomp_filter.clone(), barrier) + .map_err(VcpuManagerError::Vcpu)?; + self.vcpu_infos[cpu_index as usize].handle = Some(handle); + Ok(()) + } else { + Err(VcpuManagerError::VcpuNotCreate) + } + } + + fn get_vcpus_tid(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Gettid) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + for cpu_id in cpu_indexes { + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + match handle + .response_receiver() + .recv_timeout(Duration::from_millis(CPU_RECV_TIMEOUT_MS)) + { + Ok(VcpuResponse::Tid(_, id)) => self.vcpu_infos[*cpu_id as usize].tid = id, + Err(e) => { + error!("vCPU get tid error! {:?}", e); + return Err(VcpuManagerError::VcpuGettid); + } + _ => { + error!("vCPU get tid error!"); + return Err(VcpuManagerError::VcpuGettid); + } + } + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + // Save all vCPU thread ID to self.shared_info + let tids: Vec<(u8, u32)> = cpu_indexes + .iter() + .map(|cpu_id| (*cpu_id, self.vcpu_infos[*cpu_id as usize].tid)) + .collect(); + + // Append the new started vcpu thread IDs into self.shared_info + self.shared_info + .write() + .unwrap() + .tids + .extend_from_slice(&tids[..]); + + Ok(()) + } + + fn revalidate_vcpus_cache(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::RevalidateCache) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + fn pause_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Pause) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + fn resume_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Resume) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + Ok(()) + } + + // exit vcpus and notify the vmm exit event + fn exit_vcpus(&mut self, cpu_indexes: &[u8]) -> Result<()> { + info!("exiting vcpus {:?}", cpu_indexes); + for cpu_id in cpu_indexes { + if self.vcpu_infos.get(*cpu_id as usize).is_none() { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + if let Some(handle) = &self.vcpu_infos[*cpu_id as usize].handle { + handle + .send_event(VcpuEvent::Exit) + .map_err(VcpuManagerError::VcpuEvent)?; + } else { + return Err(VcpuManagerError::VcpuNotFound(*cpu_id)); + } + } + + for cpu_id in cpu_indexes { + let handle = self.vcpu_infos[*cpu_id as usize].handle.take().unwrap(); + handle + .join_vcpu_thread() + .map_err(|e| error!("vcpu exit error! {:?}", e)) + .ok(); + } + + let tids: &mut Vec<(u8, u32)> = &mut self + .shared_info + .write() + .expect( + "Failed to stop vcpus because shared info couldn't be written due to poisoned lock", + ) + .tids; + + // Here's a trick: since we always stop the vcpus started latest, + // thus it's ok here to remove the stopped vcpus from end to head. + tids.truncate(tids.len() - cpu_indexes.len()); + + Ok(()) + } + + fn stop_vcpus_in_action(&mut self) -> Result<()> { + let vcpus_in_action = self.vcpus_in_action.1.clone(); + self.exit_vcpus(&vcpus_in_action) + } + + fn activate_vcpus(&mut self, vcpu_count: u8, need_resume: bool) -> Result> { + let present_vcpus_count = self.present_vcpus_count(); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::ExpectedVcpuExceedMax); + } else if vcpu_count < present_vcpus_count { + return Ok(Vec::new()); + } + + let available_vcpus = self.calculate_available_vcpus(vcpu_count); + let barrier = Arc::new(Barrier::new(available_vcpus.len() + 1_usize)); + for cpu_id in available_vcpus.iter() { + self.start_vcpu(*cpu_id, barrier.clone())?; + } + barrier.wait(); + + self.get_vcpus_tid(&available_vcpus)?; + if need_resume { + self.resume_vcpus(&available_vcpus)?; + } + + Ok(available_vcpus) + } + + fn sync_action_finish(&mut self, got_error: bool) { + if let Some(tx) = self.action_sycn_tx.take() { + if let Err(e) = tx.send(got_error) { + debug!("cpu sync action send to closed channel {}", e); + } + } + } + + fn set_vcpus_action(&mut self, action: VcpuAction, vcpus: Vec) { + self.vcpus_in_action = (action, vcpus); + } + + fn get_vcpus_action(&self) -> VcpuAction { + self.vcpus_in_action.0 + } +} + +#[cfg(target_arch = "x86_64")] +impl VcpuManager { + fn create_vcpu_arch( + &self, + cpu_index: u8, + vcpu_fd: Arc, + request_ts: TimestampUs, + ) -> Result { + // It's safe to unwrap because guest_kernel always exist until vcpu manager done + Vcpu::new_x86_64( + cpu_index, + vcpu_fd, + // safe to unwrap + self.io_manager.as_ref().unwrap().clone(), + self.supported_cpuid.clone(), + self.reset_event_fd.as_ref().unwrap().try_clone().unwrap(), + self.vcpu_state_event.try_clone().unwrap(), + self.vcpu_state_sender.clone(), + request_ts, + self.support_immediate_exit, + ) + .map_err(VcpuManagerError::Vcpu) + } +} + +#[cfg(target_arch = "aarch64")] +impl VcpuManager { + // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) and configured before + // setting up the IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP + // was already initialized. + // Search for `kvm_arch_vcpu_create` in arch/arm/kvm/arm.c. + fn create_vcpu_arch( + &self, + cpu_index: u8, + vcpu_fd: Arc, + request_ts: TimestampUs, + ) -> Result { + Vcpu::new_aarch64( + cpu_index, + vcpu_fd, + // safe to unwrap + self.io_manager.as_ref().unwrap().clone(), + self.reset_event_fd.as_ref().unwrap().try_clone().unwrap(), + self.vcpu_state_event.try_clone().unwrap(), + self.vcpu_state_sender.clone(), + request_ts.clone(), + self.support_immediate_exit, + ) + .map_err(VcpuManagerError::Vcpu) + } +} + +#[cfg(feature = "hotplug")] +mod hotplug { + #[cfg(feature = "dbs-upcall")] + use super::*; + #[cfg(feature = "dbs-upcall")] + use dbs_upcall::{CpuDevRequest, DevMgrRequest}; + #[cfg(feature = "dbs-upcall")] + use std::cmp::Ordering; + + #[cfg(all(target_arch = "x86_64", feature = "dbs-upcall"))] + use dbs_boot::mptable::APIC_VERSION; + #[cfg(all(target_arch = "aarch64"))] + const APIC_VERSION: u8 = 0; + + #[cfg(feature = "dbs-upcall")] + impl VcpuManager { + /// add upcall channel for vcpu manager + pub fn set_upcall_channel( + &mut self, + upcall_channel: Option>>, + ) { + self.upcall_channel = upcall_channel; + } + + /// resize the count of vcpu in runtime + pub fn resize_vcpu( + &mut self, + vcpu_count: u8, + sync_tx: Option>, + ) -> std::result::Result<(), VcpuManagerError> { + if self.get_vcpus_action() != VcpuAction::None { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::VcpuIsHotplugging, + )); + } + self.action_sycn_tx = sync_tx; + + if let Some(upcall) = self.upcall_channel.clone() { + let now_vcpu = self.present_vcpus_count(); + info!("resize vcpu: now: {}, desire: {}", now_vcpu, vcpu_count); + match vcpu_count.cmp(&now_vcpu) { + Ordering::Equal => { + info!("resize vcpu: no need to resize"); + self.sync_action_finish(false); + Ok(()) + } + Ordering::Greater => self.do_add_vcpu(vcpu_count, upcall), + Ordering::Less => self.do_del_vcpu(vcpu_count, upcall), + } + } else { + Err(VcpuManagerError::VcpuResize( + VcpuResizeError::UpdateNotAllowedPostBoot, + )) + } + } + + fn do_add_vcpu( + &mut self, + vcpu_count: u8, + upcall_client: Arc>, + ) -> std::result::Result<(), VcpuManagerError> { + info!("resize vcpu: add"); + if vcpu_count > self.vcpu_config.max_vcpu_count { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::ExpectedVcpuExceedMax, + )); + } + + let created_vcpus = self.create_vcpus(vcpu_count, None, None)?; + let cpu_ids = self.activate_vcpus(vcpu_count, true).map_err(|e| { + // we need to rollback when activate vcpu error + error!("activate vcpu error, rollback! {:?}", e); + let activated_vcpus: Vec = created_vcpus + .iter() + .filter(|&cpu_id| self.vcpu_infos[*cpu_id as usize].handle.is_some()) + .copied() + .collect(); + if let Err(e) = self.exit_vcpus(&activated_vcpus) { + error!("try to rollback error, stop_vcpu: {:?}", e); + } + e + })?; + + let mut cpu_ids_array = [0u8; (u8::MAX as usize) + 1]; + cpu_ids_array[..cpu_ids.len()].copy_from_slice(&cpu_ids[..cpu_ids.len()]); + let req = DevMgrRequest::AddVcpu(CpuDevRequest { + count: cpu_ids.len() as u8, + apic_ids: cpu_ids_array, + apic_ver: APIC_VERSION, + }); + self.send_upcall_action(upcall_client, req)?; + + self.set_vcpus_action(VcpuAction::Hotplug, cpu_ids); + + Ok(()) + } + + fn do_del_vcpu( + &mut self, + vcpu_count: u8, + upcall_client: Arc>, + ) -> std::result::Result<(), VcpuManagerError> { + info!("resize vcpu: delete"); + if vcpu_count == 0 { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::Vcpu0CanNotBeRemoved, + )); + } + + let mut cpu_ids = self.calculate_removable_vcpus(); + let cpu_num_to_be_del = (self.present_vcpus_count() - vcpu_count) as usize; + if cpu_num_to_be_del >= cpu_ids.len() { + return Err(VcpuManagerError::VcpuResize( + VcpuResizeError::LackRemovableVcpus( + cpu_ids.len() as u16, + cpu_num_to_be_del as u16, + self.present_vcpus_count() as u16, + ), + )); + } + + cpu_ids.reverse(); + cpu_ids.truncate(cpu_num_to_be_del); + + let mut cpu_ids_array = [0u8; 256]; + cpu_ids_array[..cpu_ids.len()].copy_from_slice(&cpu_ids[..cpu_ids.len()]); + let req = DevMgrRequest::DelVcpu(CpuDevRequest { + count: cpu_num_to_be_del as u8, + apic_ids: cpu_ids_array, + apic_ver: APIC_VERSION, + }); + self.send_upcall_action(upcall_client, req)?; + + self.set_vcpus_action(VcpuAction::Hotunplug, cpu_ids); + + Ok(()) + } + + #[cfg(test)] + fn send_upcall_action( + &self, + _upcall_client: Arc>, + _request: DevMgrRequest, + ) -> std::result::Result<(), VcpuManagerError> { + Ok(()) + } + + #[cfg(not(test))] + fn send_upcall_action( + &self, + upcall_client: Arc>, + request: DevMgrRequest, + ) -> std::result::Result<(), VcpuManagerError> { + // This is used to fix clippy warnings. + use dbs_upcall::{DevMgrResponse, UpcallClientRequest, UpcallClientResponse}; + + let vcpu_state_event = self.vcpu_state_event.try_clone().unwrap(); + let vcpu_state_sender = self.vcpu_state_sender.clone(); + + upcall_client + .send_request( + UpcallClientRequest::DevMgr(request), + Box::new(move |result| match result { + UpcallClientResponse::DevMgr(response) => { + if let DevMgrResponse::CpuDev(resp) = response { + vcpu_state_sender + .send(VcpuStateEvent::Hotplug(( + resp.result, + resp.info.apic_id_index, + ))) + .unwrap(); + vcpu_state_event.write(1).unwrap(); + } + } + UpcallClientResponse::UpcallReset => { + vcpu_state_sender + .send(VcpuStateEvent::Hotplug((0, 0))) + .unwrap(); + vcpu_state_event.write(1).unwrap(); + } + #[cfg(test)] + UpcallClientResponse::FakeResponse => { + panic!("shouldn't happen"); + } + }), + ) + .map_err(VcpuResizeError::Upcall) + .map_err(VcpuManagerError::VcpuResize) + } + + /// Get removable vcpus. + /// Return: + /// * return removable vcpu_id with cascade order. + fn calculate_removable_vcpus(&self) -> Vec { + self.present_vcpus() + } + } +} + +struct VcpuEpollHandler { + vcpu_manager: Arc>, + eventfd: EventFd, + rx: Receiver, +} + +impl VcpuEpollHandler { + fn process_cpu_state_event(&mut self, _ops: &mut EventOps) { + // It's level triggered, so it's safe to ignore the result. + let _ = self.eventfd.read(); + while let Ok(event) = self.rx.try_recv() { + match event { + VcpuStateEvent::Hotplug((success, cpu_count)) => { + info!("get vcpu event, cpu_index {}", cpu_count); + self.process_cpu_action(success != 0, cpu_count); + } + } + } + } + + fn process_cpu_action(&self, success: bool, _cpu_index: u32) { + let mut vcpu_manager = self.vcpu_manager.lock().unwrap(); + if success { + match vcpu_manager.get_vcpus_action() { + VcpuAction::Hotplug => { + // Notify hotplug success + vcpu_manager.sync_action_finish(false); + } + VcpuAction::Hotunplug => { + if let Err(e) = vcpu_manager.stop_vcpus_in_action() { + error!("stop vcpus in action error: {:?}", e); + } + // notify hotunplug success + vcpu_manager.sync_action_finish(false); + } + VcpuAction::None => { + error!("cannot be here"); + } + }; + vcpu_manager.set_vcpus_action(VcpuAction::None, Vec::new()); + + vcpu_manager.sync_action_finish(true); + // TODO(sicun): rollback + } + } +} + +impl MutEventSubscriber for VcpuEpollHandler { + fn process(&mut self, events: Events, ops: &mut EventOps) { + let vcpu_state_eventfd = self.eventfd.as_raw_fd(); + + match events.fd() { + fd if fd == vcpu_state_eventfd => self.process_cpu_state_event(ops), + _ => error!("vcpu manager epoll handler: unknown event"), + } + } + + fn init(&mut self, ops: &mut EventOps) { + ops.add(Events::new(&self.eventfd, EventSet::IN)).unwrap(); + } +} diff --git a/src/dragonball/src/vcpu/x86_64.rs b/src/dragonball/src/vcpu/x86_64.rs new file mode 100644 index 0000000000..738d574bba --- /dev/null +++ b/src/dragonball/src/vcpu/x86_64.rs @@ -0,0 +1,149 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::sync::mpsc::{channel, Sender}; +use std::sync::Arc; + +use dbs_arch::cpuid::{process_cpuid, VmSpec}; +use dbs_arch::gdt::gdt_entry; +use dbs_utils::time::TimestampUs; +use kvm_bindings::CpuId; +use kvm_ioctls::{VcpuFd, VmFd}; +use log::error; +use vm_memory::{Address, GuestAddress, GuestAddressSpace}; +use vmm_sys_util::eventfd::EventFd; + +use crate::address_space_manager::GuestAddressSpaceImpl; +use crate::metric::{IncMetric, METRICS}; +use crate::vcpu::vcpu_impl::{Result, Vcpu, VcpuError, VcpuStateEvent}; +use crate::vcpu::VcpuConfig; +use crate::IoManagerCached; + +impl Vcpu { + /// Constructs a new VCPU for `vm`. + /// + /// # Arguments + /// + /// * `id` - Represents the CPU number between [0, max vcpus). + /// * `vcpu_fd` - The kvm `VcpuFd` for the vcpu. + /// * `io_mgr` - The io-manager used to access port-io and mmio devices. + /// * `cpuid` - The `CpuId` listing the supported capabilities of this vcpu. + /// * `exit_evt` - An `EventFd` that will be written into when this vcpu + /// exits. + /// * `vcpu_state_event` - The eventfd which can notify vmm state of some + /// vcpu should change. + /// * `vcpu_state_sender` - The channel to send state change message from + /// vcpu thread to vmm thread. + /// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime. + /// * `support_immediate_exit` - whether kvm used supports immediate_exit flag. + #[allow(clippy::too_many_arguments)] + pub fn new_x86_64( + id: u8, + vcpu_fd: Arc, + io_mgr: IoManagerCached, + cpuid: CpuId, + exit_evt: EventFd, + vcpu_state_event: EventFd, + vcpu_state_sender: Sender, + create_ts: TimestampUs, + support_immediate_exit: bool, + ) -> Result { + let (event_sender, event_receiver) = channel(); + let (response_sender, response_receiver) = channel(); + // Initially the cpuid per vCPU is the one supported by this VM. + Ok(Vcpu { + fd: vcpu_fd, + id, + io_mgr, + create_ts, + event_receiver, + event_sender: Some(event_sender), + response_receiver: Some(response_receiver), + response_sender, + vcpu_state_event, + vcpu_state_sender, + exit_evt, + support_immediate_exit, + cpuid, + }) + } + + /// Configures a x86_64 specific vcpu and should be called once per vcpu. + /// + /// # Arguments + /// + /// * `vm_config` - The machine configuration of this microvm needed for the CPUID configuration. + /// * `vm_fd` - The kvm `VmFd` for the virtual machine this vcpu will get attached to. + /// * `vm_memory` - The guest memory used by this microvm. + /// * `kernel_start_addr` - Offset from `guest_mem` at which the kernel starts. + /// * `pgtable_addr` - pgtable address for ap vcpu + pub fn configure( + &mut self, + vcpu_config: &VcpuConfig, + _vm_fd: &VmFd, + vm_as: &GuestAddressSpaceImpl, + kernel_start_addr: Option, + _pgtable_addr: Option, + ) -> Result<()> { + self.set_cpuid(vcpu_config)?; + + dbs_arch::regs::setup_msrs(&self.fd).map_err(VcpuError::MSRSConfiguration)?; + if let Some(start_addr) = kernel_start_addr { + dbs_arch::regs::setup_regs( + &self.fd, + start_addr.raw_value() as u64, + dbs_boot::layout::BOOT_STACK_POINTER, + dbs_boot::layout::BOOT_STACK_POINTER, + dbs_boot::layout::ZERO_PAGE_START, + ) + .map_err(VcpuError::REGSConfiguration)?; + dbs_arch::regs::setup_fpu(&self.fd).map_err(VcpuError::FPUConfiguration)?; + let gdt_table: [u64; dbs_boot::layout::BOOT_GDT_MAX as usize] = [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ]; + let pgtable_addr = + dbs_boot::setup_identity_mapping(&*vm_as.memory()).map_err(VcpuError::PageTable)?; + dbs_arch::regs::setup_sregs( + &*vm_as.memory(), + &self.fd, + pgtable_addr, + &gdt_table, + dbs_boot::layout::BOOT_GDT_OFFSET, + dbs_boot::layout::BOOT_IDT_OFFSET, + ) + .map_err(VcpuError::SREGSConfiguration)?; + } + dbs_arch::interrupts::set_lint(&self.fd).map_err(VcpuError::LocalIntConfiguration)?; + + Ok(()) + } + + fn set_cpuid(&mut self, vcpu_config: &VcpuConfig) -> Result<()> { + let cpuid_vm_spec = VmSpec::new( + self.id, + vcpu_config.max_vcpu_count as u8, + vcpu_config.threads_per_core, + vcpu_config.cores_per_die, + vcpu_config.dies_per_socket, + vcpu_config.vpmu_feature, + ) + .map_err(VcpuError::CpuId)?; + process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| { + METRICS.vcpu.filter_cpuid.inc(); + error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e); + VcpuError::CpuId(e) + })?; + + self.fd + .set_cpuid2(&self.cpuid) + .map_err(VcpuError::SetSupportedCpusFailed) + } +} diff --git a/src/dragonball/src/vm/aarch64.rs b/src/dragonball/src/vm/aarch64.rs new file mode 100644 index 0000000000..7e249f5016 --- /dev/null +++ b/src/dragonball/src/vm/aarch64.rs @@ -0,0 +1,159 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Deref; + +use dbs_arch::gic::GICDevice; +use dbs_arch::{DeviceInfoForFDT, DeviceType}; +use dbs_boot::InitrdConfig; +use dbs_utils::epoll_manager::EpollManager; +use dbs_utils::time::TimestampUs; +use linux_loader::loader::Cmdline; +use vm_memory::{GuestAddressSpace, GuestMemory}; +use vmm_sys_util::eventfd::EventFd; + +use super::{Vm, VmError}; +use crate::address_space_manager::{GuestAddressSpaceImpl, GuestMemoryImpl}; +use crate::error::{Error, StartMicroVmError}; +use crate::event_manager::EventManager; + +/// Configures the system and should be called once per vm before starting vcpu threads. +/// For aarch64, we only setup the FDT. +/// +/// # Arguments +/// +/// * `guest_mem` - The memory to be used by the guest. +/// * `cmdline` - The kernel commandline. +/// * `vcpu_mpidr` - Array of MPIDR register values per vcpu. +/// * `device_info` - A hashmap containing the attached devices for building FDT device nodes. +/// * `gic_device` - The GIC device. +/// * `initrd` - Information about an optional initrd. +fn configure_system( + guest_mem: &M, + cmdline: &str, + vcpu_mpidr: Vec, + device_info: Option<&HashMap<(DeviceType, String), T>>, + gic_device: &Box, + initrd: &Option, +) -> super::Result<()> { + dbs_boot::fdt::create_fdt( + guest_mem, + vcpu_mpidr, + cmdline, + device_info, + gic_device, + initrd, + ) + .map_err(Error::BootSystem)?; + Ok(()) +} + +#[cfg(target_arch = "aarch64")] +impl Vm { + /// Gets a reference to the irqchip of the VM + pub fn get_irqchip(&self) -> &Box { + &self.irqchip_handle.as_ref().unwrap() + } + + /// Creates the irq chip in-kernel device model. + pub fn setup_interrupt_controller(&mut self) -> std::result::Result<(), StartMicroVmError> { + let vcpu_count = self.vm_config.vcpu_count; + + self.irqchip_handle = Some( + dbs_arch::gic::create_gic(&self.vm_fd, vcpu_count.into()) + .map_err(|e| StartMicroVmError::ConfigureVm(VmError::SetupGIC(e)))?, + ); + + Ok(()) + } + + /// Initialize the virtual machine instance. + /// + /// It initialize the virtual machine instance by: + /// 1) initialize virtual machine global state and configuration. + /// 2) create system devices, such as interrupt controller. + /// 3) create and start IO devices, such as serial, console, block, net, vsock etc. + /// 4) create and initialize vCPUs. + /// 5) configure CPU power management features. + /// 6) load guest kernel image. + pub fn init_microvm( + &mut self, + epoll_mgr: EpollManager, + vm_as: GuestAddressSpaceImpl, + request_ts: TimestampUs, + ) -> Result<(), StartMicroVmError> { + let reset_eventfd = + EventFd::new(libc::EFD_NONBLOCK).map_err(|_| StartMicroVmError::EventFd)?; + self.reset_eventfd = Some( + reset_eventfd + .try_clone() + .map_err(|_| StartMicroVmError::EventFd)?, + ); + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .set_reset_event_fd(reset_eventfd) + .map_err(StartMicroVmError::Vcpu)?; + + // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) and configured before + // setting up the IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP + // was already initialized. + // Search for `kvm_arch_vcpu_create` in arch/arm/kvm/arm.c. + let kernel_loader_result = self.load_kernel(vm_as.memory().deref())?; + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .create_boot_vcpus(request_ts, kernel_loader_result.kernel_load) + .map_err(StartMicroVmError::Vcpu)?; + self.setup_interrupt_controller()?; + self.init_devices(epoll_mgr)?; + + Ok(()) + } + + /// Execute system architecture specific configurations. + /// + /// 1) set guest kernel boot parameters + /// 2) setup FDT data structs. + pub fn configure_system_arch( + &self, + vm_memory: &GuestMemoryImpl, + cmdline: &Cmdline, + initrd: Option, + ) -> std::result::Result<(), StartMicroVmError> { + let vcpu_manager = self.vcpu_manager().map_err(StartMicroVmError::Vcpu)?; + let vcpu_mpidr = vcpu_manager + .vcpus() + .into_iter() + .map(|cpu| cpu.get_mpidr()) + .collect(); + let guest_memory = vm_memory.memory(); + + configure_system( + guest_memory, + cmdline.as_str(), + vcpu_mpidr, + self.device_manager.get_mmio_device_info(), + self.get_irqchip(), + &initrd, + ) + .map_err(StartMicroVmError::ConfigureSystem) + } + + pub(crate) fn register_events( + &mut self, + event_mgr: &mut EventManager, + ) -> std::result::Result<(), StartMicroVmError> { + let reset_evt = self.get_reset_eventfd().ok_or(StartMicroVmError::EventFd)?; + event_mgr + .register_exit_eventfd(reset_evt) + .map_err(|_| StartMicroVmError::RegisterEvent)?; + + Ok(()) + } +} diff --git a/src/dragonball/src/vm/kernel_config.rs b/src/dragonball/src/vm/kernel_config.rs new file mode 100644 index 0000000000..4798d8da3d --- /dev/null +++ b/src/dragonball/src/vm/kernel_config.rs @@ -0,0 +1,72 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs::File; + +/// Structure to hold guest kernel configuration information. +pub struct KernelConfigInfo { + /// The descriptor to the kernel file. + kernel_file: File, + /// The descriptor to the initrd file, if there is one + initrd_file: Option, + /// The commandline for guest kernel. + cmdline: linux_loader::cmdline::Cmdline, +} + +impl KernelConfigInfo { + /// Create a KernelConfigInfo instance. + pub fn new( + kernel_file: File, + initrd_file: Option, + cmdline: linux_loader::cmdline::Cmdline, + ) -> Self { + KernelConfigInfo { + kernel_file, + initrd_file, + cmdline, + } + } + + /// Get a mutable reference to the kernel file. + pub fn kernel_file_mut(&mut self) -> &mut File { + &mut self.kernel_file + } + + /// Get an immutable reference to the initrd file. + pub fn initrd_file(&self) -> Option<&File> { + self.initrd_file.as_ref() + } + + /// Get a mutable reference to the initrd file. + pub fn initrd_file_mut(&mut self) -> Option<&mut File> { + self.initrd_file.as_mut() + } + + /// Get a shared reference to the guest kernel boot parameter object. + pub fn kernel_cmdline(&self) -> &linux_loader::cmdline::Cmdline { + &self.cmdline + } + + /// Get a mutable reference to the guest kernel boot parameter object. + pub fn kernel_cmdline_mut(&mut self) -> &mut linux_loader::cmdline::Cmdline { + &mut self.cmdline + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vmm_sys_util::tempfile::TempFile; + + #[test] + fn test_kernel_config_info() { + let kernel = TempFile::new().unwrap(); + let initrd = TempFile::new().unwrap(); + let mut cmdline = linux_loader::cmdline::Cmdline::new(1024); + cmdline.insert_str("ro").unwrap(); + let mut info = KernelConfigInfo::new(kernel.into_file(), Some(initrd.into_file()), cmdline); + + assert_eq!(info.cmdline.as_str(), "ro"); + assert!(info.initrd_file_mut().is_some()); + } +} diff --git a/src/dragonball/src/vm/mod.rs b/src/dragonball/src/vm/mod.rs new file mode 100644 index 0000000000..f5f62a0407 --- /dev/null +++ b/src/dragonball/src/vm/mod.rs @@ -0,0 +1,816 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::io::{self, Read, Seek, SeekFrom}; +use std::ops::Deref; +use std::os::unix::io::RawFd; +use std::sync::{Arc, Mutex, RwLock}; + +use dbs_address_space::AddressSpace; +#[cfg(target_arch = "aarch64")] +use dbs_arch::gic::GICDevice; +use dbs_boot::InitrdConfig; +use dbs_utils::epoll_manager::EpollManager; +use dbs_utils::time::TimestampUs; +use kvm_ioctls::VmFd; +use linux_loader::loader::{KernelLoader, KernelLoaderResult}; +use seccompiler::BpfProgram; +use serde_derive::{Deserialize, Serialize}; +use slog::{error, info}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace}; +use vmm_sys_util::eventfd::EventFd; + +#[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] +use dbs_upcall::{DevMgrService, UpcallClient}; + +use crate::address_space_manager::{ + AddressManagerError, AddressSpaceMgr, AddressSpaceMgrBuilder, GuestAddressSpaceImpl, + GuestMemoryImpl, +}; +use crate::api::v1::{InstanceInfo, InstanceState}; +use crate::device_manager::console_manager::DmesgWriter; +use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext}; +use crate::error::{LoadInitrdError, Result, StartMicroVmError, StopMicrovmError}; +use crate::event_manager::EventManager; +use crate::kvm_context::KvmContext; +use crate::resource_manager::ResourceManager; +use crate::vcpu::{VcpuManager, VcpuManagerError}; +#[cfg(target_arch = "aarch64")] +use dbs_arch::gic::Error as GICError; + +mod kernel_config; +pub use self::kernel_config::KernelConfigInfo; + +#[cfg(target_arch = "aarch64")] +#[path = "aarch64.rs"] +mod aarch64; + +#[cfg(target_arch = "x86_64")] +#[path = "x86_64.rs"] +mod x86_64; + +/// Errors associated with virtual machine instance related operations. +#[derive(Debug, thiserror::Error)] +pub enum VmError { + /// Cannot configure the IRQ. + #[error("failed to configure IRQ fot the virtual machine: {0}")] + Irq(#[source] kvm_ioctls::Error), + + /// Cannot configure the microvm. + #[error("failed to initialize the virtual machine: {0}")] + VmSetup(#[source] kvm_ioctls::Error), + + /// Cannot setup GIC + #[cfg(target_arch = "aarch64")] + #[error("failed to configure GIC")] + SetupGIC(GICError), +} + +/// Configuration information for user defined NUMA nodes. +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq)] +pub struct NumaRegionInfo { + /// memory size for this region (unit: MiB) + pub size: u64, + /// numa node id on host for this region + pub host_numa_node_id: Option, + /// numa node id on guest for this region + pub guest_numa_node_id: Option, + /// vcpu ids belonging to this region + pub vcpu_ids: Vec, +} + +/// Information for cpu topology to guide guest init +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct CpuTopology { + /// threads per core to indicate hyperthreading is enabled or not + pub threads_per_core: u8, + /// cores per die to guide guest cpu topology init + pub cores_per_die: u8, + /// dies per socket to guide guest cpu topology + pub dies_per_socket: u8, + /// number of sockets + pub sockets: u8, +} + +impl Default for CpuTopology { + fn default() -> Self { + CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + } + } +} + +/// Configuration information for virtual machine instance. +#[derive(Clone, Debug, PartialEq)] +pub struct VmConfigInfo { + /// Number of vcpu to start. + pub vcpu_count: u8, + /// Max number of vcpu can be added + pub max_vcpu_count: u8, + /// cpu power management. + pub cpu_pm: String, + /// cpu topology information + pub cpu_topology: CpuTopology, + /// vpmu support level + pub vpmu_feature: u8, + + /// Memory type that can be either hugetlbfs or shmem, default is shmem + pub mem_type: String, + /// Memory file path + pub mem_file_path: String, + /// The memory size in MiB. + pub mem_size_mib: usize, + + /// sock path + pub serial_path: Option, +} + +impl Default for VmConfigInfo { + fn default() -> Self { + VmConfigInfo { + vcpu_count: 1, + max_vcpu_count: 1, + cpu_pm: String::from("on"), + cpu_topology: CpuTopology { + threads_per_core: 1, + cores_per_die: 1, + dies_per_socket: 1, + sockets: 1, + }, + vpmu_feature: 0, + mem_type: String::from("shmem"), + mem_file_path: String::from(""), + mem_size_mib: 128, + serial_path: None, + } + } +} + +/// Struct to manage resources and control states of an virtual machine instance. +/// +/// An `Vm` instance holds a resources assigned to a virtual machine instance, such as CPU, memory, +/// devices etc. When an `Vm` instance gets deconstructed, all resources assigned should be +/// released. +/// +/// We have explicit build the object model as: +/// |---Vmm API Server--<-1:1-> HTTP API Server +/// | |----------<-1:1-> Shimv2/CRI API Server +/// | +/// Vmm <-1:N-> Vm <-1:1-> Address Space Manager <-1:N-> GuestMemory +/// ^ ^---1:1-> Device Manager <-1:N-> Device +/// | ^---1:1-> Resource Manager +/// | ^---1:N-> Vcpu +/// |---<-1:N-> Event Manager +pub struct Vm { + epoll_manager: EpollManager, + kvm: KvmContext, + shared_info: Arc>, + + address_space: AddressSpaceMgr, + device_manager: DeviceManager, + dmesg_fifo: Option>, + kernel_config: Option, + logger: slog::Logger, + reset_eventfd: Option, + resource_manager: Arc, + vcpu_manager: Option>>, + vm_config: VmConfigInfo, + vm_fd: Arc, + + start_instance_request_ts: u64, + start_instance_request_cpu_ts: u64, + start_instance_downtime: u64, + + // Arm specific fields. + // On aarch64 we need to keep around the fd obtained by creating the VGIC device. + #[cfg(target_arch = "aarch64")] + irqchip_handle: Option>, + + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: Option>>, +} + +impl Vm { + /// Constructs a new `Vm` instance using the given `Kvm` instance. + pub fn new( + kvm_fd: Option, + api_shared_info: Arc>, + epoll_manager: EpollManager, + ) -> Result { + let id = api_shared_info.read().unwrap().id.clone(); + let logger = slog_scope::logger().new(slog::o!("id" => id)); + let kvm = KvmContext::new(kvm_fd)?; + let vm_fd = Arc::new(kvm.create_vm()?); + let resource_manager = Arc::new(ResourceManager::new(Some(kvm.max_memslots()))); + let device_manager = DeviceManager::new( + vm_fd.clone(), + resource_manager.clone(), + epoll_manager.clone(), + &logger, + ); + + Ok(Vm { + epoll_manager, + kvm, + shared_info: api_shared_info, + + address_space: AddressSpaceMgr::default(), + device_manager, + dmesg_fifo: None, + kernel_config: None, + logger, + reset_eventfd: None, + resource_manager, + vcpu_manager: None, + vm_config: Default::default(), + vm_fd, + + start_instance_request_ts: 0, + start_instance_request_cpu_ts: 0, + start_instance_downtime: 0, + + #[cfg(target_arch = "aarch64")] + irqchip_handle: None, + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + upcall_client: None, + }) + } + + /// Gets a reference to the device manager by this VM. + pub fn device_manager(&self) -> &DeviceManager { + &self.device_manager + } + + /// Gets a mutable reference to the device manager by this VM. + pub fn device_manager_mut(&mut self) -> &mut DeviceManager { + &mut self.device_manager + } + + /// Get a reference to EpollManager. + pub fn epoll_manager(&self) -> &EpollManager { + &self.epoll_manager + } + + /// Get eventfd for exit notification. + pub fn get_reset_eventfd(&self) -> Option<&EventFd> { + self.reset_eventfd.as_ref() + } + + /// Set guest kernel boot configurations. + pub fn set_kernel_config(&mut self, kernel_config: KernelConfigInfo) { + self.kernel_config = Some(kernel_config); + } + + /// Get virtual machine shared instance information. + pub fn shared_info(&self) -> &Arc> { + &self.shared_info + } + + /// Gets a reference to the address_space.address_space for guest memory owned by this VM. + pub fn vm_address_space(&self) -> Option<&AddressSpace> { + self.address_space.get_address_space() + } + + /// Gets a reference to the address space for guest memory owned by this VM. + /// + /// Note that `GuestMemory` does not include any device memory that may have been added after + /// this VM was constructed. + pub fn vm_as(&self) -> Option<&GuestAddressSpaceImpl> { + self.address_space.get_vm_as() + } + + /// Get a immutable reference to the virtual machine configuration information. + pub fn vm_config(&self) -> &VmConfigInfo { + &self.vm_config + } + + /// Set the virtual machine configuration information. + pub fn set_vm_config(&mut self, config: VmConfigInfo) { + self.vm_config = config; + } + + /// Gets a reference to the kvm file descriptor owned by this VM. + pub fn vm_fd(&self) -> &VmFd { + &self.vm_fd + } + + /// returns true if system upcall service is ready + pub fn is_upcall_client_ready(&self) -> bool { + #[cfg(all(feature = "hotplug", feature = "dbs-upcall"))] + { + if let Some(upcall_client) = self.upcall_client() { + return upcall_client.is_ready(); + } + } + + false + } + + /// Check whether the VM has been initialized. + pub fn is_vm_initialized(&self) -> bool { + let instance_state = { + // Use expect() to crash if the other thread poisoned this lock. + let shared_info = self.shared_info.read() + .expect("Failed to determine if instance is initialized because shared info couldn't be read due to poisoned lock"); + shared_info.state + }; + instance_state != InstanceState::Uninitialized + } + + /// Check whether the VM instance is running. + pub fn is_vm_running(&self) -> bool { + let instance_state = { + // Use expect() to crash if the other thread poisoned this lock. + let shared_info = self.shared_info.read() + .expect("Failed to determine if instance is initialized because shared info couldn't be read due to poisoned lock"); + shared_info.state + }; + instance_state == InstanceState::Running + } + + /// Save VM instance exit state + pub fn vm_exit(&self, exit_code: i32) { + if let Ok(mut info) = self.shared_info.write() { + info.state = InstanceState::Exited(exit_code); + } else { + error!( + self.logger, + "Failed to save exit state, couldn't be written due to poisoned lock" + ); + } + } + + /// Create device operation context. + /// vm is not running, return false + /// vm is running, but hotplug feature is not enable, return error + /// vm is running, but upcall initialize failed, return error + /// vm is running, upcall initialize OK, return true + pub fn create_device_op_context( + &mut self, + epoll_mgr: Option, + ) -> std::result::Result { + if !self.is_vm_initialized() { + Ok(DeviceOpContext::create_boot_ctx(self, epoll_mgr)) + } else { + self.create_device_hotplug_context(epoll_mgr) + } + } + + pub(crate) fn check_health(&self) -> std::result::Result<(), StartMicroVmError> { + if self.kernel_config.is_none() { + return Err(StartMicroVmError::MissingKernelConfig); + } + Ok(()) + } + + pub(crate) fn get_dragonball_info(&self) -> (String, String) { + let guard = self.shared_info.read().unwrap(); + let instance_id = guard.id.clone(); + let dragonball_version = guard.vmm_version.clone(); + + (dragonball_version, instance_id) + } +} + +impl Vm { + pub(crate) fn init_vcpu_manager( + &mut self, + vm_as: GuestAddressSpaceImpl, + vcpu_seccomp_filter: BpfProgram, + ) -> std::result::Result<(), VcpuManagerError> { + let vcpu_manager = VcpuManager::new( + self.vm_fd.clone(), + &self.kvm, + &self.vm_config, + vm_as, + vcpu_seccomp_filter, + self.shared_info.clone(), + self.device_manager.io_manager(), + self.epoll_manager.clone(), + )?; + self.vcpu_manager = Some(vcpu_manager); + + Ok(()) + } + + /// get the cpu manager's reference + pub(crate) fn vcpu_manager( + &self, + ) -> std::result::Result, VcpuManagerError> { + self.vcpu_manager + .as_ref() + .ok_or(VcpuManagerError::VcpuManagerNotInitialized) + .map(|mgr| mgr.lock().unwrap()) + } + + /// Pause all vcpus and record the instance downtime + pub fn pause_all_vcpus_with_downtime(&mut self) -> std::result::Result<(), VcpuManagerError> { + let ts = TimestampUs::default(); + self.start_instance_downtime = ts.time_us; + + self.vcpu_manager()?.pause_all_vcpus()?; + + Ok(()) + } + + /// Resume all vcpus and calc the intance downtime + pub fn resume_all_vcpus_with_downtime(&mut self) -> std::result::Result<(), VcpuManagerError> { + self.vcpu_manager()?.resume_all_vcpus()?; + + if self.start_instance_downtime != 0 { + let now = TimestampUs::default(); + let downtime = now.time_us - self.start_instance_downtime; + info!(self.logger, "VM: instance downtime: {} us", downtime); + self.start_instance_downtime = 0; + if let Ok(mut info) = self.shared_info.write() { + info.last_instance_downtime = downtime; + } else { + error!(self.logger, "Failed to update live upgrade downtime, couldn't be written due to poisoned lock"); + } + } + + Ok(()) + } + + pub(crate) fn init_devices( + &mut self, + epoll_manager: EpollManager, + ) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM: initializing devices ..."); + + let com1_sock_path = self.vm_config.serial_path.clone(); + let kernel_config = self + .kernel_config + .as_mut() + .ok_or(StartMicroVmError::MissingKernelConfig)?; + + info!(self.logger, "VM: create interrupt manager"); + self.device_manager + .create_interrupt_manager() + .map_err(StartMicroVmError::DeviceManager)?; + + info!(self.logger, "VM: create devices"); + let vm_as = + self.address_space + .get_vm_as() + .ok_or(StartMicroVmError::AddressManagerError( + AddressManagerError::GuestMemoryNotInitialized, + ))?; + self.device_manager.create_devices( + vm_as.clone(), + epoll_manager, + kernel_config, + com1_sock_path, + self.dmesg_fifo.take(), + self.address_space.address_space(), + )?; + + info!(self.logger, "VM: start devices"); + self.device_manager.start_devices()?; + + info!(self.logger, "VM: initializing devices done"); + Ok(()) + } + + /// Remove devices when shutdown vm + pub fn remove_devices(&mut self) -> std::result::Result<(), StopMicrovmError> { + info!(self.logger, "VM: remove devices"); + let vm_as = self + .address_space + .get_vm_as() + .ok_or(StopMicrovmError::GuestMemoryNotInitialized)?; + + self.device_manager + .remove_devices( + vm_as.clone(), + self.epoll_manager.clone(), + self.address_space.address_space(), + ) + .map_err(StopMicrovmError::DeviceManager) + } + + /// Reset the console into canonical mode. + pub fn reset_console(&self) -> std::result::Result<(), DeviceMgrError> { + self.device_manager.reset_console() + } + + pub(crate) fn init_dmesg_logger(&mut self) { + let writer = self.dmesg_logger(); + self.dmesg_fifo = Some(writer); + } + + /// dmesg write to logger + fn dmesg_logger(&self) -> Box { + Box::new(DmesgWriter::new(&self.logger)) + } + + pub(crate) fn init_guest_memory(&mut self) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM: initializing guest memory..."); + // We are not allowing reinitialization of vm guest memory. + if self.address_space.is_initialized() { + return Ok(()); + } + + // vcpu boot up require local memory. reserve 100 MiB memory + let mem_size = (self.vm_config.mem_size_mib as u64) << 20; + + let mem_type = self.vm_config.mem_type.clone(); + let mut mem_file_path = String::from(""); + if mem_type == "hugetlbfs" { + let shared_info = self.shared_info.read() + .expect("Failed to determine if instance is initialized because shared info couldn't be read due to poisoned lock"); + mem_file_path.push_str("/dragonball/"); + mem_file_path.push_str(shared_info.id.as_str()); + } + + let mut vcpu_ids: Vec = Vec::new(); + for i in 0..self.vm_config().max_vcpu_count { + vcpu_ids.push(i as u32); + } + + // init default regions. + let mut numa_regions = Vec::with_capacity(1); + let numa_node = NumaRegionInfo { + size: self.vm_config.mem_size_mib as u64, + host_numa_node_id: None, + guest_numa_node_id: Some(0), + vcpu_ids, + }; + numa_regions.push(numa_node); + + info!( + self.logger, + "VM: mem_type:{} mem_file_path:{}, mem_size:{}, numa_regions:{:?}", + mem_type, + mem_file_path, + mem_size, + numa_regions, + ); + + let mut address_space_param = AddressSpaceMgrBuilder::new(&mem_type, &mem_file_path) + .map_err(StartMicroVmError::AddressManagerError)?; + address_space_param.set_kvm_vm_fd(self.vm_fd.clone()); + self.address_space + .create_address_space(&self.resource_manager, &numa_regions, address_space_param) + .map_err(StartMicroVmError::AddressManagerError)?; + + info!(self.logger, "VM: initializing guest memory done"); + Ok(()) + } + + fn init_configure_system( + &mut self, + vm_as: &GuestAddressSpaceImpl, + ) -> std::result::Result<(), StartMicroVmError> { + let vm_memory = vm_as.memory(); + let kernel_config = self + .kernel_config + .as_ref() + .ok_or(StartMicroVmError::MissingKernelConfig)?; + //let cmdline = kernel_config.cmdline.clone(); + let initrd: Option = match kernel_config.initrd_file() { + Some(f) => { + let initrd_file = f.try_clone(); + if initrd_file.is_err() { + return Err(StartMicroVmError::InitrdLoader( + LoadInitrdError::ReadInitrd(io::Error::from(io::ErrorKind::InvalidData)), + )); + } + let res = self.load_initrd(vm_memory.deref(), &mut initrd_file.unwrap())?; + Some(res) + } + None => None, + }; + + self.configure_system_arch(vm_memory.deref(), kernel_config.kernel_cmdline(), initrd) + } + + /// Loads the initrd from a file into the given memory slice. + /// + /// * `vm_memory` - The guest memory the initrd is written to. + /// * `image` - The initrd image. + /// + /// Returns the result of initrd loading + fn load_initrd( + &self, + vm_memory: &GuestMemoryImpl, + image: &mut F, + ) -> std::result::Result + where + F: Read + Seek, + { + use crate::error::LoadInitrdError::*; + + let size: usize; + // Get the image size + match image.seek(SeekFrom::End(0)) { + Err(e) => return Err(ReadInitrd(e)), + Ok(0) => { + return Err(ReadInitrd(io::Error::new( + io::ErrorKind::InvalidData, + "Initrd image seek returned a size of zero", + ))) + } + Ok(s) => size = s as usize, + }; + // Go back to the image start + image.seek(SeekFrom::Start(0)).map_err(ReadInitrd)?; + + // Get the target address + let address = dbs_boot::initrd_load_addr(vm_memory, size as u64).map_err(|_| LoadInitrd)?; + + // Load the image into memory + vm_memory + .read_from(GuestAddress(address), image, size) + .map_err(|_| LoadInitrd)?; + + Ok(InitrdConfig { + address: GuestAddress(address), + size, + }) + } + + fn load_kernel( + &mut self, + vm_memory: &GuestMemoryImpl, + ) -> std::result::Result { + // This is the easy way out of consuming the value of the kernel_cmdline. + let kernel_config = self + .kernel_config + .as_mut() + .ok_or(StartMicroVmError::MissingKernelConfig)?; + let high_mem_addr = GuestAddress(dbs_boot::get_kernel_start()); + + #[cfg(target_arch = "x86_64")] + return linux_loader::loader::elf::Elf::load( + vm_memory, + None, + kernel_config.kernel_file_mut(), + Some(high_mem_addr), + ) + .map_err(StartMicroVmError::KernelLoader); + + #[cfg(target_arch = "aarch64")] + return linux_loader::loader::pe::PE::load( + vm_memory, + Some(GuestAddress(dbs_boot::get_kernel_start())), + kernel_config.kernel_file_mut(), + Some(high_mem_addr), + ) + .map_err(StartMicroVmError::KernelLoader); + } + + /// Set up the initial microVM state and start the vCPU threads. + /// + /// This is the main entrance of the Vm object, to bring up the virtual machine instance into + /// running state. + pub fn start_microvm( + &mut self, + event_mgr: &mut EventManager, + vmm_seccomp_filter: BpfProgram, + vcpu_seccomp_filter: BpfProgram, + ) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM: received instance start command"); + if self.is_vm_initialized() { + return Err(StartMicroVmError::MicroVMAlreadyRunning); + } + + let request_ts = TimestampUs::default(); + self.start_instance_request_ts = request_ts.time_us; + self.start_instance_request_cpu_ts = request_ts.cputime_us; + + self.init_dmesg_logger(); + self.check_health()?; + + // Use expect() to crash if the other thread poisoned this lock. + self.shared_info + .write() + .expect("Failed to start microVM because shared info couldn't be written due to poisoned lock") + .state = InstanceState::Starting; + + self.init_guest_memory()?; + let vm_as = self + .vm_as() + .cloned() + .ok_or(StartMicroVmError::AddressManagerError( + AddressManagerError::GuestMemoryNotInitialized, + ))?; + + self.init_vcpu_manager(vm_as.clone(), vcpu_seccomp_filter) + .map_err(StartMicroVmError::Vcpu)?; + self.init_microvm(event_mgr.epoll_manager(), vm_as.clone(), request_ts)?; + self.init_configure_system(&vm_as)?; + #[cfg(feature = "dbs-upcall")] + self.init_upcall()?; + + info!(self.logger, "VM: register events"); + self.register_events(event_mgr)?; + + info!(self.logger, "VM: start vcpus"); + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .start_boot_vcpus(vmm_seccomp_filter) + .map_err(StartMicroVmError::Vcpu)?; + + // Use expect() to crash if the other thread poisoned this lock. + self.shared_info + .write() + .expect("Failed to start microVM because shared info couldn't be written due to poisoned lock") + .state = InstanceState::Running; + + info!(self.logger, "VM started"); + Ok(()) + } +} + +#[cfg(feature = "hotplug")] +impl Vm { + #[cfg(feature = "dbs-upcall")] + /// initialize upcall client for guest os + #[cfg(feature = "dbs-upcall")] + fn new_upcall(&mut self) -> std::result::Result<(), StartMicroVmError> { + // get vsock inner connector for upcall + let inner_connector = self + .device_manager + .get_vsock_inner_connector() + .ok_or(StartMicroVmError::UpcallMissVsock)?; + let mut upcall_client = UpcallClient::new( + inner_connector, + self.epoll_manager.clone(), + DevMgrService::default(), + ) + .map_err(StartMicroVmError::UpcallInitError)?; + + upcall_client + .connect() + .map_err(StartMicroVmError::UpcallConnectError)?; + self.upcall_client = Some(Arc::new(upcall_client)); + + info!(self.logger, "upcall client init success"); + Ok(()) + } + + #[cfg(feature = "dbs-upcall")] + fn init_upcall(&mut self) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM upcall init"); + if let Err(e) = self.new_upcall() { + info!( + self.logger, + "VM upcall init failed, no support hotplug: {}", e + ); + Err(e) + } else { + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .set_upcall_channel(self.upcall_client().clone()); + Ok(()) + } + } + + #[cfg(feature = "dbs-upcall")] + /// Get upcall client. + #[cfg(feature = "dbs-upcall")] + pub fn upcall_client(&self) -> &Option>> { + &self.upcall_client + } + + #[cfg(feature = "dbs-upcall")] + fn create_device_hotplug_context( + &self, + epoll_mgr: Option, + ) -> std::result::Result { + if self.upcall_client().is_none() { + Err(StartMicroVmError::UpcallMissVsock) + } else if self.is_upcall_client_ready() { + Ok(DeviceOpContext::create_hotplug_ctx(self, epoll_mgr)) + } else { + Err(StartMicroVmError::UpcallNotReady) + } + } + + // We will support hotplug without upcall in future stages. + #[cfg(not(feature = "dbs-upcall"))] + fn create_device_hotplug_context( + &self, + _epoll_mgr: Option, + ) -> std::result::Result { + Err(StartMicroVmError::MicroVMAlreadyRunning) + } +} + +#[cfg(not(feature = "hotplug"))] +impl Vm { + fn init_upcall(&mut self) -> std::result::Result<(), StartMicroVmError> { + Ok(()) + } + + fn create_device_hotplug_context( + &self, + _epoll_mgr: Option, + ) -> std::result::Result { + Err(StartMicroVmError::MicroVMAlreadyRunning) + } +} diff --git a/src/dragonball/src/vm/x86_64.rs b/src/dragonball/src/vm/x86_64.rs new file mode 100644 index 0000000000..96ca0acb1f --- /dev/null +++ b/src/dragonball/src/vm/x86_64.rs @@ -0,0 +1,280 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::convert::TryInto; +use std::mem; +use std::ops::Deref; + +use dbs_address_space::AddressSpace; +use dbs_boot::{add_e820_entry, bootparam, layout, mptable, BootParamsWrapper, InitrdConfig}; +use dbs_utils::epoll_manager::EpollManager; +use dbs_utils::time::TimestampUs; +use kvm_bindings::{kvm_irqchip, kvm_pit_config, kvm_pit_state2, KVM_PIT_SPEAKER_DUMMY}; +use linux_loader::cmdline::Cmdline; +use slog::info; +use vm_memory::{Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory}; + +use crate::address_space_manager::{GuestAddressSpaceImpl, GuestMemoryImpl}; +use crate::error::{Error, Result, StartMicroVmError}; +use crate::event_manager::EventManager; +use crate::vm::{Vm, VmError}; + +/// Configures the system and should be called once per vm before starting vcpu +/// threads. +/// +/// # Arguments +/// +/// * `guest_mem` - The memory to be used by the guest. +/// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was +/// loaded. +/// * `cmdline_size` - Size of the kernel command line in bytes including the +/// null terminator. +/// * `initrd` - Information about where the ramdisk image was loaded in the +/// `guest_mem`. +/// * `boot_cpus` - Number of virtual CPUs the guest will have at boot time. +/// * `max_cpus` - Max number of virtual CPUs the guest will have. +/// * `rsv_mem_bytes` - Reserve memory from microVM.. +#[allow(clippy::too_many_arguments)] +fn configure_system( + guest_mem: &M, + address_space: Option<&AddressSpace>, + cmdline_addr: GuestAddress, + cmdline_size: usize, + initrd: &Option, + boot_cpus: u8, + max_cpus: u8, +) -> super::Result<()> { + const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; + const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; + const KERNEL_LOADER_OTHER: u8 = 0xff; + const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. + + let mmio_start = GuestAddress(layout::MMIO_LOW_START); + let mmio_end = GuestAddress(layout::MMIO_LOW_END); + let himem_start = GuestAddress(layout::HIMEM_START); + + // Note that this puts the mptable at the last 1k of Linux's 640k base RAM + mptable::setup_mptable(guest_mem, boot_cpus, max_cpus).map_err(Error::MpTableSetup)?; + + let mut params: BootParamsWrapper = BootParamsWrapper(bootparam::boot_params::default()); + + params.0.hdr.type_of_loader = KERNEL_LOADER_OTHER; + params.0.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC; + params.0.hdr.header = KERNEL_HDR_MAGIC; + params.0.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32; + params.0.hdr.cmdline_size = cmdline_size as u32; + params.0.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES; + if let Some(initrd_config) = initrd { + params.0.hdr.ramdisk_image = initrd_config.address.raw_value() as u32; + params.0.hdr.ramdisk_size = initrd_config.size as u32; + } + + add_e820_entry(&mut params.0, 0, layout::EBDA_START, bootparam::E820_RAM) + .map_err(Error::BootSystem)?; + + let mem_end = address_space.ok_or(Error::AddressSpace)?.last_addr(); + if mem_end < mmio_start { + add_e820_entry( + &mut params.0, + himem_start.raw_value() as u64, + // it's safe to use unchecked_offset_from because + // mem_end > himem_start + mem_end.unchecked_offset_from(himem_start) as u64 + 1, + bootparam::E820_RAM, + ) + .map_err(Error::BootSystem)?; + } else { + add_e820_entry( + &mut params.0, + himem_start.raw_value(), + // it's safe to use unchecked_offset_from because + // end_32bit_gap_start > himem_start + mmio_start.unchecked_offset_from(himem_start), + bootparam::E820_RAM, + ) + .map_err(Error::BootSystem)?; + if mem_end > mmio_end { + add_e820_entry( + &mut params.0, + mmio_end.raw_value() + 1, + // it's safe to use unchecked_offset_from because mem_end > mmio_end + mem_end.unchecked_offset_from(mmio_end) as u64, + bootparam::E820_RAM, + ) + .map_err(Error::BootSystem)?; + } + } + + let zero_page_addr = GuestAddress(layout::ZERO_PAGE_START); + guest_mem + .checked_offset(zero_page_addr, mem::size_of::()) + .ok_or(Error::ZeroPagePastRamEnd)?; + guest_mem + .write_obj(params, zero_page_addr) + .map_err(|_| Error::ZeroPageSetup)?; + + Ok(()) +} + +impl Vm { + /// Get the status of in-kernel PIT. + pub fn get_pit_state(&self) -> Result { + self.vm_fd + .get_pit2() + .map_err(|e| Error::Vm(VmError::Irq(e))) + } + + /// Set the status of in-kernel PIT. + pub fn set_pit_state(&self, pit_state: &kvm_pit_state2) -> Result<()> { + self.vm_fd + .set_pit2(pit_state) + .map_err(|e| Error::Vm(VmError::Irq(e))) + } + + /// Get the status of in-kernel ioapic. + pub fn get_irqchip_state(&self, chip_id: u32) -> Result { + let mut irqchip: kvm_irqchip = kvm_irqchip { + chip_id, + ..kvm_irqchip::default() + }; + self.vm_fd + .get_irqchip(&mut irqchip) + .map(|_| irqchip) + .map_err(|e| Error::Vm(VmError::Irq(e))) + } + + /// Set the status of in-kernel ioapic. + pub fn set_irqchip_state(&self, irqchip: &kvm_irqchip) -> Result<()> { + self.vm_fd + .set_irqchip(irqchip) + .map_err(|e| Error::Vm(VmError::Irq(e))) + } +} + +impl Vm { + /// Initialize the virtual machine instance. + /// + /// It initialize the virtual machine instance by: + /// 1) initialize virtual machine global state and configuration. + /// 2) create system devices, such as interrupt controller, PIT etc. + /// 3) create and start IO devices, such as serial, console, block, net, vsock etc. + /// 4) create and initialize vCPUs. + /// 5) configure CPU power management features. + /// 6) load guest kernel image. + pub fn init_microvm( + &mut self, + epoll_mgr: EpollManager, + vm_as: GuestAddressSpaceImpl, + request_ts: TimestampUs, + ) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM: start initializing microvm ..."); + + self.init_tss()?; + // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` + // while on aarch64 we need to do it the other way around. + self.setup_interrupt_controller()?; + self.create_pit()?; + self.init_devices(epoll_mgr)?; + + let reset_event_fd = self.device_manager.get_reset_eventfd().unwrap(); + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .set_reset_event_fd(reset_event_fd) + .map_err(StartMicroVmError::Vcpu)?; + + if self.vm_config.cpu_pm == "on" { + // TODO: add cpu_pm support. issue #4590. + info!(self.logger, "VM: enable CPU disable_idle_exits capability"); + } + + let vm_memory = vm_as.memory(); + let kernel_loader_result = self.load_kernel(vm_memory.deref())?; + self.vcpu_manager() + .map_err(StartMicroVmError::Vcpu)? + .create_boot_vcpus(request_ts, kernel_loader_result.kernel_load) + .map_err(StartMicroVmError::Vcpu)?; + + info!(self.logger, "VM: initializing microvm done"); + Ok(()) + } + + /// Execute system architecture specific configurations. + /// + /// 1) set guest kernel boot parameters + /// 2) setup BIOS configuration data structs, mainly implement the MPSpec. + pub fn configure_system_arch( + &self, + vm_memory: &GuestMemoryImpl, + cmdline: &Cmdline, + initrd: Option, + ) -> std::result::Result<(), StartMicroVmError> { + let cmdline_addr = GuestAddress(dbs_boot::layout::CMDLINE_START); + linux_loader::loader::load_cmdline(vm_memory, cmdline_addr, cmdline) + .map_err(StartMicroVmError::LoadCommandline)?; + + configure_system( + vm_memory, + self.address_space.address_space(), + cmdline_addr, + cmdline.as_str().len() + 1, + &initrd, + self.vm_config.vcpu_count, + self.vm_config.max_vcpu_count, + ) + .map_err(StartMicroVmError::ConfigureSystem) + } + + /// Initializes the guest memory. + pub(crate) fn init_tss(&mut self) -> std::result::Result<(), StartMicroVmError> { + self.vm_fd + .set_tss_address(dbs_boot::layout::KVM_TSS_ADDRESS.try_into().unwrap()) + .map_err(|e| StartMicroVmError::ConfigureVm(VmError::VmSetup(e))) + } + + /// Creates the irq chip and an in-kernel device model for the PIT. + pub(crate) fn setup_interrupt_controller( + &mut self, + ) -> std::result::Result<(), StartMicroVmError> { + self.vm_fd + .create_irq_chip() + .map_err(|e| StartMicroVmError::ConfigureVm(VmError::VmSetup(e))) + } + + /// Creates an in-kernel device model for the PIT. + pub(crate) fn create_pit(&self) -> std::result::Result<(), StartMicroVmError> { + info!(self.logger, "VM: create pit"); + // We need to enable the emulation of a dummy speaker port stub so that writing to port 0x61 + // (i.e. KVM_SPEAKER_BASE_ADDRESS) does not trigger an exit to user space. + let pit_config = kvm_pit_config { + flags: KVM_PIT_SPEAKER_DUMMY, + ..kvm_pit_config::default() + }; + + // Safe because we know that our file is a VM fd, we know the kernel will only read the + // correct amount of memory from our pointer, and we verify the return result. + self.vm_fd + .create_pit2(pit_config) + .map_err(|e| StartMicroVmError::ConfigureVm(VmError::VmSetup(e))) + } + + pub(crate) fn register_events( + &mut self, + event_mgr: &mut EventManager, + ) -> std::result::Result<(), StartMicroVmError> { + let reset_evt = self + .device_manager + .get_reset_eventfd() + .map_err(StartMicroVmError::DeviceManager)?; + event_mgr + .register_exit_eventfd(&reset_evt) + .map_err(|_| StartMicroVmError::RegisterEvent)?; + self.reset_eventfd = Some(reset_evt); + + Ok(()) + } +} diff --git a/src/dragonball/src/vmm.rs b/src/dragonball/src/vmm.rs new file mode 100644 index 0000000000..a25543e342 --- /dev/null +++ b/src/dragonball/src/vmm.rs @@ -0,0 +1,215 @@ +// Copyright (C) 2020-2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::os::unix::io::RawFd; +use std::sync::{Arc, Mutex, RwLock}; + +use dbs_utils::epoll_manager::EpollManager; +use log::{error, info, warn}; +use seccompiler::BpfProgram; +use vmm_sys_util::eventfd::EventFd; + +use crate::api::v1::{InstanceInfo, VmmService}; +use crate::error::{EpollError, Result}; +use crate::event_manager::{EventContext, EventManager}; +use crate::vm::Vm; +use crate::{EXIT_CODE_GENERIC_ERROR, EXIT_CODE_OK}; + +/// Global coordinator to manage API servers, virtual machines, upgrade etc. +/// +/// Originally firecracker assumes an VMM only manages an VM, and doesn't distinguish VMM and VM. +/// Thus caused a mixed and confusion design. Now we have explicit build the object model as: +/// |---Vmm API Server--<-1:1-> HTTP API Server +/// | |----------<-1:1-> Shimv2/CRI API Server +/// | +/// Vmm <-1:N-> Vm <-1:1-> Address Space Manager <-1:N-> GuestMemory +/// ^ ^---1:1-> Device Manager <-1:N-> Device +/// | ^---1:1-> Resource Manager +/// | ^---1:N-> Vcpu +/// |---<-1:N-> Event Manager +pub struct Vmm { + pub(crate) event_ctx: EventContext, + epoll_manager: EpollManager, + + // Will change to a HashMap when enabling 1 VMM with multiple VMs. + vm: Vm, + + vcpu_seccomp_filter: BpfProgram, + vmm_seccomp_filter: BpfProgram, +} + +impl Vmm { + /// Create a Virtual Machine Monitor instance. + pub fn new( + api_shared_info: Arc>, + api_event_fd: EventFd, + vmm_seccomp_filter: BpfProgram, + vcpu_seccomp_filter: BpfProgram, + kvm_fd: Option, + ) -> Result { + let epoll_manager = EpollManager::default(); + Self::new_with_epoll_manager( + api_shared_info, + api_event_fd, + epoll_manager, + vmm_seccomp_filter, + vcpu_seccomp_filter, + kvm_fd, + ) + } + + /// Create a Virtual Machine Monitor instance with a epoll_manager. + pub fn new_with_epoll_manager( + api_shared_info: Arc>, + api_event_fd: EventFd, + epoll_manager: EpollManager, + vmm_seccomp_filter: BpfProgram, + vcpu_seccomp_filter: BpfProgram, + kvm_fd: Option, + ) -> Result { + let vm = Vm::new(kvm_fd, api_shared_info, epoll_manager.clone())?; + let event_ctx = EventContext::new(api_event_fd)?; + + Ok(Vmm { + event_ctx, + epoll_manager, + vm, + vcpu_seccomp_filter, + vmm_seccomp_filter, + }) + } + + /// Get a reference to a virtual machine managed by the VMM. + pub fn get_vm(&self) -> Option<&Vm> { + Some(&self.vm) + } + + /// Get a mutable reference to a virtual machine managed by the VMM. + pub fn get_vm_mut(&mut self) -> Option<&mut Vm> { + Some(&mut self.vm) + } + + /// Get the seccomp rules for vCPU threads. + pub fn vcpu_seccomp_filter(&self) -> BpfProgram { + self.vcpu_seccomp_filter.clone() + } + + /// Get the seccomp rules for VMM threads. + pub fn vmm_seccomp_filter(&self) -> BpfProgram { + self.vmm_seccomp_filter.clone() + } + + /// Run the event loop to service API requests. + /// + /// # Arguments + /// + /// * `vmm` - An Arc reference to the global Vmm instance. + /// * `service` - VMM Service provider. + pub fn run_vmm_event_loop(vmm: Arc>, mut service: VmmService) -> i32 { + let epoll_mgr = vmm.lock().unwrap().epoll_manager.clone(); + let mut event_mgr = + EventManager::new(&vmm, epoll_mgr).expect("Cannot create epoll manager"); + + 'poll: loop { + match event_mgr.handle_events(-1) { + Ok(_) => { + // Check whether there are pending vmm events. + if event_mgr.fetch_vmm_event_count() == 0 { + continue; + } + + let mut v = vmm.lock().unwrap(); + if v.event_ctx.api_event_triggered { + // The run_vmm_action() needs to access event_mgr, so it could + // not be handled in EpollHandler::handle_events(). It has been + // delayed to the main loop. + v.event_ctx.api_event_triggered = false; + service + .run_vmm_action(&mut v, &mut event_mgr) + .unwrap_or_else(|_| { + warn!("got spurious notification from api thread"); + }); + } + if v.event_ctx.exit_evt_triggered { + info!("Gracefully terminated VMM control loop"); + return v.stop(EXIT_CODE_OK as i32); + } + } + Err(e) => { + error!("Abruptly exited VMM control loop: {:?}", e); + if let EpollError::EpollMgr(dbs_utils::epoll_manager::Error::Epoll(e)) = e { + if e.errno() == libc::EAGAIN || e.errno() == libc::EINTR { + continue 'poll; + } + } + return vmm.lock().unwrap().stop(EXIT_CODE_GENERIC_ERROR as i32); + } + } + } + } + + /// Waits for all vCPUs to exit and terminates the Dragonball process. + fn stop(&mut self, exit_code: i32) -> i32 { + info!("Vmm is stopping."); + if let Some(vm) = self.get_vm_mut() { + if vm.is_vm_initialized() { + if let Err(e) = vm.remove_devices() { + warn!("failed to remove devices: {:?}", e); + } + + if let Err(e) = vm.reset_console() { + warn!("Cannot set canonical mode for the terminal. {:?}", e); + } + + // Now, we use exit_code instead of invoking _exit to + // terminate process, so all of vcpu threads should be stopped + // prior to vmm event loop. + match vm.vcpu_manager() { + Ok(mut mgr) => { + if let Err(e) = mgr.exit_all_vcpus() { + warn!("Failed to exit vcpu thread. {:?}", e); + } + } + Err(e) => warn!("Failed to get vcpu manager {:?}", e), + } + + // save exit state to VM, instead of exit process. + vm.vm_exit(exit_code); + } + } + + exit_code + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + + pub fn create_vmm_instance() -> Vmm { + let info = Arc::new(RwLock::new(InstanceInfo::default())); + let event_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + let seccomp_filter: BpfProgram = Vec::new(); + let epoll_manager = EpollManager::default(); + + Vmm::new_with_epoll_manager( + info, + event_fd, + epoll_manager, + seccomp_filter.clone(), + seccomp_filter, + None, + ) + .unwrap() + } + + #[test] + fn test_create_vmm_instance() { + create_vmm_instance(); + } +} diff --git a/src/libs/Cargo.lock b/src/libs/Cargo.lock index 99a395749b..b82c108c44 100644 --- a/src/libs/Cargo.lock +++ b/src/libs/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.57" @@ -27,9 +36,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bitflags" @@ -37,6 +46,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "byte-unit" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8" + [[package]] name = "byteorder" version = "1.4.3" @@ -71,6 +86,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cgroups-rs" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b827f9d9f6c2fff719d25f5d44cbc8d2ef6df1ef00d055c5c14d5dc25529579" +dependencies = [ + "libc", + "log", + "nix 0.23.1", + "regex", +] + [[package]] name = "chrono" version = "0.4.19" @@ -84,6 +111,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "common-path" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101" + [[package]] name = "crossbeam-channel" version = "0.5.2" @@ -121,6 +154,17 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "fail" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +dependencies = [ + "lazy_static", + "log", + "rand 0.8.5", +] + [[package]] name = "fastrand" version = "1.6.0" @@ -225,6 +269,34 @@ dependencies = [ "slab", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.10.2+wasi-snapshot-preview1", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "hashbrown" version = "0.11.2" @@ -240,6 +312,15 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "indexmap" version = "1.8.1" @@ -283,6 +364,50 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +[[package]] +name = "kata-sys-util" +version = "0.1.0" +dependencies = [ + "byteorder", + "cgroups-rs", + "chrono", + "common-path", + "fail", + "kata-types", + "lazy_static", + "libc", + "nix 0.24.2", + "num_cpus", + "oci", + "once_cell", + "rand 0.7.3", + "serde_json", + "serial_test", + "slog", + "slog-scope", + "subprocess", + "tempfile", + "thiserror", +] + +[[package]] +name = "kata-types" +version = "0.1.0" +dependencies = [ + "byte-unit", + "glob", + "lazy_static", + "num_cpus", + "oci", + "regex", + "serde", + "serde_json", + "slog", + "slog-scope", + "thiserror", + "toml", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -295,6 +420,16 @@ version = "0.2.124" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21a41fed9d98f27ab1c6d161da622a4fa35e8a54a8adc24bbf3ddd0ef70b0e50" +[[package]] +name = "lock_api" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.16" @@ -362,9 +497,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] name = "nix" -version = "0.20.2" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5e06129fb611568ef4e868c14b326274959aa70ff7776e9d55323531c374945" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", @@ -375,12 +510,11 @@ dependencies = [ [[package]] name = "nix" -version = "0.23.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "195cdbc1741b8134346d515b3a56a1c94b0912758009cfd53f99ea0f57b065fc" dependencies = [ "bitflags", - "cc", "cfg-if", "libc", "memoffset", @@ -414,12 +548,57 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "oci" +version = "0.1.0" +dependencies = [ + "libc", + "serde", + "serde_derive", + "serde_json", +] + [[package]] name = "once_cell" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + [[package]] name = "petgraph" version = "0.5.1" @@ -442,6 +621,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + [[package]] name = "proc-macro2" version = "1.0.37" @@ -504,9 +689,9 @@ dependencies = [ [[package]] name = "protobuf" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" dependencies = [ "serde", "serde_derive", @@ -514,18 +699,18 @@ dependencies = [ [[package]] name = "protobuf-codegen" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c" +checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707" dependencies = [ "protobuf", ] [[package]] name = "protobuf-codegen-pure" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440" +checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6" dependencies = [ "protobuf", "protobuf-codegen", @@ -536,6 +721,7 @@ name = "protocols" version = "0.1.0" dependencies = [ "async-trait", + "oci", "protobuf", "serde", "serde_json", @@ -552,6 +738,77 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom 0.2.6", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "redox_syscall" version = "0.2.10" @@ -561,6 +818,23 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + [[package]] name = "remove_dir_all" version = "0.5.3" @@ -585,19 +859,25 @@ dependencies = [ ] [[package]] -name = "serde" -version = "1.0.133" +name = "scopeguard" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97565067517b60e2d1ea8b268e59ce036de907ac523ad83a0475da04e818989a" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.133" +version = "1.0.136" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed201699328568d8d08208fdd080e3ff594e6c422e438b6705905da01005d537" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" dependencies = [ "proc-macro2", "quote", @@ -615,6 +895,28 @@ dependencies = [ "serde", ] +[[package]] +name = "serial_test" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0bccbcf40c8938196944a3da0e133e031a33f4d6b72db3bda3cc556e361905d" +dependencies = [ + "lazy_static", + "parking_lot", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2acd6defeddb41eb60bb468f8825d0cfd0c2a76bc03bfd235b6a1dc4f6a1ad5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "slab" version = "0.4.6" @@ -641,9 +943,9 @@ dependencies = [ [[package]] name = "slog-json" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52e9b96fb6b5e80e371423b4aca6656eb537661ce8f82c2697e619f8ca85d043" +checksum = "0f7f7a952ce80fca9da17bf0a53895d11f8aa1ba063668ca53fc72e7869329e9" dependencies = [ "chrono", "serde", @@ -662,6 +964,12 @@ dependencies = [ "slog", ] +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + [[package]] name = "socket2" version = "0.4.4" @@ -672,6 +980,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "syn" version = "1.0.91" @@ -725,21 +1043,20 @@ dependencies = [ [[package]] name = "thread_local" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" dependencies = [ "once_cell", ] [[package]] name = "time" -version = "0.1.44" +version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" dependencies = [ "libc", - "wasi 0.10.0+wasi-snapshot-preview1", "winapi", ] @@ -784,17 +1101,26 @@ dependencies = [ ] [[package]] -name = "ttrpc" -version = "0.5.2" +name = "toml" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a973ce6d5eaa20c173635b29ffb660dafbc7ef109172c0015ba44e47a23711" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" +dependencies = [ + "serde", +] + +[[package]] +name = "ttrpc" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ecfff459a859c6ba6668ff72b34c2f1d94d9d58f7088414c2674ad0f31cc7d8" dependencies = [ "async-trait", "byteorder", "futures", "libc", "log", - "nix 0.20.2", + "nix 0.23.1", "protobuf", "protobuf-codegen-pure", "thiserror", @@ -853,9 +1179,15 @@ dependencies = [ [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.9.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasi" diff --git a/src/libs/Cargo.toml b/src/libs/Cargo.toml index 16eedb91f2..b485eaa43e 100644 --- a/src/libs/Cargo.toml +++ b/src/libs/Cargo.toml @@ -1,7 +1,10 @@ [workspace] members = [ "logging", + "kata-types", + "kata-sys-util", "safe-path", "protocols", + "oci", ] resolver = "2" diff --git a/src/libs/Makefile b/src/libs/Makefile new file mode 100644 index 0000000000..9ce0be19de --- /dev/null +++ b/src/libs/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) 2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +EXTRA_RUSTFEATURES := + +EXTRA_TEST_FLAGS := +USERID=$(shell id -u) +ifeq ($(USERID), 0) + override EXTRA_TEST_FLAGS = --ignored +endif + +default: build + +build: + cargo build --all-features + +check: clippy format + +clippy: + @echo "INFO: cargo clippy..." + cargo clippy --all-targets --all-features --release \ + -- \ + -D warnings + +format: + @echo "INFO: cargo fmt..." + cargo fmt -- --check + +clean: + cargo clean + +# It is essential to run these tests using *both* build profiles. +# See the `test_logger_levels()` test for further information. +test: + @echo "INFO: testing libraries for development build" + cargo test --all $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS) + @echo "INFO: testing libraries for release build" + cargo test --release --all $(EXTRA_RUSTFEATURES) -- --nocapture $(EXTRA_TEST_FLAGS) + +.PHONY: install vendor diff --git a/src/libs/README.md b/src/libs/README.md index a04a12fafe..bb1a655c38 100644 --- a/src/libs/README.md +++ b/src/libs/README.md @@ -6,5 +6,7 @@ Currently it provides following library crates: | Library | Description | |-|-| -| [logging](logging/) | Facilities to setup logging subsystem based slog. | +| [logging](logging/) | Facilities to setup logging subsystem based on slog. | +| [system utilities](kata-sys-util/) | Collection of facilities and helpers to access system services. | +| [types](kata-types/) | Collection of constants and data types shared by multiple Kata Containers components. | | [safe-path](safe-path/) | Utilities to safely resolve filesystem paths. | diff --git a/src/libs/kata-sys-util/Cargo.toml b/src/libs/kata-sys-util/Cargo.toml new file mode 100644 index 0000000000..eb8759e6f5 --- /dev/null +++ b/src/libs/kata-sys-util/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "kata-sys-util" +version = "0.1.0" +description = "System Utilities for Kata Containers" +keywords = ["kata", "container", "runtime"] +authors = ["The Kata Containers community "] +repository = "https://github.com/kata-containers/kata-containers.git" +homepage = "https://katacontainers.io/" +readme = "README.md" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +byteorder = "1.4.3" +cgroups = { package = "cgroups-rs", version = "0.2.7" } +chrono = "0.4.0" +common-path = "=1.0.0" +fail = "0.5.0" +lazy_static = "1.4.0" +libc = "0.2.100" +nix = "0.24.1" +once_cell = "1.9.0" +serde_json = "1.0.73" +slog = "2.5.2" +slog-scope = "4.4.0" +subprocess = "0.2.8" +rand = "0.7.2" +thiserror = "1.0.30" + +kata-types = { path = "../kata-types" } +oci = { path = "../oci" } + +[dev-dependencies] +num_cpus = "1.13.1" +serial_test = "0.5.1" +tempfile = "3.2.0" diff --git a/src/libs/kata-sys-util/README.md b/src/libs/kata-sys-util/README.md new file mode 100644 index 0000000000..0c3f887bcb --- /dev/null +++ b/src/libs/kata-sys-util/README.md @@ -0,0 +1,19 @@ +# kata-sys-util + +This crate is a collection of utilities and helpers for +[Kata Containers](https://github.com/kata-containers/kata-containers/) components to access system services. + +It provides safe wrappers over system services, such as: +- cgroups +- file systems +- mount +- NUMA + +## Support + +**Operating Systems**: +- Linux + +## License + +This code is licensed under [Apache-2.0](../../../LICENSE). diff --git a/src/libs/kata-sys-util/src/device.rs b/src/libs/kata-sys-util/src/device.rs new file mode 100644 index 0000000000..00a2ade127 --- /dev/null +++ b/src/libs/kata-sys-util/src/device.rs @@ -0,0 +1,104 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fs; +use std::io::Result; +use std::os::unix::fs::{FileTypeExt, MetadataExt}; +use std::path::{Path, PathBuf}; + +use nix::sys::stat; + +use crate::{eother, sl}; + +const SYS_DEV_BLOCK_PATH: &str = "/sys/dev/block"; +const BLKDEV_PARTITION: &str = "partition"; +const BLKDEV_DEV_FILE: &str = "dev"; + +/// Get major and minor number of the device or of the device hosting the regular file/directory. +pub fn get_devid_for_blkio_cgroup>(path: P) -> Result> { + let md = fs::metadata(path)?; + + if md.is_dir() || md.is_file() { + // For regular file/directory, get major/minor of the block device hosting it. + // Note that we need to get the major/minor of the block device instead of partition, + // e.g. /dev/sda instead of /dev/sda3, because blkio cgroup works with block major/minor. + let id = md.dev(); + Ok(Some((stat::major(id), stat::minor(id)))) + } else if md.file_type().is_block_device() { + // For block device, get major/minor of the device special file itself + get_block_device_id(md.rdev()) + } else { + Ok(None) + } +} + +/// Get the block device major/minor number from a partition/block device(itself). +/// +/// For example, given the dev_t of /dev/sda3 returns major and minor of /dev/sda. We rely on the +/// fact that if /sys/dev/block/$major:$minor/partition exists, then it's a partition, and find its +/// parent for the real device. +fn get_block_device_id(dev: stat::dev_t) -> Result> { + let major = stat::major(dev); + let minor = stat::minor(dev); + let mut blk_dev_path = PathBuf::from(SYS_DEV_BLOCK_PATH) + .join(format!("{}:{}", major, minor)) + .canonicalize()?; + + // If 'partition' file exists, then it's a partition of the real device, take its parent. + // Otherwise it's already the real device. + loop { + if !blk_dev_path.join(BLKDEV_PARTITION).exists() { + break; + } + blk_dev_path = match blk_dev_path.parent() { + Some(p) => p.to_path_buf(), + None => { + return Err(eother!( + "Can't find real device for dev {}:{}", + major, + minor + )) + } + }; + } + + // Parse major:minor in dev file + let dev_path = blk_dev_path.join(BLKDEV_DEV_FILE); + let dev_buf = fs::read_to_string(&dev_path)?; + let dev_buf = dev_buf.trim_end(); + debug!( + sl!(), + "get_real_devid: dev {}:{} -> {:?} ({})", major, minor, blk_dev_path, dev_buf + ); + + if let Some((major, minor)) = dev_buf.split_once(':') { + let major = major + .parse::() + .map_err(|_e| eother!("Failed to parse major number: {}", major))?; + let minor = minor + .parse::() + .map_err(|_e| eother!("Failed to parse minor number: {}", minor))?; + Ok(Some((major, minor))) + } else { + Err(eother!( + "Wrong format in {}: {}", + dev_path.to_string_lossy(), + dev_buf + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_devid() { + //let (major, minor) = get_devid_for_blkio_cgroup("/dev/vda1").unwrap().unwrap(); + assert!(get_devid_for_blkio_cgroup("/dev/tty").unwrap().is_none()); + get_devid_for_blkio_cgroup("/do/not/exist/file_______name").unwrap_err(); + } +} diff --git a/src/libs/kata-sys-util/src/fs.rs b/src/libs/kata-sys-util/src/fs.rs new file mode 100644 index 0000000000..1d85fa61cd --- /dev/null +++ b/src/libs/kata-sys-util/src/fs.rs @@ -0,0 +1,217 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::ffi::OsString; +use std::fs::{self, File}; +use std::io::{Error, Result}; +use std::os::unix::io::AsRawFd; +use std::path::{Path, PathBuf}; +use std::process::Command; + +use crate::{eother, sl}; + +// nix filesystem_type for different libc and architectures +#[cfg(all(target_os = "linux", target_env = "musl"))] +type FsType = libc::c_ulong; +#[cfg(all( + target_os = "linux", + not(any(target_env = "musl", target_arch = "s390x")) +))] +type FsType = libc::__fsword_t; +#[cfg(all(target_os = "linux", not(target_env = "musl"), target_arch = "s390x"))] +type FsType = libc::c_uint; + +// from linux.git/fs/fuse/inode.c: #define FUSE_SUPER_MAGIC 0x65735546 +const FUSE_SUPER_MAGIC: FsType = 0x65735546; +// from linux.git/include/uapi/linux/magic.h +const OVERLAYFS_SUPER_MAGIC: FsType = 0x794c7630; + +/// Get bundle path (current working directory). +pub fn get_bundle_path() -> Result { + std::env::current_dir() +} + +/// Get the basename of the canonicalized path +pub fn get_base_name>(src: P) -> Result { + let s = src.as_ref().canonicalize()?; + s.file_name().map(|v| v.to_os_string()).ok_or_else(|| { + eother!( + "failed to get base name of path {}", + src.as_ref().to_string_lossy() + ) + }) +} + +/// Check whether `path` is on a fuse filesystem. +pub fn is_fuse_fs>(path: P) -> bool { + if let Ok(st) = nix::sys::statfs::statfs(path.as_ref()) { + if st.filesystem_type().0 == FUSE_SUPER_MAGIC { + return true; + } + } + false +} + +/// Check whether `path` is on a overlay filesystem. +pub fn is_overlay_fs>(path: P) -> bool { + if let Ok(st) = nix::sys::statfs::statfs(path.as_ref()) { + if st.filesystem_type().0 == OVERLAYFS_SUPER_MAGIC { + return true; + } + } + false +} + +/// Check whether the given path is a symlink. +pub fn is_symlink>(path: P) -> std::io::Result { + let path = path.as_ref(); + let meta = fs::symlink_metadata(path)?; + + Ok(meta.file_type().is_symlink()) +} + +/// Reflink copy src to dst, and falls back to regular copy if reflink copy fails. +/// +/// # Safety +/// The `reflink_copy()` doesn't preserve permission/security context for the copied file, +/// so caller needs to take care of it. +pub fn reflink_copy, D: AsRef>(src: S, dst: D) -> Result<()> { + let src_path = src.as_ref(); + let dst_path = dst.as_ref(); + let src = src_path.to_string_lossy(); + let dst = dst_path.to_string_lossy(); + + if !src_path.is_file() { + return Err(eother!("reflink_copy src {} is not a regular file", src)); + } + + // Make sure dst's parent exist. If dst is a regular file, then unlink it for later copy. + if dst_path.exists() { + if !dst_path.is_file() { + return Err(eother!("reflink_copy dst {} is not a regular file", dst)); + } else { + fs::remove_file(dst_path)?; + } + } else if let Some(dst_parent) = dst_path.parent() { + if !dst_parent.exists() { + if let Err(e) = fs::create_dir_all(dst_parent) { + return Err(eother!( + "reflink_copy: create_dir_all {} failed: {:?}", + dst_parent.to_str().unwrap(), + e + )); + } + } else if !dst_parent.is_dir() { + return Err(eother!("reflink_copy parent of {} is not a directory", dst)); + } + } + + // Reflink copy, and fallback to regular copy if reflink fails. + let src_file = fs::File::open(src_path)?; + let dst_file = fs::File::create(dst_path)?; + if let Err(e) = do_reflink_copy(src_file, dst_file) { + match e.raw_os_error() { + // Cross dev copy or filesystem doesn't support reflink, do regular copy + Some(os_err) + if os_err == nix::Error::EXDEV as i32 + || os_err == nix::Error::EOPNOTSUPP as i32 => + { + warn!( + sl!(), + "reflink_copy: reflink is not supported ({:?}), do regular copy instead", e, + ); + if let Err(e) = do_regular_copy(src.as_ref(), dst.as_ref()) { + return Err(eother!( + "reflink_copy: regular copy {} to {} failed: {:?}", + src, + dst, + e + )); + } + } + // Reflink copy failed + _ => { + return Err(eother!( + "reflink_copy: copy {} to {} failed: {:?}", + src, + dst, + e, + )) + } + } + } + + Ok(()) +} + +// Copy file using cp command, which handles sparse file copy. +fn do_regular_copy(src: &str, dst: &str) -> Result<()> { + let mut cmd = Command::new("/bin/cp"); + cmd.args(&["--sparse=auto", src, dst]); + + match cmd.output() { + Ok(output) => match output.status.success() { + true => Ok(()), + false => Err(eother!("`{:?}` failed: {:?}", cmd, output)), + }, + Err(e) => Err(eother!("`{:?}` failed: {:?}", cmd, e)), + } +} + +/// Copy file by reflink +fn do_reflink_copy(src: File, dst: File) -> Result<()> { + use nix::ioctl_write_int; + // FICLONE ioctl number definition, from include/linux/fs.h + const FS_IOC_MAGIC: u8 = 0x94; + const FS_IOC_FICLONE: u8 = 9; + // Define FICLONE ioctl using nix::ioctl_write_int! macro. + // The generated function has the following signature: + // pub unsafe fn ficlone(fd: libc::c_int, data: libc::c_ulang) -> Result + ioctl_write_int!(ficlone, FS_IOC_MAGIC, FS_IOC_FICLONE); + + // Safe because the `src` and `dst` are valid file objects and we have checked the result. + unsafe { ficlone(dst.as_raw_fd(), src.as_raw_fd() as u64) } + .map(|_| ()) + .map_err(|e| Error::from_raw_os_error(e as i32)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_base_name() { + assert_eq!(&get_base_name("/etc/hostname").unwrap(), "hostname"); + assert_eq!(&get_base_name("/bin").unwrap(), "bin"); + assert!(&get_base_name("/").is_err()); + assert!(&get_base_name("").is_err()); + assert!(get_base_name("/no/such/path________yeah").is_err()); + } + + #[test] + fn test_is_symlink() { + let tmpdir = tempfile::tempdir().unwrap(); + let path = tmpdir.path(); + + std::os::unix::fs::symlink(path, path.join("a")).unwrap(); + assert!(is_symlink(path.join("a")).unwrap()); + } + + #[test] + fn test_reflink_copy() { + let tmpdir = tempfile::tempdir().unwrap(); + let path = tmpdir.path().join("mounts"); + reflink_copy("/proc/mounts", &path).unwrap(); + let content = fs::read_to_string(&path).unwrap(); + assert!(!content.is_empty()); + reflink_copy("/proc/mounts", &path).unwrap(); + let content = fs::read_to_string(&path).unwrap(); + assert!(!content.is_empty()); + + reflink_copy("/proc/mounts", tmpdir.path()).unwrap_err(); + reflink_copy("/proc/mounts_not_exist", &path).unwrap_err(); + } +} diff --git a/src/libs/kata-sys-util/src/hooks.rs b/src/libs/kata-sys-util/src/hooks.rs new file mode 100644 index 0000000000..78e3ae662e --- /dev/null +++ b/src/libs/kata-sys-util/src/hooks.rs @@ -0,0 +1,541 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::ffi::OsString; +use std::hash::{Hash, Hasher}; +use std::io::{self, Read, Result}; +use std::path::Path; +use std::time::Duration; + +use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection}; + +use crate::{eother, sl}; + +const DEFAULT_HOOK_TIMEOUT_SEC: i32 = 10; + +/// A simple wrapper over `oci::Hook` to provide `Hash, Eq`. +/// +/// The `oci::Hook` is auto-generated from protobuf source file, which doesn't implement `Hash, Eq`. +#[derive(Debug, Default, Clone)] +struct HookKey(oci::Hook); + +impl From<&oci::Hook> for HookKey { + fn from(hook: &oci::Hook) -> Self { + HookKey(hook.clone()) + } +} + +impl PartialEq for HookKey { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl Eq for HookKey {} + +impl Hash for HookKey { + fn hash(&self, state: &mut H) { + self.0.path.hash(state); + self.0.args.hash(state); + self.0.env.hash(state); + self.0.timeout.hash(state); + } +} + +/// Execution state of OCI hooks. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum HookState { + /// Hook is pending for executing/retry. + Pending, + /// Hook has been successfully executed. + Done, + /// Hook has been marked as ignore. + Ignored, +} + +/// Structure to maintain state for hooks. +#[derive(Default)] +pub struct HookStates { + states: HashMap, +} + +impl HookStates { + /// Create a new instance of [`HookStates`]. + pub fn new() -> Self { + Self::default() + } + + /// Get execution state of a hook. + pub fn get(&self, hook: &oci::Hook) -> HookState { + self.states + .get(&hook.into()) + .copied() + .unwrap_or(HookState::Pending) + } + + /// Update execution state of a hook. + pub fn update(&mut self, hook: &oci::Hook, state: HookState) { + self.states.insert(hook.into(), state); + } + + /// Remove an execution state of a hook. + pub fn remove(&mut self, hook: &oci::Hook) { + self.states.remove(&hook.into()); + } + + /// Check whether some hooks are still pending and should retry execution. + pub fn should_retry(&self) -> bool { + for state in self.states.values() { + if *state == HookState::Pending { + return true; + } + } + false + } + + /// Execute an OCI hook. + /// + /// If `state` is valid, it will be sent to subprocess' STDIN. + /// + /// The [OCI Runtime specification 1.0.0](https://github.com/opencontainers/runtime-spec/releases/download/v1.0.0/oci-runtime-spec-v1.0.0.pdf) + /// states: + /// - path (string, REQUIRED) with similar semantics to IEEE Std 1003.1-2008 execv's path. + /// This specification extends the IEEE standard in that path MUST be absolute. + /// - args (array of strings, OPTIONAL) with the same semantics as IEEE Std 1003.1-2008 execv's + /// argv. + /// - env (array of strings, OPTIONAL) with the same semantics as IEEE Std 1003.1-2008's environ. + /// - timeout (int, OPTIONAL) is the number of seconds before aborting the hook. If set, timeout + /// MUST be greater than zero. + /// + /// The OCI spec also defines the context to invoke hooks, caller needs to take the responsibility + /// to setup execution context, such as namespace etc. + pub fn execute_hook(&mut self, hook: &oci::Hook, state: Option) -> Result<()> { + if self.get(hook) != HookState::Pending { + return Ok(()); + } + + fail::fail_point!("execute_hook", |_| { + Err(eother!("execute hook fail point injection")) + }); + info!(sl!(), "execute hook {:?}", hook); + + self.states.insert(hook.into(), HookState::Pending); + + let mut executor = HookExecutor::new(hook)?; + let stdin = if state.is_some() { + Redirection::Pipe + } else { + Redirection::None + }; + let mut popen = Popen::create( + &executor.args, + PopenConfig { + stdin, + stdout: Redirection::Pipe, + stderr: Redirection::Pipe, + executable: executor.executable.to_owned(), + detached: true, + env: Some(executor.envs.clone()), + ..Default::default() + }, + ) + .map_err(|e| eother!("failed to create subprocess for hook {:?}: {}", hook, e))?; + + if let Some(state) = state { + executor.execute_with_input(&mut popen, state)?; + } + executor.execute_and_wait(&mut popen)?; + info!(sl!(), "hook {} finished", hook.path); + self.states.insert(hook.into(), HookState::Done); + + Ok(()) + } + + /// Try to execute hooks and remember execution result. + /// + /// The `execute_hooks()` will be called multiple times. + /// It will first be called before creating the VMM when creating the sandbox, so hooks could be + /// used to setup environment for the VMM, such as creating tap device etc. + /// It will also be called during starting containers, to setup environment for those containers. + /// + /// The execution result will be recorded for each hook. Once a hook returns success, it will not + /// be invoked anymore. + pub fn execute_hooks(&mut self, hooks: &[oci::Hook], state: Option) -> Result<()> { + for hook in hooks.iter() { + if let Err(e) = self.execute_hook(hook, state.clone()) { + // Ignore error and try next hook, the caller should retry. + error!(sl!(), "hook {} failed: {}", hook.path, e); + } + } + + Ok(()) + } +} + +struct HookExecutor<'a> { + hook: &'a oci::Hook, + executable: Option, + args: Vec, + envs: Vec<(OsString, OsString)>, + timeout: u64, +} + +impl<'a> HookExecutor<'a> { + fn new(hook: &'a oci::Hook) -> Result { + // Ensure Hook.path is present and is an absolute path. + let executable = if hook.path.is_empty() { + return Err(eother!("path of hook {:?} is empty", hook)); + } else { + let path = Path::new(&hook.path); + if !path.is_absolute() { + return Err(eother!("path of hook {:?} is not absolute", hook)); + } + Some(path.as_os_str().to_os_string()) + }; + + // Hook.args is optional, use Hook.path as arg0 if Hook.args is empty. + let args = if hook.args.is_empty() { + vec![hook.path.clone()] + } else { + hook.args.clone() + }; + + let mut envs: Vec<(OsString, OsString)> = Vec::new(); + for e in hook.env.iter() { + match e.split_once('=') { + Some((key, value)) => envs.push((OsString::from(key), OsString::from(value))), + None => warn!(sl!(), "env {} of hook {:?} is invalid", e, hook), + } + } + + // Use Hook.timeout if it's valid, otherwise default to 10s. + let mut timeout = DEFAULT_HOOK_TIMEOUT_SEC as u64; + if let Some(t) = hook.timeout { + if t > 0 { + timeout = t as u64; + } + } + + Ok(HookExecutor { + hook, + executable, + args, + envs, + timeout, + }) + } + + fn execute_with_input(&mut self, popen: &mut Popen, state: oci::State) -> Result<()> { + let st = serde_json::to_string(&state)?; + let (stdout, stderr) = popen + .communicate_start(Some(st.as_bytes().to_vec())) + .limit_time(Duration::from_secs(self.timeout)) + .read_string() + .map_err(|e| e.error)?; + if let Some(err) = stderr { + if !err.is_empty() { + error!(sl!(), "hook {} exec failed: {}", self.hook.path, err); + } + } + if let Some(out) = stdout { + if !out.is_empty() { + info!(sl!(), "hook {} exec stdout: {}", self.hook.path, out); + } + } + // Give a grace period for `execute_and_wait()`. + self.timeout = 1; + Ok(()) + } + + fn execute_and_wait(&mut self, popen: &mut Popen) -> Result<()> { + match popen.wait_timeout(Duration::from_secs(self.timeout)) { + Ok(v) => self.handle_exit_status(v, popen), + Err(e) => self.handle_popen_wait_error(e, popen), + } + } + + fn handle_exit_status(&mut self, result: Option, popen: &mut Popen) -> Result<()> { + if let Some(exit_status) = result { + // the process has finished + info!( + sl!(), + "exit status of hook {:?} : {:?}", self.hook, exit_status + ); + self.print_result(popen); + match exit_status { + subprocess::ExitStatus::Exited(code) => { + if code == 0 { + info!(sl!(), "hook {:?} succeeds", self.hook); + Ok(()) + } else { + warn!(sl!(), "hook {:?} exit status with {}", self.hook, code,); + Err(eother!("hook {:?} exit status with {}", self.hook, code)) + } + } + _ => { + error!( + sl!(), + "no exit code for hook {:?}: {:?}", self.hook, exit_status + ); + Err(eother!( + "no exit code for hook {:?}: {:?}", + self.hook, + exit_status + )) + } + } + } else { + // may be timeout + error!(sl!(), "hook poll failed, kill it"); + // it is still running, kill it + popen.kill()?; + let _ = popen.wait(); + self.print_result(popen); + Err(io::Error::from(io::ErrorKind::TimedOut)) + } + } + + fn handle_popen_wait_error(&mut self, e: PopenError, popen: &mut Popen) -> Result<()> { + self.print_result(popen); + error!(sl!(), "wait_timeout for hook {:?} failed: {}", self.hook, e); + Err(eother!( + "wait_timeout for hook {:?} failed: {}", + self.hook, + e + )) + } + + fn print_result(&mut self, popen: &mut Popen) { + if let Some(file) = popen.stdout.as_mut() { + let mut buffer = String::new(); + file.read_to_string(&mut buffer).ok(); + if !buffer.is_empty() { + info!(sl!(), "hook stdout: {}", buffer); + } + } + if let Some(file) = popen.stderr.as_mut() { + let mut buffer = String::new(); + file.read_to_string(&mut buffer).ok(); + if !buffer.is_empty() { + info!(sl!(), "hook stderr: {}", buffer); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::{self, set_permissions, File, Permissions}; + use std::io::Write; + use std::os::unix::fs::PermissionsExt; + use std::time::Instant; + + fn test_hook_eq(hook1: &oci::Hook, hook2: &oci::Hook, expected: bool) { + let key1 = HookKey::from(hook1); + let key2 = HookKey::from(hook2); + + assert_eq!(key1 == key2, expected); + } + #[test] + fn test_hook_key() { + let hook = oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec![], + timeout: Some(0), + }; + let cases = [ + ( + oci::Hook { + path: "1000".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec![], + timeout: Some(0), + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "4".to_string()], + env: vec![], + timeout: Some(0), + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string()], + env: vec![], + timeout: Some(0), + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec!["5".to_string()], + timeout: Some(0), + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec![], + timeout: Some(6), + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec![], + timeout: None, + }, + false, + ), + ( + oci::Hook { + path: "1".to_string(), + args: vec!["2".to_string(), "3".to_string()], + env: vec![], + timeout: Some(0), + }, + true, + ), + ]; + + for case in cases.iter() { + test_hook_eq(&hook, &case.0, case.1); + } + } + + #[test] + fn test_execute_hook() { + // test need root permission + if !nix::unistd::getuid().is_root() { + println!("test need root permission"); + return; + } + + let tmpdir = tempfile::tempdir().unwrap(); + let file = tmpdir.path().join("data"); + let file_str = file.to_string_lossy(); + let mut states = HookStates::new(); + + // test case 1: normal + // execute hook + let hook = oci::Hook { + path: "/bin/touch".to_string(), + args: vec!["touch".to_string(), file_str.to_string()], + env: vec![], + timeout: Some(0), + }; + let ret = states.execute_hook(&hook, None); + assert!(ret.is_ok()); + assert!(fs::metadata(&file).is_ok()); + assert!(!states.should_retry()); + + // test case 2: timeout in 10s + let hook = oci::Hook { + path: "/bin/sleep".to_string(), + args: vec!["sleep".to_string(), "3600".to_string()], + env: vec![], + timeout: Some(0), // default timeout is 10 seconds + }; + let start = Instant::now(); + let ret = states.execute_hook(&hook, None).unwrap_err(); + let duration = start.elapsed(); + let used = duration.as_secs(); + assert!((10..12u64).contains(&used)); + assert_eq!(ret.kind(), io::ErrorKind::TimedOut); + assert_eq!(states.get(&hook), HookState::Pending); + assert!(states.should_retry()); + states.remove(&hook); + + // test case 3: timeout in 5s + let hook = oci::Hook { + path: "/bin/sleep".to_string(), + args: vec!["sleep".to_string(), "3600".to_string()], + env: vec![], + timeout: Some(5), // timeout is set to 5 seconds + }; + let start = Instant::now(); + let ret = states.execute_hook(&hook, None).unwrap_err(); + let duration = start.elapsed(); + let used = duration.as_secs(); + assert!((5..7u64).contains(&used)); + assert_eq!(ret.kind(), io::ErrorKind::TimedOut); + assert_eq!(states.get(&hook), HookState::Pending); + assert!(states.should_retry()); + states.remove(&hook); + + // test case 4: with envs + let create_shell = |shell_path: &str, data_path: &str| -> Result<()> { + let shell = format!( + r#"#!/bin/sh +echo -n "K1=${{K1}}" > {} +"#, + data_path + ); + let mut output = File::create(shell_path)?; + output.write_all(shell.as_bytes())?; + + // set to executable + let permissions = Permissions::from_mode(0o755); + set_permissions(shell_path, permissions)?; + + Ok(()) + }; + let shell_path = format!("{}/test.sh", tmpdir.path().to_string_lossy()); + let ret = create_shell(&shell_path, file_str.as_ref()); + assert!(ret.is_ok()); + let hook = oci::Hook { + path: shell_path, + args: vec![], + env: vec!["K1=V1".to_string()], + timeout: Some(5), + }; + let ret = states.execute_hook(&hook, None); + assert!(ret.is_ok()); + assert!(!states.should_retry()); + let contents = fs::read_to_string(file); + match contents { + Err(e) => panic!("got error {}", e), + Ok(s) => assert_eq!(s, "K1=V1"), + } + + // test case 5: timeout in 5s with state + let hook = oci::Hook { + path: "/bin/sleep".to_string(), + args: vec!["sleep".to_string(), "3600".to_string()], + env: vec![], + timeout: Some(6), // timeout is set to 5 seconds + }; + let state = oci::State { + version: "".to_string(), + id: "".to_string(), + status: oci::ContainerState::Creating, + pid: 10, + bundle: "nouse".to_string(), + annotations: Default::default(), + }; + let start = Instant::now(); + let ret = states.execute_hook(&hook, Some(state)).unwrap_err(); + let duration = start.elapsed(); + let used = duration.as_secs(); + assert!((6..8u64).contains(&used)); + assert_eq!(ret.kind(), io::ErrorKind::TimedOut); + assert!(states.should_retry()); + } +} diff --git a/src/libs/kata-sys-util/src/k8s.rs b/src/libs/kata-sys-util/src/k8s.rs new file mode 100644 index 0000000000..be95d5d330 --- /dev/null +++ b/src/libs/kata-sys-util/src/k8s.rs @@ -0,0 +1,69 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Utilities to support Kubernetes (K8s). +//! +//! This module depends on kubelet internal implementation details, a better way is needed +//! to detect K8S EmptyDir medium type from `oci::spec::Mount` objects. + +use kata_types::mount; +use oci::Spec; + +use crate::mount::get_linux_mount_info; + +pub use kata_types::k8s::is_empty_dir; + +/// Check whether the given path is a kubernetes ephemeral volume. +/// +/// This method depends on a specific path used by k8s to detect if it's type of ephemeral. +/// As of now, this is a very k8s specific solution that works but in future there should be a +/// better way for this method to determine if the path is for ephemeral volume type. +pub fn is_ephemeral_volume(path: &str) -> bool { + if is_empty_dir(path) { + if let Ok(info) = get_linux_mount_info(path) { + if info.fs_type == "tmpfs" { + return true; + } + } + } + + false +} + +/// Check whether the given path is a kubernetes empty-dir volume of medium "default". +/// +/// K8s `EmptyDir` volumes are directories on the host. If the fs type is tmpfs, it's a ephemeral +/// volume instead of a `EmptyDir` volume. +pub fn is_host_empty_dir(path: &str) -> bool { + if is_empty_dir(path) { + if let Ok(info) = get_linux_mount_info(path) { + if info.fs_type != "tmpfs" { + return true; + } + } + } + + false +} + +// set_ephemeral_storage_type sets the mount type to 'ephemeral' +// if the mount source path is provisioned by k8s for ephemeral storage. +// For the given pod ephemeral volume is created only once +// backed by tmpfs inside the VM. For successive containers +// of the same pod the already existing volume is reused. +pub fn update_ephemeral_storage_type(oci_spec: &mut Spec) { + for m in oci_spec.mounts.iter_mut() { + if mount::is_kata_guest_mount_volume(&m.r#type) { + continue; + } + + if is_ephemeral_volume(&m.source) { + m.r#type = String::from(mount::KATA_EPHEMERAL_VOLUME_TYPE); + } else if is_host_empty_dir(&m.source) { + m.r#type = String::from(mount::KATA_HOST_DIR_VOLUME_TYPE); + } + } +} diff --git a/src/libs/kata-sys-util/src/lib.rs b/src/libs/kata-sys-util/src/lib.rs new file mode 100644 index 0000000000..2c90adb7c4 --- /dev/null +++ b/src/libs/kata-sys-util/src/lib.rs @@ -0,0 +1,33 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +pub mod device; +pub mod fs; +pub mod hooks; +pub mod k8s; +pub mod mount; +pub mod numa; +pub mod rand; +pub mod spec; +pub mod validate; + +// Convenience macro to obtain the scoped logger +#[macro_export] +macro_rules! sl { + () => { + slog_scope::logger() + }; +} + +#[macro_export] +macro_rules! eother { + () => (std::io::Error::new(std::io::ErrorKind::Other, "")); + ($fmt:expr, $($arg:tt)*) => ({ + std::io::Error::new(std::io::ErrorKind::Other, format!($fmt, $($arg)*)) + }) +} diff --git a/src/libs/kata-sys-util/src/mount.rs b/src/libs/kata-sys-util/src/mount.rs new file mode 100644 index 0000000000..2bc8c07a5a --- /dev/null +++ b/src/libs/kata-sys-util/src/mount.rs @@ -0,0 +1,1089 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Utilities and helpers to execute mount operations on Linux systems. +//! +//! These utilities and helpers are specially designed and implemented to support container runtimes +//! on Linux systems, so they may not be generic enough. +//! +//! # Quotation from [mount(2)](https://man7.org/linux/man-pages/man2/mount.2.html) +//! +//! A call to mount() performs one of a number of general types of operation, depending on the bits +//! specified in mountflags. The choice of which operation to perform is determined by testing the +//! bits set in mountflags, with the tests being conducted in the order listed here: +//! - Remount an existing mount: mountflags includes MS_REMOUNT. +//! - Create a bind mount: mountflags includes MS_BIND. +//! - Change the propagation type of an existing mount: mountflags includes one of MS_SHARED, +//! MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE. +//! - Move an existing mount to a new location: mountflags includes MS_MOVE. +//! - Create a new mount: mountflags includes none of the above flags. +//! +//! Since Linux 2.6.26, the MS_REMOUNT flag can be used with MS_BIND to modify only the +//! per-mount-point flags. This is particularly useful for setting or clearing the "read-only" +//! flag on a mount without changing the underlying filesystem. Specifying mountflags as: +//! MS_REMOUNT | MS_BIND | MS_RDONLY +//! will make access through this mountpoint read-only, without affecting other mounts. +//! +//! # Safety +//! +//! Mount related operations are sensitive to security flaws, especially when dealing with symlinks. +//! There are several CVEs related to file path handling, for example +//! [CVE-2021-30465](https://github.com/opencontainers/runc/security/advisories/GHSA-c3xm-pvg7-gh7r). +//! +//! So some design rules are adopted here: +//! - all mount variants (`bind_remount_read_only()`, `bind_mount()`, `Mounter::mount()`) assume +//! that all received paths are safe. +//! - the caller must ensure safe version of `PathBuf` are passed to mount variants. +//! - `create_mount_destination()` may be used to generated safe `PathBuf` for mount destinations. +//! - the `safe_path` crate should be used to generate safe `PathBuf` for general cases. + +use std::fmt::Debug; +use std::fs; +use std::io::{self, BufRead}; +use std::os::raw::c_char; +use std::os::unix::ffi::OsStrExt; +use std::os::unix::fs::{DirBuilderExt, OpenOptionsExt}; +use std::os::unix::io::AsRawFd; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Instant; + +use lazy_static::lazy_static; +use nix::mount::{mount, MntFlags, MsFlags}; +use nix::{unistd, NixPath}; + +use crate::fs::is_symlink; +use crate::sl; + +/// Default permission for directories created for mountpoint. +const MOUNT_PERM: u32 = 0o755; + +const PROC_MOUNTS_FILE: &str = "/proc/mounts"; +const PROC_FIELDS_PER_LINE: usize = 6; +const PROC_DEVICE_INDEX: usize = 0; +const PROC_PATH_INDEX: usize = 1; +const PROC_TYPE_INDEX: usize = 2; + +lazy_static! { + static ref MAX_MOUNT_PARAM_SIZE: usize = + if let Ok(Some(v)) = unistd::sysconf(unistd::SysconfVar::PAGE_SIZE) { + v as usize + } else { + panic!("cannot get PAGE_SIZE by sysconf()"); + }; + +// Propagation flags for mounting container volumes. + static ref PROPAGATION_FLAGS: MsFlags = + MsFlags::MS_SHARED | MsFlags::MS_PRIVATE | MsFlags::MS_SLAVE | MsFlags::MS_UNBINDABLE; + +} + +/// Errors related to filesystem mount operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Can not bind mount {0} to {1}: {2}")] + BindMount(PathBuf, PathBuf, nix::Error), + #[error("Failure injection: {0}")] + FailureInject(String), + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Invalid mountpoint entry (expected {0} fields, got {1}) fields: {2}")] + InvalidMountEntry(usize, usize, String), + #[error("Invalid mount option: {0}")] + InvalidMountOption(String), + #[error("Invalid path: {0}")] + InvalidPath(PathBuf), + #[error("Failure in waiting for thread: {0}")] + Join(String), + #[error("Can not mount {0} to {1}: {2}")] + Mount(PathBuf, PathBuf, nix::Error), + #[error("Mount option exceeds 4K size")] + MountOptionTooBig, + #[error("Path for mountpoint is null")] + NullMountPointPath, + #[error("Faile to open file {0} by path, {1}")] + OpenByPath(PathBuf, io::Error), + #[error("Can not read metadata of {0}, {1}")] + ReadMetadata(PathBuf, io::Error), + #[error("Can not remount {0}: {1}")] + Remount(PathBuf, nix::Error), + #[error("Can not find mountpoint for {0}")] + NoMountEntry(String), + #[error("Can not umount {0}, {1}")] + Umount(PathBuf, io::Error), +} + +/// A specialized version of `std::result::Result` for mount operations. +pub type Result = std::result::Result; + +/// Information of mount record from `/proc/mounts`. +pub struct LinuxMountInfo { + /// Source of mount, first field of records from `/proc/mounts`. + pub device: String, + /// Destination of mount, second field of records from `/proc/mounts`. + pub path: String, + /// Filesystem type of mount, third field of records from `/proc/mounts`. + pub fs_type: String, +} + +/// Get the device and file system type of a mount point by parsing `/proc/mounts`. +pub fn get_linux_mount_info(mount_point: &str) -> Result { + let mount_file = fs::File::open(PROC_MOUNTS_FILE)?; + let lines = io::BufReader::new(mount_file).lines(); + + for mount in lines.flatten() { + let fields: Vec<&str> = mount.split(' ').collect(); + + if fields.len() != PROC_FIELDS_PER_LINE { + return Err(Error::InvalidMountEntry( + PROC_FIELDS_PER_LINE, + fields.len(), + mount, + )); + } + + if mount_point == fields[PROC_PATH_INDEX] { + return Ok(LinuxMountInfo { + device: fields[PROC_DEVICE_INDEX].to_string(), + path: fields[PROC_PATH_INDEX].to_string(), + fs_type: fields[PROC_TYPE_INDEX].to_string(), + }); + } + } + + Err(Error::NoMountEntry(mount_point.to_owned())) +} + +/// Recursively create destination for a mount. +/// +/// For a normal mount, the destination will always be a directory. For bind mount, the destination +/// must be a directory if the source is a directory, otherwise the destination must be a normal +/// file. If directories are created, their permissions are initialized to MountPerm. +/// +/// # Safety +/// +/// Every container has a root filesystems `rootfs`. When creating bind mounts for a container, +/// the destination should always be within the container's `rootfs`. Otherwise it's a serious +/// security flaw for container to read/override host side filesystem contents. Please refer to +/// following CVEs for example: +/// - [CVE-2021-30465](https://github.com/opencontainers/runc/security/advisories/GHSA-c3xm-pvg7-gh7r) +/// +/// To ensure security, the `create_mount_destination()` function takes an extra parameter `root`, +/// which is used to ensure that `dst` is within the specified directory. And a safe version of +/// `PathBuf` is returned to avoid TOCTTOU type of flaws. +pub fn create_mount_destination, D: AsRef, R: AsRef>( + src: S, + dst: D, + _root: R, + fs_type: &str, +) -> Result + Debug> { + // TODO: https://github.com/kata-containers/kata-containers/issues/3473 + let dst = dst.as_ref(); + let parent = dst + .parent() + .ok_or_else(|| Error::InvalidPath(dst.to_path_buf()))?; + let mut builder = fs::DirBuilder::new(); + builder.mode(MOUNT_PERM).recursive(true).create(parent)?; + + if fs_type == "bind" { + // The source and destination for bind mounting must be the same type: file or directory. + if !src.as_ref().is_dir() { + fs::OpenOptions::new() + .mode(MOUNT_PERM) + .write(true) + .create(true) + .open(dst)?; + return Ok(dst.to_path_buf()); + } + } + + if let Err(e) = builder.create(dst) { + if e.kind() != std::io::ErrorKind::AlreadyExists { + return Err(e.into()); + } + } + if !dst.is_dir() { + Err(Error::InvalidPath(dst.to_path_buf())) + } else { + Ok(dst.to_path_buf()) + } +} + +/// Remount a bind mount into readonly mode. +/// +/// # Safety +/// Caller needs to ensure safety of the `dst` to avoid possible file path based attacks. +pub fn bind_remount_read_only>(dst: P) -> Result<()> { + let dst = dst.as_ref(); + if dst.is_empty() { + return Err(Error::NullMountPointPath); + } + let dst = dst + .canonicalize() + .map_err(|_e| Error::InvalidPath(dst.to_path_buf()))?; + + do_rebind_mount_read_only(dst, MsFlags::empty()) +} + +/// Bind mount `src` to `dst` in slave mode, optionally in readonly mode if `readonly` is true. +/// +/// # Safety +/// Caller needs to ensure: +/// - `src` exists. +/// - `dst` exists, and is suitable as destination for bind mount. +/// - `dst` is free of file path based attacks. +pub fn bind_mount_unchecked, D: AsRef>( + src: S, + dst: D, + read_only: bool, +) -> Result<()> { + fail::fail_point!("bind_mount", |_| { + Err(Error::FailureInject( + "Bind mount fail point injection".to_string(), + )) + }); + + let src = src.as_ref(); + let dst = dst.as_ref(); + if src.is_empty() { + return Err(Error::NullMountPointPath); + } + if dst.is_empty() { + return Err(Error::NullMountPointPath); + } + let abs_src = src + .canonicalize() + .map_err(|_e| Error::InvalidPath(src.to_path_buf()))?; + + create_mount_destination(src, dst, "/", "bind")?; + // Bind mount `src` to `dst`. + mount( + Some(&abs_src), + dst, + Some("bind"), + MsFlags::MS_BIND, + Some(""), + ) + .map_err(|e| Error::BindMount(abs_src, dst.to_path_buf(), e))?; + + // Change into slave propagation mode. + mount(Some(""), dst, Some(""), MsFlags::MS_SLAVE, Some("")) + .map_err(|e| Error::Mount(PathBuf::new(), dst.to_path_buf(), e))?; + + // Optionally rebind into readonly mode. + if read_only { + do_rebind_mount_read_only(dst, MsFlags::empty())?; + } + + Ok(()) +} + +/// Trait to mount a `kata_types::mount::Mount`. +pub trait Mounter { + /// Mount to the specified `target`. + /// + /// # Safety + /// Caller needs to ensure: + /// - `target` exists, and is suitable as destination for mount. + /// - `target` is free of file path based attacks. + fn mount>(&self, target: P) -> Result<()>; +} + +impl Mounter for kata_types::mount::Mount { + // This function is modelled after + // [Mount::Mount()](https://github.com/containerd/containerd/blob/main/mount/mount_linux.go) + // from [Containerd](https://github.com/containerd/containerd) project. + fn mount>(&self, target: P) -> Result<()> { + fail::fail_point!("Mount::mount", |_| { + Err(Error::FailureInject( + "Mount::mount() fail point injection".to_string(), + )) + }); + + let target = target.as_ref().to_path_buf(); + let (chdir, (flags, data)) = + // Follow the same algorithm as Containerd: reserve 512 bytes to avoid hitting one page + // limit of mounting argument buffer. + if self.fs_type == "overlay" && self.option_size() >= *MAX_MOUNT_PARAM_SIZE - 512 { + info!( + sl!(), + "overlay mount option too long, maybe failed to mount" + ); + let (chdir, options) = compact_lowerdir_option(&self.options); + (chdir, parse_mount_options(&options)?) + } else { + (None, parse_mount_options(&self.options)?) + }; + + // Ensure propagation type change flags aren't included in other calls. + let o_flag = flags & (!*PROPAGATION_FLAGS); + + // - Normal mount without MS_REMOUNT flag + // - In the case of remounting with changed data (data != ""), need to call mount + if (flags & MsFlags::MS_REMOUNT) == MsFlags::empty() || !data.is_empty() { + mount_at( + chdir, + &self.source, + target.clone(), + &self.fs_type, + o_flag, + &data, + )?; + } + + // Change mount propagation type. + if (flags & *PROPAGATION_FLAGS) != MsFlags::empty() { + let propagation_flag = *PROPAGATION_FLAGS | MsFlags::MS_REC | MsFlags::MS_SILENT; + debug!( + sl!(), + "Change mount propagation flags to: 0x{:x}", + propagation_flag.bits() + ); + mount( + Some(""), + &target, + Some(""), + flags & propagation_flag, + Some(""), + ) + .map_err(|e| Error::Mount(PathBuf::new(), target.clone(), e))?; + } + + // Bind mount readonly. + let bro_flag = MsFlags::MS_BIND | MsFlags::MS_RDONLY; + if (o_flag & bro_flag) == bro_flag { + do_rebind_mount_read_only(target, o_flag)?; + } + + Ok(()) + } +} + +#[inline] +fn do_rebind_mount_read_only>(path: P, flags: MsFlags) -> Result<()> { + mount( + Some(""), + path.as_ref(), + Some(""), + flags | MsFlags::MS_BIND | MsFlags::MS_REMOUNT | MsFlags::MS_RDONLY, + Some(""), + ) + .map_err(|e| Error::Remount(path.as_ref().to_path_buf(), e)) +} + +/// Take fstab style mount options and parses them for use with a standard mount() syscall. +fn parse_mount_options(options: &[String]) -> Result<(MsFlags, String)> { + let mut flags: MsFlags = MsFlags::empty(); + let mut data: Vec = Vec::new(); + + for opt in options.iter() { + if opt == "defaults" { + continue; + } else if opt == "loop" { + return Err(Error::InvalidMountOption("loop".to_string())); + } else if let Some(v) = parse_mount_flags(flags, opt) { + flags = v; + } else { + data.push(opt.clone()); + } + } + + let data = data.join(","); + if data.len() > *MAX_MOUNT_PARAM_SIZE { + return Err(Error::MountOptionTooBig); + } + + Ok((flags, data)) +} + +fn parse_mount_flags(mut flags: MsFlags, flag_str: &str) -> Option { + // Following mount options are applicable to fstab only. + // - _netdev: The filesystem resides on a device that requires network access (used to prevent + // the system from attempting to mount these filesystems until the network has been enabled + // on the system). + // - auto: Can be mounted with the -a option. + // - group: Allow an ordinary user to mount the filesystem if one of that user’s groups matches + // the group of the device. This option implies the options nosuid and nodev (unless + // overridden by subsequent options, as in the option line group,dev,suid). + // - noauto: Can only be mounted explicitly (i.e., the -a option will not cause the filesystem + // to be mounted). + // - nofail: Do not report errors for this device if it does not exist. + // - owner: Allow an ordinary user to mount the filesystem if that user is the owner of the + // device. This option implies the options nosuid and nodev (unless overridden by subsequent + // options, as in the option line owner,dev,suid). + // - user: Allow an ordinary user to mount the filesystem. The name of the mounting user is + // written to the mtab file (or to the private libmount file in /run/mount on systems without + // a regular mtab) so that this same user can unmount the filesystem again. This option + // implies the options noexec, nosuid, and nodev (unless overridden by subsequent options, + // as in the option line user,exec,dev,suid). + // - nouser: Forbid an ordinary user to mount the filesystem. This is the default; it does not + // imply any other options. + // - users: Allow any user to mount and to unmount the filesystem, even when some other ordinary + // user mounted it. This option implies the options noexec, nosuid, and nodev (unless + // overridden by subsequent options, as in the option line users,exec,dev,suid). + match flag_str { + // Clear flags + "async" => flags &= !MsFlags::MS_SYNCHRONOUS, + "atime" => flags &= !MsFlags::MS_NOATIME, + "dev" => flags &= !MsFlags::MS_NODEV, + "diratime" => flags &= !MsFlags::MS_NODIRATIME, + "exec" => flags &= !MsFlags::MS_NOEXEC, + "loud" => flags &= !MsFlags::MS_SILENT, + "noiversion" => flags &= !MsFlags::MS_I_VERSION, + "nomand" => flags &= !MsFlags::MS_MANDLOCK, + "norelatime" => flags &= !MsFlags::MS_RELATIME, + "nostrictatime" => flags &= !MsFlags::MS_STRICTATIME, + "rw" => flags &= !MsFlags::MS_RDONLY, + "suid" => flags &= !MsFlags::MS_NOSUID, + // Set flags + "bind" => flags |= MsFlags::MS_BIND, + "dirsync" => flags |= MsFlags::MS_DIRSYNC, + "iversion" => flags |= MsFlags::MS_I_VERSION, + "mand" => flags |= MsFlags::MS_MANDLOCK, + "noatime" => flags |= MsFlags::MS_NOATIME, + "nodev" => flags |= MsFlags::MS_NODEV, + "nodiratime" => flags |= MsFlags::MS_NODIRATIME, + "noexec" => flags |= MsFlags::MS_NOEXEC, + "nosuid" => flags |= MsFlags::MS_NOSUID, + "rbind" => flags |= MsFlags::MS_BIND | MsFlags::MS_REC, + "relatime" => flags |= MsFlags::MS_RELATIME, + "remount" => flags |= MsFlags::MS_REMOUNT, + "ro" => flags |= MsFlags::MS_RDONLY, + "silent" => flags |= MsFlags::MS_SILENT, + "strictatime" => flags |= MsFlags::MS_STRICTATIME, + "sync" => flags |= MsFlags::MS_SYNCHRONOUS, + flag_str => { + warn!(sl!(), "BUG: unknown mount flag: {:?}", flag_str); + return None; + } + } + Some(flags) +} + +// Do mount, optionally change current working directory if `chdir` is not empty. +fn mount_at>( + chdir: Option, + source: P, + target: PathBuf, + fstype: &str, + flags: MsFlags, + data: &str, +) -> Result<()> { + let chdir = match chdir { + Some(v) => v, + None => { + return mount( + Some(source.as_ref()), + &target, + Some(fstype), + flags, + Some(data), + ) + .map_err(|e| Error::Mount(PathBuf::new(), target, e)); + } + }; + + info!( + sl!(), + "mount_at: chdir {}, source {}, target {} , fstype {}, data {}", + chdir.display(), + source.as_ref().display(), + target.display(), + fstype, + data + ); + + // TODO: https://github.com/kata-containers/kata-containers/issues/3473 + let o_flags = nix::fcntl::OFlag::O_PATH | nix::fcntl::OFlag::O_CLOEXEC; + let file = fs::OpenOptions::new() + .read(true) + .custom_flags(o_flags.bits()) + .open(&chdir) + .map_err(|e| Error::OpenByPath(chdir.to_path_buf(), e))?; + match file.metadata() { + Ok(md) => { + if !md.is_dir() { + return Err(Error::InvalidPath(chdir)); + } + } + Err(e) => return Err(Error::ReadMetadata(chdir, e)), + } + + let cwd = unistd::getcwd().map_err(|e| Error::Io(io::Error::from_raw_os_error(e as i32)))?; + let src = source.as_ref().to_path_buf(); + let tgt = target.clone(); + let ftype = String::from(fstype); + let d = String::from(data); + let rx = Arc::new(AtomicBool::new(false)); + let tx = rx.clone(); + + // A working thread is spawned to ease error handling. + let child = std::thread::Builder::new() + .name("async_mount".to_string()) + .spawn(move || { + match unistd::fchdir(file.as_raw_fd()) { + Ok(_) => info!(sl!(), "chdir from {} to {}", cwd.display(), chdir.display()), + Err(e) => { + error!( + sl!(), + "failed to chdir from {} to {} error {:?}", + cwd.display(), + chdir.display(), + e + ); + return; + } + } + match mount( + Some(src.as_path()), + &tgt, + Some(ftype.as_str()), + flags, + Some(d.as_str()), + ) { + Ok(_) => tx.store(true, Ordering::Release), + Err(e) => error!(sl!(), "failed to mount in chdir {}: {}", chdir.display(), e), + } + match unistd::chdir(&cwd) { + Ok(_) => info!(sl!(), "chdir from {} to {}", chdir.display(), cwd.display()), + Err(e) => { + error!( + sl!(), + "failed to chdir from {} to {} error {:?}", + chdir.display(), + cwd.display(), + e + ); + } + } + })?; + child.join().map_err(|e| Error::Join(format!("{:?}", e)))?; + + if !rx.load(Ordering::Acquire) { + Err(Error::Mount( + source.as_ref().to_path_buf(), + target, + nix::Error::EIO, + )) + } else { + Ok(()) + } +} + +/// When the size of mount options is bigger than one page, try to reduce the size by compressing +/// the `lowerdir` option for overlayfs. The assumption is that lower directories for overlayfs +/// often have a common prefix. +fn compact_lowerdir_option(opts: &[String]) -> (Option, Vec) { + let mut n_opts = opts.to_vec(); + // No need to compact if there is no overlay or only one lowerdir + let (idx, lower_opts) = match find_overlay_lowerdirs(opts) { + None => return (None, n_opts), + Some(v) => { + if v.1.len() <= 1 { + return (None, n_opts); + } + v + } + }; + + let idx = idx as usize; + let common_dir = match get_longest_common_prefix(&lower_opts) { + None => return (None, n_opts), + Some(v) => { + if v.is_absolute() && v.parent().is_none() { + return (None, n_opts); + } + v + } + }; + let common_prefix = match common_dir.as_os_str().to_str() { + None => return (None, n_opts), + Some(v) => { + let mut p = v.to_string(); + p.push('/'); + p + } + }; + + info!( + sl!(), + "compact_lowerdir_option get common prefix: {}", + common_dir.display() + ); + let lower: Vec = lower_opts + .iter() + .map(|c| c.replace(&common_prefix, "")) + .collect(); + n_opts[idx as usize] = format!("lowerdir={}", lower.join(":")); + + (Some(common_dir), n_opts) +} + +fn find_overlay_lowerdirs(opts: &[String]) -> Option<(usize, Vec)> { + for (idx, o) in opts.iter().enumerate() { + if let Some(lower) = o.strip_prefix("lowerdir=") { + if !lower.is_empty() { + let c_opts: Vec = lower.split(':').map(|c| c.to_string()).collect(); + return Some((idx, c_opts)); + } + } + } + + None +} + +fn get_longest_common_prefix(opts: &[String]) -> Option { + if opts.is_empty() { + return None; + } + + let mut paths = Vec::with_capacity(opts.len()); + for opt in opts.iter() { + match Path::new(opt).parent() { + None => return None, + Some(v) => paths.push(v), + } + } + + let mut path = PathBuf::new(); + paths.sort_unstable(); + for (first, last) in paths[0] + .components() + .zip(paths[paths.len() - 1].components()) + { + if first != last { + break; + } + path.push(first); + } + + Some(path) +} + +/// Umount a mountpoint with timeout. +/// +/// # Safety +/// Caller needs to ensure safety of the `path` to avoid possible file path based attacks. +pub fn umount_timeout>(path: P, timeout: u64) -> Result<()> { + // Protect from symlink based attacks, please refer to: + // https://github.com/kata-containers/runtime/issues/2474 + // For Kata specific, we do extra protection for parent directory too. + let path = path.as_ref(); + let parent = path + .parent() + .ok_or_else(|| Error::InvalidPath(path.to_path_buf()))?; + // TODO: https://github.com/kata-containers/kata-containers/issues/3473 + if is_symlink(path).map_err(|e| Error::ReadMetadata(path.to_owned(), e))? + || is_symlink(parent).map_err(|e| Error::ReadMetadata(path.to_owned(), e))? + { + warn!( + sl!(), + "unable to umount {} which is a symbol link", + path.display() + ); + return Ok(()); + } + + if timeout == 0 { + // Lazy unmounting the mountpoint with the MNT_DETACH flag. + umount2(path, true).map_err(|e| Error::Umount(path.to_owned(), e))?; + info!(sl!(), "lazy umount for {}", path.display()); + } else { + let start_time = Instant::now(); + while let Err(e) = umount2(path, false) { + match e.kind() { + // The mountpoint has been concurrently unmounted by other threads. + io::ErrorKind::InvalidInput => break, + io::ErrorKind::WouldBlock => { + let time_now = Instant::now(); + if time_now.duration_since(start_time).as_millis() > timeout as u128 { + warn!(sl!(), + "failed to umount {} in {} ms because of EBUSY, try again with lazy umount", + path.display(), + Instant::now().duration_since(start_time).as_millis()); + return umount2(path, true).map_err(|e| Error::Umount(path.to_owned(), e)); + } + } + _ => return Err(Error::Umount(path.to_owned(), e)), + } + } + + info!( + sl!(), + "umount {} in {} ms", + path.display(), + Instant::now().duration_since(start_time).as_millis() + ); + } + + Ok(()) +} + +/// Umount all filesystems mounted at the `mountpoint`. +/// +/// If `mountpoint` is empty or doesn't exist, `umount_all()` is a noop. Otherwise it will try to +/// unmount all filesystems mounted at `mountpoint` repeatedly. For example: +/// - bind mount /dev/sda to /tmp/mnt +/// - bind mount /tmp/b to /tmp/mnt +/// - umount_all("tmp/mnt") will umount both /tmp/b and /dev/sda +/// +/// # Safety +/// Caller needs to ensure safety of the `path` to avoid possible file path based attacks. +pub fn umount_all>(mountpoint: P, lazy_umount: bool) -> Result<()> { + if mountpoint.as_ref().is_empty() || !mountpoint.as_ref().exists() { + return Ok(()); + } + + loop { + if let Err(e) = umount2(mountpoint.as_ref(), lazy_umount) { + // EINVAL is returned if the target is not a mount point, indicating that we are + // done. It can also indicate a few other things (such as invalid flags) which we + // unfortunately end up squelching here too. + if e.kind() == io::ErrorKind::InvalidInput { + break; + } else { + return Err(Error::Umount(mountpoint.as_ref().to_path_buf(), e)); + } + } + } + + Ok(()) +} + +// Counterpart of nix::umount2, with support of `UMOUNT_FOLLOW`. +fn umount2>(path: P, lazy_umount: bool) -> std::io::Result<()> { + let path_ptr = path.as_ref().as_os_str().as_bytes().as_ptr() as *const c_char; + let mut flags = MntFlags::UMOUNT_NOFOLLOW.bits(); + if lazy_umount { + flags |= MntFlags::MNT_DETACH.bits(); + } + + // Safe because parameter is valid and we have checked the reuslt. + if unsafe { libc::umount2(path_ptr, flags) } < 0 { + Err(io::Error::last_os_error()) + } else { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn test_get_linux_mount_info() { + let info = get_linux_mount_info("/sys/fs/cgroup").unwrap(); + + assert_eq!(&info.device, "tmpfs"); + assert_eq!(&info.fs_type, "tmpfs"); + assert_eq!(&info.path, "/sys/fs/cgroup"); + + assert!(matches!( + get_linux_mount_info(""), + Err(Error::NoMountEntry(_)) + )); + assert!(matches!( + get_linux_mount_info("/sys/fs/cgroup/do_not_exist/____hi"), + Err(Error::NoMountEntry(_)) + )); + } + + #[test] + fn test_create_mount_destination() { + let tmpdir = tempfile::tempdir().unwrap(); + let src = Path::new("/proc/mounts"); + let mut dst = tmpdir.path().to_owned(); + dst.push("proc"); + dst.push("mounts"); + let dst = create_mount_destination(src, dst.as_path(), tmpdir.path(), "bind").unwrap(); + let abs_dst = dst.as_ref().canonicalize().unwrap(); + assert!(abs_dst.is_file()); + + let dst = Path::new("/"); + assert!(matches!( + create_mount_destination(src, dst, "/", "bind"), + Err(Error::InvalidPath(_)) + )); + + let src = Path::new("/proc"); + let dst = Path::new("/proc/mounts"); + assert!(matches!( + create_mount_destination(src, dst, "/", "bind"), + Err(Error::InvalidPath(_)) + )); + } + + #[test] + #[ignore] + fn test_bind_remount_read_only() { + let tmpdir = tempfile::tempdir().unwrap(); + let tmpdir2 = tempfile::tempdir().unwrap(); + + assert!(matches!( + bind_remount_read_only(&PathBuf::from("")), + Err(Error::NullMountPointPath) + )); + assert!(matches!( + bind_remount_read_only(&PathBuf::from("../______doesn't____exist____nnn")), + Err(Error::InvalidPath(_)) + )); + + bind_mount_unchecked(tmpdir2.path(), tmpdir.path(), true).unwrap(); + bind_remount_read_only(tmpdir.path()).unwrap(); + umount_timeout(tmpdir.path().to_str().unwrap(), 0).unwrap(); + } + + #[test] + #[ignore] + fn test_bind_mount() { + let tmpdir = tempfile::tempdir().unwrap(); + let tmpdir2 = tempfile::tempdir().unwrap(); + let mut src = tmpdir.path().to_owned(); + src.push("src"); + let mut dst = tmpdir.path().to_owned(); + dst.push("src"); + + assert!(matches!( + bind_mount_unchecked(Path::new(""), Path::new(""), false), + Err(Error::NullMountPointPath) + )); + assert!(matches!( + bind_mount_unchecked(tmpdir2.path(), Path::new(""), false), + Err(Error::NullMountPointPath) + )); + assert!(matches!( + bind_mount_unchecked( + Path::new("/_does_not_exist_/___aahhhh"), + Path::new("/tmp/_does_not_exist/___bbb"), + false + ), + Err(Error::InvalidPath(_)) + )); + + let dst = create_mount_destination(tmpdir2.path(), &dst, tmpdir.path(), "bind").unwrap(); + bind_mount_unchecked(tmpdir2.path(), dst.as_ref(), true).unwrap(); + bind_mount_unchecked(&src, dst.as_ref(), false).unwrap(); + umount_all(dst.as_ref(), false).unwrap(); + + let mut src = tmpdir.path().to_owned(); + src.push("file"); + fs::write(&src, "test").unwrap(); + let mut dst = tmpdir.path().to_owned(); + dst.push("file"); + let dst = create_mount_destination(&src, &dst, tmpdir.path(), "bind").unwrap(); + bind_mount_unchecked(&src, dst.as_ref(), false).unwrap(); + assert!(dst.as_ref().is_file()); + umount_timeout(dst.as_ref(), 0).unwrap(); + } + + #[test] + fn test_compact_overlay_lowerdirs() { + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + "lowerdir=/a/b/c/xxxx/1l:/a/b/c/xxxx/2l:/a/b/c/xxxx/3l:/a/b/c/xxxx/4l".to_string(), + ]; + let (prefix, n_options) = compact_lowerdir_option(&options); + assert_eq!(&prefix.unwrap(), Path::new("/a/b/c/xxxx/")); + assert_eq!(n_options.len(), 3); + assert_eq!(n_options[2], "lowerdir=1l:2l:3l:4l"); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + "lowerdir=/a/b/c/xxxx:/a/b/c/xxxx/2l:/a/b/c/xxxx/3l:/a/b/c/xxxx/4l".to_string(), + ]; + let (prefix, n_options) = compact_lowerdir_option(&options); + assert_eq!(&prefix.unwrap(), Path::new("/a/b/c/")); + assert_eq!(n_options.len(), 3); + assert_eq!(n_options[2], "lowerdir=xxxx:xxxx/2l:xxxx/3l:xxxx/4l"); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + "lowerdir=/1l:/2l:/3l:/4l".to_string(), + ]; + let (prefix, n_options) = compact_lowerdir_option(&options); + assert!(prefix.is_none()); + assert_eq!(n_options, options); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + ]; + let (prefix, n_options) = compact_lowerdir_option(&options); + assert!(prefix.is_none()); + assert_eq!(n_options, options); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "lowerdir=".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + ]; + let (prefix, n_options) = compact_lowerdir_option(&options); + assert!(prefix.is_none()); + assert_eq!(n_options, options); + } + + #[test] + fn test_find_overlay_lowerdirs() { + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + "lowerdir=/a/b/c/xxxx/1l:/a/b/c/xxxx/2l:/a/b/c/xxxx/3l:/a/b/c/xxxx/4l".to_string(), + ]; + let lower_expect = vec![ + "/a/b/c/xxxx/1l".to_string(), + "/a/b/c/xxxx/2l".to_string(), + "/a/b/c/xxxx/3l".to_string(), + "/a/b/c/xxxx/4l".to_string(), + ]; + + let (idx, lower) = find_overlay_lowerdirs(&options).unwrap(); + assert_eq!(idx, 2); + assert_eq!(lower, lower_expect); + + let common_prefix = get_longest_common_prefix(&lower).unwrap(); + assert_eq!(Path::new("/a/b/c/xxxx/"), &common_prefix); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + ]; + let v = find_overlay_lowerdirs(&options); + assert!(v.is_none()); + + let options = vec![ + "workdir=/a/b/c/xxxx/workdir".to_string(), + "lowerdir=".to_string(), + "upperdir=/a/b/c/xxxx/upper".to_string(), + ]; + find_overlay_lowerdirs(&options); + assert!(v.is_none()); + } + + #[test] + fn test_get_common_prefix() { + let lower1 = vec![ + "/a/b/c/xxxx/1l/fs".to_string(), + "/a/b/c/////xxxx/11l/fs".to_string(), + "/a/b/c/././xxxx/13l/fs".to_string(), + "/a/b/c/.////xxxx/14l/fs".to_string(), + ]; + let common_prefix = get_longest_common_prefix(&lower1).unwrap(); + assert_eq!(Path::new("/a/b/c/xxxx/"), &common_prefix); + + let lower2 = vec![ + "/fs".to_string(), + "/s".to_string(), + "/sa".to_string(), + "/s".to_string(), + ]; + let common_prefix = get_longest_common_prefix(&lower2).unwrap(); + assert_eq!(Path::new("/"), &common_prefix); + + let lower3 = vec!["".to_string(), "".to_string()]; + let common_prefix = get_longest_common_prefix(&lower3); + assert!(common_prefix.is_none()); + + let lower = vec!["/".to_string(), "/".to_string()]; + let common_prefix = get_longest_common_prefix(&lower); + assert!(common_prefix.is_none()); + + let lower = vec![ + "/a/b/c".to_string(), + "/a/b/c/d".to_string(), + "/a/b///c".to_string(), + ]; + let common_prefix = get_longest_common_prefix(&lower).unwrap(); + assert_eq!(Path::new("/a/b"), &common_prefix); + + let lower = vec!["a/b/c/e".to_string(), "a/b/c/d".to_string()]; + let common_prefix = get_longest_common_prefix(&lower).unwrap(); + assert_eq!(Path::new("a/b/c"), &common_prefix); + + let lower = vec!["a/b/c".to_string(), "a/b/c/d".to_string()]; + let common_prefix = get_longest_common_prefix(&lower).unwrap(); + assert_eq!(Path::new("a/b"), &common_prefix); + + let lower = vec!["/test".to_string()]; + let common_prefix = get_longest_common_prefix(&lower).unwrap(); + assert_eq!(Path::new("/"), &common_prefix); + + let lower = vec![]; + let common_prefix = get_longest_common_prefix(&lower); + assert!(&common_prefix.is_none()); + } + + #[test] + fn test_parse_mount_options() { + let options = vec![]; + let (flags, data) = parse_mount_options(&options).unwrap(); + assert!(flags.is_empty()); + assert!(data.is_empty()); + + let mut options = vec![ + "dev".to_string(), + "ro".to_string(), + "defaults".to_string(), + "data-option".to_string(), + ]; + let (flags, data) = parse_mount_options(&options).unwrap(); + assert_eq!(flags, MsFlags::MS_RDONLY); + assert_eq!(&data, "data-option"); + + options.push("loop".to_string()); + assert!(parse_mount_options(&options).is_err()); + + let idx = options.len() - 1; + options[idx] = " ".repeat(4097); + assert!(parse_mount_options(&options).is_err()); + } + + #[test] + #[ignore] + fn test_mount_at() { + let tmpdir = tempfile::tempdir().unwrap(); + let path = tmpdir.path().to_path_buf(); + mount_at( + Some(path.clone()), + "/___does_not_exist____a___", + PathBuf::from("/tmp/etc/host.conf"), + "", + MsFlags::empty(), + "", + ) + .unwrap_err(); + + mount_at( + Some(PathBuf::from("/___does_not_exist____a___")), + "/etc/host.conf", + PathBuf::from("/tmp/etc/host.conf"), + "", + MsFlags::empty(), + "", + ) + .unwrap_err(); + + let src = path.join("src"); + fs::write(&src, "test").unwrap(); + let dst = path.join("dst"); + fs::write(&dst, "test1").unwrap(); + mount_at( + Some(path), + "src", + PathBuf::from("dst"), + "bind", + MsFlags::MS_BIND, + "", + ) + .unwrap(); + let content = fs::read_to_string(&dst).unwrap(); + assert_eq!(&content, "test"); + } +} diff --git a/src/libs/kata-sys-util/src/numa.rs b/src/libs/kata-sys-util/src/numa.rs new file mode 100644 index 0000000000..ece5cd8e7f --- /dev/null +++ b/src/libs/kata-sys-util/src/numa.rs @@ -0,0 +1,221 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::fs::DirEntry; +use std::io::Read; +use std::path::PathBuf; + +use kata_types::cpu::CpuSet; +use lazy_static::lazy_static; + +use crate::sl; +use std::str::FromStr; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Invalid CPU number {0}")] + InvalidCpu(u32), + #[error("Invalid node file name {0}")] + InvalidNodeFileName(String), + #[error("Can not read directory {1}: {0}")] + ReadDirectory(#[source] std::io::Error, String), + #[error("Can not read from file {0}, {1:?}")] + ReadFile(String, #[source] std::io::Error), + #[error("Can not open from file {0}, {1:?}")] + OpenFile(String, #[source] std::io::Error), + #[error("Can not parse CPU info, {0:?}")] + ParseCpuInfo(#[from] kata_types::Error), +} + +pub type Result = std::result::Result; + +// global config in UT +#[cfg(test)] +lazy_static! { + static ref SYS_FS_PREFIX: PathBuf = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("test/texture"); + // numa node file for UT, we can mock data + static ref NUMA_NODE_PATH: PathBuf = (&*SYS_FS_PREFIX).join("sys/devices/system/node"); + // sysfs directory for CPU devices + static ref NUMA_CPU_PATH: PathBuf = (&*SYS_FS_PREFIX).join("sys/devices/system/cpu"); +} + +// global config in release +#[cfg(not(test))] +lazy_static! { + // numa node file for UT, we can mock data + static ref NUMA_NODE_PATH: PathBuf = PathBuf::from("/sys/devices/system/node"); + // sysfs directory for CPU devices + static ref NUMA_CPU_PATH: PathBuf = PathBuf::from("/sys/devices/system/cpu"); +} + +const NUMA_NODE_PREFIX: &str = "node"; +const NUMA_NODE_CPU_LIST_NAME: &str = "cpulist"; + +/// Get numa node id for a CPU +pub fn get_node_id(cpu: u32) -> Result { + let path = NUMA_CPU_PATH.join(format!("cpu{}", cpu)); + let dirs = path.read_dir().map_err(|_| Error::InvalidCpu(cpu))?; + + for d in dirs { + let d = d.map_err(|e| Error::ReadDirectory(e, path.to_string_lossy().to_string()))?; + if let Some(file_name) = d.file_name().to_str() { + if !file_name.starts_with(NUMA_NODE_PREFIX) { + continue; + } + let index_str = file_name.trim_start_matches(NUMA_NODE_PREFIX); + if let Ok(i) = index_str.parse::() { + return Ok(i); + } + } + } + + // Default to node 0 on UMA systems. + Ok(0) +} + +/// Map cpulist to NUMA node, returns a HashMap>. +pub fn get_node_map(cpus: &str) -> Result>> { + // > + let mut node_map: HashMap> = HashMap::new(); + let cpuset = CpuSet::from_str(cpus)?; + + for c in cpuset.iter() { + let node_id = get_node_id(*c)?; + node_map.entry(node_id).or_insert_with(Vec::new).push(*c); + } + + Ok(node_map) +} + +/// Get CPU to NUMA node mapping by reading `/sys/devices/system/node/nodex/cpulist`. +/// +/// Return a HashMap. The hashmap will be empty if NUMA is not enabled on the +/// system. +pub fn get_numa_nodes() -> Result> { + let mut numa_nodes = HashMap::new(); + let numa_node_path = &*NUMA_NODE_PATH; + if !numa_node_path.exists() { + debug!(sl!(), "no numa node available on this system"); + return Ok(numa_nodes); + } + + let dirs = numa_node_path + .read_dir() + .map_err(|e| Error::ReadDirectory(e, numa_node_path.to_string_lossy().to_string()))?; + for d in dirs { + match d { + Err(e) => { + return Err(Error::ReadDirectory( + e, + numa_node_path.to_string_lossy().to_string(), + )) + } + Ok(d) => { + if let Ok(file_name) = d.file_name().into_string() { + if file_name.starts_with(NUMA_NODE_PREFIX) { + let index_string = file_name.trim_start_matches(NUMA_NODE_PREFIX); + info!( + sl!(), + "get node dir {} node index {}", &file_name, index_string + ); + match index_string.parse::() { + Ok(nid) => read_cpu_info_from_node(&d, nid, &mut numa_nodes)?, + Err(_e) => { + return Err(Error::InvalidNodeFileName(file_name.to_string())) + } + } + } + } + } + } + } + + Ok(numa_nodes) +} + +fn read_cpu_info_from_node( + d: &DirEntry, + node_index: u32, + numa_nodes: &mut HashMap, +) -> Result<()> { + let cpu_list_path = d.path().join(NUMA_NODE_CPU_LIST_NAME); + let mut file = std::fs::File::open(&cpu_list_path) + .map_err(|e| Error::OpenFile(cpu_list_path.to_string_lossy().to_string(), e))?; + let mut cpu_list_string = String::new(); + if let Err(e) = file.read_to_string(&mut cpu_list_string) { + return Err(Error::ReadFile( + cpu_list_path.to_string_lossy().to_string(), + e, + )); + } + let split_cpus = CpuSet::from_str(cpu_list_string.trim())?; + info!( + sl!(), + "node {} list {:?} from {}", node_index, split_cpus, &cpu_list_string + ); + for split_cpu_id in split_cpus.iter() { + numa_nodes.insert(*split_cpu_id, node_index); + } + + Ok(()) +} + +/// Check whether all specified CPUs have associated NUMA node. +pub fn is_valid_numa_cpu(cpus: &[u32]) -> Result { + let numa_nodes = get_numa_nodes()?; + + for cpu in cpus { + if numa_nodes.get(cpu).is_none() { + return Ok(false); + } + } + + Ok(true) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_node_id() { + assert_eq!(get_node_id(0).unwrap(), 0); + assert_eq!(get_node_id(1).unwrap(), 0); + assert_eq!(get_node_id(64).unwrap(), 1); + get_node_id(65).unwrap_err(); + } + + #[test] + fn test_get_node_map() { + let map = get_node_map("0-1,64").unwrap(); + assert_eq!(map.len(), 2); + assert_eq!(map.get(&0).unwrap().len(), 2); + assert_eq!(map.get(&1).unwrap().len(), 1); + + get_node_map("0-1,64,65").unwrap_err(); + } + + #[test] + fn test_get_numa_nodes() { + let map = get_numa_nodes().unwrap(); + assert_eq!(map.len(), 65); + assert_eq!(*map.get(&0).unwrap(), 0); + assert_eq!(*map.get(&1).unwrap(), 0); + assert_eq!(*map.get(&63).unwrap(), 0); + assert_eq!(*map.get(&64).unwrap(), 1); + } + + #[test] + fn test_is_valid_numa_cpu() { + assert!(is_valid_numa_cpu(&[0]).unwrap()); + assert!(is_valid_numa_cpu(&[1]).unwrap()); + assert!(is_valid_numa_cpu(&[63]).unwrap()); + assert!(is_valid_numa_cpu(&[64]).unwrap()); + assert!(is_valid_numa_cpu(&[0, 1, 64]).unwrap()); + assert!(!is_valid_numa_cpu(&[0, 1, 64, 65]).unwrap()); + assert!(!is_valid_numa_cpu(&[65]).unwrap()); + } +} diff --git a/src/libs/kata-sys-util/src/rand/mod.rs b/src/libs/kata-sys-util/src/rand/mod.rs new file mode 100644 index 0000000000..adc098ff68 --- /dev/null +++ b/src/libs/kata-sys-util/src/rand/mod.rs @@ -0,0 +1,10 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod random_bytes; +pub use random_bytes::RandomBytes; +mod uuid; +pub use uuid::UUID; diff --git a/src/libs/kata-sys-util/src/rand/random_bytes.rs b/src/libs/kata-sys-util/src/rand/random_bytes.rs new file mode 100644 index 0000000000..183856d6b5 --- /dev/null +++ b/src/libs/kata-sys-util/src/rand/random_bytes.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fmt; + +use rand::RngCore; + +pub struct RandomBytes { + pub bytes: Vec, +} + +impl RandomBytes { + pub fn new(n: usize) -> Self { + let mut bytes = vec![0u8; n]; + rand::thread_rng().fill_bytes(&mut bytes); + Self { bytes } + } +} + +impl fmt::LowerHex for RandomBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for byte in &self.bytes { + write!(f, "{:x}", byte)?; + } + Ok(()) + } +} + +impl fmt::UpperHex for RandomBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for byte in &self.bytes { + write!(f, "{:X}", byte)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn random_bytes() { + let b = RandomBytes::new(16); + assert_eq!(b.bytes.len(), 16); + + // check lower hex + let lower_hex = format!("{:x}", b); + assert_eq!(lower_hex, lower_hex.to_lowercase()); + + // check upper hex + let upper_hex = format!("{:X}", b); + assert_eq!(upper_hex, upper_hex.to_uppercase()); + + // check new random bytes + let b1 = RandomBytes::new(16); + assert_ne!(b.bytes, b1.bytes); + } +} diff --git a/src/libs/kata-sys-util/src/rand/uuid.rs b/src/libs/kata-sys-util/src/rand/uuid.rs new file mode 100644 index 0000000000..905ba05e24 --- /dev/null +++ b/src/libs/kata-sys-util/src/rand/uuid.rs @@ -0,0 +1,74 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{convert::From, fmt}; + +use byteorder::{BigEndian, ByteOrder}; +use rand::RngCore; + +pub struct UUID([u8; 16]); + +impl Default for UUID { + fn default() -> Self { + Self::new() + } +} + +impl UUID { + pub fn new() -> Self { + let mut b = [0u8; 16]; + rand::thread_rng().fill_bytes(&mut b); + b[6] = (b[6] & 0x0f) | 0x40; + b[8] = (b[8] & 0x3f) | 0x80; + Self(b) + } +} + +/// From: convert UUID to string +impl From<&UUID> for String { + fn from(from: &UUID) -> Self { + let time_low = BigEndian::read_u32(&from.0[..4]); + let time_mid = BigEndian::read_u16(&from.0[4..6]); + let time_hi = BigEndian::read_u16(&from.0[6..8]); + let clk_seq_hi = from.0[8]; + let clk_seq_low = from.0[9]; + let mut buf = [0u8; 8]; + buf[2..].copy_from_slice(&from.0[10..]); + let node = BigEndian::read_u64(&buf); + + format!( + "{:08x}-{:04x}-{:04x}-{:02x}{:02x}-{:012x}", + time_low, time_mid, time_hi, clk_seq_hi, clk_seq_low, node + ) + } +} + +impl fmt::Display for UUID { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", String::from(self)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_uuid() { + let uuid1 = UUID::new(); + let s1: String = String::from(&uuid1); + + let uuid2 = UUID::new(); + let s2: String = String::from(&uuid2); + + assert_eq!(s1.len(), s2.len()); + assert_ne!(s1, s2); + + let uuid3 = UUID([0u8, 1u8, 2u8, 3u8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); + let s3 = String::from(&uuid3); + assert_eq!(&s3, "00010203-0405-0607-0809-0a0b0c0d0e0f"); + } +} diff --git a/src/libs/kata-sys-util/src/spec.rs b/src/libs/kata-sys-util/src/spec.rs new file mode 100644 index 0000000000..3aa32434b1 --- /dev/null +++ b/src/libs/kata-sys-util/src/spec.rs @@ -0,0 +1,94 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::PathBuf; + +use kata_types::container::ContainerType; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// unknow container type + #[error("unknow container type {0}")] + UnknowContainerType(String), + /// missing sandboxID + #[error("missing sandboxID")] + MissingSandboxID, + /// oci error + #[error("oci error")] + Oci(#[from] oci::Error), +} + +const CRI_CONTAINER_TYPE_KEY_LIST: &[&str] = &[ + // cri containerd + "io.kubernetes.cri.container-type", + // cri-o + "io.kubernetes.cri-o.ContainerType", + // docker shim + "io.kubernetes.docker.type", +]; + +const CRI_SANDBOX_ID_KEY_LIST: &[&str] = &[ + // cri containerd + "io.kubernetes.cri.sandbox-id", + // cri-o + "io.kubernetes.cri-o.SandboxID", + // docker shim + "io.kubernetes.sandbox.id", +]; + +/// container sandbox info +#[derive(Debug, Clone)] +pub enum ShimIdInfo { + /// Sandbox + Sandbox, + /// Container + Container(String), +} + +/// get container type +pub fn get_contaier_type(spec: &oci::Spec) -> Result { + for k in CRI_CONTAINER_TYPE_KEY_LIST.iter() { + if let Some(type_value) = spec.annotations.get(*k) { + match type_value.as_str() { + "sandbox" => return Ok(ContainerType::PodSandbox), + "podsandbox" => return Ok(ContainerType::PodSandbox), + "container" => return Ok(ContainerType::PodContainer), + _ => return Err(Error::UnknowContainerType(type_value.clone())), + } + } + } + + Ok(ContainerType::PodSandbox) +} + +/// get shim id info +pub fn get_shim_id_info() -> Result { + let spec = load_oci_spec()?; + match get_contaier_type(&spec)? { + ContainerType::PodSandbox => Ok(ShimIdInfo::Sandbox), + ContainerType::PodContainer => { + for k in CRI_SANDBOX_ID_KEY_LIST { + if let Some(sandbox_id) = spec.annotations.get(*k) { + return Ok(ShimIdInfo::Container(sandbox_id.into())); + } + } + Err(Error::MissingSandboxID) + } + } +} + +/// get bundle path +pub fn get_bundle_path() -> std::io::Result { + std::env::current_dir() +} + +/// load oci spec +pub fn load_oci_spec() -> oci::Result { + let bundle_path = get_bundle_path()?; + let spec_file = bundle_path.join("config.json"); + + oci::Spec::load(spec_file.to_str().unwrap_or_default()) +} diff --git a/src/libs/kata-sys-util/src/validate.rs b/src/libs/kata-sys-util/src/validate.rs new file mode 100644 index 0000000000..0847398cef --- /dev/null +++ b/src/libs/kata-sys-util/src/validate.rs @@ -0,0 +1,267 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("invalid container ID {0}")] + InvalidContainerID(String), +} + +// A container ID or exec ID must match this regex: +// +// ^[a-zA-Z0-9][a-zA-Z0-9_.-]+$ +// +pub fn verify_id(id: &str) -> Result<(), Error> { + let mut chars = id.chars(); + + let valid = match chars.next() { + Some(first) + if first.is_alphanumeric() + && id.len() > 1 + && chars.all(|c| c.is_alphanumeric() || ['.', '-', '_'].contains(&c)) => + { + true + } + _ => false, + }; + + match valid { + true => Ok(()), + false => Err(Error::InvalidContainerID(id.to_string())), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_verify_cid() { + #[derive(Debug)] + struct TestData<'a> { + id: &'a str, + expect_error: bool, + } + + let tests = &[ + TestData { + // Cannot be blank + id: "", + expect_error: true, + }, + TestData { + // Cannot be a space + id: " ", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: ".", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "-", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "_", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: " a", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: ".a", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "-a", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "_a", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "..", + expect_error: true, + }, + TestData { + // Too short + id: "a", + expect_error: true, + }, + TestData { + // Too short + id: "z", + expect_error: true, + }, + TestData { + // Too short + id: "A", + expect_error: true, + }, + TestData { + // Too short + id: "Z", + expect_error: true, + }, + TestData { + // Too short + id: "0", + expect_error: true, + }, + TestData { + // Too short + id: "9", + expect_error: true, + }, + TestData { + // Must start with an alphanumeric + id: "-1", + expect_error: true, + }, + TestData { + id: "/", + expect_error: true, + }, + TestData { + id: "a/", + expect_error: true, + }, + TestData { + id: "a/../", + expect_error: true, + }, + TestData { + id: "../a", + expect_error: true, + }, + TestData { + id: "../../a", + expect_error: true, + }, + TestData { + id: "../../../a", + expect_error: true, + }, + TestData { + id: "foo/../bar", + expect_error: true, + }, + TestData { + id: "foo bar", + expect_error: true, + }, + TestData { + id: "a.", + expect_error: false, + }, + TestData { + id: "a..", + expect_error: false, + }, + TestData { + id: "aa", + expect_error: false, + }, + TestData { + id: "aa.", + expect_error: false, + }, + TestData { + id: "hello..world", + expect_error: false, + }, + TestData { + id: "hello/../world", + expect_error: true, + }, + TestData { + id: "aa1245124sadfasdfgasdga.", + expect_error: false, + }, + TestData { + id: "aAzZ0123456789_.-", + expect_error: false, + }, + TestData { + id: "abcdefghijklmnopqrstuvwxyz0123456789.-_", + expect_error: false, + }, + TestData { + id: "0123456789abcdefghijklmnopqrstuvwxyz.-_", + expect_error: false, + }, + TestData { + id: " abcdefghijklmnopqrstuvwxyz0123456789.-_", + expect_error: true, + }, + TestData { + id: ".abcdefghijklmnopqrstuvwxyz0123456789.-_", + expect_error: true, + }, + TestData { + id: "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", + expect_error: false, + }, + TestData { + id: "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ.-_", + expect_error: false, + }, + TestData { + id: " ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", + expect_error: true, + }, + TestData { + id: ".ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.-_", + expect_error: true, + }, + TestData { + id: "/a/b/c", + expect_error: true, + }, + TestData { + id: "a/b/c", + expect_error: true, + }, + TestData { + id: "foo/../../../etc/passwd", + expect_error: true, + }, + TestData { + id: "../../../../../../etc/motd", + expect_error: true, + }, + TestData { + id: "/etc/passwd", + expect_error: true, + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + + let result = verify_id(d.id); + + let msg = format!("{}, result: {:?}", msg, result); + + if result.is_ok() { + assert!(!d.expect_error, "{}", msg); + } else { + assert!(d.expect_error, "{}", msg); + } + } + } +} diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpulist b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpulist new file mode 100644 index 0000000000..4cfe9ed52f --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpulist @@ -0,0 +1 @@ +0,1-63 diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpumap b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpumap new file mode 100644 index 0000000000..2f3bb0650d --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu0/node0/cpumap @@ -0,0 +1 @@ +ffffffff,ffffffff diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu1/node0/cpulist b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu1/node0/cpulist new file mode 100644 index 0000000000..4cfe9ed52f --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu1/node0/cpulist @@ -0,0 +1 @@ +0,1-63 diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu64/node1/cpulist b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu64/node1/cpulist new file mode 100644 index 0000000000..900731ffd5 --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/cpu/cpu64/node1/cpulist @@ -0,0 +1 @@ +64 diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpulist b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpulist new file mode 100644 index 0000000000..3498c1f2da --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpulist @@ -0,0 +1 @@ +0-63 diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpumap b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpumap new file mode 100644 index 0000000000..2f3bb0650d --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node0/cpumap @@ -0,0 +1 @@ +ffffffff,ffffffff diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpulist b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpulist new file mode 100644 index 0000000000..900731ffd5 --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpulist @@ -0,0 +1 @@ +64 diff --git a/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpumap b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpumap new file mode 100644 index 0000000000..62fe293eb1 --- /dev/null +++ b/src/libs/kata-sys-util/test/texture/sys/devices/system/node/node1/cpumap @@ -0,0 +1 @@ +1,00000000,00000000 diff --git a/src/libs/kata-types/.gitignore b/src/libs/kata-types/.gitignore new file mode 100644 index 0000000000..03314f77b5 --- /dev/null +++ b/src/libs/kata-types/.gitignore @@ -0,0 +1 @@ +Cargo.lock diff --git a/src/libs/kata-types/Cargo.toml b/src/libs/kata-types/Cargo.toml new file mode 100644 index 0000000000..ce7dcaf069 --- /dev/null +++ b/src/libs/kata-types/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "kata-types" +version = "0.1.0" +description = "Constants and data types shared by Kata Containers components" +keywords = ["kata", "container", "runtime"] +authors = ["The Kata Containers community "] +repository = "https://github.com/kata-containers/kata-containers.git" +homepage = "https://katacontainers.io/" +readme = "README.md" +license = "Apache-2.0" +edition = "2018" + +[dependencies] +byte-unit = "3.1.4" +glob = "0.3.0" +lazy_static = "1.4.0" +num_cpus = "1.13.1" +regex = "1.5.4" +serde = { version = "1.0.100", features = ["derive"] } +slog = "2.5.2" +slog-scope = "4.4.0" +serde_json = "1.0.73" +thiserror = "1.0" +toml = "0.5.8" + +oci = { path = "../oci" } + +[dev-dependencies] +[features] +default = [] +enable-vendor = [] diff --git a/src/libs/kata-types/README.md b/src/libs/kata-types/README.md new file mode 100644 index 0000000000..334c879e20 --- /dev/null +++ b/src/libs/kata-types/README.md @@ -0,0 +1,18 @@ +# kata-types + +This crate is a collection of constants and data types shared by multiple +[Kata Containers](https://github.com/kata-containers/kata-containers/) components. + +It defines constants and data types used by multiple Kata Containers components. Those constants +and data types may be defined by Kata Containers or by other projects/specifications, such as: +- [Containerd](https://github.com/containerd/containerd) +- [Kubelet](https://github.com/kubernetes/kubelet) + +## Support + +**Operating Systems**: +- Linux + +## License + +This code is licensed under [Apache-2.0](../../../LICENSE). diff --git a/src/libs/kata-types/src/annotations/cri_containerd.rs b/src/libs/kata-types/src/annotations/cri_containerd.rs new file mode 100644 index 0000000000..db6462a8c8 --- /dev/null +++ b/src/libs/kata-types/src/annotations/cri_containerd.rs @@ -0,0 +1,13 @@ +// Copyright (c) 2019 Alibaba Cloud +// Copyright (c) 2019 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#![allow(missing_docs)] + +pub const CONTAINER_TYPE_LABEL_KEY: &str = "io.kubernetes.cri.container-type"; +pub const SANDBOX: &str = "sandbox"; +pub const CONTAINER: &str = "container"; + +pub const SANDBOX_ID_LABEL_KEY: &str = "io.kubernetes.cri.sandbox-id"; diff --git a/src/libs/kata-types/src/annotations/crio.rs b/src/libs/kata-types/src/annotations/crio.rs new file mode 100644 index 0000000000..c8b2311f84 --- /dev/null +++ b/src/libs/kata-types/src/annotations/crio.rs @@ -0,0 +1,13 @@ +// Copyright (c) 2019 Alibaba Cloud +// Copyright (c) 2019 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#![allow(missing_docs)] + +pub const CONTAINER_TYPE_LABEL_KEY: &str = "io.kubernetes.cri.container-type"; +pub const SANDBOX: &str = "sandbox"; +pub const CONTAINER: &str = "container"; + +pub const SANDBOX_ID_LABEL_KEY: &str = "io.kubernetes.cri-o.SandboxID"; diff --git a/src/libs/kata-types/src/annotations/dockershim.rs b/src/libs/kata-types/src/annotations/dockershim.rs new file mode 100644 index 0000000000..df1279dc5a --- /dev/null +++ b/src/libs/kata-types/src/annotations/dockershim.rs @@ -0,0 +1,23 @@ +// Copyright (c) 2019 Alibaba Cloud +// Copyright (c) 2019 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#![allow(missing_docs)] + +//! Copied from k8s.io/pkg/kubelet/dockershim/docker_service.go, used to identify whether a docker +//! container is a sandbox or a regular container, will be removed after defining those as public +//! fields in dockershim. + +/// ContainerTypeLabelKey is the container type (podsandbox or container) of key. +pub const CONTAINER_TYPE_LABEL_KEY: &str = "io.kubernetes.docker.type"; + +/// ContainerTypeLabelSandbox represents a sandbox sandbox container. +pub const SANDBOX: &str = "podsandbox"; + +/// ContainerTypeLabelContainer represents a container running within a sandbox. +pub const CONTAINER: &str = "container"; + +/// SandboxIDLabelKey is the sandbox ID annotation. +pub const SANDBOX_ID_LABEL_KEY: &str = "io.kubernetes.sandbox.id"; diff --git a/src/libs/kata-types/src/annotations/mod.rs b/src/libs/kata-types/src/annotations/mod.rs new file mode 100644 index 0000000000..0a517e2216 --- /dev/null +++ b/src/libs/kata-types/src/annotations/mod.rs @@ -0,0 +1,910 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::fs::File; +use std::io::{self, BufReader, Result}; +use std::result::{self}; +use std::u32; + +use serde::Deserialize; + +use crate::config::hypervisor::get_hypervisor_plugin; +use crate::config::TomlConfig; +use crate::sl; + +/// CRI-containerd specific annotations. +pub mod cri_containerd; + +/// CRI-O specific annotations. +pub mod crio; + +/// Dockershim specific annotations. +pub mod dockershim; + +/// Third-party annotations. +pub mod thirdparty; + +// Common section +/// Prefix for Kata specific annotations +pub const KATA_ANNO_PREFIX: &str = "io.katacontainers."; +/// Prefix for Kata configuration annotations +pub const KATA_ANNO_CFG_PREFIX: &str = "io.katacontainers.config."; +/// Prefix for Kata container annotations +pub const KATA_ANNO_CONTAINER_PREFIX: &str = "io.katacontainers.container."; +/// The annotation key to fetch runtime configuration file. +pub const SANDBOX_CFG_PATH_KEY: &str = "io.katacontainers.config_path"; + +// OCI section +/// The annotation key to fetch the OCI configuration file path. +pub const BUNDLE_PATH_KEY: &str = "io.katacontainers.pkg.oci.bundle_path"; +/// The annotation key to fetch container type. +pub const CONTAINER_TYPE_KEY: &str = "io.katacontainers.pkg.oci.container_type"; + +// Container resource related annotations +/// Prefix for Kata container resource related annotations. +pub const KATA_ANNO_CONTAINER_RES_PREFIX: &str = "io.katacontainers.container.resource"; +/// A container annotation to specify the Resources.Memory.Swappiness. +pub const KATA_ANNO_CONTAINER_RES_SWAPPINESS: &str = + "io.katacontainers.container.resource.swappiness"; +/// A container annotation to specify the Resources.Memory.Swap. +pub const KATA_ANNO_CONTAINER_RES_SWAP_IN_BYTES: &str = + "io.katacontainers.container.resource.swap_in_bytes"; + +// Agent related annotations +/// Prefix for Agent configurations. +pub const KATA_ANNO_CFG_AGENT_PREFIX: &str = "io.katacontainers.config.agent."; +/// KernelModules is the annotation key for passing the list of kernel modules and their parameters +/// that will be loaded in the guest kernel. +/// +/// Semicolon separated list of kernel modules and their parameters. These modules will be loaded +/// in the guest kernel using modprobe(8). +/// The following example can be used to load two kernel modules with parameters +/// +/// annotations: +/// io.katacontainers.config.agent.kernel_modules: "e1000e InterruptThrottleRate=3000,3000,3000 EEE=1; i915 enable_ppgtt=0" +/// +/// The first word is considered as the module name and the rest as its parameters. +pub const KATA_ANNO_CFG_KERNEL_MODULES: &str = "io.katacontainers.config.agent.kernel_modules"; +/// A sandbox annotation to enable tracing for the agent. +pub const KATA_ANNO_CFG_AGENT_TRACE: &str = "io.katacontainers.config.agent.enable_tracing"; +/// An annotation to specify the size of the pipes created for containers. +pub const KATA_ANNO_CFG_AGENT_CONTAINER_PIPE_SIZE: &str = + "io.katacontainers.config.agent.container_pipe_size"; +/// An annotation key to specify the size of the pipes created for containers. +pub const CONTAINER_PIPE_SIZE_KERNEL_PARAM: &str = "agent.container_pipe_size"; + +// Hypervisor related annotations +/// Prefix for Hypervisor configurations. +pub const KATA_ANNO_CFG_HYPERVISOR_PREFIX: &str = "io.katacontainers.config.hypervisor."; +/// A sandbox annotation for passing a per container path pointing at the hypervisor that will run +/// the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_PATH: &str = "io.katacontainers.config.hypervisor.path"; +/// A sandbox annotation for passing a container hypervisor binary SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_HASH: &str = "io.katacontainers.config.hypervisor.path_hash"; +/// A sandbox annotation for passing a per container path pointing at the hypervisor control binary +/// that will run the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_CTLPATH: &str = "io.katacontainers.config.hypervisor.ctlpath"; +/// A sandbox annotation for passing a container hypervisor control binary SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_CTLHASH: &str = + "io.katacontainers.config.hypervisor.hypervisorctl_hash"; +/// A sandbox annotation for passing a per container path pointing at the jailer that will constrain +/// the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_JAILER_PATH: &str = + "io.katacontainers.config.hypervisor.jailer_path"; +/// A sandbox annotation for passing a jailer binary SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_JAILER_HASH: &str = + "io.katacontainers.config.hypervisor.jailer_hash"; +/// A sandbox annotation to enable IO to be processed in a separate thread. +/// Supported currently for virtio-scsi driver. +pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_IO_THREADS: &str = + "io.katacontainers.config.hypervisor.enable_iothreads"; +/// The hash type used for assets verification +pub const KATA_ANNO_CFG_HYPERVISOR_ASSET_HASH_TYPE: &str = + "io.katacontainers.config.hypervisor.asset_hash_type"; +/// SHA512 is the SHA-512 (64) hash algorithm +pub const SHA512: &str = "sha512"; + +// Hypervisor Block Device related annotations +/// Specify the driver to be used for block device either VirtioSCSI or VirtioBlock +pub const KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_DRIVER: &str = + "io.katacontainers.config.hypervisor.block_device_driver"; +/// A sandbox annotation that disallows a block device from being used. +pub const KATA_ANNO_CFG_HYPERVISOR_DISABLE_BLOCK_DEV_USE: &str = + "io.katacontainers.config.hypervisor.disable_block_device_use"; +/// A sandbox annotation that specifies cache-related options will be set to block devices or not. +pub const KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_SET: &str = + "io.katacontainers.config.hypervisor.block_device_cache_set"; +/// A sandbox annotation that specifies cache-related options for block devices. +/// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. +pub const KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_DIRECT: &str = + "io.katacontainers.config.hypervisor.block_device_cache_direct"; +/// A sandbox annotation that specifies cache-related options for block devices. +/// Denotes whether flush requests for the device are ignored. +pub const KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_NOFLUSH: &str = + "io.katacontainers.config.hypervisor.block_device_cache_noflush"; +/// A sandbox annotation to specify use of nvdimm device for guest rootfs image. +pub const KATA_ANNO_CFG_HYPERVISOR_DISABLE_IMAGE_NVDIMM: &str = + "io.katacontainers.config.hypervisor.disable_image_nvdimm"; +/// A sandbox annotation that specifies the memory space used for nvdimm device by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_MEMORY_OFFSET: &str = + "io.katacontainers.config.hypervisor.memory_offset"; +/// A sandbox annotation to specify if vhost-user-blk/scsi is abailable on the host +pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_VHOSTUSER_STORE: &str = + "io.katacontainers.config.hypervisor.enable_vhost_user_store"; +/// A sandbox annotation to specify the directory path where vhost-user devices related folders, +/// sockets and device nodes should be. +pub const KATA_ANNO_CFG_HYPERVISOR_VHOSTUSER_STORE_PATH: &str = + "io.katacontainers.config.hypervisor.vhost_user_store_path"; + +// Hypervisor Guest Boot related annotations +/// A sandbox annotation for passing a per container path pointing at the kernel needed to boot +/// the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_KERNEL_PATH: &str = "io.katacontainers.config.hypervisor.kernel"; +/// A sandbox annotation for passing a container kernel image SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_KERNEL_HASH: &str = + "io.katacontainers.config.hypervisor.kernel_hash"; +/// A sandbox annotation for passing a per container path pointing at the guest image that will run +/// in the container VM. +/// A sandbox annotation for passing additional guest kernel parameters. +pub const KATA_ANNO_CFG_HYPERVISOR_KERNEL_PARAMS: &str = + "io.katacontainers.config.hypervisor.kernel_params"; +/// A sandbox annotation for passing a container guest image path. +pub const KATA_ANNO_CFG_HYPERVISOR_IMAGE_PATH: &str = "io.katacontainers.config.hypervisor.image"; +/// A sandbox annotation for passing a container guest image SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_IMAGE_HASH: &str = + "io.katacontainers.config.hypervisor.image_hash"; +/// A sandbox annotation for passing a per container path pointing at the initrd that will run +/// in the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_INITRD_PATH: &str = "io.katacontainers.config.hypervisor.initrd"; +/// A sandbox annotation for passing a container guest initrd SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_INITRD_HASH: &str = + "io.katacontainers.config.hypervisor.initrd_hash"; +/// A sandbox annotation for passing a per container path pointing at the guest firmware that will +/// run the container VM. +pub const KATA_ANNO_CFG_HYPERVISOR_FIRMWARE_PATH: &str = + "io.katacontainers.config.hypervisor.firmware"; +/// A sandbox annotation for passing a container guest firmware SHA-512 hash value. +pub const KATA_ANNO_CFG_HYPERVISOR_FIRMWARE_HASH: &str = + "io.katacontainers.config.hypervisor.firmware_hash"; + +// Hypervisor CPU related annotations +/// A sandbox annotation to specify cpu specific features. +pub const KATA_ANNO_CFG_HYPERVISOR_CPU_FEATURES: &str = + "io.katacontainers.config.hypervisor.cpu_features"; +/// A sandbox annotation for passing the default vcpus assigned for a VM by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS: &str = + "io.katacontainers.config.hypervisor.default_vcpus"; +/// A sandbox annotation that specifies the maximum number of vCPUs allocated for the VM by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MAX_VCPUS: &str = + "io.katacontainers.config.hypervisor.default_max_vcpus"; + +// Hypervisor Device related annotations +/// A sandbox annotation used to indicate if devices need to be hotplugged on the root bus instead +/// of a bridge. +pub const KATA_ANNO_CFG_HYPERVISOR_HOTPLUG_VFIO_ON_ROOT_BUS: &str = + "io.katacontainers.config.hypervisor.hotplug_vfio_on_root_bus"; +/// PCIeRootPort is used to indicate the number of PCIe Root Port devices +pub const KATA_ANNO_CFG_HYPERVISOR_PCIE_ROOT_PORT: &str = + "io.katacontainers.config.hypervisor.pcie_root_port"; +/// A sandbox annotation to specify if the VM should have a vIOMMU device. +pub const KATA_ANNO_CFG_HYPERVISOR_IOMMU: &str = "io.katacontainers.config.hypervisor.enable_iommu"; +/// Enable Hypervisor Devices IOMMU_PLATFORM +pub const KATA_ANNO_CFG_HYPERVISOR_IOMMU_PLATFORM: &str = + "io.katacontainers.config.hypervisor.enable_iommu_platform"; + +// Hypervisor Machine related annotations +/// A sandbox annotation to specify the type of machine being emulated by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_MACHINE_TYPE: &str = + "io.katacontainers.config.hypervisor.machine_type"; +/// A sandbox annotation to specify machine specific accelerators for the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_MACHINE_ACCELERATORS: &str = + "io.katacontainers.config.hypervisor.machine_accelerators"; +/// EntropySource is a sandbox annotation to specify the path to a host source of +/// entropy (/dev/random, /dev/urandom or real hardware RNG device) +pub const KATA_ANNO_CFG_HYPERVISOR_ENTROPY_SOURCE: &str = + "io.katacontainers.config.hypervisor.entropy_source"; + +// Hypervisor Memory related annotations +/// A sandbox annotation for the memory assigned for a VM by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MEMORY: &str = + "io.katacontainers.config.hypervisor.default_memory"; +/// A sandbox annotation to specify the memory slots assigned to the VM by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_MEMORY_SLOTS: &str = + "io.katacontainers.config.hypervisor.memory_slots"; +/// A sandbox annotation that specifies the memory space used for nvdimm device by the hypervisor. +pub const KATA_ANNO_CFG_HYPERVISOR_MEMORY_PREALLOC: &str = + "io.katacontainers.config.hypervisor.enable_mem_prealloc"; +/// A sandbox annotation to specify if the memory should be pre-allocated from huge pages. +pub const KATA_ANNO_CFG_HYPERVISOR_HUGE_PAGES: &str = + "io.katacontainers.config.hypervisor.enable_hugepages"; +/// A sandbox annotation to soecify file based memory backend root directory. +pub const KATA_ANNO_CFG_HYPERVISOR_FILE_BACKED_MEM_ROOT_DIR: &str = + "io.katacontainers.config.hypervisor.file_mem_backend"; +/// A sandbox annotation that is used to enable/disable virtio-mem. +pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_MEM: &str = + "io.katacontainers.config.hypervisor.enable_virtio_mem"; +/// A sandbox annotation to enable swap of vm memory. +pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_SWAP: &str = + "io.katacontainers.config.hypervisor.enable_swap"; +/// A sandbox annotation to enable swap in the guest. +pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP: &str = + "io.katacontainers.config.hypervisor.enable_guest_swap"; + +// Hypervisor Network related annotations +/// A sandbox annotation to specify if vhost-net is not available on the host. +pub const KATA_ANNO_CFG_HYPERVISOR_DISABLE_VHOST_NET: &str = + "io.katacontainers.config.hypervisor.disable_vhost_net"; +/// A sandbox annotation that specifies max rate on network I/O inbound bandwidth. +pub const KATA_ANNO_CFG_HYPERVISOR_RX_RATE_LIMITER_MAX_RATE: &str = + "io.katacontainers.config.hypervisor.rx_rate_limiter_max_rate"; +/// A sandbox annotation that specifies max rate on network I/O outbound bandwidth. +pub const KATA_ANNO_CFG_HYPERVISOR_TX_RATE_LIMITER_MAX_RATE: &str = + "io.katacontainers.config.hypervisor.tx_rate_limiter_max_rate"; + +// Hypervisor Security related annotations +/// A sandbox annotation to specify the path within the VM that will be used for 'drop-in' hooks. +pub const KATA_ANNO_CFG_HYPERVISOR_GUEST_HOOK_PATH: &str = + "io.katacontainers.config.hypervisor.guest_hook_path"; +/// A sandbox annotation to enable rootless hypervisor (only supported in QEMU currently). +pub const KATA_ANNO_CFG_HYPERVISOR_ENABLE_ROOTLESS_HYPERVISOR: &str = + "io.katacontainers.config.hypervisor.rootless"; + +// Hypervisor Shared File System related annotations +/// A sandbox annotation to specify the shared file system type, either virtio-9p or virtio-fs. +pub const KATA_ANNO_CFG_HYPERVISOR_SHARED_FS: &str = + "io.katacontainers.config.hypervisor.shared_fs"; +/// A sandbox annotations to specify virtio-fs vhost-user daemon path. +pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_DAEMON: &str = + "io.katacontainers.config.hypervisor.virtio_fs_daemon"; +/// A sandbox annotation to specify the cache mode for fs version cache or "none". +pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_CACHE: &str = + "io.katacontainers.config.hypervisor.virtio_fs_cache"; +/// A sandbox annotation to specify the DAX cache size in MiB. +pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_CACHE_SIZE: &str = + "io.katacontainers.config.hypervisor.virtio_fs_cache_size"; +/// A sandbox annotation to pass options to virtiofsd daemon. +pub const KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_EXTRA_ARGS: &str = + "io.katacontainers.config.hypervisor.virtio_fs_extra_args"; +/// A sandbox annotation to specify as the msize for 9p shares. +pub const KATA_ANNO_CFG_HYPERVISOR_MSIZE_9P: &str = "io.katacontainers.config.hypervisor.msize_9p"; + +// Runtime related annotations +/// Prefix for Runtime configurations. +pub const KATA_ANNO_CFG_RUNTIME_PREFIX: &str = "io.katacontainers.config.runtime."; +/// runtime name +pub const KATA_ANNO_CFG_RUNTIME_NAME: &str = "io.katacontainers.config.runtime.name"; +/// hypervisor name +pub const KATA_ANNO_CFG_RUNTIME_HYPERVISOR: &str = + "io.katacontainers.config.runtime.hypervisor_name"; +/// agent name +pub const KATA_ANNO_CFG_RUNTIME_AGENT: &str = "io.katacontainers.config.runtime.agent_name"; +/// A sandbox annotation that determines if seccomp should be applied inside guest. +pub const KATA_ANNO_CFG_DISABLE_GUEST_SECCOMP: &str = + "io.katacontainers.config.runtime.disable_guest_seccomp"; +/// A sandbox annotation that determines if pprof enabled. +pub const KATA_ANNO_CFG_ENABLE_PPROF: &str = "io.katacontainers.config.runtime.enable_pprof"; +/// A sandbox annotation that determines if experimental features enabled. +pub const KATA_ANNO_CFG_EXPERIMENTAL: &str = "io.katacontainers.config.runtime.experimental"; +/// A sandbox annotaion that determines how the VM should be connected to the the container network +/// interface. +pub const KATA_ANNO_CFG_INTER_NETWORK_MODEL: &str = + "io.katacontainers.config.runtime.internetworking_model"; +/// SandboxCgroupOnly is a sandbox annotation that determines if kata processes are managed only in sandbox cgroup. +pub const KATA_ANNO_CFG_SANDBOX_CGROUP_ONLY: &str = + "io.katacontainers.config.runtime.sandbox_cgroup_only"; +/// A sandbox annotation that determines if create a netns for hypervisor process. +pub const KATA_ANNO_CFG_DISABLE_NEW_NETNS: &str = + "io.katacontainers.config.runtime.disable_new_netns"; +/// A sandbox annotation to specify how attached VFIO devices should be treated. +pub const KATA_ANNO_CFG_VFIO_MODE: &str = "io.katacontainers.config.runtime.vfio_mode"; + +/// A helper structure to query configuration information by check annotations. +#[derive(Debug, Default, Deserialize)] +pub struct Annotation { + annotations: HashMap, +} + +impl From> for Annotation { + fn from(annotations: HashMap) -> Self { + Annotation { annotations } + } +} + +impl Annotation { + /// Create a new instance of [`Annotation`]. + pub fn new(annotations: HashMap) -> Annotation { + Annotation { annotations } + } + + /// Deserialize an object from a json string. + pub fn deserialize(path: &str) -> Result + where + for<'a> T: Deserialize<'a>, + { + let f = BufReader::new(File::open(path)?); + Ok(serde_json::from_reader(f)?) + } + + /// Get an immutable reference to the annotation hashmap. + pub fn get_annotations(&self) -> &HashMap { + &self.annotations + } + + /// Get a mutable reference to the annotation hashmap. + pub fn get_annotations_mut(&mut self) -> &mut HashMap { + &mut self.annotations + } + + /// Get the value of annotation with `key` + pub fn get_value( + &self, + key: &str, + ) -> result::Result, ::Err> + where + T: std::str::FromStr, + { + if let Some(value) = self.get(key) { + return value.parse::().map(Some); + } + Ok(None) + } + + /// Get the value of annotation with `key` as string. + pub fn get(&self, key: &str) -> Option { + self.annotations.get(key).map(|v| String::from(v.trim())) + } +} + +// Miscellaneous annotations. +impl Annotation { + /// Get the annotation of sandbox configuration file path. + pub fn get_sandbox_config_path(&self) -> Option { + self.get(SANDBOX_CFG_PATH_KEY) + } + + /// Get the annotation of bundle path. + pub fn get_bundle_path(&self) -> Option { + self.get(BUNDLE_PATH_KEY) + } + + /// Get the annotation of container type. + pub fn get_container_type(&self) -> Option { + self.get(CONTAINER_TYPE_KEY) + } + + /// Get the annotation to specify the Resources.Memory.Swappiness. + pub fn get_container_resource_swappiness(&self) -> Result> { + match self.get_value::(KATA_ANNO_CONTAINER_RES_SWAPPINESS) { + Ok(r) => { + if r.unwrap_or_default() > 100 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("{} greater than 100", r.unwrap_or_default()), + )); + } else { + Ok(r) + } + } + Err(_e) => Err(io::Error::new( + io::ErrorKind::InvalidData, + "parse u32 error".to_string(), + )), + } + } + + /// Get the annotation to specify the Resources.Memory.Swap. + pub fn get_container_resource_swap_in_bytes(&self) -> Option { + self.get(KATA_ANNO_CONTAINER_RES_SWAP_IN_BYTES) + } +} + +impl Annotation { + /// update config info by annotation + pub fn update_config_by_annotation(&self, config: &mut TomlConfig) -> Result<()> { + if let Some(hv) = self.annotations.get(KATA_ANNO_CFG_RUNTIME_HYPERVISOR) { + if config.hypervisor.get(hv).is_some() { + config.runtime.hypervisor_name = hv.to_string(); + } + } + if let Some(ag) = self.annotations.get(KATA_ANNO_CFG_RUNTIME_AGENT) { + if config.agent.get(ag).is_some() { + config.runtime.agent_name = ag.to_string(); + } + } + let hypervisor_name = &config.runtime.hypervisor_name; + let agent_name = &config.runtime.agent_name; + + let bool_err = io::Error::new(io::ErrorKind::InvalidData, "parse bool error".to_string()); + let u32_err = io::Error::new(io::ErrorKind::InvalidData, "parse u32 error".to_string()); + let u64_err = io::Error::new(io::ErrorKind::InvalidData, "parse u64 error".to_string()); + let i32_err = io::Error::new(io::ErrorKind::InvalidData, "parse i32 error".to_string()); + let mut hv = config.hypervisor.get_mut(hypervisor_name).unwrap(); + let mut ag = config.agent.get_mut(agent_name).unwrap(); + for (key, value) in &self.annotations { + if hv.security_info.is_annotation_enabled(key) { + match key.as_str() { + // update hypervisor config + // Hypervisor related annotations + KATA_ANNO_CFG_HYPERVISOR_PATH => { + hv.validate_hypervisor_path(value)?; + hv.path = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_CTLPATH => { + hv.validate_hypervisor_ctlpath(value)?; + hv.ctlpath = value.to_string(); + } + + KATA_ANNO_CFG_HYPERVISOR_JAILER_PATH => { + hv.validate_jailer_path(value)?; + hv.jailer_path = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_ENABLE_IO_THREADS => match self.get_value::(key) + { + Ok(r) => { + hv.enable_iothreads = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + // Hypervisor Block Device related annotations + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_DRIVER => { + hv.blockdev_info.block_device_driver = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_DISABLE_BLOCK_DEV_USE => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.disable_block_device_use = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_SET => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.block_device_cache_set = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_DIRECT => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.block_device_cache_direct = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_NOFLUSH => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.block_device_cache_noflush = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_DISABLE_IMAGE_NVDIMM => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.disable_image_nvdimm = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_MEMORY_OFFSET => match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.memory_offset = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u64_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_ENABLE_VHOSTUSER_STORE => { + match self.get_value::(key) { + Ok(r) => { + hv.blockdev_info.enable_vhost_user_store = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_VHOSTUSER_STORE_PATH => { + hv.blockdev_info.validate_vhost_user_store_path(value)?; + hv.blockdev_info.vhost_user_store_path = value.to_string(); + } + // Hypervisor Guest Boot related annotations + KATA_ANNO_CFG_HYPERVISOR_KERNEL_PATH => { + hv.boot_info.validate_boot_path(value)?; + hv.boot_info.kernel = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_KERNEL_PARAMS => { + hv.boot_info.kernel_params = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_IMAGE_PATH => { + hv.boot_info.validate_boot_path(value)?; + hv.boot_info.image = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_INITRD_PATH => { + hv.boot_info.validate_boot_path(value)?; + hv.boot_info.initrd = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_FIRMWARE_PATH => { + hv.boot_info.validate_boot_path(value)?; + hv.boot_info.firmware = value.to_string(); + } + // Hypervisor CPU related annotations + KATA_ANNO_CFG_HYPERVISOR_CPU_FEATURES => { + hv.cpu_info.cpu_features = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS => match self.get_value::(key) { + Ok(num_cpus) => { + let num_cpus = num_cpus.unwrap_or_default(); + if num_cpus + > get_hypervisor_plugin(hypervisor_name) + .unwrap() + .get_max_cpus() as i32 + { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Vcpus specified in annotation {} is more than maximum limitation {}", + num_cpus, + get_hypervisor_plugin(hypervisor_name) + .unwrap() + .get_max_cpus() + ), + )); + } else { + hv.cpu_info.default_vcpus = num_cpus; + } + } + Err(_e) => { + return Err(i32_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MAX_VCPUS => { + match self.get_value::(key) { + Ok(r) => { + hv.cpu_info.default_maxvcpus = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + } + } + // Hypervisor Device related annotations + KATA_ANNO_CFG_HYPERVISOR_HOTPLUG_VFIO_ON_ROOT_BUS => { + match self.get_value::(key) { + Ok(r) => { + hv.device_info.hotplug_vfio_on_root_bus = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_PCIE_ROOT_PORT => match self.get_value::(key) { + Ok(r) => { + hv.device_info.pcie_root_port = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_IOMMU => match self.get_value::(key) { + Ok(r) => { + hv.device_info.enable_iommu = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_IOMMU_PLATFORM => match self.get_value::(key) { + Ok(r) => { + hv.device_info.enable_iommu_platform = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + // Hypervisor Machine related annotations + KATA_ANNO_CFG_HYPERVISOR_MACHINE_TYPE => { + hv.machine_info.machine_type = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_MACHINE_ACCELERATORS => { + hv.machine_info.machine_accelerators = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_ENTROPY_SOURCE => { + hv.machine_info.validate_entropy_source(value)?; + hv.machine_info.entropy_source = value.to_string(); + } + // Hypervisor Memory related annotations + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MEMORY => { + match byte_unit::Byte::from_str(value) { + Ok(mem_bytes) => { + let memory_size = mem_bytes + .get_adjusted_unit(byte_unit::ByteUnit::MiB) + .get_value() + as u32; + info!(sl!(), "get mem {} from annotations: {}", memory_size, value); + if memory_size + < get_hypervisor_plugin(hypervisor_name) + .unwrap() + .get_min_memory() + { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "memory specified in annotation {} is less than minimum limitation {}", + memory_size, + get_hypervisor_plugin(hypervisor_name) + .unwrap() + .get_min_memory() + ), + )); + } + hv.memory_info.default_memory = memory_size; + } + Err(error) => { + error!( + sl!(), + "failed to parse byte from string {} error {:?}", value, error + ); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_MEMORY_SLOTS => match self.get_value::(key) { + Ok(v) => { + hv.memory_info.memory_slots = v.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + }, + + KATA_ANNO_CFG_HYPERVISOR_MEMORY_PREALLOC => match self.get_value::(key) { + Ok(r) => { + hv.memory_info.enable_mem_prealloc = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_HUGE_PAGES => match self.get_value::(key) { + Ok(r) => { + hv.memory_info.enable_hugepages = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_FILE_BACKED_MEM_ROOT_DIR => { + hv.memory_info.validate_memory_backend_path(value)?; + hv.memory_info.file_mem_backend = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_MEM => match self.get_value::(key) { + Ok(r) => { + hv.memory_info.enable_virtio_mem = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_ENABLE_SWAP => match self.get_value::(key) { + Ok(r) => { + hv.memory_info.enable_swap = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP => match self.get_value::(key) + { + Ok(r) => { + hv.memory_info.enable_guest_swap = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + // Hypervisor Network related annotations + KATA_ANNO_CFG_HYPERVISOR_DISABLE_VHOST_NET => match self.get_value::(key) + { + Ok(r) => { + hv.network_info.disable_vhost_net = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_HYPERVISOR_RX_RATE_LIMITER_MAX_RATE => { + match self.get_value::(key) { + Ok(r) => { + hv.network_info.rx_rate_limiter_max_rate = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u64_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_TX_RATE_LIMITER_MAX_RATE => { + match self.get_value::(key) { + Ok(r) => { + hv.network_info.tx_rate_limiter_max_rate = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u64_err); + } + } + } + // Hypervisor Security related annotations + KATA_ANNO_CFG_HYPERVISOR_GUEST_HOOK_PATH => { + hv.security_info.validate_path(value)?; + hv.security_info.guest_hook_path = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_ENABLE_ROOTLESS_HYPERVISOR => { + match self.get_value::(key) { + Ok(r) => { + hv.security_info.rootless = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + } + } + // Hypervisor Shared File System related annotations + KATA_ANNO_CFG_HYPERVISOR_SHARED_FS => { + hv.shared_fs.shared_fs = self.get(key); + } + + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_DAEMON => { + hv.shared_fs.validate_virtiofs_daemon_path(value)?; + hv.shared_fs.virtio_fs_daemon = value.to_string(); + } + + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_CACHE => { + hv.shared_fs.virtio_fs_cache = value.to_string(); + } + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_CACHE_SIZE => { + match self.get_value::(key) { + Ok(r) => { + hv.shared_fs.virtio_fs_cache_size = r.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + } + } + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_EXTRA_ARGS => { + let args: Vec = + value.to_string().split(',').map(str::to_string).collect(); + for arg in args { + hv.shared_fs.virtio_fs_extra_args.push(arg.to_string()); + } + } + KATA_ANNO_CFG_HYPERVISOR_MSIZE_9P => match self.get_value::(key) { + Ok(v) => { + hv.shared_fs.msize_9p = v.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + }, + + _ => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("Invalid annotation type {}", key), + )); + } + } + } else { + match key.as_str() { + //update agent config + KATA_ANNO_CFG_KERNEL_MODULES => { + let kernel_mod: Vec = + value.to_string().split(';').map(str::to_string).collect(); + for modules in kernel_mod { + ag.kernel_modules.push(modules.to_string()); + } + } + KATA_ANNO_CFG_AGENT_TRACE => match self.get_value::(key) { + Ok(r) => { + ag.enable_tracing = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_AGENT_CONTAINER_PIPE_SIZE => match self.get_value::(key) { + Ok(v) => { + ag.container_pipe_size = v.unwrap_or_default(); + } + Err(_e) => { + return Err(u32_err); + } + }, + //update runtime config + KATA_ANNO_CFG_RUNTIME_NAME => { + let runtime = vec!["virt-container", "linux-container", "wasm-container"]; + if runtime.contains(&value.as_str()) { + config.runtime.name = value.to_string(); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "runtime specified in annotation {} is not in {:?}", + &value, &runtime + ), + )); + } + } + KATA_ANNO_CFG_DISABLE_GUEST_SECCOMP => match self.get_value::(key) { + Ok(r) => { + config.runtime.disable_guest_seccomp = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_ENABLE_PPROF => match self.get_value::(key) { + Ok(r) => { + config.runtime.enable_pprof = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_EXPERIMENTAL => { + let args: Vec = + value.to_string().split(',').map(str::to_string).collect(); + for arg in args { + config.runtime.experimental.push(arg.to_string()); + } + } + KATA_ANNO_CFG_INTER_NETWORK_MODEL => { + config.runtime.internetworking_model = value.to_string(); + } + KATA_ANNO_CFG_SANDBOX_CGROUP_ONLY => match self.get_value::(key) { + Ok(r) => { + config.runtime.sandbox_cgroup_only = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_DISABLE_NEW_NETNS => match self.get_value::(key) { + Ok(r) => { + config.runtime.disable_new_netns = r.unwrap_or_default(); + } + Err(_e) => { + return Err(bool_err); + } + }, + KATA_ANNO_CFG_VFIO_MODE => { + config.runtime.vfio_mode = value.to_string(); + } + _ => { + warn!(sl!(), "Annotation {} not enabled", key); + } + } + } + } + Ok(()) + } +} diff --git a/src/libs/kata-types/src/annotations/thirdparty.rs b/src/libs/kata-types/src/annotations/thirdparty.rs new file mode 100644 index 0000000000..e8f2a71683 --- /dev/null +++ b/src/libs/kata-types/src/annotations/thirdparty.rs @@ -0,0 +1,12 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Third-party annotations - annotations defined by other projects or k8s plugins but that can +//! change Kata Containers behaviour. + +/// Annotation to enable SGX. +/// +/// Hardware-based isolation and memory encryption. +pub const SGX_EPC: &str = "sgx.intel.com/epc"; diff --git a/src/libs/kata-types/src/config/agent.rs b/src/libs/kata-types/src/config/agent.rs new file mode 100644 index 0000000000..b5bcf5b3b5 --- /dev/null +++ b/src/libs/kata-types/src/config/agent.rs @@ -0,0 +1,123 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Result; + +use crate::config::{ConfigOps, TomlConfig}; + +pub use vendor::AgentVendor; + +/// Kata agent configuration information. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct Agent { + /// If enabled, the agent will log additional debug messages to the system log. + #[serde(default, rename = "enable_debug")] + pub debug: bool, + + /// Enable agent tracing. + /// + /// If enabled, the agent will generate OpenTelemetry trace spans. + /// # Notes: + /// - If the runtime also has tracing enabled, the agent spans will be associated with the + /// appropriate runtime parent span. + /// - If enabled, the runtime will wait for the container to shutdown, increasing the container + /// shutdown time slightly. + #[serde(default)] + pub enable_tracing: bool, + + /// Enable debug console. + /// If enabled, user can connect guest OS running inside hypervisor through + /// "kata-runtime exec " command + #[serde(default)] + pub debug_console_enabled: bool, + + /// Agent server port + #[serde(default)] + pub server_port: u32, + + /// Agent log port + #[serde(default)] + pub log_port: u32, + + /// Agent connection dialing timeout value in millisecond + #[serde(default = "default_dial_timeout")] + pub dial_timeout_ms: u32, + + /// Agent reconnect timeout value in millisecond + #[serde(default = "default_reconnect_timeout")] + pub reconnect_timeout_ms: u32, + + /// Agent request timeout value in millisecond + #[serde(default = "default_request_timeout")] + pub request_timeout_ms: u32, + + /// Agent health check request timeout value in millisecond + #[serde(default = "default_health_check_timeout")] + pub health_check_request_timeout_ms: u32, + + /// Comma separated list of kernel modules and their parameters. + /// + /// These modules will be loaded in the guest kernel using modprobe(8). + /// The following example can be used to load two kernel modules with parameters: + /// - kernel_modules=["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1", "i915 enable_ppgtt=0"] + /// The first word is considered as the module name and the rest as its parameters. + /// Container will not be started when: + /// - A kernel module is specified and the modprobe command is not installed in the guest + /// or it fails loading the module. + /// - The module is not available in the guest or it doesn't met the guest kernel + /// requirements, like architecture and version. + #[serde(default)] + pub kernel_modules: Vec, + + /// container pipe size + pub container_pipe_size: u32, +} + +fn default_dial_timeout() -> u32 { + // 10ms + 10 +} + +fn default_reconnect_timeout() -> u32 { + // 3s + 3_000 +} + +fn default_request_timeout() -> u32 { + // 30s + 30_000 +} + +fn default_health_check_timeout() -> u32 { + // 90s + 90_000 +} + +impl ConfigOps for Agent { + fn adjust_config(conf: &mut TomlConfig) -> Result<()> { + AgentVendor::adjust_config(conf)?; + Ok(()) + } + + fn validate(conf: &TomlConfig) -> Result<()> { + AgentVendor::validate(conf)?; + Ok(()) + } +} + +#[cfg(not(feature = "enable-vendor"))] +mod vendor { + use super::*; + + /// Vendor customization agent configuration. + #[derive(Debug, Default, Deserialize, Serialize)] + pub struct AgentVendor {} + + impl ConfigOps for AgentVendor {} +} + +#[cfg(feature = "enable-vendor")] +#[path = "agent_vendor.rs"] +mod vendor; diff --git a/src/libs/kata-types/src/config/agent_vendor.rs b/src/libs/kata-types/src/config/agent_vendor.rs new file mode 100644 index 0000000000..62ce710d01 --- /dev/null +++ b/src/libs/kata-types/src/config/agent_vendor.rs @@ -0,0 +1,12 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use super::*; + +/// Vendor customization agent configuration. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct AgentVendor {} + +impl ConfigOps for AgentVendor {} diff --git a/src/libs/kata-types/src/config/default.rs b/src/libs/kata-types/src/config/default.rs new file mode 100644 index 0000000000..d55ccf7e97 --- /dev/null +++ b/src/libs/kata-types/src/config/default.rs @@ -0,0 +1,58 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Default configuration values. +#![allow(missing_docs)] + +use lazy_static::lazy_static; + +lazy_static! { + /// Default configuration file paths, vendor may extend the list + pub static ref DEFAULT_RUNTIME_CONFIGURATIONS: Vec::<&'static str> = vec![ + "/etc/kata-containers/configuration.toml", + "/usr/share/defaults/kata-containers/configuration.toml", + ]; +} +pub const DEFAULT_AGENT_NAME: &str = "kata-agent"; + +pub const DEFAULT_INTERNETWORKING_MODEL: &str = "tcfilter"; + +pub const DEFAULT_BLOCK_DEVICE_TYPE: &str = "virtio-blk"; +pub const DEFAULT_VHOST_USER_STORE_PATH: &str = "/var/run/vhost-user"; +pub const DEFAULT_BLOCK_NVDIMM_MEM_OFFSET: u64 = 0; + +pub const DEFAULT_SHARED_FS_TYPE: &str = "virtio-fs"; +pub const DEFAULT_VIRTIO_FS_CACHE_MODE: &str = "none"; +pub const DEFAULT_VIRTIO_FS_DAX_SIZE_MB: u32 = 1024; +pub const DEFAULT_SHARED_9PFS_SIZE_MB: u32 = 128 * 1024; +pub const MIN_SHARED_9PFS_SIZE_MB: u32 = 4 * 1024; +pub const MAX_SHARED_9PFS_SIZE_MB: u32 = 8 * 1024 * 1024; + +pub const DEFAULT_GUEST_HOOK_PATH: &str = "/opt/kata/hooks"; + +pub const DEFAULT_GUEST_VCPUS: u32 = 1; + +// Default configuration for dragonball +pub const DEFAULT_DRAGONBALL_GUEST_KERNEL_IMAGE: &str = "vmlinuz"; +pub const DEFAULT_DRAGONBALL_GUEST_KERNEL_PARAMS: &str = ""; +pub const DEFAULT_DRAGONBALL_ENTROPY_SOURCE: &str = "/dev/urandom"; +pub const DEFAULT_DRAGONBALL_MEMORY_SIZE_MB: u32 = 128; +pub const DEFAULT_DRAGONBALL_MEMORY_SLOTS: u32 = 128; +pub const MAX_DRAGONBALL_VCPUS: u32 = 256; +pub const MIN_DRAGONBALL_MEMORY_SIZE_MB: u32 = 64; +// Default configuration for qemu +pub const DEFAULT_QEMU_BINARY_PATH: &str = "/usr/bin/qemu-system-x86_64"; +pub const DEFAULT_QEMU_CONTROL_PATH: &str = ""; +pub const DEFAULT_QEMU_MACHINE_TYPE: &str = "q35"; +pub const DEFAULT_QEMU_ENTROPY_SOURCE: &str = "/dev/urandom"; +pub const DEFAULT_QEMU_GUEST_KERNEL_IMAGE: &str = "vmlinuz"; +pub const DEFAULT_QEMU_GUEST_KERNEL_PARAMS: &str = ""; +pub const DEFAULT_QEMU_FIRMWARE_PATH: &str = ""; +pub const DEFAULT_QEMU_MEMORY_SIZE_MB: u32 = 128; +pub const DEFAULT_QEMU_MEMORY_SLOTS: u32 = 128; +pub const DEFAULT_QEMU_PCI_BRIDGES: u32 = 2; +pub const MAX_QEMU_PCI_BRIDGES: u32 = 5; +pub const MAX_QEMU_VCPUS: u32 = 256; +pub const MIN_QEMU_MEMORY_SIZE_MB: u32 = 64; diff --git a/src/libs/kata-types/src/config/hypervisor/dragonball.rs b/src/libs/kata-types/src/config/hypervisor/dragonball.rs new file mode 100644 index 0000000000..bb72944a2e --- /dev/null +++ b/src/libs/kata-types/src/config/hypervisor/dragonball.rs @@ -0,0 +1,198 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Result; +use std::path::Path; +use std::sync::Arc; +use std::u32; + +use super::{default, register_hypervisor_plugin}; +use crate::config::default::MAX_DRAGONBALL_VCPUS; +use crate::config::default::MIN_DRAGONBALL_MEMORY_SIZE_MB; +use crate::config::hypervisor::{ + VIRTIO_BLK, VIRTIO_BLK_MMIO, VIRTIO_FS, VIRTIO_FS_INLINE, VIRTIO_PMEM, +}; +use crate::config::{ConfigPlugin, TomlConfig}; +use crate::{eother, resolve_path, validate_path}; + +/// Hypervisor name for dragonball, used to index `TomlConfig::hypervisor`. +pub const HYPERVISOR_NAME_DRAGONBALL: &str = "dragonball"; + +/// Configuration information for dragonball. +#[derive(Default, Debug)] +pub struct DragonballConfig {} + +impl DragonballConfig { + /// Create a new instance of `DragonballConfig`. + pub fn new() -> Self { + DragonballConfig {} + } + + /// Register the dragonball plugin. + pub fn register(self) { + let plugin = Arc::new(self); + register_hypervisor_plugin(HYPERVISOR_NAME_DRAGONBALL, plugin); + } +} + +impl ConfigPlugin for DragonballConfig { + fn get_max_cpus(&self) -> u32 { + MAX_DRAGONBALL_VCPUS + } + fn get_min_memory(&self) -> u32 { + MIN_DRAGONBALL_MEMORY_SIZE_MB + } + fn name(&self) -> &str { + HYPERVISOR_NAME_DRAGONBALL + } + + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(&self, conf: &mut TomlConfig) -> Result<()> { + if let Some(db) = conf.hypervisor.get_mut(HYPERVISOR_NAME_DRAGONBALL) { + resolve_path!(db.jailer_path, "dragonball jailer path {} is invalid: {}")?; + + if db.boot_info.kernel.is_empty() { + db.boot_info.kernel = default::DEFAULT_DRAGONBALL_GUEST_KERNEL_IMAGE.to_string(); + } + if db.boot_info.kernel_params.is_empty() { + db.boot_info.kernel_params = + default::DEFAULT_DRAGONBALL_GUEST_KERNEL_PARAMS.to_string(); + } + + if db.cpu_info.default_maxvcpus > default::MAX_DRAGONBALL_VCPUS { + db.cpu_info.default_maxvcpus = default::MAX_DRAGONBALL_VCPUS; + } + + if db.cpu_info.default_vcpus as u32 > db.cpu_info.default_maxvcpus { + db.cpu_info.default_vcpus = db.cpu_info.default_maxvcpus as i32; + } + + if db.machine_info.entropy_source.is_empty() { + db.machine_info.entropy_source = + default::DEFAULT_DRAGONBALL_ENTROPY_SOURCE.to_string(); + } + + if db.memory_info.default_memory == 0 { + db.memory_info.default_memory = default::DEFAULT_DRAGONBALL_MEMORY_SIZE_MB; + } + if db.memory_info.memory_slots == 0 { + db.memory_info.memory_slots = default::DEFAULT_DRAGONBALL_MEMORY_SLOTS; + } + } + Ok(()) + } + + /// Validate the configuration information. + fn validate(&self, conf: &TomlConfig) -> Result<()> { + if let Some(db) = conf.hypervisor.get(HYPERVISOR_NAME_DRAGONBALL) { + if !db.path.is_empty() { + return Err(eother!("Path for dragonball hypervisor should be empty")); + } + if !db.valid_hypervisor_paths.is_empty() { + return Err(eother!( + "Valid hypervisor path for dragonball hypervisor should be empty" + )); + } + if !db.ctlpath.is_empty() { + return Err(eother!("CtlPath for dragonball hypervisor should be empty")); + } + if !db.valid_ctlpaths.is_empty() { + return Err(eother!("CtlPath for dragonball hypervisor should be empty")); + } + validate_path!(db.jailer_path, "dragonball jailer path {} is invalid: {}")?; + if db.enable_iothreads { + return Err(eother!("dragonball hypervisor doesn't support IO threads.")); + } + + if !db.blockdev_info.disable_block_device_use + && db.blockdev_info.block_device_driver != VIRTIO_BLK + && db.blockdev_info.block_device_driver != VIRTIO_BLK_MMIO + && db.blockdev_info.block_device_driver != VIRTIO_PMEM + { + return Err(eother!( + "{} is unsupported block device type.", + db.blockdev_info.block_device_driver + )); + } + + if db.boot_info.kernel.is_empty() { + return Err(eother!( + "Guest kernel image for dragonball hypervisor is empty" + )); + } + if db.boot_info.image.is_empty() { + return Err(eother!( + "Guest boot image for dragonball hypervisor is empty" + )); + } + if !db.boot_info.initrd.is_empty() { + return Err(eother!("Initrd for dragonball hypervisor should be empty")); + } + if !db.boot_info.firmware.is_empty() { + return Err(eother!( + "Firmware for dragonball hypervisor should be empty" + )); + } + + if (db.cpu_info.default_vcpus > 0 + && db.cpu_info.default_vcpus as u32 > default::MAX_DRAGONBALL_VCPUS) + || db.cpu_info.default_maxvcpus > default::MAX_DRAGONBALL_VCPUS + { + return Err(eother!( + "dragonball hypervisor can not support {} vCPUs", + db.cpu_info.default_maxvcpus + )); + } + + if db.device_info.enable_iommu || db.device_info.enable_iommu_platform { + return Err(eother!("dragonball hypervisor does not support vIOMMU")); + } + if db.device_info.hotplug_vfio_on_root_bus + || db.device_info.default_bridges > 0 + || db.device_info.pcie_root_port > 0 + { + return Err(eother!( + "dragonball hypervisor does not support PCI hotplug options" + )); + } + + if !db.machine_info.machine_type.is_empty() { + return Err(eother!( + "dragonball hypervisor does not support machine_type" + )); + } + if !db.machine_info.pflashes.is_empty() { + return Err(eother!("dragonball hypervisor does not support pflashes")); + } + + if db.memory_info.enable_guest_swap { + return Err(eother!( + "dragonball hypervisor doesn't support enable_guest_swap" + )); + } + + if db.security_info.rootless { + return Err(eother!( + "dragonball hypervisor does not support rootless mode" + )); + } + + if let Some(v) = db.shared_fs.shared_fs.as_ref() { + if v != VIRTIO_FS && v != VIRTIO_FS_INLINE { + return Err(eother!("dragonball hypervisor doesn't support {}", v)); + } + } + + if db.memory_info.default_memory < MIN_DRAGONBALL_MEMORY_SIZE_MB { + return Err(eother!( + "dragonball hypervisor has minimal memory limitation {}", + MIN_DRAGONBALL_MEMORY_SIZE_MB + )); + } + } + + Ok(()) + } +} diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs new file mode 100644 index 0000000000..01ac15f66e --- /dev/null +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -0,0 +1,1069 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Configuration information for hypervisors. +//! +//! The configuration information for hypervisors is complex, and different hypervisor requires +//! different configuration information. To make it flexible and extensible, we build a multi-layer +//! architecture to manipulate hypervisor configuration information. +//! - the vendor layer. The `HypervisorVendor` structure provides hook points for vendors to +//! customize the configuration for its deployment. +//! - the hypervisor plugin layer. The hypervisor plugin layer provides hook points for different +//! hypervisors to manipulate the configuration information. +//! - the hypervisor common layer. This layer handles generic logic for all types of hypervisors. +//! +//! These three layers are applied in order. So changes made by the vendor layer will be visible +//! to the hypervisor plugin layer and the common layer. And changes made by the plugin layer will +//! only be visible to the common layer. +//! +//! Ideally the hypervisor configuration information should be split into hypervisor specific +//! part and common part. But the Kata 2.0 has adopted a policy to build a superset for all +//! hypervisors, so let's contain it... + +use std::collections::HashMap; +use std::io::{self, Result}; +use std::path::Path; +use std::sync::{Arc, Mutex}; + +use lazy_static::lazy_static; +use regex::RegexSet; + +use super::{default, ConfigOps, ConfigPlugin, TomlConfig}; +use crate::annotations::KATA_ANNO_CFG_HYPERVISOR_PREFIX; +use crate::{eother, resolve_path, validate_path}; + +mod dragonball; +pub use self::dragonball::{DragonballConfig, HYPERVISOR_NAME_DRAGONBALL}; + +mod qemu; +pub use self::qemu::{QemuConfig, HYPERVISOR_NAME_QEMU}; + +const VIRTIO_BLK: &str = "virtio-blk"; +const VIRTIO_BLK_MMIO: &str = "virtio-mmio"; +const VIRTIO_BLK_CCW: &str = "virtio-blk-ccw"; +const VIRTIO_SCSI: &str = "virtio-scsi"; +const VIRTIO_PMEM: &str = "nvdimm"; +const VIRTIO_9P: &str = "virtio-9p"; +const VIRTIO_FS: &str = "virtio-fs"; +const VIRTIO_FS_INLINE: &str = "inline-virtio-fs"; +const MAX_BRIDGE_SIZE: u32 = 5; + +lazy_static! { + static ref HYPERVISOR_PLUGINS: Mutex>> = + Mutex::new(HashMap::new()); +} + +/// Register a hypervisor plugin with `name`. +pub fn register_hypervisor_plugin(name: &str, plugin: Arc) { + let mut hypervisors = HYPERVISOR_PLUGINS.lock().unwrap(); + hypervisors.insert(name.to_string(), plugin); +} + +/// Get the hypervisor plugin with `name`. +pub fn get_hypervisor_plugin(name: &str) -> Option> { + let hypervisors = HYPERVISOR_PLUGINS.lock().unwrap(); + hypervisors.get(name).cloned() +} + +/// Configuration information for block device. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct BlockDeviceInfo { + /// Disable block device from being used for a container's rootfs. + /// + /// In case of a storage driver like devicemapper where a container's root file system is + /// backed by a block device, the block device is passed directly to the hypervisor for + /// performance reasons. This flag prevents the block device from being passed to the + /// hypervisor, shared fs is used instead to pass the rootfs. + #[serde(default)] + pub disable_block_device_use: bool, + + /// Block storage driver to be used for the hypervisor in case the container rootfs is backed + /// by a block device. This is virtio-scsi, virtio-blk or nvdimm. + #[serde(default)] + pub block_device_driver: String, + + /// Specifies cache-related options will be set to block devices or not. + #[serde(default)] + pub block_device_cache_set: bool, + + /// Specifies cache-related options for block devices. + /// + /// Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. + #[serde(default)] + pub block_device_cache_direct: bool, + + /// Specifies cache-related options for block devices. + /// Denotes whether flush requests for the device are ignored. + #[serde(default)] + pub block_device_cache_noflush: bool, + + /// If false and nvdimm is supported, use nvdimm device to plug guest image. + #[serde(default)] + pub disable_image_nvdimm: bool, + + /// The size in MiB will be plused to max memory of hypervisor. + /// + /// It is the memory address space for the NVDIMM devie. If set block storage driver + /// (block_device_driver) to "nvdimm", should set memory_offset to the size of block device. + #[serde(default)] + pub memory_offset: u64, + + /// Enable vhost-user storage device, default false + /// + /// Enabling this will result in some Linux reserved block type major range 240-254 being + /// chosen to represent vhost-user devices. + #[serde(default)] + pub enable_vhost_user_store: bool, + + /// The base directory specifically used for vhost-user devices. + /// + /// Its sub-path "block" is used for block devices; "block/sockets" is where we expect + /// vhost-user sockets to live; "block/devices" is where simulated block device nodes for + /// vhost-user devices to live. + #[serde(default)] + pub vhost_user_store_path: String, + + /// List of valid annotations values for the vhost user store path. + /// + /// The default if not set is empty (all annotations rejected.) + #[serde(default)] + pub valid_vhost_user_store_paths: Vec, +} + +impl BlockDeviceInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + if self.disable_block_device_use { + self.block_device_driver = "".to_string(); + self.enable_vhost_user_store = false; + self.memory_offset = 0; + return Ok(()); + } + + if self.block_device_driver.is_empty() { + self.block_device_driver = default::DEFAULT_BLOCK_DEVICE_TYPE.to_string(); + } + if self.memory_offset == 0 { + self.memory_offset = default::DEFAULT_BLOCK_NVDIMM_MEM_OFFSET; + } + if !self.enable_vhost_user_store { + self.vhost_user_store_path = String::new(); + } else if self.vhost_user_store_path.is_empty() { + self.vhost_user_store_path = default::DEFAULT_VHOST_USER_STORE_PATH.to_string(); + } + resolve_path!( + self.vhost_user_store_path, + "Invalid vhost-user-store-path {}: {}" + )?; + + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + if self.disable_block_device_use { + return Ok(()); + } + let l = [ + VIRTIO_BLK, + VIRTIO_BLK_CCW, + VIRTIO_BLK_MMIO, + VIRTIO_PMEM, + VIRTIO_SCSI, + ]; + if !l.contains(&self.block_device_driver.as_str()) { + return Err(eother!( + "{} is unsupported block device type.", + self.block_device_driver + )); + } + validate_path!( + self.vhost_user_store_path, + "Invalid vhost-user-store-path {}: {}" + )?; + + Ok(()) + } + + /// Validate path of vhost-user storage backend. + pub fn validate_vhost_user_store_path>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_vhost_user_store_paths, path) + } +} + +/// Guest kernel boot information. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct BootInfo { + /// Path to guest kernel file on host + #[serde(default)] + pub kernel: String, + /// Guest kernel commandline. + #[serde(default)] + pub kernel_params: String, + /// Path to initrd file on host + #[serde(default)] + pub initrd: String, + /// Path to root device on host + #[serde(default)] + pub image: String, + /// Path to the firmware. + /// + /// If you want that qemu uses the default firmware leave this option empty. + #[serde(default)] + pub firmware: String, +} + +impl BootInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + resolve_path!(self.kernel, "guest kernel image file {} is invalid: {}")?; + resolve_path!(self.image, "guest boot image file {} is invalid: {}")?; + resolve_path!(self.initrd, "guest initrd image file {} is invalid: {}")?; + resolve_path!(self.firmware, "firmware image file {} is invalid: {}")?; + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + validate_path!(self.kernel, "guest kernel image file {} is invalid: {}")?; + validate_path!(self.image, "guest boot image file {} is invalid: {}")?; + validate_path!(self.initrd, "guest initrd image file {} is invalid: {}")?; + validate_path!(self.firmware, "firmware image file {} is invalid: {}")?; + if !self.image.is_empty() && !self.initrd.is_empty() { + return Err(eother!("Can not configure both initrd and image for boot")); + } + Ok(()) + } + + /// Validate guest kernel image annotaion + pub fn validate_boot_path(&self, path: &str) -> Result<()> { + validate_path!(path, "path {} is invalid{}")?; + Ok(()) + } +} + +/// Virtual CPU configuration information. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct CpuInfo { + /// CPU features, comma-separated list of cpu features to pass to the cpu. + /// For example, `cpu_features = "pmu=off,vmx=off" + #[serde(default)] + pub cpu_features: String, + + /// Default number of vCPUs per SB/VM: + /// - unspecified or 0 --> will be set to @DEFVCPUS@ + /// - < 0 --> will be set to the actual number of physical cores + /// > 0 <= number of physical cores --> will be set to the specified number + /// > number of physical cores --> will be set to the actual number of physical cores + #[serde(default)] + pub default_vcpus: i32, + + /// Default maximum number of vCPUs per SB/VM: + /// - unspecified or == 0 --> will be set to the actual number of physical cores or + /// to the maximum number of vCPUs supported by KVM + /// if that number is exceeded + /// - > 0 <= number of physical cores --> will be set to the specified number + /// - > number of physical cores --> will be set to the actual number of physical cores or + /// to the maximum number of vCPUs supported by KVM + /// if that number is exceeded + /// + /// WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used + /// when the actual number of physical cores is greater than it. + /// + /// WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU + /// the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 + /// vCPUs can be added to a SB/VM, but the memory footprint will be big. Another example, with + /// `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number + /// of vCPUs supported by the SB/VM. In general, we recommend that you do not edit this + /// variable, unless you know what are you doing. + /// + /// NOTICE: on arm platform with gicv2 interrupt controller, set it to 8. + #[serde(default)] + pub default_maxvcpus: u32, +} + +impl CpuInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + let features: Vec<&str> = self.cpu_features.split(',').map(|v| v.trim()).collect(); + self.cpu_features = features.join(","); + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + Ok(()) + } + + /// Get default number of guest vCPUs. + pub fn get_default_vcpus(&self) -> u32 { + let cpus = num_cpus::get() as u32; + if self.default_vcpus < 0 || self.default_vcpus as u32 > cpus { + cpus + } else if self.default_vcpus == 0 { + default::DEFAULT_GUEST_VCPUS + } else { + self.default_vcpus as u32 + } + } + + /// Get default maximal number of guest vCPUs. + pub fn get_default_max_vcpus(&self) -> u32 { + let cpus = num_cpus::get() as u32; + if self.default_maxvcpus == 0 || self.default_maxvcpus > cpus { + cpus + } else { + self.default_maxvcpus + } + } +} + +/// Configuration information for shared filesystem, such virtio-9p and virtio-fs. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct DebugInfo { + /// This option changes the default hypervisor and kernel parameters to enable debug output + /// where available. + #[serde(default)] + pub enable_debug: bool, + + /// Enable dumping information about guest page structures if true. + #[serde(default)] + pub guest_memory_dump_paging: bool, + + /// Set where to save the guest memory dump file. + /// + /// If set, when GUEST_PANICKED event occurred, guest memory will be dumped to host filesystem + /// under guest_memory_dump_path. This directory will be created automatically if it does not + /// exist. The dumped file(also called vmcore) can be processed with crash or gdb. + /// + /// # WARNING: + /// Dump guest's memory can take very long depending on the amount of guest memory and use + /// much disk space. + #[serde(default)] + pub guest_memory_dump_path: String, +} + +impl DebugInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + Ok(()) + } +} + +/// Virtual machine device configuration information. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct DeviceInfo { + /// Bridges can be used to hot plug devices. + /// + /// Limitations: + /// - Currently only pci bridges are supported + /// - Until 30 devices per bridge can be hot plugged. + /// - Until 5 PCI bridges can be cold plugged per VM. + /// + /// This limitation could be a bug in qemu or in the kernel + /// Default number of bridges per SB/VM: + /// - unspecified or 0 --> will be set to @DEFBRIDGES@ + /// - > 1 <= 5 --> will be set to the specified number + /// - > 5 --> will be set to 5 + #[serde(default)] + pub default_bridges: u32, + + /// VFIO devices are hotplugged on a bridge by default. + /// + /// Enable hotplugging on root bus. This may be required for devices with a large PCI bar, + /// as this is a current limitation with hotplugging on a bridge. + #[serde(default)] + pub hotplug_vfio_on_root_bus: bool, + + /// Before hot plugging a PCIe device, you need to add a pcie_root_port device. + /// + /// Use this parameter when using some large PCI bar devices, such as Nvidia GPU. + /// The value means the number of pcie_root_port. + /// This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" + #[serde(default)] + pub pcie_root_port: u32, + + /// Enable vIOMMU, default false + /// + /// Enabling this will result in the VM having a vIOMMU device. This will also add the + /// following options to the kernel's command line: intel_iommu=on,iommu=pt + #[serde(default)] + pub enable_iommu: bool, + + /// Enable IOMMU_PLATFORM, default false + /// + /// Enabling this will result in the VM device having iommu_platform=on set + #[serde(default)] + pub enable_iommu_platform: bool, +} + +impl DeviceInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + if self.default_bridges > MAX_BRIDGE_SIZE { + self.default_bridges = MAX_BRIDGE_SIZE; + } + + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + if self.default_bridges > MAX_BRIDGE_SIZE { + return Err(eother!( + "The configured PCI bridges {} are too many", + self.default_bridges + )); + } + Ok(()) + } +} + +/// Configuration information for virtual machine. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct MachineInfo { + /// Virtual machine model/type. + #[serde(default)] + pub machine_type: String, + + /// Machine accelerators. + /// Comma-separated list of machine accelerators to pass to the hypervisor. + /// For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` + #[serde(default)] + pub machine_accelerators: String, + + /// Add flash image file to VM. + /// + /// The arguments of it should be in format of ["/path/to/flash0.img", "/path/to/flash1.img"]. + #[serde(default)] + pub pflashes: Vec, + + /// Default entropy source. + /// The path to a host source of entropy (including a real hardware RNG). + /// `/dev/urandom` and `/dev/random` are two main options. Be aware that `/dev/random` is a + /// blocking source of entropy. If the host runs out of entropy, the VMs boot time will + /// increase leading to get startup timeouts. The source of entropy `/dev/urandom` is + /// non-blocking and provides a generally acceptable source of entropy. It should work well + /// for pretty much all practical purposes. + #[serde(default)] + pub entropy_source: String, + + /// List of valid annotations values for entropy_source. + /// The default if not set is empty (all annotations rejected.) + #[serde(default)] + pub valid_entropy_sources: Vec, +} + +impl MachineInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + let accelerators: Vec<&str> = self + .machine_accelerators + .split(',') + .map(|v| v.trim()) + .collect(); + self.machine_accelerators = accelerators.join(","); + + for pflash in self.pflashes.iter_mut() { + resolve_path!(*pflash, "Flash image file {} is invalide: {}")?; + } + resolve_path!(self.entropy_source, "Entropy source {} is invalid: {}")?; + + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + for pflash in self.pflashes.iter() { + validate_path!(*pflash, "Flash image file {} is invalid: {}")?; + } + validate_path!(self.entropy_source, "Entropy source {} is invalid: {}")?; + Ok(()) + } + + /// Validate path of entropy source. + pub fn validate_entropy_source>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_entropy_sources, path) + } +} + +/// Virtual machine memory configuration information. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct MemoryInfo { + /// Default memory size in MiB for SB/VM. + #[serde(default)] + pub default_memory: u32, + + /// Default memory slots per SB/VM. + /// + /// This is will determine the times that memory will be hotadded to sandbox/VM. + #[serde(default)] + pub memory_slots: u32, + + /// Enable file based guest memory support. + /// + /// The default is an empty string which will disable this feature. In the case of virtio-fs, + /// this is enabled automatically and '/dev/shm' is used as the backing folder. This option + /// will be ignored if VM templating is enabled. + #[serde(default)] + pub file_mem_backend: String, + + /// List of valid annotations values for the file_mem_backend annotation + /// + /// The default if not set is empty (all annotations rejected.) + #[serde(default)] + pub valid_file_mem_backends: Vec, + + /// Enable pre allocation of VM RAM, default false + /// + /// Enabling this will result in lower container density as all of the memory will be allocated + /// and locked. This is useful when you want to reserve all the memory upfront or in the cases + /// where you want memory latencies to be very predictable + #[serde(default)] + pub enable_mem_prealloc: bool, + + /// Enable huge pages for VM RAM, default false + /// + /// Enabling this will result in the VM memory being allocated using huge pages. This is useful + /// when you want to use vhost-user network stacks within the container. This will automatically + /// result in memory pre allocation. + #[serde(default)] + pub enable_hugepages: bool, + + /// Specifies virtio-mem will be enabled or not. + /// + /// Please note that this option should be used with the command + /// "echo 1 > /proc/sys/vm/overcommit_memory". + #[serde(default)] + pub enable_virtio_mem: bool, + + /// Enable swap of vm memory. Default false. + /// + /// The behaviour is undefined if mem_prealloc is also set to true + #[serde(default)] + pub enable_swap: bool, + + /// Enable swap in the guest. Default false. + /// + /// When enable_guest_swap is enabled, insert a raw file to the guest as the swap device if the + /// swappiness of a container (set by annotation "io.katacontainers.container.resource.swappiness") + /// is bigger than 0. + /// + /// The size of the swap device should be swap_in_bytes (set by annotation + /// "io.katacontainers.container.resource.swap_in_bytes") - memory_limit_in_bytes. + /// If swap_in_bytes is not set, the size should be memory_limit_in_bytes. + /// If swap_in_bytes and memory_limit_in_bytes is not set, the size should be default_memory. + #[serde(default)] + pub enable_guest_swap: bool, +} + +impl MemoryInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + resolve_path!( + self.file_mem_backend, + "Memory backend file {} is invalid: {}" + )?; + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + validate_path!( + self.file_mem_backend, + "Memory backend file {} is invalid: {}" + )?; + if self.default_memory == 0 { + return Err(eother!("Configured memory size for guest VM is zero")); + } + if self.memory_slots == 0 { + return Err(eother!("Configured memory slots for guest VM are zero")); + } + + Ok(()) + } + + /// Validate path of memory backend files. + pub fn validate_memory_backend_path>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_file_mem_backends, path) + } +} + +/// Configuration information for virtual machine. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct NetworkInfo { + /// If vhost-net backend for virtio-net is not desired, set to true. + /// + /// Default is false, which trades off security (vhost-net runs ring0) for network I/O + /// performance. + #[serde(default)] + pub disable_vhost_net: bool, + + /// Use rx Rate Limiter to control network I/O inbound bandwidth(size in bits/sec for SB/VM). + /// + /// In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) to discipline traffic. + /// Default 0-sized value means unlimited rate. + #[serde(default)] + pub rx_rate_limiter_max_rate: u64, + + /// Use tx Rate Limiter to control network I/O outbound bandwidth(size in bits/sec for SB/VM). + /// + /// In Qemu, we use classful qdiscs HTB(Hierarchy Token Bucket) and ifb(Intermediate Functional + /// Block) to discipline traffic. + /// Default 0-sized value means unlimited rate. + #[serde(default)] + pub tx_rate_limiter_max_rate: u64, + + /// network queues + #[serde(default)] + pub network_queues: u32, +} + +impl NetworkInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + Ok(()) + } +} + +/// Configuration information for virtual machine. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct SecurityInfo { + /// Enable running QEMU VMM as a non-root user. + /// + /// By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as + /// a non-root random user. See documentation for the limitations of this mode. + #[serde(default)] + pub rootless: bool, + + /// Disable seccomp. + #[serde(default)] + pub disable_seccomp: bool, + + /// Enable confidential guest support. + /// + /// Toggling that setting may trigger different hardware features, ranging from memory + /// encryption to both memory and CPU-state encryption and integrity.The Kata Containers + /// runtime dynamically detects the available feature set and aims at enabling the largest + /// possible one. + #[serde(default)] + pub confidential_guest: bool, + + /// Path to OCI hook binaries in the *guest rootfs*. + /// + /// This does not affect host-side hooks which must instead be added to the OCI spec passed to + /// the runtime. + /// + /// You can create a rootfs with hooks by customizing the osbuilder scripts: + /// https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder + /// + /// Hooks must be stored in a subdirectory of guest_hook_path according to their hook type, + /// i.e. "guest_hook_path/{prestart,poststart,poststop}". The agent will scan these directories + /// for executable files and add them, in lexicographical order, to the lifecycle of the guest + /// container. + /// + /// Hooks are executed in the runtime namespace of the guest. See the official documentation: + /// https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks + /// + /// Warnings will be logged if any error is encountered while scanning for hooks, but it will + /// not abort container execution. + #[serde(default)] + pub guest_hook_path: String, + + /// List of valid annotation names for the hypervisor. + /// + /// Each member of the list is a regular expression, which is the base name of the annotation, + /// e.g. "path" for io.katacontainers.config.hypervisor.path" + #[serde(default)] + pub enable_annotations: Vec, +} + +impl SecurityInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + if self.guest_hook_path.is_empty() { + self.guest_hook_path = default::DEFAULT_GUEST_HOOK_PATH.to_string(); + } + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + Ok(()) + } + + /// Check whether annotation key is enabled or not. + pub fn is_annotation_enabled(&self, path: &str) -> bool { + if !path.starts_with(KATA_ANNO_CFG_HYPERVISOR_PREFIX) { + return false; + } + let pos = KATA_ANNO_CFG_HYPERVISOR_PREFIX.len(); + let key = &path[pos..]; + if let Ok(set) = RegexSet::new(&self.enable_annotations) { + return set.is_match(key); + } + false + } + + /// Validate path + pub fn validate_path(&self, path: &str) -> Result<()> { + validate_path!(path, "path {} is invalid{}")?; + Ok(()) + } +} + +/// Configuration information for shared filesystem, such virtio-9p and virtio-fs. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct SharedFsInfo { + /// Shared file system type: + /// - virtio-fs (default) + /// - virtio-9p` + pub shared_fs: Option, + + /// Path to vhost-user-fs daemon. + #[serde(default)] + pub virtio_fs_daemon: String, + + /// List of valid annotations values for the virtiofs daemon + /// The default if not set is empty (all annotations rejected.) + #[serde(default)] + pub valid_virtio_fs_daemon_paths: Vec, + + /// Extra args for virtiofsd daemon + /// + /// Format example: + /// ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] + /// + /// see `virtiofsd -h` for possible options. + #[serde(default)] + pub virtio_fs_extra_args: Vec, + + /// Cache mode: + /// - none: Metadata, data, and pathname lookup are not cached in guest. They are always + /// fetched from host and any changes are immediately pushed to host. + /// - auto: Metadata and pathname lookup cache expires after a configured amount of time + /// (default is 1 second). Data is cached while the file is open (close to open consistency). + /// - always: Metadata, data, and pathname lookup are cached in guest and never expire. + #[serde(default)] + pub virtio_fs_cache: String, + + /// Default size of DAX cache in MiB + #[serde(default)] + pub virtio_fs_cache_size: u32, + + /// Enable virtio-fs DAX window if true. + #[serde(default)] + pub virtio_fs_is_dax: bool, + + /// This is the msize used for 9p shares. It is the number of bytes used for 9p packet payload. + #[serde(default)] + pub msize_9p: u32, +} + +impl SharedFsInfo { + /// Adjust the configuration information after loading from configuration file. + pub fn adjust_config(&mut self) -> Result<()> { + if self.shared_fs.as_deref() == Some("") { + self.shared_fs = Some(default::DEFAULT_SHARED_FS_TYPE.to_string()); + } + match self.shared_fs.as_deref() { + Some(VIRTIO_FS) => self.adjust_virtio_fs(false)?, + Some(VIRTIO_FS_INLINE) => self.adjust_virtio_fs(true)?, + Some(VIRTIO_9P) => { + if self.msize_9p == 0 { + self.msize_9p = default::DEFAULT_SHARED_9PFS_SIZE_MB; + } + } + _ => {} + } + + Ok(()) + } + + /// Validate the configuration information. + pub fn validate(&self) -> Result<()> { + match self.shared_fs.as_deref() { + None => Ok(()), + Some(VIRTIO_FS) => self.validate_virtio_fs(false), + Some(VIRTIO_FS_INLINE) => self.validate_virtio_fs(true), + Some(VIRTIO_9P) => { + if self.msize_9p < default::MIN_SHARED_9PFS_SIZE_MB + || self.msize_9p > default::MAX_SHARED_9PFS_SIZE_MB + { + return Err(eother!( + "Invalid 9p configuration msize 0x{:x}, min value is 0x{:x}, max value is 0x{:x}", + self.msize_9p,default::MIN_SHARED_9PFS_SIZE_MB, default::MAX_SHARED_9PFS_SIZE_MB + )); + } + Ok(()) + } + Some(v) => Err(eother!("Invalid shared_fs type {}", v)), + } + } + + /// Validate path of virtio-fs daemon, especially for annotations. + pub fn validate_virtiofs_daemon_path>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_virtio_fs_daemon_paths, path) + } + + fn adjust_virtio_fs(&mut self, _inline: bool) -> Result<()> { + resolve_path!( + self.virtio_fs_daemon, + "Virtio-fs daemon path {} is invalid: {}" + )?; + if self.virtio_fs_cache.is_empty() { + self.virtio_fs_cache = default::DEFAULT_VIRTIO_FS_CACHE_MODE.to_string(); + } + if self.virtio_fs_is_dax && self.virtio_fs_cache_size == 0 { + self.virtio_fs_cache_size = default::DEFAULT_VIRTIO_FS_DAX_SIZE_MB; + } + if !self.virtio_fs_is_dax && self.virtio_fs_cache_size != 0 { + self.virtio_fs_is_dax = true; + } + Ok(()) + } + + fn validate_virtio_fs(&self, inline: bool) -> Result<()> { + if inline && !self.virtio_fs_daemon.is_empty() { + return Err(eother!( + "Executable path for inline-virtio-fs is not empty: {}", + &self.virtio_fs_daemon + )); + } + validate_path!( + self.virtio_fs_daemon, + "Virtio-fs daemon path {} is invalid: {}" + )?; + + let l = ["none", "auto", "always"]; + + if !l.contains(&self.virtio_fs_cache.as_str()) { + return Err(eother!( + "Invalid virtio-fs cache mode: {}", + &self.virtio_fs_cache + )); + } + if self.virtio_fs_is_dax && self.virtio_fs_cache_size == 0 { + return Err(eother!( + "Invalid virtio-fs DAX window size: {}", + &self.virtio_fs_cache_size + )); + } + Ok(()) + } +} + +/// Common configuration information for hypervisors. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct Hypervisor { + /// Path to the hypervisor executable. + #[serde(default)] + pub path: String, + /// List of valid annotations values for the hypervisor. + /// + /// Each member of the list is a path pattern as described by glob(3). The default if not set + /// is empty (all annotations rejected.) + #[serde(default)] + pub valid_hypervisor_paths: Vec, + + /// Hypervisor control executable path. + #[serde(default)] + pub ctlpath: String, + /// List of valid annotations values for the hypervisor control executable. + /// + /// Each member of the list is a path pattern as described by glob(3). The default if not set + /// is empty (all annotations rejected.) + #[serde(default)] + pub valid_ctlpaths: Vec, + + /// Control channel path. + #[serde(default)] + pub jailer_path: String, + /// List of valid annotations values for the hypervisor jailer path. + /// + /// Each member of the list is a path pattern as described by glob(3). The default if not set + /// is empty (all annotations rejected.) + #[serde(default)] + pub valid_jailer_paths: Vec, + + /// Disable the customizations done in the runtime when it detects that it is running on top + /// a VMM. This will result in the runtime behaving as it would when running on bare metal. + #[serde(default)] + pub disable_nesting_checks: bool, + + /// Enable iothreads (data-plane) to be used. This causes IO to be handled in a separate IO + /// thread. This is currently only implemented for SCSI. + #[serde(default)] + pub enable_iothreads: bool, + + /// Block device configuration information. + #[serde(default, flatten)] + pub blockdev_info: BlockDeviceInfo, + + /// Guest system boot information. + #[serde(default, flatten)] + pub boot_info: BootInfo, + + /// Guest virtual CPU configuration information. + #[serde(default, flatten)] + pub cpu_info: CpuInfo, + + /// Debug configuration information. + #[serde(default, flatten)] + pub debug_info: DebugInfo, + + /// Device configuration information. + #[serde(default, flatten)] + pub device_info: DeviceInfo, + + /// Virtual machine configuration information. + #[serde(default, flatten)] + pub machine_info: MachineInfo, + + /// Virtual machine memory configuration information. + #[serde(default, flatten)] + pub memory_info: MemoryInfo, + + /// Network configuration information. + #[serde(default, flatten)] + pub network_info: NetworkInfo, + + /// Security configuration information. + #[serde(default, flatten)] + pub security_info: SecurityInfo, + + /// Shared file system configuration information. + #[serde(default, flatten)] + pub shared_fs: SharedFsInfo, + + /// Vendor customized runtime configuration. + #[serde(default, flatten)] + pub vendor: HypervisorVendor, +} + +impl Hypervisor { + /// Validate path of hypervisor executable. + pub fn validate_hypervisor_path>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_hypervisor_paths, path) + } + + /// Validate path of hypervisor control executable. + pub fn validate_hypervisor_ctlpath>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_ctlpaths, path) + } + + /// Validate path of jailer executable. + pub fn validate_jailer_path>(&self, path: P) -> Result<()> { + validate_path_pattern(&self.valid_jailer_paths, path) + } +} + +impl ConfigOps for Hypervisor { + fn adjust_config(conf: &mut TomlConfig) -> Result<()> { + HypervisorVendor::adjust_config(conf)?; + let hypervisors: Vec = conf.hypervisor.keys().cloned().collect(); + for hypervisor in hypervisors.iter() { + if let Some(plugin) = get_hypervisor_plugin(hypervisor) { + plugin.adjust_config(conf)?; + // Safe to unwrap() because `hypervisor` is a valid key in the hash map. + let hv = conf.hypervisor.get_mut(hypervisor).ok_or_else(|| { + io::Error::new(io::ErrorKind::NotFound, "hypervisor not found".to_string()) + })?; + hv.blockdev_info.adjust_config()?; + hv.boot_info.adjust_config()?; + hv.cpu_info.adjust_config()?; + hv.debug_info.adjust_config()?; + hv.device_info.adjust_config()?; + hv.machine_info.adjust_config()?; + hv.memory_info.adjust_config()?; + hv.network_info.adjust_config()?; + hv.security_info.adjust_config()?; + hv.shared_fs.adjust_config()?; + } else { + return Err(eother!("Can not find plugin for hypervisor {}", hypervisor)); + } + } + + Ok(()) + } + + fn validate(conf: &TomlConfig) -> Result<()> { + HypervisorVendor::validate(conf)?; + + let hypervisors: Vec = conf.hypervisor.keys().cloned().collect(); + for hypervisor in hypervisors.iter() { + if let Some(plugin) = get_hypervisor_plugin(hypervisor) { + plugin.validate(conf)?; + + // Safe to unwrap() because `hypervisor` is a valid key in the hash map. + let hv = conf.hypervisor.get(hypervisor).unwrap(); + hv.blockdev_info.validate()?; + hv.boot_info.validate()?; + hv.cpu_info.validate()?; + hv.debug_info.validate()?; + hv.device_info.validate()?; + hv.machine_info.validate()?; + hv.memory_info.validate()?; + hv.network_info.validate()?; + hv.security_info.validate()?; + hv.shared_fs.validate()?; + validate_path!(hv.path, "Hypervisor binary path `{}` is invalid: {}")?; + validate_path!( + hv.ctlpath, + "Hypervisor control executable `{}` is invalid: {}" + )?; + validate_path!(hv.jailer_path, "Hypervisor jailer path `{}` is invalid: {}")?; + } else { + return Err(eother!("Can not find plugin for hypervisor {}", hypervisor)); + } + } + + Ok(()) + } +} + +#[cfg(not(feature = "enable-vendor"))] +mod vendor { + use super::*; + + /// Vendor customization runtime configuration. + #[derive(Clone, Debug, Default, Deserialize, Serialize)] + pub struct HypervisorVendor {} + + impl ConfigOps for HypervisorVendor {} +} + +#[cfg(feature = "enable-vendor")] +#[path = "vendor.rs"] +mod vendor; + +pub use self::vendor::HypervisorVendor; +use crate::config::validate_path_pattern; +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_register_plugin() { + let db = DragonballConfig::new(); + db.register(); + + let db = Arc::new(DragonballConfig::new()); + register_hypervisor_plugin("dragonball", db); + + assert!(get_hypervisor_plugin("dragonball").is_some()); + assert!(get_hypervisor_plugin("dragonball2").is_none()); + } +} diff --git a/src/libs/kata-types/src/config/hypervisor/qemu.rs b/src/libs/kata-types/src/config/hypervisor/qemu.rs new file mode 100644 index 0000000000..945abc4b48 --- /dev/null +++ b/src/libs/kata-types/src/config/hypervisor/qemu.rs @@ -0,0 +1,150 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Result; +use std::path::Path; +use std::sync::Arc; + +use super::{default, register_hypervisor_plugin}; + +use crate::config::default::MAX_QEMU_VCPUS; +use crate::config::default::MIN_QEMU_MEMORY_SIZE_MB; + +use crate::config::hypervisor::VIRTIO_BLK_MMIO; +use crate::config::{ConfigPlugin, TomlConfig}; +use crate::{eother, resolve_path, validate_path}; + +/// Hypervisor name for qemu, used to index `TomlConfig::hypervisor`. +pub const HYPERVISOR_NAME_QEMU: &str = "qemu"; + +/// Configuration information for qemu. +#[derive(Default, Debug)] +pub struct QemuConfig {} + +impl QemuConfig { + /// Create a new instance of `QemuConfig`. + pub fn new() -> Self { + QemuConfig {} + } + + /// Register the qemu plugin. + pub fn register(self) { + let plugin = Arc::new(self); + register_hypervisor_plugin(HYPERVISOR_NAME_QEMU, plugin); + } +} + +impl ConfigPlugin for QemuConfig { + fn get_max_cpus(&self) -> u32 { + MAX_QEMU_VCPUS + } + + fn get_min_memory(&self) -> u32 { + MIN_QEMU_MEMORY_SIZE_MB + } + fn name(&self) -> &str { + HYPERVISOR_NAME_QEMU + } + + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(&self, conf: &mut TomlConfig) -> Result<()> { + if let Some(qemu) = conf.hypervisor.get_mut(HYPERVISOR_NAME_QEMU) { + if qemu.path.is_empty() { + qemu.path = default::DEFAULT_QEMU_BINARY_PATH.to_string(); + } + resolve_path!(qemu.path, "Qemu binary path `{}` is invalid: {}")?; + if qemu.ctlpath.is_empty() { + qemu.ctlpath = default::DEFAULT_QEMU_CONTROL_PATH.to_string(); + } + resolve_path!(qemu.ctlpath, "Qemu ctlpath `{}` is invalid: {}")?; + + if qemu.boot_info.kernel.is_empty() { + qemu.boot_info.kernel = default::DEFAULT_QEMU_GUEST_KERNEL_IMAGE.to_string(); + } + if qemu.boot_info.kernel_params.is_empty() { + qemu.boot_info.kernel_params = + default::DEFAULT_QEMU_GUEST_KERNEL_PARAMS.to_string(); + } + if qemu.boot_info.firmware.is_empty() { + qemu.boot_info.firmware = default::DEFAULT_QEMU_FIRMWARE_PATH.to_string(); + } + + if qemu.device_info.default_bridges == 0 { + qemu.device_info.default_bridges = default::DEFAULT_QEMU_PCI_BRIDGES; + } + + if qemu.machine_info.machine_type.is_empty() { + qemu.machine_info.machine_type = default::DEFAULT_QEMU_MACHINE_TYPE.to_string(); + } + if qemu.machine_info.entropy_source.is_empty() { + qemu.machine_info.entropy_source = default::DEFAULT_QEMU_ENTROPY_SOURCE.to_string(); + } + + if qemu.memory_info.default_memory == 0 { + qemu.memory_info.default_memory = default::DEFAULT_QEMU_MEMORY_SIZE_MB; + } + if qemu.memory_info.memory_slots == 0 { + qemu.memory_info.memory_slots = default::DEFAULT_QEMU_MEMORY_SLOTS; + } + } + + Ok(()) + } + + /// Validate the configuration information. + fn validate(&self, conf: &TomlConfig) -> Result<()> { + if let Some(qemu) = conf.hypervisor.get(HYPERVISOR_NAME_QEMU) { + validate_path!(qemu.path, "QEMU binary path `{}` is invalid: {}")?; + validate_path!(qemu.ctlpath, "QEMU control path `{}` is invalid: {}")?; + if !qemu.jailer_path.is_empty() { + return Err(eother!("Path for QEMU jailer should be empty")); + } + if !qemu.valid_jailer_paths.is_empty() { + return Err(eother!("Valid Qemu jailer path list should be empty")); + } + + if !qemu.blockdev_info.disable_block_device_use + && qemu.blockdev_info.block_device_driver == VIRTIO_BLK_MMIO + { + return Err(eother!("Qemu doesn't support virtio-blk-mmio")); + } + + if qemu.boot_info.kernel.is_empty() { + return Err(eother!("Guest kernel image for qemu is empty")); + } + if qemu.boot_info.image.is_empty() && qemu.boot_info.initrd.is_empty() { + return Err(eother!( + "Both guest boot image and initrd for qemu are empty" + )); + } + + if (qemu.cpu_info.default_vcpus > 0 + && qemu.cpu_info.default_vcpus as u32 > default::MAX_QEMU_VCPUS) + || qemu.cpu_info.default_maxvcpus > default::MAX_QEMU_VCPUS + { + return Err(eother!( + "Qemu hypervisor can not support {} vCPUs", + qemu.cpu_info.default_maxvcpus + )); + } + + if qemu.device_info.default_bridges > default::MAX_QEMU_PCI_BRIDGES { + return Err(eother!( + "Qemu hypervisor can not support {} PCI bridges", + qemu.device_info.default_bridges + )); + } + + if qemu.memory_info.default_memory < MIN_QEMU_MEMORY_SIZE_MB { + return Err(eother!( + "Qemu hypervisor has minimal memory limitation {}", + MIN_QEMU_MEMORY_SIZE_MB + )); + } + } + + Ok(()) + } +} diff --git a/src/libs/kata-types/src/config/hypervisor/vendor.rs b/src/libs/kata-types/src/config/hypervisor/vendor.rs new file mode 100644 index 0000000000..39f5779a45 --- /dev/null +++ b/src/libs/kata-types/src/config/hypervisor/vendor.rs @@ -0,0 +1,14 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! A sample for vendor to customize the hypervisor implementation. + +use super::*; + +/// Vendor customization runtime configuration. +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct HypervisorVendor {} + +impl ConfigOps for HypervisorVendor {} diff --git a/src/libs/kata-types/src/config/mod.rs b/src/libs/kata-types/src/config/mod.rs new file mode 100644 index 0000000000..52c9a0e3c0 --- /dev/null +++ b/src/libs/kata-types/src/config/mod.rs @@ -0,0 +1,307 @@ +// Copyright (c) 2019-2021 Ant Financial +// Copyright (c) 2019-2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::fs; +use std::io::{self, Result}; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; +use std::u32; + +use lazy_static::lazy_static; + +use crate::{eother, sl}; + +/// Default configuration values. +pub mod default; + +mod agent; +pub mod hypervisor; + +pub use self::agent::Agent; +pub use self::hypervisor::{ + BootInfo, DragonballConfig, Hypervisor, QemuConfig, HYPERVISOR_NAME_DRAGONBALL, + HYPERVISOR_NAME_QEMU, +}; + +mod runtime; +pub use self::runtime::{Runtime, RuntimeVendor}; + +/// Trait to manipulate global Kata configuration information. +pub trait ConfigPlugin: Send + Sync { + /// Get the plugin name. + fn name(&self) -> &str; + + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(&self, _conf: &mut TomlConfig) -> Result<()>; + + /// Validate the configuration information. + fn validate(&self, _conf: &TomlConfig) -> Result<()>; + + /// Get the minmum memory for hypervisor + fn get_min_memory(&self) -> u32; + + /// Get the max defualt cpus + fn get_max_cpus(&self) -> u32; +} + +/// Trait to manipulate Kata configuration information. +pub trait ConfigOps { + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(_conf: &mut TomlConfig) -> Result<()> { + Ok(()) + } + + /// Validate the configuration information. + fn validate(_conf: &TomlConfig) -> Result<()> { + Ok(()) + } +} + +/// Trait to manipulate global Kata configuration information. +pub trait ConfigObjectOps { + /// Adjust the configuration information after loading from configuration file. + fn adjust_config(&mut self) -> Result<()> { + Ok(()) + } + + /// Validate the configuration information. + fn validate(&self) -> Result<()> { + Ok(()) + } +} + +/// Kata configuration information. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct TomlConfig { + /// Configuration information for agents. + #[serde(default)] + pub agent: HashMap, + /// Configuration information for hypervisors. + #[serde(default)] + pub hypervisor: HashMap, + /// Kata runtime configuration information. + #[serde(default)] + pub runtime: Runtime, +} + +impl TomlConfig { + /// Load Kata configuration information from configuration files. + /// + /// If `config_file` is valid, it will used, otherwise a built-in default path list will be + /// scanned. + pub fn load_from_file>(config_file: P) -> Result<(TomlConfig, PathBuf)> { + let file_path = if !config_file.as_ref().as_os_str().is_empty() { + fs::canonicalize(config_file)? + } else { + Self::get_default_config_file()? + }; + + info!( + sl!(), + "load configuration from: {}", + file_path.to_string_lossy() + ); + let content = fs::read_to_string(&file_path)?; + let config = Self::load(&content)?; + + Ok((config, file_path)) + } + + /// Load raw Kata configuration information from configuration files. + /// + /// If `config_file` is valid, it will used, otherwise a built-in default path list will be + /// scanned. + pub fn load_raw_from_file>(config_file: P) -> Result<(TomlConfig, PathBuf)> { + let file_path = if !config_file.as_ref().as_os_str().is_empty() { + fs::canonicalize(config_file)? + } else { + Self::get_default_config_file()? + }; + + info!( + sl!(), + "load configuration from: {}", + file_path.to_string_lossy() + ); + let content = fs::read_to_string(&file_path)?; + let config: TomlConfig = toml::from_str(&content)?; + + Ok((config, file_path)) + } + + /// Load Kata configuration information from string. + pub fn load(content: &str) -> Result { + let mut config: TomlConfig = toml::from_str(content)?; + Hypervisor::adjust_config(&mut config)?; + Runtime::adjust_config(&mut config)?; + Agent::adjust_config(&mut config)?; + info!(sl!(), "get kata config: {:?}", config); + Ok(config) + } + + /// Validate Kata configuration information. + pub fn validate(&self) -> Result<()> { + Hypervisor::validate(self)?; + Runtime::validate(self)?; + Agent::validate(self)?; + + Ok(()) + } + + /// Probe configuration file according to the default configuration file list. + fn get_default_config_file() -> Result { + for f in default::DEFAULT_RUNTIME_CONFIGURATIONS.iter() { + if let Ok(path) = fs::canonicalize(f) { + return Ok(path); + } + } + + Err(io::Error::from(io::ErrorKind::NotFound)) + } +} + +/// Validate the `path` matches one of the pattern in `patterns`. +/// +/// Each member in `patterns` is a path pattern as described by glob(3) +pub fn validate_path_pattern>(patterns: &[String], path: P) -> Result<()> { + let path = path + .as_ref() + .to_str() + .ok_or_else(|| eother!("Invalid path {}", path.as_ref().to_string_lossy()))?; + for p in patterns.iter() { + if let Ok(glob) = glob::Pattern::new(p) { + if glob.matches(path) { + return Ok(()); + } + } + } + + Err(eother!("Path {} is not permitted", path)) +} + +/// Kata configuration information. +pub struct KataConfig { + config: Option, + agent: String, + hypervisor: String, +} + +impl KataConfig { + /// Set the default Kata configuration object. + /// + /// The default Kata configuration information is loaded from system configuration file. + pub fn set_default_config(config: Option, hypervisor: &str, agent: &str) { + let kata = KataConfig { + config, + agent: agent.to_string(), + hypervisor: hypervisor.to_string(), + }; + *KATA_DEFAULT_CONFIG.lock().unwrap() = Arc::new(kata); + } + + /// Get the default Kata configuration object. + /// + /// The default Kata configuration information is loaded from system configuration file. + pub fn get_default_config() -> Arc { + KATA_DEFAULT_CONFIG.lock().unwrap().clone() + } + + /// Set the active Kata configuration object. + /// + /// The active Kata configuration information is default configuration information patched + /// with tunable configuration information from annotations. + pub fn set_active_config(config: Option, hypervisor: &str, agent: &str) { + let kata = KataConfig { + config, + agent: agent.to_string(), + hypervisor: hypervisor.to_string(), + }; + *KATA_ACTIVE_CONFIG.lock().unwrap() = Arc::new(kata); + } + + /// Get the active Kata configuration object. + /// + /// The active Kata configuration information is default configuration information patched + /// with tunable configuration information from annotations. + pub fn get_active_config() -> Arc { + KATA_ACTIVE_CONFIG.lock().unwrap().clone() + } + /// Get the config in use + pub fn get_config(&self) -> &TomlConfig { + self.config.as_ref().unwrap() + } + + /// Get the agent configuration in use. + pub fn get_agent(&self) -> Option<&Agent> { + if !self.agent.is_empty() { + self.config.as_ref().unwrap().agent.get(&self.agent) + } else { + None + } + } + + /// Get the hypervisor configuration in use. + pub fn get_hypervisor(&self) -> Option<&Hypervisor> { + if !self.hypervisor.is_empty() { + self.config + .as_ref() + .unwrap() + .hypervisor + .get(&self.hypervisor) + } else { + None + } + } +} + +lazy_static! { + static ref KATA_DEFAULT_CONFIG: Mutex> = { + let config = Some(TomlConfig::load("").unwrap()); + let kata = KataConfig { + config, + agent: String::new(), + hypervisor: String::new(), + }; + + Mutex::new(Arc::new(kata)) + }; + static ref KATA_ACTIVE_CONFIG: Mutex> = { + let config = Some(TomlConfig::load("").unwrap()); + let kata = KataConfig { + config, + agent: String::new(), + hypervisor: String::new(), + }; + Mutex::new(Arc::new(kata)) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_path_pattern() { + let patterns = []; + validate_path_pattern(&patterns, "/bin/ls").unwrap_err(); + + let patterns = ["/bin".to_string()]; + validate_path_pattern(&patterns, "/bin/ls").unwrap_err(); + + let patterns = ["/bin/*/ls".to_string()]; + validate_path_pattern(&patterns, "/bin/ls").unwrap_err(); + + let patterns = ["/bin/*".to_string()]; + validate_path_pattern(&patterns, "/bin/ls").unwrap(); + + let patterns = ["/*".to_string()]; + validate_path_pattern(&patterns, "/bin/ls").unwrap(); + + let patterns = ["/usr/share".to_string(), "/bin/*".to_string()]; + validate_path_pattern(&patterns, "/bin/ls").unwrap(); + } +} diff --git a/src/libs/kata-types/src/config/runtime.rs b/src/libs/kata-types/src/config/runtime.rs new file mode 100644 index 0000000000..ce8e9efa59 --- /dev/null +++ b/src/libs/kata-types/src/config/runtime.rs @@ -0,0 +1,285 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Result; +use std::path::Path; + +use super::default; +use crate::config::{ConfigOps, TomlConfig}; +use crate::{eother, resolve_path, validate_path}; + +/// Kata runtime configuration information. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct Runtime { + /// Runtime name: Plan to support virt-container, linux-container, wasm-container + #[serde(default)] + pub name: String, + + /// Hypervisor name: Plan to support dragonball, qemu + #[serde(default)] + pub hypervisor_name: String, + + /// Agent name + #[serde(default)] + pub agent_name: String, + + /// If enabled, the runtime will log additional debug messages to the system log. + #[serde(default, rename = "enable_debug")] + pub debug: bool, + + /// Enabled experimental feature list, format: ["a", "b"]. + /// + /// Experimental features are features not stable enough for production, they may break + /// compatibility, and are prepared for a big version bump. + #[serde(default)] + pub experimental: Vec, + + /// Determines how the VM should be connected to the container network interface. + /// + /// Options: + /// - macvtap: used when the Container network interface can be bridged using macvtap. + /// - none: used when customize network. Only creates a tap device. No veth pair. + /// - tcfilter: uses tc filter rules to redirect traffic from the network interface provided + /// by plugin to a tap interface connected to the VM. + #[serde(default)] + pub internetworking_model: String, + + /// If enabled, the runtime won't create a network namespace for shim and hypervisor processes. + /// + /// This option may have some potential impacts to your host. It should only be used when you + /// know what you're doing. + /// + /// `disable_new_netns` conflicts with `internetworking_model=tcfilter` and + /// `internetworking_model=macvtap`. It works only with `internetworking_model=none`. + /// The tap device will be in the host network namespace and can connect to a bridge (like OVS) + /// directly. + /// + /// If you are using docker, `disable_new_netns` only works with `docker run --net=none` + #[serde(default)] + pub disable_new_netns: bool, + + /// If specified, sandbox_bind_mounts identifies host paths to be mounted into the sandboxes + /// shared path. + /// + /// This is only valid if filesystem sharing is utilized. The provided path(s) will be bind + /// mounted into the shared fs directory. If defaults are utilized, these mounts should be + /// available in the guest at `/run/kata-containers/shared/containers/passthrough/sandbox-mounts`. + /// These will not be exposed to the container workloads, and are only provided for potential + /// guest services. + #[serde(default)] + pub sandbox_bind_mounts: Vec, + + /// If enabled, the runtime will add all the kata processes inside one dedicated cgroup. + /// + /// The container cgroups in the host are not created, just one single cgroup per sandbox. + /// The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. + /// The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. + /// The sandbox cgroup is constrained if there is no container type annotation. + /// See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType + #[serde(default)] + pub sandbox_cgroup_only: bool, + + /// If enabled, the runtime will create opentracing.io traces and spans. + /// See https://www.jaegertracing.io/docs/getting-started. + #[serde(default)] + pub enable_tracing: bool, + /// The full url to the Jaeger HTTP Thrift collector. + #[serde(default)] + pub jaeger_endpoint: String, + /// The username to be used if basic auth is required for Jaeger. + #[serde(default)] + pub jaeger_user: String, + /// The password to be used if basic auth is required for Jaeger. + #[serde(default)] + pub jaeger_password: String, + + /// If enabled, user can run pprof tools with shim v2 process through kata-monitor. + #[serde(default)] + pub enable_pprof: bool, + + /// Determines whether container seccomp profiles are passed to the virtual machine and + /// applied by the kata agent. If set to true, seccomp is not applied within the guest. + #[serde(default)] + pub disable_guest_seccomp: bool, + + /// Determines how VFIO devices should be be presented to the container. + /// + /// Options: + /// - vfio: Matches behaviour of OCI runtimes (e.g. runc) as much as possible. VFIO devices + /// will appear in the container as VFIO character devices under /dev/vfio. The exact names + /// may differ from the host (they need to match the VM's IOMMU group numbers rather than + /// the host's) + /// - guest-kernel: This is a Kata-specific behaviour that's useful in certain cases. + /// The VFIO device is managed by whatever driver in the VM kernel claims it. This means + /// it will appear as one or more device nodes or network interfaces depending on the nature + /// of the device. Using this mode requires specially built workloads that know how to locate + /// the relevant device interfaces within the VM. + #[serde(default)] + pub vfio_mode: String, + + /// Vendor customized runtime configuration. + #[serde(default, flatten)] + pub vendor: RuntimeVendor, +} + +impl ConfigOps for Runtime { + fn adjust_config(conf: &mut TomlConfig) -> Result<()> { + RuntimeVendor::adjust_config(conf)?; + if conf.runtime.internetworking_model.is_empty() { + conf.runtime.internetworking_model = default::DEFAULT_INTERNETWORKING_MODEL.to_owned(); + } + + for bind in conf.runtime.sandbox_bind_mounts.iter_mut() { + resolve_path!(*bind, "sandbox bind mount `{}` is invalid: {}")?; + } + + Ok(()) + } + + fn validate(conf: &TomlConfig) -> Result<()> { + RuntimeVendor::validate(conf)?; + + let net_model = &conf.runtime.internetworking_model; + if !net_model.is_empty() + && net_model != "macvtap" + && net_model != "none" + && net_model != "tcfilter" + { + return Err(eother!( + "Invalid internetworking_model `{}` in configuration file", + net_model + )); + } + + let vfio_mode = &conf.runtime.vfio_mode; + if !vfio_mode.is_empty() && vfio_mode != "vfio" && vfio_mode != "guest-kernel" { + return Err(eother!( + "Invalid vfio_mode `{}` in configuration file", + vfio_mode + )); + } + + for bind in conf.runtime.sandbox_bind_mounts.iter() { + validate_path!(*bind, "sandbox bind mount `{}` is invalid: {}")?; + } + + Ok(()) + } +} + +impl Runtime { + /// Check whether experiment `feature` is enabled or not. + pub fn is_experiment_enabled(&self, feature: &str) -> bool { + self.experimental.contains(&feature.to_string()) + } +} + +#[cfg(not(feature = "enable-vendor"))] +mod vendor { + use super::*; + + /// Vendor customization runtime configuration. + #[derive(Debug, Default, Deserialize, Serialize)] + pub struct RuntimeVendor {} + + impl ConfigOps for RuntimeVendor {} +} + +#[cfg(feature = "enable-vendor")] +#[path = "runtime_vendor.rs"] +mod vendor; + +pub use vendor::RuntimeVendor; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_invalid_config() { + let content = r#" +[runtime] +enable_debug = 10 +"#; + TomlConfig::load(content).unwrap_err(); + + let content = r#" +[runtime] +enable_debug = true +internetworking_model = "test" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + + let content = r#" +[runtime] +enable_debug = true +internetworking_model = "macvtap,none" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + + let content = r#" +[runtime] +enable_debug = true +vfio_mode = "none" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + + let content = r#" +[runtime] +enable_debug = true +vfio_mode = "vfio,guest-kernel" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + + let content = r#" +[runtime] +enable_debug = true +vfio_mode = "guest_kernel" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + } + + #[test] + fn test_config() { + let content = r#" +[runtime] +name = "virt-container" +enable_debug = true +experimental = ["a", "b"] +internetworking_model = "macvtap" +disable_new_netns = true +sandbox_bind_mounts = [] +sandbox_cgroup_only = true +enable_tracing = true +jaeger_endpoint = "localhost:1234" +jaeger_user = "user" +jaeger_password = "pw" +enable_pprof = true +disable_guest_seccomp = true +vfio_mode = "vfio" +field_should_be_ignored = true +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap(); + assert_eq!(&config.runtime.name, "virt-container"); + assert!(config.runtime.debug); + assert_eq!(config.runtime.experimental.len(), 2); + assert_eq!(&config.runtime.experimental[0], "a"); + assert_eq!(&config.runtime.experimental[1], "b"); + assert_eq!(&config.runtime.internetworking_model, "macvtap"); + assert!(config.runtime.disable_new_netns); + assert_eq!(config.runtime.sandbox_bind_mounts.len(), 0); + assert!(config.runtime.sandbox_cgroup_only); + assert!(config.runtime.enable_tracing); + assert!(config.runtime.is_experiment_enabled("a")); + assert!(config.runtime.is_experiment_enabled("b")); + assert!(!config.runtime.is_experiment_enabled("c")); + } +} diff --git a/src/libs/kata-types/src/config/runtime_vendor.rs b/src/libs/kata-types/src/config/runtime_vendor.rs new file mode 100644 index 0000000000..e12a63f399 --- /dev/null +++ b/src/libs/kata-types/src/config/runtime_vendor.rs @@ -0,0 +1,83 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! A sample for vendor to customize the runtime implementation. + +use super::*; +use slog::Level; +/// Vendor customization runtime configuration. +#[derive(Debug, Default, Deserialize, Serialize)] +pub struct RuntimeVendor { + /// Log level + #[serde(default)] + pub log_level: u32, + + /// Prefix for log messages + #[serde(default)] + pub log_prefix: String, +} + +impl ConfigOps for RuntimeVendor { + fn adjust_config(conf: &mut TomlConfig) -> Result<()> { + if conf.runtime.vendor.log_level > Level::Debug as u32 { + conf.runtime.debug = true; + } + + Ok(()) + } + + /// Validate the configuration information. + fn validate(conf: &TomlConfig) -> Result<()> { + if conf.runtime.vendor.log_level > 10 { + return Err(eother!( + "log level {} in configuration file is invalid", + conf.runtime.vendor.log_level + )); + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_invalid_vendor_config() { + let content = r#" +[runtime] +debug = false +log_level = 20 +log_prefix = "test" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap_err(); + + let content = r#" +[runtime] +debug = false +log_level = "test" +log_prefix = "test" +"#; + TomlConfig::load(content).unwrap_err(); + } + + #[test] + fn test_vendor_config() { + let content = r#" +[runtime] +debug = false +log_level = 10 +log_prefix = "test" +log_fmt = "nouse" +"#; + let config: TomlConfig = TomlConfig::load(content).unwrap(); + config.validate().unwrap(); + assert!(config.runtime.debug); + assert_eq!(config.runtime.vendor.log_level, 10); + assert_eq!(&config.runtime.vendor.log_prefix, "test"); + } +} diff --git a/src/libs/kata-types/src/container.rs b/src/libs/kata-types/src/container.rs new file mode 100644 index 0000000000..3a64a4dd7f --- /dev/null +++ b/src/libs/kata-types/src/container.rs @@ -0,0 +1,209 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +// a container running within a pod +pub(crate) const POD_CONTAINER: &str = "pod_container"; +// cri containerd/crio/docker: a container running within a pod +pub(crate) const CONTAINER: &str = "container"; + +// a pod sandbox container +pub(crate) const POD_SANDBOX: &str = "pod_sandbox"; +// cri containerd/crio: a pod sandbox container +pub(crate) const SANDBOX: &str = "sandbox"; +// docker: a sandbox sandbox container +pub(crate) const PODSANDBOX: &str = "podsandbox"; + +const STATE_READY: &str = "ready"; +const STATE_RUNNING: &str = "running"; +const STATE_STOPPED: &str = "stopped"; +const STATE_PAUSED: &str = "paused"; + +/// Error codes for container related operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Invalid container type + #[error("Invalid container type {0}")] + InvalidContainerType(String), + /// Invalid container state + #[error("Invalid sandbox state {0}")] + InvalidState(String), + /// Invalid container state transition + #[error("Can not transit from {0} to {1}")] + InvalidStateTransition(State, State), +} + +/// Types of pod containers: container or sandbox. +#[derive(PartialEq, Debug, Clone)] +pub enum ContainerType { + /// A pod container. + PodContainer, + /// A pod sandbox. + PodSandbox, +} + +impl ContainerType { + /// Check whether it's a pod container. + pub fn is_pod_container(&self) -> bool { + matches!(self, ContainerType::PodContainer) + } + + /// Check whether it's a pod container. + pub fn is_pod_sandbox(&self) -> bool { + matches!(self, ContainerType::PodSandbox) + } +} + +impl Display for ContainerType { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + ContainerType::PodContainer => write!(f, "{}", POD_CONTAINER), + ContainerType::PodSandbox => write!(f, "{}", POD_SANDBOX), + } + } +} + +impl FromStr for ContainerType { + type Err = Error; + + fn from_str(value: &str) -> Result { + match value { + POD_CONTAINER | CONTAINER => Ok(ContainerType::PodContainer), + POD_SANDBOX | PODSANDBOX | SANDBOX => Ok(ContainerType::PodSandbox), + _ => Err(Error::InvalidContainerType(value.to_owned())), + } + } +} + +/// Process states. +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum State { + /// The container is ready to run. + Ready, + /// The container executed the user-specified program but has not exited + Running, + /// The container has exited + Stopped, + /// The container has been paused. + Paused, +} + +impl Display for State { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + State::Ready => write!(f, "{}", STATE_READY), + State::Running => write!(f, "{}", STATE_RUNNING), + State::Stopped => write!(f, "{}", STATE_STOPPED), + State::Paused => write!(f, "{}", STATE_PAUSED), + } + } +} + +impl FromStr for State { + type Err = Error; + + fn from_str(value: &str) -> Result { + match value { + STATE_READY => Ok(State::Ready), + STATE_RUNNING => Ok(State::Running), + STATE_STOPPED => Ok(State::Stopped), + STATE_PAUSED => Ok(State::Paused), + _ => Err(Error::InvalidState(value.to_owned())), + } + } +} + +impl State { + /// Check whether it's a valid state transition from self to the `new_state`. + pub fn check_transition(self, new_state: State) -> Result<(), Error> { + match self { + State::Ready if new_state == State::Running || new_state == State::Stopped => Ok(()), + State::Running if new_state == State::Stopped => Ok(()), + State::Stopped if new_state == State::Running => Ok(()), + State::Paused if new_state == State::Paused => Ok(()), + _ => Err(Error::InvalidStateTransition(self, new_state)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_container_type() { + assert!(ContainerType::PodContainer.is_pod_container()); + assert!(!ContainerType::PodContainer.is_pod_sandbox()); + + assert!(ContainerType::PodSandbox.is_pod_sandbox()); + assert!(!ContainerType::PodSandbox.is_pod_container()); + } + + #[test] + fn test_container_type_display() { + assert_eq!(format!("{}", ContainerType::PodContainer), POD_CONTAINER); + assert_eq!(format!("{}", ContainerType::PodSandbox), POD_SANDBOX); + } + + #[test] + fn test_container_type_from_str() { + assert_eq!( + ContainerType::from_str("pod_container").unwrap(), + ContainerType::PodContainer + ); + assert_eq!( + ContainerType::from_str("container").unwrap(), + ContainerType::PodContainer + ); + assert_eq!( + ContainerType::from_str("pod_sandbox").unwrap(), + ContainerType::PodSandbox + ); + assert_eq!( + ContainerType::from_str("podsandbox").unwrap(), + ContainerType::PodSandbox + ); + assert_eq!( + ContainerType::from_str("sandbox").unwrap(), + ContainerType::PodSandbox + ); + ContainerType::from_str("test").unwrap_err(); + } + + #[test] + fn test_valid() { + let mut state = State::from_str("invalid_state"); + assert!(state.is_err()); + + state = State::from_str("ready"); + assert!(state.is_ok()); + + state = State::from_str("running"); + assert!(state.is_ok()); + + state = State::from_str("stopped"); + assert!(state.is_ok()); + } + + #[test] + fn test_valid_transition() { + use State::*; + + assert!(Ready.check_transition(Ready).is_err()); + assert!(Ready.check_transition(Running).is_ok()); + assert!(Ready.check_transition(Stopped).is_ok()); + + assert!(Running.check_transition(Ready).is_err()); + assert!(Running.check_transition(Running).is_err()); + assert!(Running.check_transition(Stopped).is_ok()); + + assert!(Stopped.check_transition(Ready).is_err()); + assert!(Stopped.check_transition(Running).is_ok()); + assert!(Stopped.check_transition(Stopped).is_err()); + } +} diff --git a/src/libs/kata-types/src/cpu.rs b/src/libs/kata-types/src/cpu.rs new file mode 100644 index 0000000000..0020de097b --- /dev/null +++ b/src/libs/kata-types/src/cpu.rs @@ -0,0 +1,255 @@ +// Copyright (c) 2022 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::convert::TryFrom; +use std::str::FromStr; + +use oci::LinuxCpu; + +/// A set of CPU ids. +pub type CpuSet = crate::utils::u32_set::U32Set; + +/// A set of NUMA memory nodes. +pub type NumaNodeSet = crate::utils::u32_set::U32Set; + +/// Error code for CPU related operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Invalid CPU list. + #[error("Invalid CPU list: {0}")] + InvalidCpuSet(crate::Error), + /// Invalid NUMA memory node list. + #[error("Invalid NUMA memory node list: {0}")] + InvalidNodeSet(crate::Error), +} + +/// Assigned CPU resources for a Linux container. +#[derive(Default, Debug)] +pub struct LinuxContainerCpuResources { + shares: u64, + period: u64, + quota: i64, + cpuset: CpuSet, + nodeset: NumaNodeSet, + calculated_vcpu_time_ms: Option, +} + +impl LinuxContainerCpuResources { + /// Get the CPU shares. + pub fn shares(&self) -> u64 { + self.shares + } + + /// Get the CPU schedule period. + pub fn period(&self) -> u64 { + self.period + } + + /// Get the CPU schedule quota. + pub fn quota(&self) -> i64 { + self.quota + } + + /// Get the CPU set. + pub fn cpuset(&self) -> &CpuSet { + &self.cpuset + } + + /// Get the NUMA memory node set. + pub fn nodeset(&self) -> &NumaNodeSet { + &self.nodeset + } + + /// Get number of vCPUs to fulfill the CPU resource request, `None` means unconstrained. + pub fn get_vcpus(&self) -> Option { + self.calculated_vcpu_time_ms + .map(|v| v.saturating_add(999) / 1000) + } +} + +impl TryFrom<&LinuxCpu> for LinuxContainerCpuResources { + type Error = Error; + + // Unhandled fields: realtime_runtime, realtime_period, mems + fn try_from(value: &LinuxCpu) -> Result { + let period = value.period.unwrap_or(0); + let quota = value.quota.unwrap_or(-1); + let cpuset = CpuSet::from_str(&value.cpus).map_err(Error::InvalidCpuSet)?; + let nodeset = NumaNodeSet::from_str(&value.mems).map_err(Error::InvalidNodeSet)?; + + // If quota is -1, it means the CPU resource request is unconstrained. In that case, + // we don't currently assign additional CPUs. + let milli_sec = if quota >= 0 && period != 0 { + Some((quota as u64).saturating_mul(1000) / period) + } else { + None + }; + + Ok(LinuxContainerCpuResources { + shares: value.shares.unwrap_or(0), + period, + quota, + cpuset, + nodeset, + calculated_vcpu_time_ms: milli_sec, + }) + } +} + +/// Assigned CPU resources for a Linux sandbox/pod. +#[derive(Default, Debug)] +pub struct LinuxSandboxCpuResources { + shares: u64, + calculated_vcpu_time_ms: u64, + cpuset: CpuSet, + nodeset: NumaNodeSet, +} + +impl LinuxSandboxCpuResources { + /// Create a new instance of `LinuxSandboxCpuResources`. + pub fn new(shares: u64) -> Self { + Self { + shares, + ..Default::default() + } + } + + /// Get the CPU shares. + pub fn shares(&self) -> u64 { + self.shares + } + + /// Get assigned vCPU time in ms. + pub fn calculated_vcpu_time_ms(&self) -> u64 { + self.calculated_vcpu_time_ms + } + + /// Get the CPU set. + pub fn cpuset(&self) -> &CpuSet { + &self.cpuset + } + + /// Get the NUMA memory node set. + pub fn nodeset(&self) -> &NumaNodeSet { + &self.nodeset + } + + /// Get number of vCPUs to fulfill the CPU resource request. + pub fn get_vcpus(&self) -> u64 { + if self.calculated_vcpu_time_ms == 0 && !self.cpuset.is_empty() { + self.cpuset.len() as u64 + } else { + self.calculated_vcpu_time_ms.saturating_add(999) / 1000 + } + } + + /// Merge resources assigned to a container into the sandbox/pod resources. + pub fn merge(&mut self, container_resource: &LinuxContainerCpuResources) -> &mut Self { + if let Some(v) = container_resource.calculated_vcpu_time_ms.as_ref() { + self.calculated_vcpu_time_ms += v; + } + self.cpuset.extend(&container_resource.cpuset); + self.nodeset.extend(&container_resource.nodeset); + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_linux_container_cpu_resources() { + let resources = LinuxContainerCpuResources::default(); + + assert_eq!(resources.shares(), 0); + assert_eq!(resources.calculated_vcpu_time_ms, None); + assert!(resources.cpuset.is_empty()); + assert!(resources.nodeset.is_empty()); + assert!(resources.calculated_vcpu_time_ms.is_none()); + + let oci = oci::LinuxCpu { + shares: Some(2048), + quota: Some(1001), + period: Some(100), + realtime_runtime: None, + realtime_period: None, + cpus: "1,2,3".to_string(), + mems: "1".to_string(), + }; + let resources = LinuxContainerCpuResources::try_from(&oci).unwrap(); + assert_eq!(resources.shares(), 2048); + assert_eq!(resources.period(), 100); + assert_eq!(resources.quota(), 1001); + assert_eq!(resources.calculated_vcpu_time_ms, Some(10010)); + assert_eq!(resources.get_vcpus().unwrap(), 11); + assert_eq!(resources.cpuset().len(), 3); + assert_eq!(resources.nodeset().len(), 1); + + let oci = oci::LinuxCpu { + shares: Some(2048), + quota: None, + period: None, + realtime_runtime: None, + realtime_period: None, + cpus: "1".to_string(), + mems: "1-2".to_string(), + }; + let resources = LinuxContainerCpuResources::try_from(&oci).unwrap(); + assert_eq!(resources.shares(), 2048); + assert_eq!(resources.period(), 0); + assert_eq!(resources.quota(), -1); + assert_eq!(resources.calculated_vcpu_time_ms, None); + assert!(resources.get_vcpus().is_none()); + assert_eq!(resources.cpuset().len(), 1); + assert_eq!(resources.nodeset().len(), 2); + } + + #[test] + fn test_linux_sandbox_cpu_resources() { + let mut sandbox = LinuxSandboxCpuResources::new(1024); + + assert_eq!(sandbox.shares(), 1024); + assert_eq!(sandbox.get_vcpus(), 0); + assert_eq!(sandbox.calculated_vcpu_time_ms(), 0); + assert!(sandbox.cpuset().is_empty()); + assert!(sandbox.nodeset().is_empty()); + + let oci = oci::LinuxCpu { + shares: Some(2048), + quota: Some(1001), + period: Some(100), + realtime_runtime: None, + realtime_period: None, + cpus: "1,2,3".to_string(), + mems: "1".to_string(), + }; + let resources = LinuxContainerCpuResources::try_from(&oci).unwrap(); + sandbox.merge(&resources); + assert_eq!(sandbox.shares(), 1024); + assert_eq!(sandbox.get_vcpus(), 11); + assert_eq!(sandbox.calculated_vcpu_time_ms(), 10010); + assert_eq!(sandbox.cpuset().len(), 3); + assert_eq!(sandbox.nodeset().len(), 1); + + let oci = oci::LinuxCpu { + shares: Some(2048), + quota: None, + period: None, + realtime_runtime: None, + realtime_period: None, + cpus: "1,4".to_string(), + mems: "1-2".to_string(), + }; + let resources = LinuxContainerCpuResources::try_from(&oci).unwrap(); + sandbox.merge(&resources); + + assert_eq!(sandbox.shares(), 1024); + assert_eq!(sandbox.get_vcpus(), 11); + assert_eq!(sandbox.calculated_vcpu_time_ms(), 10010); + assert_eq!(sandbox.cpuset().len(), 4); + assert_eq!(sandbox.nodeset().len(), 2); + } +} diff --git a/src/libs/kata-types/src/k8s.rs b/src/libs/kata-types/src/k8s.rs new file mode 100644 index 0000000000..7e53601b88 --- /dev/null +++ b/src/libs/kata-types/src/k8s.rs @@ -0,0 +1,213 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::Path; + +use crate::annotations; +use crate::container::ContainerType; +use std::str::FromStr; + +// K8S_EMPTY_DIR is the k8s specific path for `empty-dir` volumes +const K8S_EMPTY_DIR: &str = "kubernetes.io~empty-dir"; + +/// Check whether the path is a K8S empty directory. +/// +/// For a K8S EmptyDir, Kubernetes mounts +/// "/var/lib/kubelet/pods//volumes/kubernetes.io~empty-dir/" +/// to "/". +pub fn is_empty_dir>(path: P) -> bool { + let path = path.as_ref(); + + if let Some(parent) = path.parent() { + if let Some(pname) = parent.file_name() { + if pname == K8S_EMPTY_DIR && parent.parent().is_some() { + return true; + } + } + } + + false +} + +/// Get K8S container type from OCI annotations. +pub fn container_type(spec: &oci::Spec) -> ContainerType { + // PodSandbox: "sandbox" (Containerd & CRI-O), "podsandbox" (dockershim) + // PodContainer: "container" (Containerd & CRI-O & dockershim) + for k in [ + annotations::crio::CONTAINER_TYPE_LABEL_KEY, + annotations::cri_containerd::CONTAINER_TYPE_LABEL_KEY, + annotations::dockershim::CONTAINER_TYPE_LABEL_KEY, + ] + .iter() + { + if let Some(v) = spec.annotations.get(k.to_owned()) { + if let Ok(t) = ContainerType::from_str(v) { + return t; + } + } + } + + ContainerType::PodSandbox +} + +/// Determine the k8s sandbox ID from OCI annotations. +/// +/// This function is expected to be called only when the container type is "PodContainer". +pub fn container_type_with_id(spec: &oci::Spec) -> (ContainerType, Option) { + let container_type = container_type(spec); + let mut sid = None; + if container_type == ContainerType::PodContainer { + for k in [ + annotations::crio::SANDBOX_ID_LABEL_KEY, + annotations::cri_containerd::SANDBOX_ID_LABEL_KEY, + annotations::dockershim::SANDBOX_ID_LABEL_KEY, + ] + .iter() + { + if let Some(id) = spec.annotations.get(k.to_owned()) { + sid = Some(id.to_string()); + break; + } + } + } + + (container_type, sid) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{annotations, container}; + + #[test] + fn test_is_empty_dir() { + let empty_dir = "/volumes/kubernetes.io~empty-dir/shm"; + assert!(is_empty_dir(empty_dir)); + + let empty_dir = "/volumes/kubernetes.io~empty-dir//shm"; + assert!(is_empty_dir(empty_dir)); + + let empty_dir = "/volumes/kubernetes.io~empty-dir-test/shm"; + assert!(!is_empty_dir(empty_dir)); + + let empty_dir = "/volumes/kubernetes.io~empty-dir"; + assert!(!is_empty_dir(empty_dir)); + + let empty_dir = "kubernetes.io~empty-dir"; + assert!(!is_empty_dir(empty_dir)); + + let empty_dir = "/kubernetes.io~empty-dir/shm"; + assert!(is_empty_dir(empty_dir)); + } + + #[test] + fn test_container_type() { + let sid = "sid".to_string(); + let mut spec = oci::Spec::default(); + + // default + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodSandbox, None) + ); + + // crio sandbox + spec.annotations = [( + annotations::crio::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::SANDBOX.to_string(), + )] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodSandbox, None) + ); + + // cri containerd sandbox + spec.annotations = [( + annotations::crio::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::POD_SANDBOX.to_string(), + )] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodSandbox, None) + ); + + // docker shim sandbox + spec.annotations = [( + annotations::crio::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::PODSANDBOX.to_string(), + )] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodSandbox, None) + ); + + // crio container + spec.annotations = [ + ( + annotations::crio::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::CONTAINER.to_string(), + ), + ( + annotations::crio::SANDBOX_ID_LABEL_KEY.to_string(), + sid.clone(), + ), + ] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodContainer, Some(sid.clone())) + ); + + // cri containerd container + spec.annotations = [ + ( + annotations::cri_containerd::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::POD_CONTAINER.to_string(), + ), + ( + annotations::cri_containerd::SANDBOX_ID_LABEL_KEY.to_string(), + sid.clone(), + ), + ] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodContainer, Some(sid.clone())) + ); + + // docker shim container + spec.annotations = [ + ( + annotations::dockershim::CONTAINER_TYPE_LABEL_KEY.to_string(), + container::CONTAINER.to_string(), + ), + ( + annotations::dockershim::SANDBOX_ID_LABEL_KEY.to_string(), + sid.clone(), + ), + ] + .iter() + .cloned() + .collect(); + assert_eq!( + container_type_with_id(&spec), + (ContainerType::PodContainer, Some(sid)) + ); + } +} diff --git a/src/libs/kata-types/src/lib.rs b/src/libs/kata-types/src/lib.rs new file mode 100644 index 0000000000..ce43d29607 --- /dev/null +++ b/src/libs/kata-types/src/lib.rs @@ -0,0 +1,93 @@ +// Copyright (c) 2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +//! Constants and Data Types shared by Kata Containers components. + +#![deny(missing_docs)] +#[macro_use] +extern crate slog; +#[macro_use] +extern crate serde; + +/// Constants and data types related to annotations. +pub mod annotations; + +/// Kata configuration information from configuration file. +pub mod config; + +/// Constants and data types related to container. +pub mod container; + +/// Constants and data types related to CPU. +pub mod cpu; + +/// Constants and data types related to Kubernetes/kubelet. +pub mod k8s; + +/// Constants and data types related to mount point. +pub mod mount; + +pub(crate) mod utils; + +/// Common error codes. +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Invalid configuration list. + #[error("invalid list {0}")] + InvalidList(String), +} + +/// Convenience macro to obtain the scoped logger +#[macro_export] +macro_rules! sl { + () => { + slog_scope::logger() + }; +} + +/// Helper to create std::io::Error(std::io::ErrorKind::Other) +#[macro_export] +macro_rules! eother { + () => (std::io::Error::new(std::io::ErrorKind::Other, "")); + ($fmt:expr) => ({ + std::io::Error::new(std::io::ErrorKind::Other, format!($fmt)) + }); + ($fmt:expr, $($arg:tt)*) => ({ + std::io::Error::new(std::io::ErrorKind::Other, format!($fmt, $($arg)*)) + }); +} + +/// Resolve a path to its final value. +#[macro_export] +macro_rules! resolve_path { + ($field:expr, $fmt:expr) => {{ + if !$field.is_empty() { + match Path::new(&$field).canonicalize() { + Err(e) => Err(eother!($fmt, &$field, e)), + Ok(path) => { + $field = path.to_string_lossy().to_string(); + Ok(()) + } + } + } else { + Ok(()) + } + }}; +} + +/// Validate a path. +#[macro_export] +macro_rules! validate_path { + ($field:expr, $fmt:expr) => {{ + if !$field.is_empty() { + Path::new(&$field) + .canonicalize() + .map_err(|e| eother!($fmt, &$field, e)) + .map(|_| ()) + } else { + Ok(()) + } + }}; +} diff --git a/src/libs/kata-types/src/mount.rs b/src/libs/kata-types/src/mount.rs new file mode 100644 index 0000000000..2ccc0feed2 --- /dev/null +++ b/src/libs/kata-types/src/mount.rs @@ -0,0 +1,88 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// Copyright (c) 2019-2021 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::PathBuf; + +/// Prefix to mark a volume as Kata special. +pub const KATA_VOLUME_TYPE_PREFIX: &str = "kata:"; + +/// The Mount should be ignored by the host and handled by the guest. +pub const KATA_GUEST_MOUNT_PREFIX: &str = "kata:guest-mount:"; + +/// KATA_EPHEMERAL_DEV_TYPE creates a tmpfs backed volume for sharing files between containers. +pub const KATA_EPHEMERAL_VOLUME_TYPE: &str = "kata:ephemeral"; + +/// KATA_HOST_DIR_TYPE use for host empty dir +pub const KATA_HOST_DIR_VOLUME_TYPE: &str = "kata:hostdir"; + +/// Information about a mount. +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)] +pub struct Mount { + /// A device name, but can also be a file or directory name for bind mounts or a dummy. + /// Path values for bind mounts are either absolute or relative to the bundle. A mount is a + /// bind mount if it has either bind or rbind in the options. + pub source: String, + /// Destination of mount point: path inside container. This value MUST be an absolute path. + pub destination: PathBuf, + /// The type of filesystem for the mountpoint. + pub fs_type: String, + /// Mount options for the mountpoint. + pub options: Vec, + /// Optional device id for the block device when: + /// - the source is a block device or a mountpoint for a block device + /// - block device direct assignment is enabled + pub device_id: Option, + /// Intermediate path to mount the source on host side and then passthrough to vm by shared fs. + pub host_shared_fs_path: Option, + /// Whether to mount the mountpoint in readonly mode + pub read_only: bool, +} + +impl Mount { + /// Get size of mount options. + pub fn option_size(&self) -> usize { + self.options.iter().map(|v| v.len() + 1).sum() + } +} + +/// Check whether a mount type is a marker for Kata specific volume. +pub fn is_kata_special_volume(ty: &str) -> bool { + ty.len() > KATA_VOLUME_TYPE_PREFIX.len() && ty.starts_with(KATA_VOLUME_TYPE_PREFIX) +} + +/// Check whether a mount type is a marker for Kata guest mount volume. +pub fn is_kata_guest_mount_volume(ty: &str) -> bool { + ty.len() > KATA_GUEST_MOUNT_PREFIX.len() && ty.starts_with(KATA_GUEST_MOUNT_PREFIX) +} + +/// Check whether a mount type is a marker for Kata ephemeral volume. +pub fn is_kata_ephemeral_volume(ty: &str) -> bool { + ty == KATA_EPHEMERAL_VOLUME_TYPE +} + +/// Check whether a mount type is a marker for Kata hostdir volume. +pub fn is_kata_host_dir_volume(ty: &str) -> bool { + ty == KATA_HOST_DIR_VOLUME_TYPE +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_kata_special_volume() { + assert!(is_kata_special_volume("kata:guest-mount:nfs")); + assert!(!is_kata_special_volume("kata:")); + } + + #[test] + fn test_is_kata_guest_mount_volume() { + assert!(is_kata_guest_mount_volume("kata:guest-mount:nfs")); + assert!(!is_kata_guest_mount_volume("kata:guest-mount")); + assert!(!is_kata_guest_mount_volume("kata:guest-moun")); + assert!(!is_kata_guest_mount_volume("Kata:guest-mount:nfs")); + } +} diff --git a/src/libs/kata-types/src/utils/mod.rs b/src/libs/kata-types/src/utils/mod.rs new file mode 100644 index 0000000000..abcb4c2277 --- /dev/null +++ b/src/libs/kata-types/src/utils/mod.rs @@ -0,0 +1,6 @@ +// Copyright (c) 2022 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub(crate) mod u32_set; diff --git a/src/libs/kata-types/src/utils/u32_set.rs b/src/libs/kata-types/src/utils/u32_set.rs new file mode 100644 index 0000000000..3742e4d54f --- /dev/null +++ b/src/libs/kata-types/src/utils/u32_set.rs @@ -0,0 +1,163 @@ +// Copyright (c) 2022 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::ops::Deref; +use std::slice::Iter; +use std::str::FromStr; + +use crate::Error; + +/// A set of unique `u32` IDs. +/// +/// The `U32Set` may be used to save CPUs parsed from a CPU list file or NUMA nodes parsed from +/// a NUMA node list file. +#[derive(Default, Debug)] +pub struct U32Set(Vec); + +impl U32Set { + /// Create a new instance of `U32Set`. + pub fn new() -> Self { + U32Set(vec![]) + } + + /// Add the `cpu` to the CPU set. + pub fn add(&mut self, cpu: u32) { + self.0.push(cpu); + self.0.sort_unstable(); + self.0.dedup(); + } + + /// Add new CPUs into the set. + pub fn extend(&mut self, cpus: &[u32]) { + self.0.extend_from_slice(cpus); + self.0.sort_unstable(); + self.0.dedup(); + } + + /// Returns true if the CPU set contains elements. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Get number of elements in the CPU set. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Get an iterator over the CPU set. + pub fn iter(&self) -> Iter { + self.0.iter() + } +} + +impl From> for U32Set { + fn from(mut cpus: Vec) -> Self { + cpus.sort_unstable(); + cpus.dedup(); + U32Set(cpus) + } +} + +impl FromStr for U32Set { + type Err = Error; + + fn from_str(cpus_str: &str) -> Result { + if cpus_str.is_empty() { + return Ok(U32Set::new()); + } + + let mut cpus = Vec::new(); + for split_cpu in cpus_str.split(',') { + if !split_cpu.contains('-') { + if !split_cpu.is_empty() { + if let Ok(cpu_id) = split_cpu.parse::() { + cpus.push(cpu_id); + continue; + } + } + } else { + let fields: Vec<&str> = split_cpu.split('-').collect(); + if fields.len() == 2 { + if let Ok(start) = fields[0].parse::() { + if let Ok(end) = fields[1].parse::() { + if start < end { + for cpu in start..=end { + cpus.push(cpu); + } + continue; + } + } + } + } + } + + return Err(Error::InvalidList(cpus_str.to_string())); + } + + Ok(U32Set::from(cpus)) + } +} + +impl Deref for U32Set { + type Target = [u32]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// Test whether two CPU sets are equal. +impl PartialEq for U32Set { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_cpuset_equal() { + let cpuset1 = U32Set::from(vec![1, 2, 3]); + let cpuset2 = U32Set::from(vec![3, 2, 1]); + let cpuset3 = U32Set::from(vec![]); + let cpuset4 = U32Set::from(vec![3, 2, 4]); + let cpuset5 = U32Set::from(vec![1, 2, 3, 3, 2, 1]); + + assert_eq!(cpuset1.len(), 3); + assert!(cpuset3.is_empty()); + assert_eq!(cpuset5.len(), 3); + + assert_eq!(cpuset1, cpuset2); + assert_eq!(cpuset1, cpuset5); + assert_ne!(cpuset1, cpuset3); + assert_ne!(cpuset1, cpuset4); + } + + #[test] + fn test_cpuset_from_str() { + assert!(U32Set::from_str("").unwrap().is_empty()); + + let support_cpus1 = U32Set::from(vec![1, 2, 3]); + assert_eq!(support_cpus1, U32Set::from_str("1,2,3").unwrap()); + assert_eq!(support_cpus1, U32Set::from_str("1-2,3").unwrap()); + + let support_cpus2 = U32Set::from(vec![1, 3, 4, 6, 7, 8]); + assert_eq!(support_cpus2, U32Set::from_str("1,3,4,6,7,8").unwrap()); + assert_eq!(support_cpus2, U32Set::from_str("1,3-4,6-8").unwrap()); + + assert!(U32Set::from_str("1-2-3,3").is_err()); + assert!(U32Set::from_str("1-2,,3").is_err()); + assert!(U32Set::from_str("1-2.5,3").is_err()); + assert!(U32Set::from_str("1-1").is_err()); + assert!(U32Set::from_str("2-1").is_err()); + assert!(U32Set::from_str("0,,1").is_err()); + assert!(U32Set::from_str("-1").is_err()); + assert!(U32Set::from_str("1-").is_err()); + assert!(U32Set::from_str("-1--2").is_err()); + assert!(U32Set::from_str("999999999999999999999999999999999999999999999").is_err()); + } +} diff --git a/src/libs/kata-types/tests/test_config.rs b/src/libs/kata-types/tests/test_config.rs new file mode 100644 index 0000000000..b7d5f953b1 --- /dev/null +++ b/src/libs/kata-types/tests/test_config.rs @@ -0,0 +1,493 @@ +// Copyright (c) 2019-2021 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// +#[cfg(test)] +mod tests { + use kata_types::annotations::{ + Annotation, KATA_ANNO_CFG_AGENT_CONTAINER_PIPE_SIZE, KATA_ANNO_CFG_AGENT_TRACE, + KATA_ANNO_CFG_DISABLE_GUEST_SECCOMP, KATA_ANNO_CFG_ENABLE_PPROF, + KATA_ANNO_CFG_EXPERIMENTAL, KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_NOFLUSH, + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_DRIVER, KATA_ANNO_CFG_HYPERVISOR_CTLPATH, + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MEMORY, KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS, + KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP, KATA_ANNO_CFG_HYPERVISOR_ENABLE_IO_THREADS, + KATA_ANNO_CFG_HYPERVISOR_ENABLE_SWAP, KATA_ANNO_CFG_HYPERVISOR_FILE_BACKED_MEM_ROOT_DIR, + KATA_ANNO_CFG_HYPERVISOR_GUEST_HOOK_PATH, KATA_ANNO_CFG_HYPERVISOR_HUGE_PAGES, + KATA_ANNO_CFG_HYPERVISOR_JAILER_PATH, KATA_ANNO_CFG_HYPERVISOR_KERNEL_PATH, + KATA_ANNO_CFG_HYPERVISOR_MEMORY_PREALLOC, KATA_ANNO_CFG_HYPERVISOR_MEMORY_SLOTS, + KATA_ANNO_CFG_HYPERVISOR_PATH, KATA_ANNO_CFG_HYPERVISOR_VHOSTUSER_STORE_PATH, + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_DAEMON, KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_EXTRA_ARGS, + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_MEM, KATA_ANNO_CFG_KERNEL_MODULES, + KATA_ANNO_CFG_RUNTIME_NAME, + }; + use kata_types::config::KataConfig; + use kata_types::config::{QemuConfig, TomlConfig}; + use std::collections::HashMap; + use std::fs; + use std::path::Path; + #[test] + fn test_change_config_annotation() { + let content = include_str!("texture/configuration-anno-0.toml"); + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + std::process::Command::new("mkdir") + .arg("./hypervisor_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./store_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./test_hypervisor_hook_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./jvm") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./test_file_backend_mem_root") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./test_jailer_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./test_kernel_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("mkdir") + .arg("./virtio_fs") + .output() + .expect("failed to execute process"); + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_KERNEL_MODULES.to_string(), + "j465 aaa=1;r33w".to_string(), + ); + anno_hash.insert(KATA_ANNO_CFG_AGENT_TRACE.to_string(), "false".to_string()); + anno_hash.insert( + KATA_ANNO_CFG_AGENT_CONTAINER_PIPE_SIZE.to_string(), + "3".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_PATH.to_string(), + "./hypervisor_path".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_DRIVER.to_string(), + "device".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_CACHE_NOFLUSH.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_VHOSTUSER_STORE_PATH.to_string(), + "./store_path".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_DISABLE_GUEST_SECCOMP.to_string(), + "true".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_GUEST_HOOK_PATH.to_string(), + "./test_hypervisor_hook_path".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_MEMORY_PREALLOC.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_CTLPATH.to_string(), + "./jvm".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS.to_string(), + "12".to_string(), + ); + anno_hash.insert(KATA_ANNO_CFG_ENABLE_PPROF.to_string(), "false".to_string()); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MEMORY.to_string(), + "100MiB".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_IO_THREADS.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_IO_THREADS.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_SWAP.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_FILE_BACKED_MEM_ROOT_DIR.to_string(), + "./test_file_backend_mem_root".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_HUGE_PAGES.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_JAILER_PATH.to_string(), + "./test_jailer_path".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_KERNEL_PATH.to_string(), + "./test_kernel_path".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_MEMORY_SLOTS.to_string(), + "100".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_EXTRA_ARGS.to_string(), + "rr,dg,er".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_MEM.to_string(), + "false".to_string(), + ); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_VIRTIO_FS_DAEMON.to_string(), + "./virtio_fs".to_string(), + ); + anno_hash.insert(KATA_ANNO_CFG_EXPERIMENTAL.to_string(), "c,d,e".to_string()); + + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_ok()); + KataConfig::set_active_config(Some(config), "qemu", "agnet0"); + if let Some(ag) = KataConfig::get_default_config().get_agent() { + assert_eq!( + ag.kernel_modules[0], + "e1000e InterruptThrottleRate=3000,3000,3000 EEE=1" + ); + + assert_eq!(ag.kernel_modules[1], "i915_enabled_ppgtt=0"); + assert_eq!(ag.kernel_modules[2], "j465 aaa=1"); + assert_eq!(ag.kernel_modules[3], "r33w"); + assert!(!ag.enable_tracing); + assert_eq!(ag.container_pipe_size, 3); + } + if let Some(hv) = KataConfig::get_default_config().get_hypervisor() { + assert_eq!(hv.path, "./hypervisor_path".to_string()); + assert_eq!(hv.blockdev_info.block_device_driver, "device"); + assert!(!hv.blockdev_info.block_device_cache_noflush); + assert!(hv.blockdev_info.block_device_cache_set); + assert_eq!(hv.blockdev_info.vhost_user_store_path, "./store_path"); + assert_eq!( + hv.security_info.guest_hook_path, + "./test_hypervisor_hook_path" + ); + assert!(!hv.memory_info.enable_mem_prealloc); + assert_eq!(hv.ctlpath, "./jvm".to_string()); + assert_eq!(hv.cpu_info.default_vcpus, 12); + assert!(!hv.memory_info.enable_guest_swap); + assert_eq!(hv.memory_info.default_memory, 100); + assert!(!hv.enable_iothreads); + assert!(!hv.enable_iothreads); + assert!(!hv.memory_info.enable_swap); + assert_eq!( + hv.memory_info.file_mem_backend, + "./test_file_backend_mem_root" + ); + assert!(!hv.memory_info.enable_hugepages); + assert_eq!(hv.jailer_path, "./test_jailer_path".to_string()); + assert_eq!(hv.boot_info.kernel, "./test_kernel_path"); + assert_eq!(hv.memory_info.memory_slots, 100); + assert_eq!(hv.shared_fs.virtio_fs_extra_args[5], "rr"); + assert_eq!(hv.shared_fs.virtio_fs_extra_args[6], "dg"); + assert_eq!(hv.shared_fs.virtio_fs_extra_args[7], "er"); + assert!(!hv.memory_info.enable_virtio_mem); + assert_eq!(hv.shared_fs.virtio_fs_daemon, "./virtio_fs"); + } + + assert!( + KataConfig::get_active_config() + .get_config() + .runtime + .disable_guest_seccomp + ); + + assert!( + !KataConfig::get_active_config() + .get_config() + .runtime + .enable_pprof + ); + assert_eq!( + KataConfig::get_active_config() + .get_config() + .runtime + .experimental, + ["a", "b", "c", "d", "e"] + ); + std::process::Command::new("rmdir") + .arg("./hypervisor_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("rmdir") + .arg("./test_hypervisor_hook_path") + .output() + .expect("failed to execute process"); + + std::process::Command::new("rmdir") + .arg("./test_file_backend_mem_root") + .output() + .expect("failed to execute process"); + + std::process::Command::new("rmdir") + .arg("./test_jailer_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("rmdir") + .arg("./test_kernel_path") + .output() + .expect("failed to execute process"); + std::process::Command::new("rmdir") + .arg("./virtio_fs") + .output() + .expect("failed to execute process"); + std::process::Command::new("rmdir") + .arg("./jvm") + .output() + .expect("failed to execute process"); + std::process::Command::new("rmdir") + .arg("./store_path") + .output() + .expect("failed to execute process"); + } + + #[test] + fn test_fail_to_change_block_device_driver_because_not_enabled() { + let content = include_str!("texture/configuration-anno-1.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_BLOCK_DEV_DRIVER.to_string(), + "fvfvfvfvf".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_ok()); + if let Some(hv) = KataConfig::get_default_config().get_hypervisor() { + assert_eq!(hv.blockdev_info.block_device_driver, "virtio-blk"); + } + } + + #[test] + fn test_fail_to_change_enable_guest_swap_because_not_enabled() { + let content = include_str!("texture/configuration-anno-1.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP.to_string(), + "false".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_ok()); + if let Some(hv) = KataConfig::get_default_config().get_hypervisor() { + assert!(hv.memory_info.enable_guest_swap) + } + } + + #[test] + fn test_fail_to_change_hypervisor_path_because_of_invalid_path() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_PATH.to_string(), + "/usr/bin/nle".to_string(), + ); + let anno = Annotation::new(anno_hash); + + let path = env!("CARGO_MANIFEST_DIR"); + let path = Path::new(path).join("tests/texture/configuration-anno-0.toml"); + let content = fs::read_to_string(&path).unwrap(); + let mut config = TomlConfig::load(&content).unwrap(); + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_kernel_path_because_of_invalid_path() { + let path = env!("CARGO_MANIFEST_DIR"); + let path = Path::new(path).join("tests/texture/configuration-anno-0.toml"); + let content = fs::read_to_string(&path).unwrap(); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(&content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_KERNEL_PATH.to_string(), + "/usr/bin/cdcd".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(&content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_memory_slots_because_of_less_than_zero() { + let content = include_str!("texture/configuration-anno-0.toml"); + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_MEMORY_SLOTS.to_string(), + "-1".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_default_memory_because_less_than_min_memory_size() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_MEMORY.to_string(), + "10".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_default_vcpus_becuase_more_than_max_cpu_size() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS.to_string(), + "400".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_enable_guest_swap_because_invalid_input() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_ENABLE_GUEST_SWAP.to_string(), + "false1".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_default_vcpus_becuase_invalid_input() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_HYPERVISOR_DEFAULT_VCPUS.to_string(), + "ddc".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } + + #[test] + fn test_fail_to_change_runtime_name() { + let content = include_str!("texture/configuration-anno-0.toml"); + + let qemu = QemuConfig::new(); + qemu.register(); + + let config = TomlConfig::load(content).unwrap(); + KataConfig::set_active_config(Some(config), "qemu", "agent0"); + let mut anno_hash = HashMap::new(); + anno_hash.insert( + KATA_ANNO_CFG_RUNTIME_NAME.to_string(), + "other-container".to_string(), + ); + let anno = Annotation::new(anno_hash); + let mut config = TomlConfig::load(content).unwrap(); + assert!(anno.update_config_by_annotation(&mut config).is_err()); + } +} diff --git a/src/libs/kata-types/tests/texture/configuration-anno-0.toml b/src/libs/kata-types/tests/texture/configuration-anno-0.toml new file mode 100644 index 0000000000..807de57b69 --- /dev/null +++ b/src/libs/kata-types/tests/texture/configuration-anno-0.toml @@ -0,0 +1,90 @@ +[hypervisor.qemu] +path = "/usr/bin/lsns" +valid_hypervisor_paths = ["/usr/bin/qemu*", "/opt/qemu?","/usr/bin/ls*","./hypervisor_path"] +valid_jailer_paths = ["/usr/lib/rust","./test_jailer_path"] +ctlpath = "/usr/bin/" +valid_ctlpaths = ["/usr/lib/jvm","usr/bin/qemu-io","./jvm"] +disable_nesting_checks = true +enable_iothreads = true +jailer_path = "/usr/local" +kernel = "/usr/bin/../bin/zcmp" +image = "/usr/bin/./tabs" +kernel_params = "ro" +firmware = "/etc/hostname" + +cpu_features="pmu=off,vmx=off" +default_vcpus = 2 +default_maxvcpus = 64 + +machine_type = "q35" +confidential_guest = true +rootless = true +enable_annotations = ["shared_fs","path", "ctlpath","jailer_path","enable_iothreads","default_memory","memory_slots","enable_mem_prealloc","enable_hugepages","file_mem_backend","enable_virtio_mem","enable_swap","enable_guest_swap","default_vcpus","virtio_fs_extra_args","block_device_driver","vhost_user_store_path","kernel","guest_hook_path","block_device_cache_noflush","virtio_fs_daemon"] +machine_accelerators="noapic" +default_bridges = 2 +default_memory = 128 +memory_slots = 128 +memory_offset = 0x100000 +enable_virtio_mem = true +disable_block_device_use = false +shared_fs = "virtio-fs" +virtio_fs_daemon = "/usr/bin/uptime" +valid_virtio_fs_daemon_paths = ["/usr/local/bin/virtiofsd*","./virtio_fs"] +virtio_fs_cache_size = 512 +virtio_fs_extra_args = ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +virtio_fs_cache = "always" +block_device_driver = "virtio-blk" +block_device_cache_set = true +block_device_cache_direct = true +block_device_cache_noflush = true +enable_mem_prealloc = true +enable_hugepages = true +enable_vhost_user_store = true +vhost_user_store_path = "/tmp" +valid_vhost_user_store_paths = ["/var/kata/vhost-user-store*", "/tmp/kata?","/var/tmp","./store_path"] +enable_iommu = true +enable_iommu_platform = true +file_mem_backend = "/dev/shm" +valid_file_mem_backends = ["/dev/shm","/dev/snd","./test_file_backend_mem_root"] +enable_swap = true +pflashes = ["/proc/mounts"] +enable_debug = true +msize_9p = 16384 +disable_image_nvdimm = true +hotplug_vfio_on_root_bus = true +pcie_root_port = 2 +disable_vhost_net = true +entropy_source= "/dev/urandom" +valid_entropy_sources = ["/dev/urandom", "/dev/random"] +guest_hook_path = "/usr/share" +rx_rate_limiter_max_rate = 10000 +tx_rate_limiter_max_rate = 10000 +guest_memory_dump_path="/var/crash/kata" +guest_memory_dump_paging = true +enable_guest_swap = true + +[agent.agent0] +enable_tracing = true +debug_console_enabled = true +debug = true +dial_timeout = 1 +kernel_modules = ["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1","i915_enabled_ppgtt=0"] +container_pipe_size = 2 +[runtime] +enable_debug = true +internetworking_model="macvtap" +disable_guest_seccomp=false +enable_tracing = true +jaeger_endpoint = "localhost:1234" +jaeger_user = "user" +jaeger_password = "pw" +disable_new_netns = true +sandbox_cgroup_only=true +sandbox_bind_mounts=["/proc/self"] +vfio_mode="vfio" +experimental=["a", "b"] +enable_pprof = true +hypervisor_name = "qemu" +agent_name = "agent0" + + diff --git a/src/libs/kata-types/tests/texture/configuration-anno-1.toml b/src/libs/kata-types/tests/texture/configuration-anno-1.toml new file mode 100644 index 0000000000..66e11dbe5c --- /dev/null +++ b/src/libs/kata-types/tests/texture/configuration-anno-1.toml @@ -0,0 +1,88 @@ +[hypervisor.qemu] +path = "/usr/bin/lsns" +valid_hypervisor_paths = ["/usr/bin/qemu*", "/opt/qemu?","/usr/bin/lsns","./hypervisor_path"] +valid_jailer_paths = ["/usr/lib/rust"] +ctlpath = "/usr/bin" +disable_nesting_checks = true +enable_iothreads = true +jailer_path = "/usr/local" +kernel = "/usr/bin/../bin/uptime" +image = "/usr/bin/./lessecho" +kernel_params = "ro" +firmware = "/etc/hostname" + +cpu_features="pmu=off,vmx=off" +default_vcpus = 2 +default_maxvcpus = 64 + +machine_type = "q35" +confidential_guest = true +rootless = true +enable_annotations = ["path", "ctlpath","jailer_path"] +machine_accelerators="noapic" +default_bridges = 2 +default_memory = 128 +memory_slots = 128 +memory_offset = 0x100000 +enable_virtio_mem = true +disable_block_device_use = false +shared_fs = "virtio-fs" +virtio_fs_daemon = "/usr/bin/uptime" +valid_virtio_fs_daemon_paths = ["/usr/local/bin/virtiofsd*"] +virtio_fs_cache_size = 512 +virtio_fs_extra_args = ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +virtio_fs_cache = "always" +block_device_driver = "virtio-blk" +block_device_cache_set = true +block_device_cache_direct = true +block_device_cache_noflush = true +enable_mem_prealloc = true +enable_hugepages = true +enable_vhost_user_store = true +vhost_user_store_path = "/tmp" +valid_vhost_user_store_paths = ["/var/kata/vhost-user-store*", "/tmp/kata?"] +enable_iommu = true +enable_iommu_platform = true +file_mem_backend = "/dev/shm" +valid_file_mem_backends = ["/dev/shm"] +enable_swap = true +pflashes = ["/proc/mounts"] +enable_debug = true +msize_9p = 16384 +disable_image_nvdimm = true +hotplug_vfio_on_root_bus = true +pcie_root_port = 2 +disable_vhost_net = true +entropy_source= "/dev/urandom" +valid_entropy_sources = ["/dev/urandom", "/dev/random"] +guest_hook_path = "/usr/share/oci/hooks" +rx_rate_limiter_max_rate = 10000 +tx_rate_limiter_max_rate = 10000 +guest_memory_dump_path="/var/crash/kata" +guest_memory_dump_paging = true +enable_guest_swap = true + +[agent.agent0] +enable_tracing = true +debug_console_enabled = true +debug = true +dial_timeout = 1 +kernel_modules = ["e1000e InterruptThrottleRate=3000,3000,3000 EEE=1","i915_enabled_ppgtt=0"] +container_pipe_size = 2 +[runtime] +enable_debug = true +internetworking_model="macvtap" +pdisable_guest_seccomp=true +enable_tracing = true +jaeger_endpoint = "localhost:1234" +jaeger_user = "user" +jaeger_password = "pw" +disable_new_netns = true +sandbox_cgroup_only=true +sandbox_bind_mounts=["/proc/self"] +vfio_mode="vfio" +experimental=["a", "b"] +enable_pprof = true +hypervisor_name = "qemu" +agent_name = "agent0" + diff --git a/src/libs/logging/Cargo.toml b/src/libs/logging/Cargo.toml index 36685c15a3..3457072bc6 100644 --- a/src/libs/logging/Cargo.toml +++ b/src/libs/logging/Cargo.toml @@ -12,7 +12,7 @@ serde_json = "1.0.73" # - Dynamic keys required to allow HashMap keys to be slog::Serialized. # - The 'max_*' features allow changing the log level at runtime # (by stopping the compiler from removing log calls). -slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug"] } +slog = { version = "2.5.2", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug"] } slog-json = "2.4.0" slog-async = "2.7.0" slog-scope = "4.4.0" diff --git a/src/libs/logging/Makefile b/src/libs/logging/Makefile deleted file mode 100644 index 74c917ab88..0000000000 --- a/src/libs/logging/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2021 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 -# - -# It is not necessary to have a build target as this crate is built -# automatically by the consumers of it. -# -# However, it is essential that the crate be tested. -default: test - -# It is essential to run these tests using *both* build profiles. -# See the `test_logger_levels()` test for further information. -test: - @echo "INFO: testing log levels for development build" - @cargo test - @echo "INFO: testing log levels for release build" - @cargo test --release diff --git a/src/libs/logging/src/file_rotate.rs b/src/libs/logging/src/file_rotate.rs new file mode 100644 index 0000000000..444297e53d --- /dev/null +++ b/src/libs/logging/src/file_rotate.rs @@ -0,0 +1,315 @@ +// Copyright (c) 2020 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 or MIT +// +// Partial code are extracted from +// https://github.com/sile/sloggers/blob/153c00a59f7218c1d96f522fb7a95c80bb0d530c/src/file.rs +// with following license and copyright. +// The MIT License +// +// Copyright (c) 2017 Takeru Ohta +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, LineWriter, Result, Write}; +use std::path::{Path, PathBuf}; + +/// Default rotate size for logger files. +const DEFAULT_LOG_FILE_SIZE_TO_ROTATE: u64 = 10485760; + +/// Default number of log files to keep. +const DEFAULT_HISTORY_LOG_FILES: usize = 3; + +/// Writer with file rotation for log files. +/// +/// This is a modified version of `FileAppender` from +/// https://github.com/sile/sloggers/blob/153c00a59f7218c1d96f522fb7a95c80bb0d530c/src/file.rs#L190 +#[derive(Debug)] +pub struct FileRotator { + path: PathBuf, + file: Option>, + ignore_errors: bool, + rotate_size: u64, + rotate_keep: usize, + truncate: bool, + written_size: u64, + #[cfg(test)] + fail_rename: bool, +} + +impl FileRotator { + /// Create a new instance of [`FileRotator`] to write log file at `path`. + /// + /// It returns `std::io::Error` if the path is not a normal file or the parent directory does + /// not exist. + pub fn new>(path: P) -> Result { + let p = Path::new(path.as_ref()); + match p.metadata() { + Ok(md) => { + if !md.is_file() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("path '{}' is not a file", p.to_string_lossy()), + )); + } + } + Err(e) if e.kind() == io::ErrorKind::NotFound => {} + Err(e) => return Err(e), + } + if let Some(parent) = p.parent() { + if p.has_root() || !parent.as_os_str().is_empty() { + let md = parent.metadata()?; + if !md.is_dir() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("'{}' is not a directory", parent.to_string_lossy()), + )); + } + } + } + + Ok(FileRotator { + path: p.to_path_buf(), + file: None, + ignore_errors: false, + rotate_size: DEFAULT_LOG_FILE_SIZE_TO_ROTATE, + rotate_keep: DEFAULT_HISTORY_LOG_FILES, + truncate: false, + written_size: 0, + #[cfg(test)] + fail_rename: false, + }) + } + + /// Use "truncate" or "append" mode when opening the log file. + pub fn truncate_mode(&mut self, truncate: bool) -> &mut Self { + self.truncate = truncate; + self + } + + /// Set the threshold size to rotate log files. + pub fn rotate_threshold(&mut self, size: u64) -> &mut Self { + self.rotate_size = size; + self + } + + /// Set number of rotated log files to keep. + pub fn rotate_count(&mut self, count: usize) -> &mut Self { + self.rotate_keep = count; + self + } + + /// Ignore all errors and try best effort to log messages but without guarantee. + pub fn ignore_errors(&mut self, ignore_errors: bool) -> &mut Self { + self.ignore_errors = ignore_errors; + self + } + + /// Open the log file if + /// - it hasn't been opened yet. + /// - current log file has been rotated and needs to open a new log file. + fn reopen_if_needed(&mut self) -> Result<()> { + if self.file.is_none() || !self.path.exists() { + let file = OpenOptions::new() + .create(true) + .write(true) + .truncate(self.truncate) + .append(!self.truncate) + .open(&self.path)?; + match file.metadata() { + Ok(md) => self.written_size = md.len(), + Err(e) => { + if self.ignore_errors { + // Pretend as an empty file. + // It's better to permit over-sized log file instead of disabling rotation. + self.written_size = 0; + } else { + return Err(e); + } + } + } + self.file = Some(LineWriter::new(file)); + } + + Ok(()) + } + + /// Try to rotate log files. + /// + /// When failed to rotate the log files, we choose to ignore the error instead of possibly + /// panicking the whole program. This may cause over-sized log files, but that should be easy + /// to recover. + fn rotate(&mut self) -> Result<()> { + for i in (1..=self.rotate_keep).rev() { + let from = self.rotated_path(i); + let to = self.rotated_path(i + 1); + if from.exists() { + let _ = fs::rename(from, to); + } + } + + #[cfg(test)] + if !self.fail_rename && self.path.exists() { + let rotated_path = self.rotated_path(1); + let _ = fs::rename(&self.path, &rotated_path); + } + #[cfg(not(test))] + if self.path.exists() { + let rotated_path = self.rotated_path(1); + let _ = fs::rename(&self.path, &rotated_path); + } + + let delete_path = self.rotated_path(self.rotate_keep + 1); + if delete_path.exists() { + let _ = fs::remove_file(delete_path); + } + + // Reset the `written_size` so only try to rotate again when another `rotate_size` bytes + // of log messages have been written to the lo file. + self.written_size = 0; + self.reopen_if_needed()?; + + Ok(()) + } + + fn rotated_path(&self, i: usize) -> PathBuf { + let mut path = self.path.clone().into_os_string(); + path.push(format!(".{}", i)); + PathBuf::from(path) + } +} + +impl Write for FileRotator { + fn write(&mut self, buf: &[u8]) -> Result { + if self.ignore_errors { + let _ = self.reopen_if_needed(); + if let Some(file) = self.file.as_mut() { + let _ = file.write_all(buf); + } + } else { + self.reopen_if_needed()?; + match self.file.as_mut() { + Some(file) => file.write_all(buf)?, + None => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("Cannot open file: {:?}", self.path), + )) + } + } + } + + self.written_size += buf.len() as u64; + Ok(buf.len()) + } + + fn flush(&mut self) -> Result<()> { + if let Some(f) = self.file.as_mut() { + if let Err(e) = f.flush() { + if !self.ignore_errors { + return Err(e); + } + } + } + if self.written_size >= self.rotate_size { + if let Err(e) = self.rotate() { + if !self.ignore_errors { + return Err(e); + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::unix::fs::MetadataExt; + + #[test] + fn test_rotator_valid_path() { + FileRotator::new("/proc/self").unwrap_err(); + FileRotator::new("/proc/self/__does_not_exist__/log.txt").unwrap_err(); + + let _ = FileRotator::new("log.txt").unwrap(); + } + + #[test] + fn test_rotator_rotate() { + let tmpdir = tempfile::tempdir().unwrap(); + let mut path = tmpdir.path().to_path_buf(); + path.push("log.txt"); + + let mut rotator = FileRotator::new(&path).unwrap(); + rotator.truncate_mode(false); + rotator.rotate_threshold(4); + rotator.rotate_count(1); + assert_eq!(rotator.rotate_size, 4); + assert_eq!(rotator.rotate_keep, 1); + assert!(!rotator.truncate); + + rotator.write_all("test".as_bytes()).unwrap(); + rotator.flush().unwrap(); + rotator.write_all("test1".as_bytes()).unwrap(); + rotator.flush().unwrap(); + rotator.write_all("t2".as_bytes()).unwrap(); + rotator.flush().unwrap(); + + let content = fs::read_to_string(path).unwrap(); + assert_eq!(content, "t2"); + + let mut path1 = tmpdir.path().to_path_buf(); + path1.push("log.txt.1"); + let content = fs::read_to_string(path1).unwrap(); + assert_eq!(content, "test1"); + + let mut path2 = tmpdir.path().to_path_buf(); + path2.push("log.txt.2"); + fs::read_to_string(path2).unwrap_err(); + } + + #[test] + fn test_rotator_rotate_fail() { + let tmpdir = tempfile::tempdir().unwrap(); + let mut path = tmpdir.path().to_path_buf(); + path.push("log.txt"); + + let mut rotator = FileRotator::new(&path).unwrap(); + rotator.truncate_mode(false); + rotator.rotate_threshold(1); + rotator.rotate_count(1); + rotator.fail_rename = true; + + rotator.write_all("test".as_bytes()).unwrap(); + rotator.flush().unwrap(); + let size1 = path.metadata().unwrap().size(); + + rotator.write_all("test1".as_bytes()).unwrap(); + rotator.flush().unwrap(); + let size2 = path.metadata().unwrap().size(); + assert!(size2 > size1); + + rotator.write_all("test2".as_bytes()).unwrap(); + rotator.flush().unwrap(); + let size3 = path.metadata().unwrap().size(); + assert!(size3 > size2); + } +} diff --git a/src/libs/logging/src/lib.rs b/src/libs/logging/src/lib.rs index 3c9bf5e993..d72292a2c4 100644 --- a/src/libs/logging/src/lib.rs +++ b/src/libs/logging/src/lib.rs @@ -11,6 +11,23 @@ use std::process; use std::result; use std::sync::Mutex; +mod file_rotate; +mod log_writer; + +pub use file_rotate::FileRotator; +pub use log_writer::LogWriter; + +#[macro_export] +macro_rules! logger_with_subsystem { + ($name: ident, $subsystem: expr) => { + macro_rules! $name { + () => { + slog_scope::logger().new(slog::o!("subsystem" => $subsystem)) + }; + } + }; +} + const LOG_LEVELS: &[(&str, slog::Level)] = &[ ("trace", slog::Level::Trace), ("debug", slog::Level::Debug), @@ -528,8 +545,8 @@ mod tests { let msg = format!("test[{}]", i); // Create a writer for the logger drain to use - let writer = - NamedTempFile::new().unwrap_or_else(|_| panic!("{:}: failed to create tempfile", msg)); + let writer = NamedTempFile::new() + .unwrap_or_else(|_| panic!("{:}: failed to create tempfile", msg)); // Used to check file contents before the temp file is unlinked let mut writer_ref = writer diff --git a/src/libs/logging/src/log_writer.rs b/src/libs/logging/src/log_writer.rs new file mode 100644 index 0000000000..53e6d541e0 --- /dev/null +++ b/src/libs/logging/src/log_writer.rs @@ -0,0 +1,66 @@ +// Copyright (c) 2020 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{Result, Write}; + +use slog::{info, Logger}; + +/// Writer to convert each line written to it to a log record. +#[derive(Debug)] +pub struct LogWriter(Logger); + +impl LogWriter { + /// Create a new isntance of ['LogWriter']. + pub fn new(logger: Logger) -> Self { + LogWriter(logger) + } +} + +impl Write for LogWriter { + fn write(&mut self, buf: &[u8]) -> Result { + buf.split(|b| *b == b'\n').for_each(|it| { + if !it.is_empty() { + info!(self.0, "{}", String::from_utf8_lossy(it)) + } + }); + + Ok(buf.len()) + } + + fn flush(&mut self) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{create_logger, FileRotator}; + use std::fs; + + #[test] + fn test_log_writer() { + let tmpdir = tempfile::tempdir().unwrap(); + let mut path = tmpdir.path().to_path_buf(); + path.push("log.txt"); + + let mut rotator = FileRotator::new(&path).unwrap(); + rotator.truncate_mode(false); + rotator.rotate_threshold(4); + rotator.rotate_count(1); + + let (logger, guard) = create_logger("test", "hi", slog::Level::Info, rotator); + let mut writer = LogWriter::new(logger); + + writer.write_all("test1\nblabla".as_bytes()).unwrap(); + writer.flush().unwrap(); + writer.write_all("test2".as_bytes()).unwrap(); + writer.flush().unwrap(); + drop(guard); + + let content = fs::read_to_string(path).unwrap(); + assert!(!content.is_empty()); + } +} diff --git a/src/libs/oci/src/lib.rs b/src/libs/oci/src/lib.rs index 3998b166c1..3bcaefa162 100644 --- a/src/libs/oci/src/lib.rs +++ b/src/libs/oci/src/lib.rs @@ -14,6 +14,8 @@ use std::collections::HashMap; mod serialize; pub use serialize::{to_string, to_writer, Error, Result}; +pub const OCI_SPEC_CONFIG_FILE_NAME: &str = "config.json"; + #[allow(dead_code)] fn is_false(b: bool) -> bool { !b diff --git a/src/libs/protocols/.gitignore b/src/libs/protocols/.gitignore index ce4964c4f0..0a83b1689a 100644 --- a/src/libs/protocols/.gitignore +++ b/src/libs/protocols/.gitignore @@ -1,9 +1,11 @@ Cargo.lock src/agent.rs src/agent_ttrpc.rs +src/agent_ttrpc_async.rs src/csi.rs src/empty.rs src/health.rs src/health_ttrpc.rs +src/health_ttrpc_async.rs src/oci.rs src/types.rs diff --git a/src/libs/protocols/Cargo.toml b/src/libs/protocols/Cargo.toml index ae93e7fa19..6853e9c259 100644 --- a/src/libs/protocols/Cargo.toml +++ b/src/libs/protocols/Cargo.toml @@ -7,13 +7,15 @@ edition = "2018" [features] default = [] with-serde = [ "serde", "serde_json" ] +async = ["ttrpc/async", "async-trait"] [dependencies] -ttrpc = { version = "0.5.0", features = ["async"] } -async-trait = "0.1.42" -protobuf = { version = "=2.14.0", features = ["with-serde"] } +ttrpc = { version = "0.6.0" } +async-trait = { version = "0.1.42", optional = true } +protobuf = { version = "2.27.0", features = ["with-serde"] } serde = { version = "1.0.130", features = ["derive"], optional = true } serde_json = { version = "1.0.68", optional = true } +oci = { path = "../oci" } [build-dependencies] ttrpc-codegen = "0.2.0" diff --git a/src/libs/protocols/build.rs b/src/libs/protocols/build.rs index 4a43f36777..ebb6ef1269 100644 --- a/src/libs/protocols/build.rs +++ b/src/libs/protocols/build.rs @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::fs::File; +use std::fs::{self, File}; use std::io::{BufRead, BufReader, Read, Write}; use std::path::Path; use std::process::exit; @@ -90,17 +90,8 @@ fn handle_file(autogen_comment: &str, rust_filename: &str) -> Result<(), std::io Ok(()) } -fn real_main() -> Result<(), std::io::Error> { - let autogen_comment = format!("\n//! Generated by {:?} ({:?})", file!(), module_path!()); - - let protos = vec![ - "protos/agent.proto", - "protos/csi.proto", - "protos/google/protobuf/empty.proto", - "protos/health.proto", - "protos/oci.proto", - "protos/types.proto", - ]; +fn codegen(path: &str, protos: &[&str], async_all: bool) -> Result<(), std::io::Error> { + fs::create_dir_all(path).unwrap(); // Tell Cargo that if the .proto files changed, to rerun this build script. protos @@ -108,7 +99,7 @@ fn real_main() -> Result<(), std::io::Error> { .for_each(|p| println!("cargo:rerun-if-changed={}", &p)); let ttrpc_options = Customize { - async_server: true, + async_all, ..Default::default() }; @@ -121,13 +112,14 @@ fn real_main() -> Result<(), std::io::Error> { Codegen::new() .out_dir(out_dir) - .inputs(&protos) + .inputs(protos) .include("protos") .customize(ttrpc_options) .rust_protobuf() .rust_protobuf_customize(protobuf_options) .run()?; + let autogen_comment = format!("\n//! Generated by {:?} ({:?})", file!(), module_path!()); for file in protos.iter() { let proto_filename = Path::new(file).file_name().unwrap(); @@ -147,6 +139,32 @@ fn real_main() -> Result<(), std::io::Error> { handle_file(&autogen_comment, out_file_str)?; } + use_serde(protos, out_dir)?; + Ok(()) +} +fn real_main() -> Result<(), std::io::Error> { + codegen( + "src", + &[ + "protos/google/protobuf/empty.proto", + "protos/oci.proto", + "protos/types.proto", + "protos/csi.proto", + ], + false, + )?; + + // generate async + #[cfg(feature = "async")] + { + codegen("src", &["protos/agent.proto", "protos/health.proto"], true)?; + + fs::rename("src/agent_ttrpc.rs", "src/agent_ttrpc_async.rs")?; + fs::rename("src/health_ttrpc.rs", "src/health_ttrpc_async.rs")?; + } + + codegen("src", &["protos/agent.proto", "protos/health.proto"], false)?; + // There is a message named 'Box' in oci.proto // so there is a struct named 'Box', we should replace Box to ::std::boxed::Box // to avoid the conflict. @@ -156,8 +174,6 @@ fn real_main() -> Result<(), std::io::Error> { "self: ::std::boxed::Box", )?; - use_serde(&protos, out_dir)?; - Ok(()) } diff --git a/src/libs/protocols/src/lib.rs b/src/libs/protocols/src/lib.rs index 14298e52d9..0c62b8a933 100644 --- a/src/libs/protocols/src/lib.rs +++ b/src/libs/protocols/src/lib.rs @@ -7,9 +7,14 @@ pub mod agent; pub mod agent_ttrpc; +#[cfg(feature = "async")] +pub mod agent_ttrpc_async; pub mod csi; pub mod empty; pub mod health; pub mod health_ttrpc; +#[cfg(feature = "async")] +pub mod health_ttrpc_async; pub mod oci; +pub mod trans; pub mod types; diff --git a/src/libs/protocols/src/trans.rs b/src/libs/protocols/src/trans.rs new file mode 100644 index 0000000000..e9ecfe7859 --- /dev/null +++ b/src/libs/protocols/src/trans.rs @@ -0,0 +1,1085 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashMap; +use std::convert::From; + +use oci::{ + Hook, Hooks, Linux, LinuxBlockIo, LinuxCapabilities, LinuxCpu, LinuxDevice, LinuxHugepageLimit, + LinuxIdMapping, LinuxIntelRdt, LinuxInterfacePriority, LinuxMemory, LinuxNamespace, + LinuxNetwork, LinuxPids, LinuxResources, LinuxSeccomp, LinuxSeccompArg, LinuxSyscall, + LinuxThrottleDevice, LinuxWeightDevice, Mount, PosixRlimit, Process, Root, Spec, User, +}; + +// translate from interface to ttprc tools +fn from_option>(from: Option) -> ::protobuf::SingularPtrField { + match from { + Some(f) => ::protobuf::SingularPtrField::from_option(Some(T::from(f))), + None => ::protobuf::SingularPtrField::none(), + } +} + +fn from_vec>(from: Vec) -> ::protobuf::RepeatedField { + let mut to: Vec = vec![]; + for data in from { + to.push(T::from(data)); + } + ::protobuf::RepeatedField::from_vec(to) +} + +impl From for crate::oci::Box { + fn from(from: oci::Box) -> Self { + crate::oci::Box { + Height: from.height, + Width: from.width, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::User { + fn from(from: User) -> Self { + crate::oci::User { + UID: from.uid, + GID: from.gid, + AdditionalGids: from.additional_gids, + Username: from.username, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxCapabilities { + fn from(from: LinuxCapabilities) -> Self { + crate::oci::LinuxCapabilities { + Bounding: from_vec(from.bounding), + Effective: from_vec(from.effective), + Inheritable: from_vec(from.inheritable), + Permitted: from_vec(from.permitted), + Ambient: from_vec(from.ambient), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::POSIXRlimit { + fn from(from: PosixRlimit) -> Self { + crate::oci::POSIXRlimit { + Type: from.r#type, + Hard: from.hard, + Soft: from.soft, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Process { + fn from(from: Process) -> Self { + crate::oci::Process { + Terminal: from.terminal, + ConsoleSize: from_option(from.console_size), + User: from_option(Some(from.user)), + Args: from_vec(from.args), + Env: from_vec(from.env), + Cwd: from.cwd, + Capabilities: from_option(from.capabilities), + Rlimits: from_vec(from.rlimits), + NoNewPrivileges: from.no_new_privileges, + ApparmorProfile: from.apparmor_profile, + OOMScoreAdj: from.oom_score_adj.map_or(0, |t| t as i64), + SelinuxLabel: from.selinux_label, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxDeviceCgroup { + fn from(from: oci::LinuxDeviceCgroup) -> Self { + crate::oci::LinuxDeviceCgroup { + Allow: from.allow, + Type: from.r#type, + Major: from.major.map_or(0, |t| t as i64), + Minor: from.minor.map_or(0, |t| t as i64), + Access: from.access, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxMemory { + fn from(from: LinuxMemory) -> Self { + crate::oci::LinuxMemory { + Limit: from.limit.map_or(0, |t| t), + Reservation: from.reservation.map_or(0, |t| t), + Swap: from.swap.map_or(0, |t| t), + Kernel: from.kernel.map_or(0, |t| t), + KernelTCP: from.kernel_tcp.map_or(0, |t| t), + Swappiness: from.swappiness.map_or(0, |t| t as u64), + DisableOOMKiller: from.disable_oom_killer.map_or(false, |t| t), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxCPU { + fn from(from: LinuxCpu) -> Self { + crate::oci::LinuxCPU { + Shares: from.shares.map_or(0, |t| t), + Quota: from.quota.map_or(0, |t| t), + Period: from.period.map_or(0, |t| t), + RealtimeRuntime: from.realtime_runtime.map_or(0, |t| t), + RealtimePeriod: from.realtime_period.map_or(0, |t| t), + Cpus: from.cpus, + Mems: from.mems, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxPids { + fn from(from: LinuxPids) -> Self { + crate::oci::LinuxPids { + Limit: from.limit, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxWeightDevice { + fn from(from: LinuxWeightDevice) -> Self { + crate::oci::LinuxWeightDevice { + // TODO : check + Major: 0, + Minor: 0, + Weight: from.weight.map_or(0, |t| t as u32), + LeafWeight: from.leaf_weight.map_or(0, |t| t as u32), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxThrottleDevice { + fn from(from: LinuxThrottleDevice) -> Self { + crate::oci::LinuxThrottleDevice { + // TODO : check + Major: 0, + Minor: 0, + Rate: from.rate, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxBlockIO { + fn from(from: LinuxBlockIo) -> Self { + crate::oci::LinuxBlockIO { + Weight: from.weight.map_or(0, |t| t as u32), + LeafWeight: from.leaf_weight.map_or(0, |t| t as u32), + WeightDevice: from_vec(from.weight_device), + ThrottleReadBpsDevice: from_vec(from.throttle_read_bps_device), + ThrottleWriteBpsDevice: from_vec(from.throttle_write_bps_device), + ThrottleReadIOPSDevice: from_vec(from.throttle_read_iops_device), + ThrottleWriteIOPSDevice: from_vec(from.throttle_write_iops_device), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxHugepageLimit { + fn from(from: LinuxHugepageLimit) -> Self { + crate::oci::LinuxHugepageLimit { + Pagesize: from.page_size, + Limit: from.limit, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxInterfacePriority { + fn from(from: LinuxInterfacePriority) -> Self { + crate::oci::LinuxInterfacePriority { + Name: from.name, + Priority: from.priority, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxNetwork { + fn from(from: LinuxNetwork) -> Self { + crate::oci::LinuxNetwork { + ClassID: from.class_id.map_or(0, |t| t), + Priorities: from_vec(from.priorities), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxResources { + fn from(from: LinuxResources) -> Self { + crate::oci::LinuxResources { + Devices: from_vec(from.devices), + Memory: from_option(from.memory), + CPU: from_option(from.cpu), + Pids: from_option(from.pids), + BlockIO: from_option(from.block_io), + HugepageLimits: from_vec(from.hugepage_limits), + Network: from_option(from.network), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Root { + fn from(from: Root) -> Self { + crate::oci::Root { + Path: from.path, + Readonly: from.readonly, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Mount { + fn from(from: Mount) -> Self { + crate::oci::Mount { + destination: from.destination, + source: from.source, + field_type: from.r#type, + options: from_vec(from.options), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Hook { + fn from(from: Hook) -> Self { + let mut timeout: i64 = 0; + if let Some(v) = from.timeout { + timeout = v as i64; + } + crate::oci::Hook { + Path: from.path, + Args: from_vec(from.args), + Env: from_vec(from.env), + Timeout: timeout, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Hooks { + fn from(from: Hooks) -> Self { + crate::oci::Hooks { + Prestart: from_vec(from.prestart), + Poststart: from_vec(from.poststart), + Poststop: from_vec(from.poststop), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxIDMapping { + fn from(from: LinuxIdMapping) -> Self { + crate::oci::LinuxIDMapping { + HostID: from.host_id, + ContainerID: from.container_id, + Size: from.size, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxNamespace { + fn from(from: LinuxNamespace) -> Self { + crate::oci::LinuxNamespace { + Type: from.r#type, + Path: from.path, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxDevice { + fn from(from: LinuxDevice) -> Self { + crate::oci::LinuxDevice { + Path: from.path, + Type: from.r#type, + Major: from.major, + Minor: from.minor, + FileMode: from.file_mode.map_or(0, |v| v as u32), + UID: from.uid.map_or(0, |v| v), + GID: from.gid.map_or(0, |v| v), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxSeccompArg { + fn from(from: LinuxSeccompArg) -> Self { + crate::oci::LinuxSeccompArg { + Index: from.index as u64, + Value: from.value, + ValueTwo: from.value_two, + Op: from.op, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxSyscall { + fn from(from: LinuxSyscall) -> Self { + crate::oci::LinuxSyscall { + Names: from_vec(from.names), + Action: from.action, + Args: from_vec(from.args), + ErrnoRet: Some(crate::oci::LinuxSyscall_oneof_ErrnoRet::errnoret( + from.errno_ret, + )), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxSeccomp { + fn from(from: LinuxSeccomp) -> Self { + crate::oci::LinuxSeccomp { + DefaultAction: from.default_action, + Architectures: from_vec(from.architectures), + Syscalls: from_vec(from.syscalls), + Flags: from_vec(from.flags), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::LinuxIntelRdt { + fn from(from: LinuxIntelRdt) -> Self { + crate::oci::LinuxIntelRdt { + L3CacheSchema: from.l3_cache_schema, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Linux { + fn from(from: Linux) -> Self { + crate::oci::Linux { + UIDMappings: from_vec(from.uid_mappings), + GIDMappings: from_vec(from.gid_mappings), + Sysctl: from.sysctl, + Resources: from_option(from.resources), + CgroupsPath: from.cgroups_path, + Namespaces: from_vec(from.namespaces), + Devices: from_vec(from.devices), + Seccomp: from_option(from.seccomp), + RootfsPropagation: from.rootfs_propagation, + MaskedPaths: from_vec(from.masked_paths), + ReadonlyPaths: from_vec(from.readonly_paths), + MountLabel: from.mount_label, + IntelRdt: from_option(from.intel_rdt), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for crate::oci::Spec { + fn from(from: Spec) -> Self { + crate::oci::Spec { + Version: from.version, + Process: from_option(from.process), + Root: from_option(from.root), + Hostname: from.hostname, + Mounts: from_vec(from.mounts), + Hooks: from_option(from.hooks), + Annotations: from.annotations, + Linux: from_option(from.linux), + Solaris: Default::default(), + Windows: Default::default(), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for oci::Root { + fn from(from: crate::oci::Root) -> Self { + Self { + path: from.Path, + readonly: from.Readonly, + } + } +} + +impl From for oci::Mount { + fn from(mut from: crate::oci::Mount) -> Self { + let options = from.take_options().to_vec(); + Self { + r#type: from.take_field_type(), + destination: from.take_destination(), + source: from.take_source(), + options, + } + } +} + +impl From for oci::LinuxIdMapping { + fn from(from: crate::oci::LinuxIDMapping) -> Self { + LinuxIdMapping { + container_id: from.get_ContainerID(), + host_id: from.get_HostID(), + size: from.get_Size(), + } + } +} + +impl From for oci::LinuxDeviceCgroup { + fn from(mut from: crate::oci::LinuxDeviceCgroup) -> Self { + let mut major = None; + if from.get_Major() > 0 { + major = Some(from.get_Major() as i64); + } + + let mut minor = None; + if from.get_Minor() > 0 { + minor = Some(from.get_Minor() as i64) + } + + oci::LinuxDeviceCgroup { + allow: from.get_Allow(), + r#type: from.take_Type(), + major, + minor, + access: from.take_Access(), + } + } +} + +impl From for oci::LinuxMemory { + fn from(from: crate::oci::LinuxMemory) -> Self { + let mut limit = None; + if from.get_Limit() > 0 { + limit = Some(from.get_Limit()); + } + + let mut reservation = None; + if from.get_Reservation() > 0 { + reservation = Some(from.get_Reservation()); + } + + let mut swap = None; + if from.get_Swap() > 0 { + swap = Some(from.get_Swap()); + } + + let mut kernel = None; + if from.get_Kernel() > 0 { + kernel = Some(from.get_Kernel()); + } + + let mut kernel_tcp = None; + if from.get_KernelTCP() > 0 { + kernel_tcp = Some(from.get_KernelTCP()); + } + + let mut swappiness = None; + if from.get_Swappiness() > 0 { + swappiness = Some(from.get_Swappiness()); + } + + let disable_oom_killer = Some(from.get_DisableOOMKiller()); + + oci::LinuxMemory { + limit, + reservation, + swap, + kernel, + kernel_tcp, + swappiness, + disable_oom_killer, + } + } +} + +impl From for oci::LinuxCpu { + fn from(mut from: crate::oci::LinuxCPU) -> Self { + let mut shares = None; + if from.get_Shares() > 0 { + shares = Some(from.get_Shares()); + } + + let mut quota = None; + if from.get_Quota() > 0 { + quota = Some(from.get_Quota()); + } + + let mut period = None; + if from.get_Period() > 0 { + period = Some(from.get_Period()); + } + + let mut realtime_runtime = None; + if from.get_RealtimeRuntime() > 0 { + realtime_runtime = Some(from.get_RealtimeRuntime()); + } + + let mut realtime_period = None; + if from.get_RealtimePeriod() > 0 { + realtime_period = Some(from.get_RealtimePeriod()); + } + + let cpus = from.take_Cpus(); + let mems = from.take_Mems(); + + oci::LinuxCpu { + shares, + quota, + period, + realtime_runtime, + realtime_period, + cpus, + mems, + } + } +} + +impl From for oci::LinuxPids { + fn from(from: crate::oci::LinuxPids) -> Self { + oci::LinuxPids { + limit: from.get_Limit(), + } + } +} + +impl From for oci::LinuxBlockIo { + fn from(from: crate::oci::LinuxBlockIO) -> Self { + let mut weight = None; + if from.get_Weight() > 0 { + weight = Some(from.get_Weight() as u16); + } + let mut leaf_weight = None; + if from.get_LeafWeight() > 0 { + leaf_weight = Some(from.get_LeafWeight() as u16); + } + let mut weight_device = Vec::new(); + for wd in from.get_WeightDevice() { + weight_device.push(wd.clone().into()); + } + + let mut throttle_read_bps_device = Vec::new(); + for td in from.get_ThrottleReadBpsDevice() { + throttle_read_bps_device.push(td.clone().into()); + } + + let mut throttle_write_bps_device = Vec::new(); + for td in from.get_ThrottleWriteBpsDevice() { + throttle_write_bps_device.push(td.clone().into()); + } + + let mut throttle_read_iops_device = Vec::new(); + for td in from.get_ThrottleReadIOPSDevice() { + throttle_read_iops_device.push(td.clone().into()); + } + + let mut throttle_write_iops_device = Vec::new(); + for td in from.get_ThrottleWriteIOPSDevice() { + throttle_write_iops_device.push(td.clone().into()); + } + + oci::LinuxBlockIo { + weight, + leaf_weight, + weight_device, + throttle_read_bps_device, + throttle_write_bps_device, + throttle_read_iops_device, + throttle_write_iops_device, + } + } +} + +impl From for oci::LinuxThrottleDevice { + fn from(from: crate::oci::LinuxThrottleDevice) -> Self { + oci::LinuxThrottleDevice { + blk: oci::LinuxBlockIoDevice { + major: from.Major, + minor: from.Minor, + }, + rate: from.Rate, + } + } +} + +impl From for oci::LinuxWeightDevice { + fn from(from: crate::oci::LinuxWeightDevice) -> Self { + oci::LinuxWeightDevice { + blk: oci::LinuxBlockIoDevice { + major: from.Major, + minor: from.Minor, + }, + weight: Some(from.Weight as u16), + leaf_weight: Some(from.LeafWeight as u16), + } + } +} + +impl From for oci::LinuxInterfacePriority { + fn from(mut from: crate::oci::LinuxInterfacePriority) -> Self { + oci::LinuxInterfacePriority { + name: from.take_Name(), + priority: from.get_Priority(), + } + } +} + +impl From for oci::LinuxNetwork { + fn from(mut from: crate::oci::LinuxNetwork) -> Self { + let mut class_id = None; + if from.get_ClassID() > 0 { + class_id = Some(from.get_ClassID()); + } + let mut priorities = Vec::new(); + for p in from.take_Priorities().to_vec() { + priorities.push(p.into()) + } + + oci::LinuxNetwork { + class_id, + priorities, + } + } +} + +impl From for oci::LinuxHugepageLimit { + fn from(mut from: crate::oci::LinuxHugepageLimit) -> Self { + oci::LinuxHugepageLimit { + page_size: from.take_Pagesize(), + limit: from.get_Limit(), + } + } +} + +impl From for oci::LinuxResources { + fn from(mut from: crate::oci::LinuxResources) -> Self { + let mut devices = Vec::new(); + for d in from.take_Devices().to_vec() { + devices.push(d.into()); + } + + let mut memory = None; + if from.has_Memory() { + memory = Some(from.take_Memory().into()); + } + + let mut cpu = None; + if from.has_CPU() { + cpu = Some(from.take_CPU().into()); + } + + let mut pids = None; + if from.has_Pids() { + pids = Some(from.get_Pids().clone().into()) + } + + let mut block_io = None; + if from.has_BlockIO() { + block_io = Some(from.get_BlockIO().clone().into()); + } + + let mut hugepage_limits = Vec::new(); + for hl in from.get_HugepageLimits() { + hugepage_limits.push(hl.clone().into()); + } + + let mut network = None; + if from.has_Network() { + network = Some(from.take_Network().into()); + } + + let rdma = HashMap::new(); + + LinuxResources { + devices, + memory, + cpu, + pids, + block_io, + hugepage_limits, + network, + rdma, + } + } +} + +impl From for oci::LinuxDevice { + fn from(mut from: crate::oci::LinuxDevice) -> Self { + oci::LinuxDevice { + path: from.take_Path(), + r#type: from.take_Type(), + major: from.get_Major(), + minor: from.get_Minor(), + file_mode: Some(from.get_FileMode()), + uid: Some(from.get_UID()), + gid: Some(from.get_GID()), + } + } +} + +impl From for oci::LinuxSeccompArg { + fn from(mut from: crate::oci::LinuxSeccompArg) -> Self { + oci::LinuxSeccompArg { + index: from.get_Index() as u32, + value: from.get_Value(), + value_two: from.get_ValueTwo(), + op: from.take_Op(), + } + } +} + +impl From for oci::LinuxSyscall { + fn from(mut from: crate::oci::LinuxSyscall) -> Self { + let mut args = Vec::new(); + for ag in from.take_Args().to_vec() { + args.push(ag.into()); + } + oci::LinuxSyscall { + names: from.take_Names().to_vec(), + action: from.take_Action(), + args, + errno_ret: from.get_errnoret(), + } + } +} + +impl From for oci::LinuxSeccomp { + fn from(mut from: crate::oci::LinuxSeccomp) -> Self { + let mut syscalls = Vec::new(); + for s in from.take_Syscalls().to_vec() { + syscalls.push(s.into()); + } + + oci::LinuxSeccomp { + default_action: from.take_DefaultAction(), + architectures: from.take_Architectures().to_vec(), + syscalls, + flags: from.take_Flags().to_vec(), + } + } +} + +impl From for oci::LinuxNamespace { + fn from(mut from: crate::oci::LinuxNamespace) -> Self { + oci::LinuxNamespace { + r#type: from.take_Type(), + path: from.take_Path(), + } + } +} + +impl From for oci::Linux { + fn from(mut from: crate::oci::Linux) -> Self { + let mut uid_mappings = Vec::new(); + for id_map in from.take_UIDMappings().to_vec() { + uid_mappings.push(id_map.into()) + } + + let mut gid_mappings = Vec::new(); + for id_map in from.take_GIDMappings().to_vec() { + gid_mappings.push(id_map.into()) + } + + let sysctl = from.get_Sysctl().clone(); + let mut resources = None; + if from.has_Resources() { + resources = Some(from.take_Resources().into()); + } + + let cgroups_path = from.take_CgroupsPath(); + let mut namespaces = Vec::new(); + for ns in from.take_Namespaces().to_vec() { + namespaces.push(ns.into()) + } + + let mut devices = Vec::new(); + for d in from.take_Devices().to_vec() { + devices.push(d.into()); + } + + let mut seccomp = None; + if from.has_Seccomp() { + seccomp = Some(from.take_Seccomp().into()); + } + + let rootfs_propagation = from.take_RootfsPropagation(); + let masked_paths = from.take_MaskedPaths().to_vec(); + + let readonly_paths = from.take_ReadonlyPaths().to_vec(); + + let mount_label = from.take_MountLabel(); + let intel_rdt = None; + + oci::Linux { + uid_mappings, + gid_mappings, + sysctl, + resources, + cgroups_path, + namespaces, + devices, + seccomp, + rootfs_propagation, + masked_paths, + readonly_paths, + mount_label, + intel_rdt, + } + } +} + +impl From for oci::PosixRlimit { + fn from(mut from: crate::oci::POSIXRlimit) -> Self { + oci::PosixRlimit { + r#type: from.take_Type(), + hard: from.get_Hard(), + soft: from.get_Soft(), + } + } +} + +impl From for oci::LinuxCapabilities { + fn from(mut from: crate::oci::LinuxCapabilities) -> Self { + oci::LinuxCapabilities { + bounding: from.take_Bounding().to_vec(), + effective: from.take_Effective().to_vec(), + inheritable: from.take_Inheritable().to_vec(), + permitted: from.take_Permitted().to_vec(), + ambient: from.take_Ambient().to_vec(), + } + } +} + +impl From for oci::User { + fn from(mut from: crate::oci::User) -> Self { + oci::User { + uid: from.get_UID(), + gid: from.get_GID(), + additional_gids: from.take_AdditionalGids().to_vec(), + username: from.take_Username(), + } + } +} + +impl From for oci::Box { + fn from(from: crate::oci::Box) -> Self { + oci::Box { + height: from.get_Height(), + width: from.get_Width(), + } + } +} + +impl From for oci::Process { + fn from(mut from: crate::oci::Process) -> Self { + let mut console_size = None; + if from.has_ConsoleSize() { + console_size = Some(from.take_ConsoleSize().into()); + } + + let user = from.take_User().into(); + let args = from.take_Args().into_vec(); + let env = from.take_Env().into_vec(); + let cwd = from.take_Cwd(); + let mut capabilities = None; + if from.has_Capabilities() { + capabilities = Some(from.take_Capabilities().into()); + } + let mut rlimits = Vec::new(); + for rl in from.take_Rlimits().to_vec() { + rlimits.push(rl.into()); + } + let no_new_privileges = from.get_NoNewPrivileges(); + let apparmor_profile = from.take_ApparmorProfile(); + let mut oom_score_adj = None; + if from.get_OOMScoreAdj() != 0 { + oom_score_adj = Some(from.get_OOMScoreAdj() as i32); + } + let selinux_label = from.take_SelinuxLabel(); + + oci::Process { + terminal: from.Terminal, + console_size, + user, + args, + env, + cwd, + capabilities, + rlimits, + no_new_privileges, + apparmor_profile, + oom_score_adj, + selinux_label, + } + } +} + +impl From for oci::Hook { + fn from(mut from: crate::oci::Hook) -> Self { + let mut timeout = None; + if from.get_Timeout() > 0 { + timeout = Some(from.get_Timeout() as i32); + } + oci::Hook { + path: from.take_Path(), + args: from.take_Args().to_vec(), + env: from.take_Env().to_vec(), + timeout, + } + } +} + +impl From for oci::Hooks { + fn from(mut from: crate::oci::Hooks) -> Self { + let mut prestart = Vec::new(); + for hook in from.take_Prestart().to_vec() { + prestart.push(hook.into()) + } + let mut poststart = Vec::new(); + for hook in from.take_Poststart().to_vec() { + poststart.push(hook.into()); + } + let mut poststop = Vec::new(); + for hook in from.take_Poststop().to_vec() { + poststop.push(hook.into()); + } + oci::Hooks { + prestart, + poststart, + poststop, + } + } +} + +impl From for oci::Spec { + fn from(mut from: crate::oci::Spec) -> Self { + let mut process = None; + if from.has_Process() { + process = Some(from.take_Process().into()); + } + + let mut root = None; + if from.has_Root() { + root = Some(from.take_Root().into()); + } + + let mut mounts = Vec::new(); + for m in from.take_Mounts().into_vec() { + mounts.push(m.into()) + } + + let mut hooks: Option = None; + if from.has_Hooks() { + hooks = Some(from.take_Hooks().into()); + } + + let annotations = from.take_Annotations(); + + let mut linux = None; + if from.has_Linux() { + linux = Some(from.take_Linux().into()); + } + + oci::Spec { + version: from.take_Version(), + process, + root, + hostname: from.take_Hostname(), + mounts, + hooks, + annotations, + linux, + solaris: None, + windows: None, + vm: None, + } + } +} + +#[cfg(test)] +mod tests { + use crate::trans::from_vec; + + #[derive(Clone)] + struct TestA { + pub from: String, + } + + #[derive(Clone)] + struct TestB { + pub to: String, + } + + impl From for TestB { + fn from(from: TestA) -> Self { + TestB { to: from.from } + } + } + + #[test] + fn test_from() { + let from = TestA { + from: "a".to_string(), + }; + let to: TestB = TestB::from(from.clone()); + + assert_eq!(from.from, to.to); + } + + #[test] + fn test_from_vec_len_0() { + let from: Vec = vec![]; + let to: ::protobuf::RepeatedField = from_vec(from.clone()); + assert_eq!(from.len(), to.len()); + } + + #[test] + fn test_from_vec_len_1() { + let from: Vec = vec![TestA { + from: "a".to_string(), + }]; + let to: ::protobuf::RepeatedField = from_vec(from.clone()); + + assert_eq!(from.len(), to.len()); + assert_eq!(from[0].from, to[0].to); + } +} diff --git a/src/runtime-rs/.gitignore b/src/runtime-rs/.gitignore new file mode 100644 index 0000000000..0e5a39c11f --- /dev/null +++ b/src/runtime-rs/.gitignore @@ -0,0 +1,3 @@ +target +crates/shim/src/config.rs +/config/*.toml diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock new file mode 100644 index 0000000000..5bfe7cbfe9 --- /dev/null +++ b/src/runtime-rs/Cargo.lock @@ -0,0 +1,3062 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "actix-macros" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465a6172cf69b960917811022d8f29bc0b7fa1398bc4f78b3c466673db1213b6" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "actix-rt" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ea16c295198e958ef31930a6ef37d0fb64e9ca3b6116e6b93a8bdae96ee1000" +dependencies = [ + "actix-macros", + "futures-core", + "tokio", +] + +[[package]] +name = "addr2line" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "agent" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "futures 0.1.31", + "kata-types", + "log", + "logging", + "oci", + "protobuf", + "protocols", + "serde", + "serde_json", + "slog", + "slog-scope", + "tokio", + "ttrpc", + "url", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f9b8508dccb7687a1d6c4ce66b2b0ecef467c94667de27d8d7fe1f8d2a9cdc" + +[[package]] +name = "arc-swap" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dabe5a181f83789739c194cbe5a897dde195078fac08568d09221fd6137a7ba8" + +[[package]] +name = "arc-swap" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5d78ce20460b82d3fa150275ed9d55e21064fc7951177baacf86a145c4a4b1f" + +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + +[[package]] +name = "arrayvec" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" + +[[package]] +name = "async-trait" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "awaitgroup" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc17ab023b4091c10ff099f9deebaeeb59b5189df07e554c4fef042b70745d68" + +[[package]] +name = "backtrace" +version = "0.3.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11a17d453482a265fd5f8479f2a3f405566e6ca627837aaddb85af8b1ab8ef61" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "blake3" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a08e53fc5a564bb15bfe6fae56bd71522205f1f91893f9c0116edad6496c183f" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if 1.0.0", + "constant_time_eq", + "digest 0.10.3", +] + +[[package]] +name = "blobfs" +version = "0.1.0" +source = "git+https://github.com/dragonflyoss/image-service.git?rev=316380792092f73c99f832c4cb44ef4319d6f76b#316380792092f73c99f832c4cb44ef4319d6f76b" +dependencies = [ + "fuse-backend-rs", + "libc", + "log", + "nydus-error", + "rafs", + "serde", + "serde_json", + "serde_with", + "storage", + "vm-memory", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-buffer" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf7fe51849ea569fd452f37822f606a5cabb684dc918707a0193fd4664ff324" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" + +[[package]] +name = "byte-unit" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" +dependencies = [ + "byteorder", + "iovec", +] + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "caps" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61bf7211aad104ce2769ec05efcdfabf85ee84ac92461d142f22cf8badd0e54c" +dependencies = [ + "errno", + "libc", + "thiserror", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cgroups-rs" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdae996d9638ba03253ffa1c93345a585974a97abbdeab9176c77922f3efc1e8" +dependencies = [ + "libc", + "log", + "nix 0.23.1", + "regex", +] + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "time 0.1.43", + "winapi", +] + +[[package]] +name = "common" +version = "0.1.0" +dependencies = [ + "agent", + "anyhow", + "async-trait", + "containerd-shim-protos", + "kata-sys-util", + "kata-types", + "lazy_static", + "nix 0.24.1", + "oci", + "protobuf", + "serde_json", + "slog", + "slog-scope", + "strum", + "thiserror", + "tokio", + "ttrpc", +] + +[[package]] +name = "common-path" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101" + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "containerd-shim-protos" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "077ec778a0835d9d85502e8535362130187759b69eddabe2bdb3a68ffb575bd0" +dependencies = [ + "async-trait", + "protobuf", + "ttrpc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "065a5d7ffdcbc8fa145d6f0746f3555025b9097a9e9cda59f7467abae670c78d" +dependencies = [ + "libc", +] + +[[package]] +name = "cpuid-bool" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" +dependencies = [ + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "crypto-common" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "darling" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "dashmap" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" +dependencies = [ + "cfg-if 1.0.0", + "num_cpus", + "parking_lot 0.12.1", +] + +[[package]] +name = "dbs-address-space" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9acd47f8b1ad8a6a62450d2d83ced5452dbf9549e2b98709d945554b22a45ed7" +dependencies = [ + "arc-swap 1.5.0", + "libc", + "nix 0.23.1", + "thiserror", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "dbs-allocator" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92a384ac9bd9c22c486c7a66e68cdc6cd504da7793b69bd891f3d85070c999b6" +dependencies = [ + "thiserror", +] + +[[package]] +name = "dbs-arch" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "kvm-bindings", + "kvm-ioctls", + "libc", + "memoffset", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "dbs-boot" +version = "0.2.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "dbs-arch", + "kvm-bindings", + "kvm-ioctls", + "lazy_static", + "libc", + "thiserror", + "vm-fdt", + "vm-memory", +] + +[[package]] +name = "dbs-device" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "thiserror", +] + +[[package]] +name = "dbs-interrupt" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "dbs-device", + "kvm-bindings", + "kvm-ioctls", + "libc", + "vmm-sys-util", +] + +[[package]] +name = "dbs-legacy-devices" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "dbs-device", + "dbs-utils", + "log", + "serde", + "vm-superio", + "vmm-sys-util", +] + +[[package]] +name = "dbs-uhttp" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b773f7f1b9088438e9746890c7c0836b133b07935812867a33e06e81c92c0cdc" +dependencies = [ + "libc", + "mio", +] + +[[package]] +name = "dbs-utils" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "anyhow", + "event-manager", + "libc", + "log", + "serde", + "thiserror", + "timerfd", + "vmm-sys-util", +] + +[[package]] +name = "dbs-virtio-devices" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" +dependencies = [ + "blobfs", + "byteorder", + "caps", + "dbs-device", + "dbs-interrupt", + "dbs-utils", + "epoll", + "fuse-backend-rs", + "io-uring", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "nix 0.23.1", + "rafs", + "rlimit", + "serde", + "serde_json", + "thiserror", + "threadpool", + "virtio-bindings", + "virtio-queue", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "digest" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fb860ca6fafa5552fb6d0e816a69c8e49f0908bf524e30a90d97c85892d506" +dependencies = [ + "block-buffer 0.10.2", + "crypto-common", + "subtle", +] + +[[package]] +name = "diskarbitration-sys" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f82432ae94d42f160b6e17389d6e1c1eee29827b99ad32d35a0a96bb98bedb5" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "dragonball" +version = "0.1.0" +dependencies = [ + "arc-swap 1.5.0", + "bytes 1.1.0", + "dbs-address-space", + "dbs-allocator", + "dbs-arch", + "dbs-boot", + "dbs-device", + "dbs-interrupt", + "dbs-legacy-devices", + "dbs-utils", + "dbs-virtio-devices", + "kvm-bindings", + "kvm-ioctls", + "lazy_static", + "libc", + "linux-loader", + "log", + "nix 0.23.1", + "seccompiler", + "serde", + "serde_derive", + "serde_json", + "slog", + "slog-scope", + "thiserror", + "virtio-queue", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "enum-iterator" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2953d1df47ac0eb70086ccabf0275aa8da8591a28bd358ee2b52bd9f9e3ff9e9" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8958699f9359f0b04e691a13850d48b7de329138023876d07cbd024c2c820598" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "epoll" +version = "4.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20df693c700404f7e19d4d6fae6b15215d2913c27955d2b9d6f2c0f537511cd0" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "event-manager" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "377fa591135fbe23396a18e2655a6d5481bf7c5823cdfa3cc81b01a229cbe640" +dependencies = [ + "libc", + "vmm-sys-util", +] + +[[package]] +name = "fail" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +dependencies = [ + "lazy_static", + "log", + "rand 0.8.5", +] + +[[package]] +name = "fastrand" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +dependencies = [ + "instant", +] + +[[package]] +name = "fixedbitset" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" + +[[package]] +name = "flate2" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + +[[package]] +name = "fuse-backend-rs" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a96ec48cd39ee2504eaa4a31b88262b7d13151a4da0b53af8fd212c7c9ffa5d" +dependencies = [ + "arc-swap 1.5.0", + "bitflags", + "caps", + "core-foundation-sys", + "diskarbitration-sys", + "lazy_static", + "libc", + "log", + "mio", + "nix 0.23.1", + "virtio-queue", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "futures" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a471a38ef8ed83cd6e40aa59c1ffe17db6855c18e3604d9c4ed8c08ebc28678" + +[[package]] +name = "futures" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" + +[[package]] +name = "futures-executor" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" + +[[package]] +name = "futures-macro" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" + +[[package]] +name = "futures-task" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" + +[[package]] +name = "futures-util" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.2+wasi-snapshot-preview1", +] + +[[package]] +name = "getset" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "gimli" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4" + +[[package]] +name = "git2" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3826a6e0e2215d7a41c2bfc7c9244123969273f3476b939a226aac0ab56e9e3c" +dependencies = [ + "bitflags", + "libc", + "libgit2-sys", + "log", + "url", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "go-flag" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4a40c9ca507513f573aabaf6a8558173a1ac9aa1363d8de30c7f89b34f8d2b" +dependencies = [ + "cfg-if 0.1.10", +] + +[[package]] +name = "governor" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19775995ee20209163239355bc3ad2f33f83da35d9ef72dea26e5af753552c87" +dependencies = [ + "dashmap", + "futures 0.3.21", + "futures-timer", + "no-std-compat", + "nonzero_ext", + "parking_lot 0.12.1", + "quanta", + "rand 0.8.5", + "smallvec", +] + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "httpdate" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" + +[[package]] +name = "hypervisor" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "dbs-utils", + "dragonball", + "go-flag", + "kata-sys-util", + "kata-types", + "libc", + "logging", + "nix 0.24.1", + "seccompiler", + "serde_json", + "slog", + "slog-scope", + "thiserror", + "tokio", + "vmm-sys-util", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6012d540c5baa3589337a98ce73408de9b5a25ec9fc2c6fd6be8f0d39e0ca5a" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "io-lifetimes" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9448015e586b611e5d322f6703812bbca2f1e709d5773ecd38ddb4e3bb649504" + +[[package]] +name = "io-uring" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d75829ed9377bab6c90039fe47b9d84caceb4b5063266142e21bcce6550cda8" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "671a26f820db17c2a2750743f1dd03bafd15b98c9f30c7c2628c024c05d73397" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "kata-sys-util" +version = "0.1.0" +dependencies = [ + "byteorder", + "cgroups-rs", + "chrono", + "common-path", + "fail", + "kata-types", + "lazy_static", + "libc", + "nix 0.24.1", + "oci", + "once_cell", + "rand 0.7.3", + "serde_json", + "slog", + "slog-scope", + "subprocess", + "thiserror", +] + +[[package]] +name = "kata-types" +version = "0.1.0" +dependencies = [ + "byte-unit", + "glob", + "lazy_static", + "num_cpus", + "oci", + "regex", + "serde", + "serde_json", + "slog", + "slog-scope", + "thiserror", + "toml 0.5.9", +] + +[[package]] +name = "kvm-bindings" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a78c049190826fff959994b7c1d8a2930d0a348f1b8f3aa4f9bb34cd5d7f2952" +dependencies = [ + "vmm-sys-util", +] + +[[package]] +name = "kvm-ioctls" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97422ba48d7ffb66fd4d18130f72ab66f9bbbf791fb7a87b9291cdcfec437593" +dependencies = [ + "kvm-bindings", + "libc", + "vmm-sys-util", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" + +[[package]] +name = "libgit2-sys" +version = "0.13.2+1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a42de9a51a5c12e00fc0e4ca6bc2ea43582fc6418488e8f615e905d886f258b" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-loader" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a5e77493808403a6bd56a301a64ea6b9342e36ea845044bf0dfdf56fe52fa08" +dependencies = [ + "vm-memory", +] + +[[package]] +name = "linux-raw-sys" +version = "0.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4d2456c373231a208ad294c33dc5bff30051eafd954cd4caae83a712b12854d" + +[[package]] +name = "linux_container" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "common", + "kata-types", + "tokio", +] + +[[package]] +name = "lock_api" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "logging" +version = "0.1.0" +dependencies = [ + "serde_json", + "slog", + "slog-async", + "slog-json", + "slog-scope", +] + +[[package]] +name = "lz4-sys" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7be8908e2ed6f31c02db8a9fa962f03e36c53fbfde437363eae3306b85d7e17" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "mach" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa" +dependencies = [ + "libc", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "713d550d9b44d89174e066b7a6217ae06234c10cb47819a88290d2b353c31799" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "netlink-packet-core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345b8ab5bd4e71a2986663e88c56856699d060e78e152e6e9d7966fcd5491297" +dependencies = [ + "anyhow", + "byteorder", + "libc", + "netlink-packet-utils", +] + +[[package]] +name = "netlink-packet-route" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea4302b9759a7a88242299225ea3688e63c85ea136371bb6cf94fd674efaab" +dependencies = [ + "anyhow", + "bitflags", + "byteorder", + "libc", + "netlink-packet-core", + "netlink-packet-utils", +] + +[[package]] +name = "netlink-packet-utils" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25af9cf0dc55498b7bd94a1508af7a78706aa0ab715a73c5169273e03c84845e" +dependencies = [ + "anyhow", + "byteorder", + "paste", + "thiserror", +] + +[[package]] +name = "netlink-proto" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65b4b14489ab424703c092062176d52ba55485a89c076b4f9db05092b7223aa6" +dependencies = [ + "bytes 1.1.0", + "futures 0.3.21", + "log", + "netlink-packet-core", + "netlink-sys", + "thiserror", + "tokio", +] + +[[package]] +name = "netlink-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92b654097027250401127914afb37cb1f311df6610a9891ff07a757e94199027" +dependencies = [ + "bytes 1.1.0", + "futures 0.3.21", + "libc", + "log", + "tokio", +] + +[[package]] +name = "nix" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0eaf8df8bab402257e0a5c17a254e4cc1f72a93588a1ddfb5d356c801aa7cb" +dependencies = [ + "bitflags", + "cc", + "cfg-if 0.1.10", + "libc", + "void", +] + +[[package]] +name = "nix" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + +[[package]] +name = "nix" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "libc", + "memoffset", +] + +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "nydus-error" +version = "0.2.0" +source = "git+https://github.com/dragonflyoss/image-service.git?rev=316380792092f73c99f832c4cb44ef4319d6f76b#316380792092f73c99f832c4cb44ef4319d6f76b" +dependencies = [ + "backtrace", + "httpdate", + "libc", + "log", + "serde", + "serde_json", +] + +[[package]] +name = "nydus-utils" +version = "0.1.0" +source = "git+https://github.com/dragonflyoss/image-service.git?rev=316380792092f73c99f832c4cb44ef4319d6f76b#316380792092f73c99f832c4cb44ef4319d6f76b" +dependencies = [ + "blake3", + "flate2", + "fuse-backend-rs", + "lazy_static", + "libc", + "log", + "lz4-sys", + "nydus-error", + "serde", + "serde_json", + "sha2", + "zstd", +] + +[[package]] +name = "object" +version = "0.28.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e42c982f2d955fac81dd7e1d0e1426a7d702acd9c98d19ab01083a6a0328c424" +dependencies = [ + "memchr", +] + +[[package]] +name = "oci" +version = "0.1.0" +dependencies = [ + "libc", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "once_cell" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.5", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.3", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + +[[package]] +name = "paste" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c520e05135d6e763148b6426a837e239041653ba7becd2e538c076c738025fc" + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "petgraph" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" + +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c54b25569025b7fc9651de43004ae593a75ad88543b17178aa5e1b9c4f15f56f" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +dependencies = [ + "bytes 1.1.0", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +dependencies = [ + "bytes 1.1.0", + "heck 0.3.3", + "itertools", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +dependencies = [ + "bytes 1.1.0", + "prost", +] + +[[package]] +name = "protobuf" +version = "2.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "protobuf-codegen" +version = "2.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6" +dependencies = [ + "protobuf", + "protobuf-codegen", +] + +[[package]] +name = "protocols" +version = "0.1.0" +dependencies = [ + "async-trait", + "oci", + "protobuf", + "ttrpc", + "ttrpc-codegen", +] + +[[package]] +name = "quanta" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20afe714292d5e879d8b12740aa223c6a88f118af41870e8b6196e39a02238a8" +dependencies = [ + "crossbeam-utils", + "libc", + "mach", + "once_cell", + "raw-cpuid", + "wasi 0.10.2+wasi-snapshot-preview1", + "web-sys", + "winapi", +] + +[[package]] +name = "quote" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rafs" +version = "0.1.0" +source = "git+https://github.com/dragonflyoss/image-service.git?rev=316380792092f73c99f832c4cb44ef4319d6f76b#316380792092f73c99f832c4cb44ef4319d6f76b" +dependencies = [ + "anyhow", + "arc-swap 0.4.8", + "bitflags", + "blake3", + "flate2", + "fuse-backend-rs", + "futures 0.3.21", + "lazy_static", + "libc", + "log", + "lz4-sys", + "nix 0.23.1", + "nydus-error", + "nydus-utils", + "serde", + "serde_json", + "serde_with", + "sha2", + "spmc", + "storage", + "vm-memory", +] + +[[package]] +name = "rand" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ac302d8f83c0c1974bf758f6b041c6c8ada916fbb44a609158ca8b064cc76c" +dependencies = [ + "libc", + "rand 0.4.6", +] + +[[package]] +name = "rand" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" +dependencies = [ + "fuchsia-cprng", + "libc", + "rand_core 0.3.1", + "rdrand", + "winapi", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +dependencies = [ + "rand_core 0.4.2", +] + +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom 0.2.6", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "raw-cpuid" +version = "10.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "738bc47119e3eeccc7e94c4a506901aea5e7b4944ecd0829cbebf4af04ceda12" +dependencies = [ + "bitflags", +] + +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "redox_syscall" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "resource" +version = "0.1.0" +dependencies = [ + "actix-rt", + "agent", + "anyhow", + "async-trait", + "bitflags", + "cgroups-rs", + "futures 0.3.21", + "hypervisor", + "kata-sys-util", + "kata-types", + "lazy_static", + "libc", + "logging", + "netlink-packet-route", + "netlink-sys", + "nix 0.24.1", + "oci", + "rand 0.7.3", + "rtnetlink", + "scopeguard", + "slog", + "slog-scope", + "tokio", + "uuid", +] + +[[package]] +name = "rlimit" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "347703a5ae47adf1e693144157be231dde38c72bd485925cae7407ad3e52480b" +dependencies = [ + "libc", +] + +[[package]] +name = "rtnetlink" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "322c53fd76a18698f1c27381d58091de3a043d356aa5bd0d510608b565f469a0" +dependencies = [ + "futures 0.3.21", + "log", + "netlink-packet-route", + "netlink-proto", + "nix 0.24.1", + "thiserror", + "tokio", +] + +[[package]] +name = "runtimes" +version = "0.1.0" +dependencies = [ + "anyhow", + "common", + "kata-types", + "lazy_static", + "linux_container", + "logging", + "oci", + "slog", + "slog-scope", + "tokio", + "virt_container", + "wasm_container", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.34.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2079c267b8394eb529872c3cf92e181c378b41fea36e68130357b52493701d2e" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "winapi", +] + +[[package]] +name = "rustversion" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" + +[[package]] +name = "ryu" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "seccompiler" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01d1292a1131b22ccea49f30bd106f1238b5ddeec1a98d39268dcc31d540e68" +dependencies = [ + "libc", +] + +[[package]] +name = "semver" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cb243bdfdb5936c8dc3c45762a19d12ab4550cdc753bc247637d4ec35a040fd" + +[[package]] +name = "serde" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f26faba0c3959972377d3b2d306ee9f71faee9714294e41bb777f83f88578be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7ce2b32a1aed03c558dc61a5cd328f15aff2dbc17daad8fb8af04d2100e15c" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" +dependencies = [ + "serde", + "serde_with_macros", +] + +[[package]] +name = "serde_with_macros" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serial_test" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0bccbcf40c8938196944a3da0e133e031a33f4d6b72db3bda3cc556e361905d" +dependencies = [ + "lazy_static", + "parking_lot 0.11.2", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2acd6defeddb41eb60bb468f8825d0cfd0c2a76bc03bfd235b6a1dc4f6a1ad5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "service" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "common", + "containerd-shim-protos", + "logging", + "runtimes", + "slog", + "slog-scope", + "tokio", + "ttrpc", +] + +[[package]] +name = "sha2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa827a14b29ab7f44778d14a88d3cb76e949c45083f7dbfa507d0cb699dc12de" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpuid-bool", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "shim" +version = "0.1.0" +dependencies = [ + "anyhow", + "backtrace", + "containerd-shim-protos", + "go-flag", + "kata-sys-util", + "kata-types", + "libc", + "log", + "logging", + "nix 0.24.1", + "oci", + "protobuf", + "rand 0.8.5", + "serial_test", + "service", + "sha2", + "slog", + "slog-async", + "slog-scope", + "slog-stdlog", + "tempfile", + "tests_utils", + "thiserror", + "tokio", + "unix_socket2", + "vergen", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" + +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + +[[package]] +name = "slog-async" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766c59b252e62a34651412870ff55d8c4e6d04df19b43eecb2703e417b097ffe" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-json" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e1e53f61af1e3c8b852eef0a9dee29008f55d6dd63794f3f12cef786cf0f219" +dependencies = [ + "serde", + "serde_json", + "slog", + "time 0.3.9", +] + +[[package]] +name = "slog-scope" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786" +dependencies = [ + "arc-swap 1.5.0", + "lazy_static", + "slog", +] + +[[package]] +name = "slog-stdlog" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6706b2ace5bbae7291d3f8d2473e2bfab073ccd7d03670946197aec98471fa3e" +dependencies = [ + "log", + "slog", + "slog-scope", +] + +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + +[[package]] +name = "socket2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "spmc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02a8428da277a8e3a15271d79943e80ccc2ef254e78813a166a08d65e4c3ece5" + +[[package]] +name = "storage" +version = "0.5.0" +source = "git+https://github.com/dragonflyoss/image-service.git?rev=316380792092f73c99f832c4cb44ef4319d6f76b#316380792092f73c99f832c4cb44ef4319d6f76b" +dependencies = [ + "anyhow", + "arc-swap 0.4.8", + "bitflags", + "dbs-uhttp", + "fuse-backend-rs", + "futures 0.3.21", + "governor", + "lazy_static", + "libc", + "log", + "nix 0.23.1", + "nydus-error", + "nydus-utils", + "serde", + "serde_json", + "serde_with", + "sha2", + "spmc", + "tokio", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e96acfc1b70604b8b2f1ffa4c57e59176c7dbb05d556c71ecd2f5498a1dee7f8" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef" +dependencies = [ + "heck 0.4.0", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "syn" +version = "1.0.96" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0748dd251e24453cb8717f0354206b91557e4ec8703673a4b30208f2abaf1ebf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if 1.0.0", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "tests_utils" +version = "0.1.0" +dependencies = [ + "rand 0.8.5", +] + +[[package]] +name = "thiserror" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +dependencies = [ + "once_cell", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +dependencies = [ + "itoa", + "libc", + "num_threads", +] + +[[package]] +name = "timerfd" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29f85a7c965b8e7136952f59f2a359694c78f105b2d2ff99cf6c2c404bf7e33f" +dependencies = [ + "rustix", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "tokio" +version = "1.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95eec79ea28c00a365f539f1961e9278fbcaf81c0ff6aaf0e93c181352446948" +dependencies = [ + "bytes 1.1.0", + "libc", + "memchr", + "mio", + "num_cpus", + "once_cell", + "parking_lot 0.12.1", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "winapi", +] + +[[package]] +name = "tokio-macros" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-vsock" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e0723fc001950a3b018947b05eeb45014fd2b7c6e8f292502193ab74486bdb6" +dependencies = [ + "bytes 0.4.12", + "futures 0.3.21", + "libc", + "tokio", + "vsock", +] + +[[package]] +name = "toml" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758664fc71a3a69038656bee8b6be6477d2a6c315a6b81f7081f591bffa4111f" +dependencies = [ + "serde", +] + +[[package]] +name = "toml" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" +dependencies = [ + "serde", +] + +[[package]] +name = "ttrpc" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ecfff459a859c6ba6668ff72b34c2f1d94d9d58f7088414c2674ad0f31cc7d8" +dependencies = [ + "async-trait", + "byteorder", + "futures 0.3.21", + "libc", + "log", + "nix 0.23.1", + "protobuf", + "protobuf-codegen-pure", + "thiserror", + "tokio", + "tokio-vsock", +] + +[[package]] +name = "ttrpc-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809eda4e459820237104e4b61d6b41bbe6c9e1ce6adf4057955e6e6722a90408" +dependencies = [ + "protobuf", + "protobuf-codegen", + "protobuf-codegen-pure", + "ttrpc-compiler", +] + +[[package]] +name = "ttrpc-compiler" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2978ed3fa047d8fd55cbeb4d4a61d461fb3021a90c9618519c73ce7e5bb66c15" +dependencies = [ + "derive-new", + "prost", + "prost-build", + "prost-types", + "protobuf", + "protobuf-codegen", + "tempfile", +] + +[[package]] +name = "typenum" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" + +[[package]] +name = "unicode-bidi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-ident" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + +[[package]] +name = "unix_socket2" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b57c6eace16c00eccb98a28e85db3370eab0685bdd5e13831d59e2bcb49a1d8a" +dependencies = [ + "libc", +] + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "uuid" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cfec50b0842181ba6e713151b72f4ec84a6a7e2c9c8a8a3ffc37bb1cd16b231" +dependencies = [ + "rand 0.3.23", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vergen" +version = "6.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3893329bee75c101278e0234b646fa72221547d63f97fb66ac112a0569acd110" +dependencies = [ + "anyhow", + "cfg-if 1.0.0", + "chrono", + "enum-iterator", + "getset", + "git2", + "rustc_version", + "rustversion", + "thiserror", +] + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "virt_container" +version = "0.1.0" +dependencies = [ + "agent", + "anyhow", + "async-trait", + "awaitgroup", + "common", + "containerd-shim-protos", + "futures 0.3.21", + "hypervisor", + "kata-sys-util", + "kata-types", + "lazy_static", + "libc", + "logging", + "nix 0.16.1", + "oci", + "protobuf", + "resource", + "serde", + "serde_derive", + "serde_json", + "slog", + "slog-scope", + "tokio", + "toml 0.4.10", + "url", +] + +[[package]] +name = "virtio-bindings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff512178285488516ed85f15b5d0113a7cdb89e9e8a760b269ae4f02b84bd6b" + +[[package]] +name = "virtio-queue" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90da9e627f6aaf667cc7b6548a28be332d3e1f058f4ceeb46ab6bcee5c4b74d" +dependencies = [ + "log", + "vm-memory", + "vmm-sys-util", +] + +[[package]] +name = "vm-fdt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43fb5a6bd1a7d423ad72802801036719b7546cf847a103f8fe4575f5b0d45a6" + +[[package]] +name = "vm-memory" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339d4349c126fdcd87e034631d7274370cf19eb0e87b33166bcd956589fc72c5" +dependencies = [ + "arc-swap 1.5.0", + "libc", + "winapi", +] + +[[package]] +name = "vm-superio" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4b5231d334edbc03b22704caa1a022e4c07491d6df736593f26094df8b04a51" + +[[package]] +name = "vmm-sys-util" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "733537bded03aaa93543f785ae997727b30d1d9f4a03b7861d23290474242e11" +dependencies = [ + "bitflags", + "libc", +] + +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "vsock" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133" +dependencies = [ + "libc", + "nix 0.23.1", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27370197c907c55e3f1a9fbe26f44e937fe6451368324e009cba39e139dc08ad" +dependencies = [ + "cfg-if 1.0.0", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53e04185bfa3a779273da532f5025e33398409573f348985af9a1cbf3774d3f4" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17cae7ff784d7e83a2fe7611cfe766ecf034111b49deb850a3dc7699c08251f5" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ec0dc7a4756fffc231aab1b9f2f578d23cd391390ab27f952ae0c9b3ece20b" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d554b7f530dee5964d9a9468d95c1f8b8acae4f282807e7d27d4b03099a46744" + +[[package]] +name = "wasm_container" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "common", + "kata-types", + "tokio", +] + +[[package]] +name = "web-sys" +version = "0.3.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b17e741662c70c8bd24ac5c5b18de314a2c26c32bf8346ee1e6f53de919c283" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "which" +version = "4.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c4fb54e6113b6a8772ee41c3404fb0301ac79604489467e0a9ce1f3e97c24ae" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.1+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" +dependencies = [ + "cc", + "libc", +] + +[[patch.unused]] +name = "dbs-upcall" +version = "0.1.0" +source = "git+https://github.com/openanolis/dragonball-sandbox.git?rev=7a8e832b53d66994d6a16f0513d69f540583dcd0#7a8e832b53d66994d6a16f0513d69f540583dcd0" diff --git a/src/runtime-rs/Cargo.toml b/src/runtime-rs/Cargo.toml new file mode 100644 index 0000000000..470b29a64d --- /dev/null +++ b/src/runtime-rs/Cargo.toml @@ -0,0 +1,14 @@ +[workspace] +members = [ + "crates/shim", +] + +[patch.'crates-io'] +dbs-device = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-utils = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-interrupt = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-legacy-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-virtio-devices = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-boot = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-arch = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } +dbs-upcall = { git = "https://github.com/openanolis/dragonball-sandbox.git", rev = "7a8e832b53d66994d6a16f0513d69f540583dcd0" } diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile new file mode 100644 index 0000000000..48c5703541 --- /dev/null +++ b/src/runtime-rs/Makefile @@ -0,0 +1,443 @@ +# Copyright (c) 2019-2022 Alibaba Cloud +# Copyright (c) 2019-2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +# To show variables or targets help on `make help` +# Use the following format: +# '##VAR VARIABLE_NAME: help about variable' +# '##TARGET TARGET_NAME: help about target' +PROJECT_TYPE = kata +PROJECT_NAME = Kata Containers +PROJECT_TAG = kata-containers +PROJECT_URL = https://github.com/kata-containers +PROJECT_COMPONENT = containerd-shim-kata-v2 +CONTAINERD_RUNTIME_NAME = io.containerd.kata.v2 + +include ../../utils.mk + +ARCH_DIR = arch +ARCH_FILE_SUFFIX = -options.mk +ARCH_FILE = $(ARCH_DIR)/$(ARCH)$(ARCH_FILE_SUFFIX) + +ifeq ($(ARCH), s390x) +default: + @echo "s390x not support currently" + exit 0 +test: + @echo "s390x not support currently" + exit 0 +else +##TARGET default: build code +default: runtime show-header +#TARGET test: run cargo tests +test: + @cargo test --all --target $(TRIPLE) $(EXTRA_RUSTFEATURES) -- --nocapture +endif + +ifeq (,$(realpath $(ARCH_FILE))) + $(error "ERROR: invalid architecture: '$(ARCH)'") +else + # Load architecture-dependent settings + include $(ARCH_FILE) +endif + +ifeq ($(PREFIX),) + PREFIX := /usr +endif + +PREFIXDEPS := $(PREFIX) +LIBEXECDIR := $(PREFIXDEPS)/libexec +SHAREDIR := $(PREFIX)/share +DEFAULTSDIR := $(SHAREDIR)/defaults +PROJECT_DIR = $(PROJECT_TAG) +IMAGENAME = $(PROJECT_TAG).img +TARGET = $(PROJECT_COMPONENT) + +CONFIG_FILE = configuration.toml +HYPERVISOR_DB = dragonball + + +DEFAULT_HYPERVISOR ?= $(HYPERVISOR_DB) + +HYPERVISORS := $(HYPERVISOR_DB) + +DBVALIDHYPERVISORPATHS := [] +PKGDATADIR := $(PREFIXDEPS)/share/$(PROJECT_DIR) +KERNELDIR := $(PKGDATADIR) +IMAGEPATH := $(PKGDATADIR)/$(IMAGENAME) +FIRMWAREPATH := +FIRMWAREVOLUMEPATH := + +# Default number of vCPUs +DEFVCPUS := 1 +# Default maximum number of vCPUs +DEFMAXVCPUS := 0 +# Default memory size in MiB +DEFMEMSZ := 2048 +# Default memory slots +# Cases to consider : +# - nvdimm rootfs image +# - preallocated memory +# - vm template memory +# - hugepage memory +DEFMEMSLOTS := 10 +#Default number of bridges +DEFBRIDGES := 1 +DEFENABLEANNOTATIONS := [] +DEFDISABLEGUESTSECCOMP := true +DEFDISABLEGUESTEMPTYDIR := false +#Default experimental features enabled +DEFAULTEXPFEATURES := [] +DEFDISABLESELINUX := false +#Default entropy source +DEFENTROPYSOURCE := /dev/urandom +DEFVALIDENTROPYSOURCES := [\"/dev/urandom\",\"/dev/random\",\"\"] +DEFDISABLEBLOCK := false +DEFSHAREDFS_CLH_VIRTIOFS := virtio-fs +DEFSHAREDFS_QEMU_VIRTIOFS := virtio-fs +DEFVIRTIOFSDAEMON := $(LIBEXECDIR)/kata-qemu/virtiofsd +ifeq ($(ARCH),x86_64) +DEFVIRTIOFSDAEMON := $(LIBEXECDIR)/virtiofsd +endif +DEFVALIDVIRTIOFSDAEMONPATHS := [\"$(DEFVIRTIOFSDAEMON)\"] +# Default DAX mapping cache size in MiB +#if value is 0, DAX is not enabled +DEFVIRTIOFSCACHESIZE ?= 0 +DEFVIRTIOFSCACHE ?= auto +# Format example: +# [\"-o\", \"arg1=xxx,arg2\", \"-o\", \"hello world\", \"--arg3=yyy\"] +# +# see `virtiofsd -h` for possible options. +# Make sure you quote args. +DEFVIRTIOFSEXTRAARGS ?= [\"--thread-pool-size=1\", \"-o\", \"announce_submounts\"] +DEFENABLEIOTHREADS := false +DEFENABLEVHOSTUSERSTORE := false +DEFVHOSTUSERSTOREPATH := $(PKGRUNDIR)/vhost-user +DEFVALIDVHOSTUSERSTOREPATHS := [\"$(DEFVHOSTUSERSTOREPATH)\"] +DEFFILEMEMBACKEND := "" +DEFVALIDFILEMEMBACKENDS := [\"$(DEFFILEMEMBACKEND)\"] +DEFMSIZE9P := 8192 +DEFVFIOMODE := guest-kernel +# Default cgroup model +DEFSANDBOXCGROUPONLY ?= false +DEFSTATICRESOURCEMGMT ?= false +DEFBINDMOUNTS := [] +SED = sed +CLI_DIR = cmd +SHIMV2 = containerd-shim-kata-v2 +SHIMV2_OUTPUT = $(CURDIR)/$(SHIMV2) +SHIMV2_DIR = $(CLI_DIR)/$(SHIMV2) +MONITOR = kata-monitor +MONITOR_OUTPUT = $(CURDIR)/$(MONITOR) +MONITOR_DIR = $(CLI_DIR)/kata-monitor +SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') +VERSION := ${shell cat ./VERSION} + +# List of configuration files to build and install +CONFIGS = +CONFIG_PATHS = +SYSCONFIG_PATHS = +# List of hypervisors known for the current architecture +KNOWN_HYPERVISORS = +# List of hypervisors known for the current architecture +KNOWN_HYPERVISORS = + +CONFDIR := $(DEFAULTSDIR)/$(PROJECT_DIR) +SYSCONFDIR := $(SYSCONFDIR)/$(PROJECT_DIR) +# Main configuration file location for stateless systems +CONFIG_PATH := $(abspath $(CONFDIR)/$(CONFIG_FILE)) +# Secondary configuration file location. Note that this takes precedence +# over CONFIG_PATH. +SYSCONFIG := $(abspath $(SYSCONFDIR)/$(CONFIG_FILE)) +SHAREDIR := $(SHAREDIR) + +ifneq (,$(DBCMD)) + KNOWN_HYPERVISORS += $(HYPERVISOR_DB) + CONFIG_FILE_DB = configuration-dragonball.toml + CONFIG_DB = config/$(CONFIG_FILE_DB) + CONFIG_DB_IN = $(CONFIG_DB).in + CONFIG_PATH_DB = $(abspath $(CONFDIR)/$(CONFIG_FILE_DB)) + CONFIG_PATHS += $(CONFIG_PATH_DB) + SYSCONFIG_DB = $(abspath $(SYSCONFDIR)/$(CONFIG_FILE_DB)) + SYSCONFIG_PATHS += $(SYSCONFIG_DB) + CONFIGS += $(CONFIG_DB) + # dragonball-specific options (all should be suffixed by "_dragonball") + DEFMAXVCPUS_DB := 1 + DEFBLOCKSTORAGEDRIVER_DB := virtio-blk + DEFNETWORKMODEL_DB := tcfilter + KERNELPARAMS = console=ttyS1 agent.log_vport=1025 + KERNELTYPE_DB = uncompressed + KERNEL_NAME_DB = $(call MAKE_KERNEL_NAME,$(KERNELTYPE_DB)) + KERNELPATH_DB = $(KERNELDIR)/$(KERNEL_NAME_DB) + DEFSANDBOXCGROUPONLY = true + RUNTIMENAME := virt_container + PIPESIZE := 1 + DBSHAREDFS := inline-virtio-fs +endif + +ifeq ($(DEFAULT_HYPERVISOR),$(HYPERVISOR_DB)) + DEFAULT_HYPERVISOR_CONFIG = $(CONFIG_FILE_DB) +endif +# list of variables the user may wish to override +USER_VARS += ARCH +USER_VARS += BINDIR +USER_VARS += CONFIG_DB_IN +USER_VARS += CONFIG_PATH +USER_VARS += DESTDIR +USER_VARS += DEFAULT_HYPERVISOR +USER_VARS += DBCMD +USER_VARS += DBCTLCMD +USER_VARS += DBPATH +USER_VARS += DBVALIDHYPERVISORPATHS +USER_VARS += DBCTLPATH +USER_VARS += DBVALIDCTLPATHS +USER_VARS += SYSCONFIG +USER_VARS += IMAGENAME +USER_VARS += IMAGEPATH +USER_VARS += MACHINETYPE +USER_VARS += KERNELDIR +USER_VARS += KERNELTYPE +USER_VARS += KERNELPATH_DB +USER_VARS += KERNELPATH +USER_VARS += KERNELVIRTIOFSPATH +USER_VARS += FIRMWAREPATH +USER_VARS += FIRMWAREVOLUMEPATH +USER_VARS += MACHINEACCELERATORS +USER_VARS += CPUFEATURES +USER_VARS += DEFMACHINETYPE_CLH +USER_VARS += KERNELPARAMS +USER_VARS += LIBEXECDIR +USER_VARS += LOCALSTATEDIR +USER_VARS += PKGDATADIR +USER_VARS += PKGLIBEXECDIR +USER_VARS += PKGRUNDIR +USER_VARS += PREFIX +USER_VARS += PROJECT_BUG_URL +USER_VARS += PROJECT_NAME +USER_VARS += PROJECT_ORG +USER_VARS += PROJECT_PREFIX +USER_VARS += PROJECT_TAG +USER_VARS += PROJECT_TYPE +USER_VARS += RUNTIME_NAME +USER_VARS += SHAREDIR +USER_VARS += SYSCONFDIR +USER_VARS += DEFVCPUS +USER_VARS += DEFMAXVCPUS +USER_VARS += DEFMAXVCPUS_ACRN +USER_VARS += DEFMAXVCPUS_DB +USER_VARS += DEFMEMSZ +USER_VARS += DEFMEMSLOTS +USER_VARS += DEFBRIDGES +USER_VARS += DEFNETWORKMODEL_DB +USER_VARS += DEFDISABLEGUESTEMPTYDIR +USER_VARS += DEFDISABLEGUESTSECCOMP +USER_VARS += DEFDISABLESELINUX +USER_VARS += DEFAULTEXPFEATURES +USER_VARS += DEFDISABLEBLOCK +USER_VARS += DEFBLOCKSTORAGEDRIVER_DB +USER_VARS += DEFSHAREDFS_CLH_VIRTIOFS +USER_VARS += DEFSHAREDFS_QEMU_VIRTIOFS +USER_VARS += DEFVIRTIOFSDAEMON +USER_VARS += DEFVALIDVIRTIOFSDAEMONPATHS +USER_VARS += DEFVIRTIOFSCACHESIZE +USER_VARS += DEFVIRTIOFSCACHE +USER_VARS += DEFVIRTIOFSEXTRAARGS +USER_VARS += DEFENABLEANNOTATIONS +USER_VARS += DEFENABLEIOTHREADS +USER_VARS += DEFENABLEVHOSTUSERSTORE +USER_VARS += DEFVHOSTUSERSTOREPATH +USER_VARS += DEFVALIDVHOSTUSERSTOREPATHS +USER_VARS += DEFFILEMEMBACKEND +USER_VARS += DEFVALIDFILEMEMBACKENDS +USER_VARS += DEFMSIZE9P +USER_VARS += DEFENTROPYSOURCE +USER_VARS += DEFVALIDENTROPYSOURCES +USER_VARS += DEFSANDBOXCGROUPONLY +USER_VARS += DEFSTATICRESOURCEMGMT +USER_VARS += DEFBINDMOUNTS +USER_VARS += DEFVFIOMODE +USER_VARS += BUILDFLAGS +USER_VARS += RUNTIMENAME +USER_VARS += HYPERVISOR_DB +USER_VARS += PIPESIZE +USER_VARS += DBSHAREDFS +USER_VARS += KATA_INSTALL_GROUP +USER_VARS += KATA_INSTALL_OWNER +USER_VARS += KATA_INSTALL_CFG_PERMS + +SOURCES := \ + $(shell find . 2>&1 | grep -E '.*\.rs$$') \ + Cargo.toml + +VERSION_FILE := ./VERSION +VERSION := $(shell grep -v ^\# $(VERSION_FILE)) +COMMIT_NO := $(shell git rev-parse HEAD 2>/dev/null || true) +COMMIT := $(if $(shell git status --porcelain --untracked-files=no 2>/dev/null || true),${COMMIT_NO}-dirty,${COMMIT_NO}) +COMMIT_MSG = $(if $(COMMIT),$(COMMIT),unknown) + +# Exported to allow cargo to see it +export VERSION_COMMIT := $(if $(COMMIT),$(VERSION)-$(COMMIT),$(VERSION)) + +EXTRA_RUSTFEATURES := + +ifneq ($(EXTRA_RUSTFEATURES),) + override EXTRA_RUSTFEATURES := --features $(EXTRA_RUSTFEATURES) +endif + + +TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET) + +##VAR DESTDIR= is a directory prepended to each installed target file +DESTDIR := +##VAR BINDIR= is a directory for installing executable programs +BINDIR := /usr/local/bin + +GENERATED_CODE = crates/shim/src/config.rs + +RUNTIME_NAME=$(TARGET) +RUNTIME_VERSION=$(VERSION) + +GENERATED_VARS = \ + VERSION \ + CONFIG_DB_IN \ + $(USER_VARS) + + +GENERATED_REPLACEMENTS= \ + PROJECT_NAME \ + RUNTIME_NAME \ + CONTAINERD_RUNTIME_NAME \ + RUNTIME_VERSION \ + BINDIR \ + COMMIT \ + VERSION_COMMIT +GENERATED_FILES := + +GENERATED_FILES += $(GENERATED_CODE) + +# Display name of command and it's version (or a message if not available). +# +# Arguments: +# +# 1: Name of command +define get_command_version +$(shell printf "%s: %s\\n" $(1) "$(or $(shell $(1) --version 2>/dev/null), (not available))") +endef + +define get_toolchain_version +$(shell printf "%s: %s\\n" "toolchain" "$(or $(shell rustup show active-toolchain 2>/dev/null), (unknown))") +endef + +define INSTALL_FILE + install -D -m 644 $1 $(DESTDIR)$2/$1 || exit 1; +endef + +# Returns the name of the kernel file to use based on the provided KERNELTYPE. +# $1 : KERNELTYPE (compressed or uncompressed) +define MAKE_KERNEL_NAME +$(if $(findstring uncompressed,$1),vmlinux.container,vmlinuz.container) +endef + +.DEFAULT_GOAL := default + +runtime: $(TARGET) + +$(TARGET): $(GENERATED_CODE) $(TARGET_PATH) + +$(TARGET_PATH): $(SOURCES) | show-summary + @RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES) + +GENERATED_FILES += $(CONFIGS) + +$(GENERATED_FILES): %: %.in + @sed \ + $(foreach r,$(GENERATED_REPLACEMENTS),-e 's|@$r@|$($r)|g') \ + $(foreach v,$(GENERATED_VARS),-e "s|@$v@|$($v)|g") \ + $< > $@ + +##TARGET optimize: optimized build +optimize: $(SOURCES) | show-summary show-header + @RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES) + +##TARGET clean: clean build +clean: + @cargo clean + @rm -f $(GENERATED_FILES) + @rm -f tarpaulin-report.html + @rm -f $(CONFIGS) + +vendor: + @cargo vendor + +##TARGET check: run test +check: $(GENERATED_FILES) standard_rust_check + +##TARGET run: build and run agent +run: + @cargo run --target $(TRIPLE) + +show-header: + @printf "%s - version %s (commit %s)\n\n" "$(TARGET)" "$(VERSION)" "$(COMMIT_MSG)" + +show-summary: show-header + @printf "project:\n" + @printf " name: $(PROJECT_NAME)\n" + @printf " url: $(PROJECT_URL)\n" + @printf " component: $(PROJECT_COMPONENT)\n" + @printf "target: $(TARGET)\n" + @printf "architecture:\n" + @printf " host: $(ARCH)\n" + @printf "rust:\n" + @printf " %s\n" "$(call get_command_version,cargo)" + @printf " %s\n" "$(call get_command_version,rustc)" + @printf " %s\n" "$(call get_command_version,rustup)" + @printf " %s\n" "$(call get_toolchain_version)" + @printf "\n" + +## help: Show help comments that start with `##VAR` and `##TARGET` +help: Makefile show-summary + @echo "========================== Help =============================" + @echo "Variables:" + @sed -n 's/^##VAR//p' $< | sort + @echo "" + @echo "Targets:" + @sed -n 's/^##TARGET//p' $< | sort + +TARPAULIN_ARGS:=-v --workspace +install-tarpaulin: + cargo install cargo-tarpaulin + +# Check if cargo tarpaulin is installed +HAS_TARPAULIN:= $(shell cargo --list | grep tarpaulin 2>/dev/null) +check_tarpaulin: +ifndef HAS_TARPAULIN + $(error "tarpaulin is not available please: run make install-tarpaulin ") +else + $(info OK: tarpaulin installed) +endif + +##TARGET codecov: Generate code coverage report +codecov: check_tarpaulin + cargo tarpaulin $(TARPAULIN_ARGS) + +##TARGET codecov-html: Generate code coverage html report +codecov-html: check_tarpaulin + cargo tarpaulin $(TARPAULIN_ARGS) -o Html + +install: install-runtime install-configs + +install-runtime: runtime + install -D $(TARGET_PATH) $(BINDIR) + +install-configs: $(CONFIGS) + $(foreach f,$(CONFIGS),$(call INSTALL_CONFIG,$f,$(dir $(CONFIG_PATH)))) \ + sudo ln -sf $(DEFAULT_HYPERVISOR_CONFIG) $(DESTDIR)/$(CONFIG_PATH) + +.PHONY: \ + help \ + optimize \ + show-header \ + show-summary \ + vendor diff --git a/src/runtime-rs/VERSION b/src/runtime-rs/VERSION new file mode 120000 index 0000000000..558194c5a5 --- /dev/null +++ b/src/runtime-rs/VERSION @@ -0,0 +1 @@ +../../VERSION \ No newline at end of file diff --git a/src/runtime-rs/arch/aarch64-options.mk b/src/runtime-rs/arch/aarch64-options.mk new file mode 100644 index 0000000000..2e9e5759b7 --- /dev/null +++ b/src/runtime-rs/arch/aarch64-options.mk @@ -0,0 +1,15 @@ +# Copyright (c) 2019-2022 Alibaba Cloud +# Copyright (c) 2019-2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +MACHINETYPE := +KERNELPARAMS := +MACHINEACCELERATORS := +CPUFEATURES := pmu=off + +QEMUCMD := qemu-system-aarch64 + +# dragonball binary name +DBCMD := dragonball diff --git a/src/runtime-rs/arch/s390x-options.mk b/src/runtime-rs/arch/s390x-options.mk new file mode 100644 index 0000000000..f6381eee22 --- /dev/null +++ b/src/runtime-rs/arch/s390x-options.mk @@ -0,0 +1,15 @@ +# Copyright (c) 2019-2022 Alibaba Cloud +# Copyright (c) 2019-2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +MACHINETYPE := +KERNELPARAMS := +MACHINEACCELERATORS := +CPUFEATURES := pmu=off + +QEMUCMD := qemu-system-s390x + +# dragonball binary name +DBCMD := dragonball diff --git a/src/runtime-rs/arch/x86_64-options.mk b/src/runtime-rs/arch/x86_64-options.mk new file mode 100644 index 0000000000..0e837f0657 --- /dev/null +++ b/src/runtime-rs/arch/x86_64-options.mk @@ -0,0 +1,15 @@ +# Copyright (c) 2019-2022 Alibaba Cloud +# Copyright (c) 2019-2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +MACHINETYPE := q35 +KERNELPARAMS := +MACHINEACCELERATORS := +CPUFEATURES := pmu=off + +QEMUCMD := qemu-system-x86_64 + +# dragonball binary name +DBCMD := dragonball diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in new file mode 100644 index 0000000000..bda6a8d3a1 --- /dev/null +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -0,0 +1,249 @@ +# Copyright (c) 2019-2022 Alibaba Cloud +# Copyright (c) 2019-2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +# XXX: WARNING: this file is auto-generated. +# XXX: +# XXX: Source file: "@CONFIG_DB_IN@" +# XXX: Project: +# XXX: Name: @PROJECT_NAME@ +# XXX: Type: @PROJECT_TYPE@ + +[hypervisor.dragonball] +path = "@DBPATH@" +ctlpath = "@DBCTLPATH@" +kernel = "@KERNELPATH_DB@" +image = "@IMAGEPATH@" + +# List of valid annotation names for the hypervisor +# Each member of the list is a regular expression, which is the base name +# of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" +enable_annotations = @DEFENABLEANNOTATIONS@ + +# List of valid annotations values for the hypervisor +# Each member of the list is a path pattern as described by glob(3). +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: @DBVALIDHYPERVISORPATHS@ +valid_hypervisor_paths = @DBVALIDHYPERVISORPATHS@ + +# List of valid annotations values for ctlpath +# The default if not set is empty (all annotations rejected.) +# Your distribution recommends: +# valid_ctlpaths = + +# Optional space-separated list of options to pass to the guest kernel. +# For example, use `kernel_params = "vsyscall=emulate"` if you are having +# trouble running pre-2.15 glibc. +# +# WARNING: - any parameter specified here will take priority over the default +# parameter value of the same name used to start the virtual machine. +# Do not set values here unless you understand the impact of doing so as you +# may stop the virtual machine from booting. +# To see the list of default parameters, enable hypervisor debug, create a +# container and look for 'default-kernel-parameters' log entries. +kernel_params = "@KERNELPARAMS@" + +# Path to the firmware. +# If you want that DB uses the default firmware leave this option empty +firmware = "@FIRMWAREPATH@" + + +# Default number of vCPUs per SB/VM: +# unspecified or 0 --> will be set to 1 +# < 0 --> will be set to the actual number of physical cores +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores +default_vcpus = @DEFVCPUS@ + + +# Default maximum number of vCPUs per SB/VM: +# unspecified or == 0 --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# > 0 <= number of physical cores --> will be set to the specified number +# > number of physical cores --> will be set to the actual number of physical cores or to the maximum number +# of vCPUs supported by KVM if that number is exceeded +# WARNING: Depending of the architecture, the maximum number of vCPUs supported by KVM is used when +# the actual number of physical cores is greater than it. +# WARNING: Be aware that this value impacts the virtual machine's memory footprint and CPU +# the hotplug functionality. For example, `default_maxvcpus = 240` specifies that until 240 vCPUs +# can be added to a SB/VM, but the memory footprint will be big. Another example, with +# `default_maxvcpus = 8` the memory footprint will be small, but 8 will be the maximum number of +# vCPUs supported by the SB/VM. In general, we recommend that you do not edit this variable, +# unless you know what are you doing. +default_maxvcpus = @DEFMAXVCPUS_DB@ + +# Bridges can be used to hot plug devices. +# Limitations: +# * Currently only pci bridges are supported +# * Until 30 devices per bridge can be hot plugged. +# * Until 5 PCI bridges can be cold plugged per VM. +# This limitation could be a bug in the kernel +# Default number of bridges per SB/VM: +# unspecified or 0 --> will be set to @DEFBRIDGES@ +# > 1 <= 5 --> will be set to the specified number +# > 5 --> will be set to 5 +default_bridges = @DEFBRIDGES@ + +# Default memory size in MiB for SB/VM. +# If unspecified then it will be set @DEFMEMSZ@ MiB. +default_memory = @DEFMEMSZ@ + +# Block storage driver to be used for the hypervisor in case the container +# rootfs is backed by a block device. DB only supports virtio-blk. +block_device_driver = "@DEFBLOCKSTORAGEDRIVER_DB@" + +# This option changes the default hypervisor and kernel parameters +# to enable debug output where available. +# +# Default false +#enable_debug = true + +# Disable the customizations done in the runtime when it detects +# that it is running on top a VMM. This will result in the runtime +# behaving as it would when running on bare metal. +# +#disable_nesting_checks = true + +# If host doesn't support vhost_net, set to true. Thus we won't create vhost fds for nics. +# Default false +#disable_vhost_net = true + +# Path to OCI hook binaries in the *guest rootfs*. +# This does not affect host-side hooks which must instead be added to +# the OCI spec passed to the runtime. +# +# You can create a rootfs with hooks by customizing the osbuilder scripts: +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder +# +# Hooks must be stored in a subdirectory of guest_hook_path according to their +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". +# The agent will scan these directories for executable files and add them, in +# lexicographical order, to the lifecycle of the guest container. +# Hooks are executed in the runtime namespace of the guest. See the official documentation: +# https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks +# Warnings will be logged if any error is encountered while scanning for hooks, +# but it will not abort container execution. +#guest_hook_path = "/usr/share/oci/hooks" + +# Shared file system type: +# - virtio-fs (default) +# - virtio-9p +# - virtio-fs-nydus +shared_fs = "@DBSHAREDFS@" + +[agent.@PROJECT_TYPE@] +container_pipe_size=@PIPESIZE@ +# If enabled, make the agent display debug-level messages. +# (default: disabled) +#enable_debug = true + +# Enable agent tracing. +# +# If enabled, the agent will generate OpenTelemetry trace spans. +# +# Notes: +# +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. +# +# (default: disabled) +#enable_tracing = true + +# Enable debug console. + +# If enabled, user can connect guest OS running inside hypervisor +# through "kata-runtime exec " command + +#debug_console_enabled = true + +# Agent connection dialing timeout value in seconds +# (default: 30) +#dial_timeout = 30 + +[runtime] +# If enabled, the runtime will log additional debug messages to the +# system log +# (default: disabled) +#enable_debug = true +# +# Internetworking model +# Determines how the VM should be connected to the +# the container network interface +# Options: +# +# - bridged (Deprecated) +# Uses a linux bridge to interconnect the container interface to +# the VM. Works for most cases except macvlan and ipvlan. +# ***NOTE: This feature has been deprecated with plans to remove this +# feature in the future. Please use other network models listed below. +# +# +# - macvtap +# Used when the Container network interface can be bridged using +# macvtap. +# +# - none +# Used when customize network. Only creates a tap device. No veth pair. +# +# - tcfilter +# Uses tc filter rules to redirect traffic from the network interface +# provided by plugin to a tap interface connected to the VM. +# +internetworking_model="@DEFNETWORKMODEL_DB@" + +name="@RUNTIMENAME@" +hypervisor_name="@HYPERVISOR_DB@" +agent_name="@PROJECT_TYPE@" + +# disable guest seccomp +# Determines whether container seccomp profiles are passed to the virtual +# machine and applied by the kata agent. If set to true, seccomp is not applied +# within the guest +# (default: true) +disable_guest_seccomp=@DEFDISABLEGUESTSECCOMP@ + +# If enabled, the runtime will create opentracing.io traces and spans. +# (See https://www.jaegertracing.io/docs/getting-started). +# (default: disabled) +#enable_tracing = true + +# Set the full url to the Jaeger HTTP Thrift collector. +# The default if not set will be "http://localhost:14268/api/traces" +#jaeger_endpoint = "" + +# Sets the username to be used if basic auth is required for Jaeger. +#jaeger_user = "" + +# Sets the password to be used if basic auth is required for Jaeger. +#jaeger_password = "" + +# If enabled, the runtime will not create a network namespace for shim and hypervisor processes. +# This option may have some potential impacts to your host. It should only be used when you know what you're doing. +# `disable_new_netns` conflicts with `internetworking_model=bridged` and `internetworking_model=macvtap`. It works only +# with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge +# (like OVS) directly. +# (default: false) +#disable_new_netns = true + +# if enabled, the runtime will add all the kata processes inside one dedicated cgroup. +# The container cgroups in the host are not created, just one single cgroup per sandbox. +# The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. +# The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. +# The sandbox cgroup is constrained if there is no container type annotation. +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType +sandbox_cgroup_only=@DEFSANDBOXCGROUPONLY@ + +# Enabled experimental feature list, format: ["a", "b"]. +# Experimental features are features not stable enough for production, +# they may break compatibility, and are prepared for a big version bump. +# Supported experimental features: +# (default: []) +experimental=@DEFAULTEXPFEATURES@ + +# If enabled, user can run pprof tools with shim v2 process through kata-monitor. +# (default: false) +# enable_pprof = true diff --git a/src/runtime-rs/crates/agent/Cargo.toml b/src/runtime-rs/crates/agent/Cargo.toml new file mode 100644 index 0000000000..c5febe43d7 --- /dev/null +++ b/src/runtime-rs/crates/agent/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "agent" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dev-dependencies] +futures = "0.1.27" + +[dependencies] +anyhow = "1.0.26" +async-trait = "0.1.48" +log = "0.4.14" +protobuf = "2.27.0" +serde = { version = "^1.0", features = ["derive"] } +serde_json = ">=1.0.9" +slog = "2.5.2" +slog-scope = "4.4.0" +ttrpc = { version = "0.6.1" } +tokio = { version = "1.8.0", features = ["fs", "rt"] } +url = "2.2.2" + +kata-types = { path = "../../../libs/kata-types"} +logging = { path = "../../../libs/logging"} +oci = { path = "../../../libs/oci" } +protocols = { path = "../../../libs/protocols", features=["async"] } + +[features] +default = [] diff --git a/src/runtime-rs/crates/agent/src/kata/agent.rs b/src/runtime-rs/crates/agent/src/kata/agent.rs new file mode 100644 index 0000000000..90a812d441 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/kata/agent.rs @@ -0,0 +1,107 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use ttrpc::context as ttrpc_ctx; + +use crate::{kata::KataAgent, Agent, AgentManager, HealthService}; + +/// millisecond to nanosecond +const MILLISECOND_TO_NANOSECOND: i64 = 1_000_000; + +/// new ttrpc context with timeout +fn new_ttrpc_ctx(timeout: i64) -> ttrpc_ctx::Context { + ttrpc_ctx::with_timeout(timeout) +} + +#[async_trait] +impl AgentManager for KataAgent { + async fn start(&self, address: &str) -> Result<()> { + info!(sl!(), "begin to connect agent {:?}", address); + self.set_socket_address(address) + .await + .context("set socket")?; + self.connect_agent_server() + .await + .context("connect agent server")?; + self.start_log_forwarder() + .await + .context("connect log forwarder")?; + Ok(()) + } + + async fn stop(&self) { + self.stop_log_forwarder().await; + } +} + +// implement for health service +macro_rules! impl_health_service { + ($($name: tt | $req: ty | $resp: ty),*) => { + #[async_trait] + impl HealthService for KataAgent { + $(async fn $name(&self, req: $req) -> Result<$resp> { + let r = req.into(); + let (mut client, timeout, _) = self.get_health_client().await.context("get health client")?; + let resp = client.$name(new_ttrpc_ctx(timeout * MILLISECOND_TO_NANOSECOND), &r).await?; + Ok(resp.into()) + })* + } + }; +} + +impl_health_service!( + check | crate::CheckRequest | crate::HealthCheckResponse, + version | crate::CheckRequest | crate::VersionCheckResponse +); + +macro_rules! impl_agent { + ($($name: tt | $req: ty | $resp: ty | $new_timeout: expr),*) => { + #[async_trait] + impl Agent for KataAgent { + $(async fn $name(&self, req: $req) -> Result<$resp> { + let r = req.into(); + let (mut client, mut timeout, _) = self.get_agent_client().await.context("get client")?; + + // update new timeout + if let Some(v) = $new_timeout { + timeout = v; + } + + let resp = client.$name(new_ttrpc_ctx(timeout * MILLISECOND_TO_NANOSECOND), &r).await?; + Ok(resp.into()) + })* + } + }; +} + +impl_agent!( + create_container | crate::CreateContainerRequest | crate::Empty | None, + start_container | crate::ContainerID | crate::Empty | None, + remove_container | crate::RemoveContainerRequest | crate::Empty | None, + exec_process | crate::ExecProcessRequest | crate::Empty | None, + signal_process | crate::SignalProcessRequest | crate::Empty | None, + wait_process | crate::WaitProcessRequest | crate::WaitProcessResponse | Some(0), + update_container | crate::UpdateContainerRequest | crate::Empty | None, + stats_container | crate::ContainerID | crate::StatsContainerResponse | None, + pause_container | crate::ContainerID | crate::Empty | None, + resume_container | crate::ContainerID | crate::Empty | None, + write_stdin | crate::WriteStreamRequest | crate::WriteStreamResponse | None, + read_stdout | crate::ReadStreamRequest | crate::ReadStreamResponse | None, + read_stderr | crate::ReadStreamRequest | crate::ReadStreamResponse | None, + close_stdin | crate::CloseStdinRequest | crate::Empty | None, + tty_win_resize | crate::TtyWinResizeRequest | crate::Empty | None, + update_interface | crate::UpdateInterfaceRequest | crate::Interface | None, + update_routes | crate::UpdateRoutesRequest | crate::Routes | None, + add_arp_neighbors | crate::AddArpNeighborRequest | crate::Empty | None, + list_interfaces | crate::Empty | crate::Interfaces | None, + list_routes | crate::Empty | crate::Routes | None, + create_sandbox | crate::CreateSandboxRequest | crate::Empty | None, + destroy_sandbox | crate::Empty | crate::Empty | None, + copy_file | crate::CopyFileRequest | crate::Empty | None, + get_oom_event | crate::Empty | crate::OomEventResponse | Some(0) +); diff --git a/src/runtime-rs/crates/agent/src/kata/mod.rs b/src/runtime-rs/crates/agent/src/kata/mod.rs new file mode 100644 index 0000000000..dd7831d35b --- /dev/null +++ b/src/runtime-rs/crates/agent/src/kata/mod.rs @@ -0,0 +1,129 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod agent; +mod trans; + +use std::os::unix::io::{IntoRawFd, RawFd}; + +use anyhow::{Context, Result}; +use kata_types::config::Agent as AgentConfig; +use protocols::{agent_ttrpc_async as agent_ttrpc, health_ttrpc_async as health_ttrpc}; +use tokio::sync::Mutex; +use ttrpc::asynchronous::Client; + +use crate::{log_forwarder::LogForwarder, sock}; + +// https://github.com/firecracker-microvm/firecracker/blob/master/docs/vsock.md +#[derive(Debug, Default)] +pub struct Vsock { + pub context_id: u64, + pub port: u32, +} + +pub(crate) struct KataAgentInner { + /// TTRPC client + pub client: Option, + + /// Client fd + pub client_fd: RawFd, + + /// Unix domain socket address + pub socket_address: String, + + /// Agent config + config: AgentConfig, + + /// Log forwarder + log_forwarder: LogForwarder, +} + +unsafe impl Send for KataAgent {} +unsafe impl Sync for KataAgent {} +pub struct KataAgent { + pub(crate) inner: Mutex, +} + +impl KataAgent { + pub fn new(config: AgentConfig) -> Self { + KataAgent { + inner: Mutex::new(KataAgentInner { + client: None, + client_fd: -1, + socket_address: "".to_string(), + config, + log_forwarder: LogForwarder::new(), + }), + } + } + + pub async fn get_health_client(&self) -> Option<(health_ttrpc::HealthClient, i64, RawFd)> { + let inner = self.inner.lock().await; + inner.client.as_ref().map(|c| { + ( + health_ttrpc::HealthClient::new(c.clone()), + inner.config.health_check_request_timeout_ms as i64, + inner.client_fd, + ) + }) + } + + pub async fn get_agent_client(&self) -> Option<(agent_ttrpc::AgentServiceClient, i64, RawFd)> { + let inner = self.inner.lock().await; + inner.client.as_ref().map(|c| { + ( + agent_ttrpc::AgentServiceClient::new(c.clone()), + inner.config.request_timeout_ms as i64, + inner.client_fd, + ) + }) + } + + pub(crate) async fn set_socket_address(&self, address: &str) -> Result<()> { + let mut inner = self.inner.lock().await; + inner.socket_address = address.to_string(); + Ok(()) + } + + pub(crate) async fn connect_agent_server(&self) -> Result<()> { + let mut inner = self.inner.lock().await; + + let config = sock::ConnectConfig::new( + inner.config.dial_timeout_ms as u64, + inner.config.reconnect_timeout_ms as u64, + ); + let sock = + sock::new(&inner.socket_address, inner.config.server_port).context("new sock")?; + let stream = sock.connect(&config).await.context("connect")?; + let fd = stream.into_raw_fd(); + info!(sl!(), "get stream raw fd {:?}", fd); + let c = Client::new(fd); + inner.client = Some(c); + inner.client_fd = fd; + Ok(()) + } + + pub(crate) async fn start_log_forwarder(&self) -> Result<()> { + let mut inner = self.inner.lock().await; + let config = sock::ConnectConfig::new( + inner.config.dial_timeout_ms as u64, + inner.config.reconnect_timeout_ms as u64, + ); + let address = inner.socket_address.clone(); + let port = inner.config.log_port; + inner + .log_forwarder + .start(&address, port, config) + .await + .context("start log forwarder")?; + Ok(()) + } + + pub(crate) async fn stop_log_forwarder(&self) { + let mut inner = self.inner.lock().await; + inner.log_forwarder.stop(); + } +} diff --git a/src/runtime-rs/crates/agent/src/kata/trans.rs b/src/runtime-rs/crates/agent/src/kata/trans.rs new file mode 100644 index 0000000000..e7fdaa9448 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/kata/trans.rs @@ -0,0 +1,812 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::convert::Into; + +use protocols::{ + agent::{self, OOMEvent}, + empty, health, types, +}; + +use crate::{ + types::{ + ARPNeighbor, ARPNeighbors, AddArpNeighborRequest, AgentDetails, BlkioStats, + BlkioStatsEntry, CgroupStats, CheckRequest, CloseStdinRequest, ContainerID, + CopyFileRequest, CpuStats, CpuUsage, CreateContainerRequest, CreateSandboxRequest, Device, + Empty, ExecProcessRequest, FSGroup, FSGroupChangePolicy, GuestDetailsResponse, + HealthCheckResponse, HugetlbStats, IPAddress, IPFamily, Interface, Interfaces, + KernelModule, MemHotplugByProbeRequest, MemoryData, MemoryStats, NetworkStats, + OnlineCPUMemRequest, PidsStats, ReadStreamRequest, ReadStreamResponse, + RemoveContainerRequest, ReseedRandomDevRequest, Route, Routes, SetGuestDateTimeRequest, + SignalProcessRequest, StatsContainerResponse, Storage, StringUser, ThrottlingData, + TtyWinResizeRequest, UpdateContainerRequest, UpdateInterfaceRequest, UpdateRoutesRequest, + VersionCheckResponse, WaitProcessRequest, WriteStreamRequest, + }, + OomEventResponse, WaitProcessResponse, WriteStreamResponse, +}; + +fn from_vec, T: Sized>(from: Vec) -> ::protobuf::RepeatedField { + let mut to: Vec = vec![]; + for data in from { + to.push(data.into()); + } + ::protobuf::RepeatedField::from_vec(to) +} + +fn into_vec>(from: ::protobuf::RepeatedField) -> Vec { + let mut to: Vec = vec![]; + for data in from.to_vec() { + to.push(data.into()); + } + to +} + +fn from_option>(from: Option) -> ::protobuf::SingularPtrField { + match from { + Some(f) => ::protobuf::SingularPtrField::from_option(Some(T::from(f))), + None => ::protobuf::SingularPtrField::none(), + } +} + +fn into_option, T: Sized>(from: ::protobuf::SingularPtrField) -> Option { + from.into_option().map(|f| f.into()) +} + +fn into_hash_map, T>( + from: std::collections::HashMap, +) -> std::collections::HashMap { + let mut to: std::collections::HashMap = Default::default(); + + for (key, value) in from { + to.insert(key, value.into()); + } + + to +} + +impl From for Empty { + fn from(_: empty::Empty) -> Self { + Self {} + } +} + +impl From for agent::FSGroup { + fn from(from: FSGroup) -> Self { + let policy = match from.group_change_policy { + FSGroupChangePolicy::Always => types::FSGroupChangePolicy::Always, + FSGroupChangePolicy::OnRootMismatch => types::FSGroupChangePolicy::OnRootMismatch, + }; + + Self { + group_id: from.group_id, + group_change_policy: policy, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::StringUser { + fn from(from: StringUser) -> Self { + Self { + uid: from.uid, + gid: from.gid, + additionalGids: ::protobuf::RepeatedField::from_vec(from.additional_gids), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::Device { + fn from(from: Device) -> Self { + Self { + id: from.id, + field_type: from.field_type, + vm_path: from.vm_path, + container_path: from.container_path, + options: from_vec(from.options), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::Storage { + fn from(from: Storage) -> Self { + Self { + driver: from.driver, + driver_options: from_vec(from.driver_options), + source: from.source, + fstype: from.fs_type, + fs_group: from_option(from.fs_group), + options: from_vec(from.options), + mount_point: from.mount_point, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::KernelModule { + fn from(from: KernelModule) -> Self { + Self { + name: from.name, + parameters: from_vec(from.parameters), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for types::IPFamily { + fn from(from: IPFamily) -> Self { + if from == IPFamily::V4 { + types::IPFamily::v4 + } else { + types::IPFamily::v6 + } + } +} + +impl From for IPFamily { + fn from(src: types::IPFamily) -> Self { + match src { + types::IPFamily::v4 => IPFamily::V4, + types::IPFamily::v6 => IPFamily::V6, + } + } +} + +impl From for types::IPAddress { + fn from(from: IPAddress) -> Self { + Self { + family: from.family.into(), + address: from.address, + mask: from.mask, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for IPAddress { + fn from(src: types::IPAddress) -> Self { + Self { + family: src.family.into(), + address: "".to_string(), + mask: "".to_string(), + } + } +} + +impl From for types::Interface { + fn from(from: Interface) -> Self { + Self { + device: from.device, + name: from.name, + IPAddresses: from_vec(from.ip_addresses), + mtu: from.mtu, + hwAddr: from.hw_addr, + pciPath: from.pci_addr, + field_type: from.field_type, + raw_flags: from.raw_flags, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for Interface { + fn from(src: types::Interface) -> Self { + Self { + device: src.device, + name: src.name, + ip_addresses: into_vec(src.IPAddresses), + mtu: src.mtu, + hw_addr: src.hwAddr, + pci_addr: src.pciPath, + field_type: src.field_type, + raw_flags: src.raw_flags, + } + } +} + +impl From for Interfaces { + fn from(src: agent::Interfaces) -> Self { + Self { + interfaces: into_vec(src.Interfaces), + } + } +} + +impl From for types::Route { + fn from(from: Route) -> Self { + Self { + dest: from.dest, + gateway: from.gateway, + device: from.device, + source: from.source, + scope: from.scope, + family: from.family.into(), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for Route { + fn from(src: types::Route) -> Self { + Self { + dest: src.dest, + gateway: src.gateway, + device: src.device, + source: src.source, + scope: src.scope, + family: src.family.into(), + } + } +} + +impl From for agent::Routes { + fn from(from: Routes) -> Self { + Self { + Routes: from_vec(from.routes), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for Routes { + fn from(src: agent::Routes) -> Self { + Self { + routes: into_vec(src.Routes), + } + } +} + +impl From for agent::CreateContainerRequest { + fn from(from: CreateContainerRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + string_user: from_option(from.string_user), + devices: from_vec(from.devices), + storages: from_vec(from.storages), + OCI: from_option(from.oci), + sandbox_pidns: from.sandbox_pidns, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::RemoveContainerRequest { + fn from(from: RemoveContainerRequest) -> Self { + Self { + container_id: from.container_id, + timeout: from.timeout, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::StartContainerRequest { + fn from(from: ContainerID) -> Self { + Self { + container_id: from.container_id, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::StatsContainerRequest { + fn from(from: ContainerID) -> Self { + Self { + container_id: from.container_id, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::PauseContainerRequest { + fn from(from: ContainerID) -> Self { + Self { + container_id: from.container_id, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::ResumeContainerRequest { + fn from(from: ContainerID) -> Self { + Self { + container_id: from.container_id, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::SignalProcessRequest { + fn from(from: SignalProcessRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + signal: from.signal, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::WaitProcessRequest { + fn from(from: WaitProcessRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::UpdateContainerRequest { + fn from(from: UpdateContainerRequest) -> Self { + Self { + container_id: from.container_id, + resources: from_option(Some(from.resources)), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::WriteStreamRequest { + fn from(from: WriteStreamRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + data: from.data, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for WriteStreamResponse { + fn from(from: agent::WriteStreamResponse) -> Self { + Self { length: from.len } + } +} + +impl From for agent::ExecProcessRequest { + fn from(from: ExecProcessRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + string_user: from_option(from.string_user), + process: from_option(from.process), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for CpuUsage { + fn from(src: agent::CpuUsage) -> Self { + Self { + total_usage: src.total_usage, + percpu_usage: src.percpu_usage, + usage_in_kernelmode: src.usage_in_kernelmode, + usage_in_usermode: src.usage_in_usermode, + } + } +} + +impl From for ThrottlingData { + fn from(src: agent::ThrottlingData) -> Self { + Self { + periods: src.periods, + throttled_periods: src.throttled_periods, + throttled_time: src.throttled_time, + } + } +} + +impl From for CpuStats { + fn from(src: agent::CpuStats) -> Self { + Self { + cpu_usage: into_option(src.cpu_usage), + throttling_data: into_option(src.throttling_data), + } + } +} + +impl From for MemoryData { + fn from(src: agent::MemoryData) -> Self { + Self { + usage: src.usage, + max_usage: src.max_usage, + failcnt: src.failcnt, + limit: src.limit, + } + } +} + +impl From for MemoryStats { + fn from(src: agent::MemoryStats) -> Self { + Self { + cache: src.cache, + usage: into_option(src.usage), + swap_usage: into_option(src.swap_usage), + kernel_usage: into_option(src.kernel_usage), + use_hierarchy: src.use_hierarchy, + stats: into_hash_map(src.stats), + } + } +} + +impl From for PidsStats { + fn from(src: agent::PidsStats) -> Self { + Self { + current: src.current, + limit: src.limit, + } + } +} + +impl From for BlkioStatsEntry { + fn from(src: agent::BlkioStatsEntry) -> Self { + Self { + major: src.major, + minor: src.minor, + op: src.op, + value: src.value, + } + } +} + +impl From for BlkioStats { + fn from(src: agent::BlkioStats) -> Self { + Self { + io_service_bytes_recursive: into_vec(src.io_service_bytes_recursive), + io_serviced_recursive: into_vec(src.io_serviced_recursive), + io_queued_recursive: into_vec(src.io_queued_recursive), + io_service_time_recursive: into_vec(src.io_service_time_recursive), + io_wait_time_recursive: into_vec(src.io_wait_time_recursive), + io_merged_recursive: into_vec(src.io_merged_recursive), + io_time_recursive: into_vec(src.io_time_recursive), + sectors_recursive: into_vec(src.sectors_recursive), + } + } +} + +impl From for HugetlbStats { + fn from(src: agent::HugetlbStats) -> Self { + Self { + usage: src.usage, + max_usage: src.max_usage, + failcnt: src.failcnt, + } + } +} + +impl From for CgroupStats { + fn from(src: agent::CgroupStats) -> Self { + Self { + cpu_stats: into_option(src.cpu_stats), + memory_stats: into_option(src.memory_stats), + pids_stats: into_option(src.pids_stats), + blkio_stats: into_option(src.blkio_stats), + hugetlb_stats: into_hash_map(src.hugetlb_stats), + } + } +} + +impl From for NetworkStats { + fn from(src: agent::NetworkStats) -> Self { + Self { + name: src.name, + rx_bytes: src.rx_bytes, + rx_packets: src.rx_packets, + rx_errors: src.rx_errors, + rx_dropped: src.rx_dropped, + tx_bytes: src.tx_bytes, + tx_packets: src.tx_packets, + tx_errors: src.tx_errors, + tx_dropped: src.tx_dropped, + } + } +} + +// translate ttrpc::agent response to interface::agent response +impl From for StatsContainerResponse { + fn from(src: agent::StatsContainerResponse) -> Self { + Self { + cgroup_stats: into_option(src.cgroup_stats), + network_stats: into_vec(src.network_stats), + } + } +} + +impl From for agent::ReadStreamRequest { + fn from(from: ReadStreamRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + len: from.len, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for ReadStreamResponse { + fn from(from: agent::ReadStreamResponse) -> Self { + Self { data: from.data } + } +} + +impl From for agent::CloseStdinRequest { + fn from(from: CloseStdinRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::TtyWinResizeRequest { + fn from(from: TtyWinResizeRequest) -> Self { + Self { + container_id: from.process_id.container_id(), + exec_id: from.process_id.exec_id(), + row: from.row, + column: from.column, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::UpdateInterfaceRequest { + fn from(from: UpdateInterfaceRequest) -> Self { + Self { + interface: from_option(from.interface), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::ListInterfacesRequest { + fn from(_: Empty) -> Self { + Self { + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::UpdateRoutesRequest { + fn from(from: UpdateRoutesRequest) -> Self { + Self { + routes: from_option(from.route), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::ListRoutesRequest { + fn from(_: Empty) -> Self { + Self { + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for types::ARPNeighbor { + fn from(from: ARPNeighbor) -> Self { + Self { + toIPAddress: from_option(from.to_ip_address), + device: from.device, + lladdr: from.ll_addr, + state: from.state, + flags: from.flags, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::ARPNeighbors { + fn from(from: ARPNeighbors) -> Self { + Self { + ARPNeighbors: from_vec(from.neighbors), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::AddARPNeighborsRequest { + fn from(from: AddArpNeighborRequest) -> Self { + Self { + neighbors: from_option(from.neighbors), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::CreateSandboxRequest { + fn from(from: CreateSandboxRequest) -> Self { + Self { + hostname: from.hostname, + dns: from_vec(from.dns), + storages: from_vec(from.storages), + sandbox_pidns: from.sandbox_pidns, + sandbox_id: from.sandbox_id, + guest_hook_path: from.guest_hook_path, + kernel_modules: from_vec(from.kernel_modules), + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::DestroySandboxRequest { + fn from(_: Empty) -> Self { + Self { + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::OnlineCPUMemRequest { + fn from(from: OnlineCPUMemRequest) -> Self { + Self { + wait: from.wait, + nb_cpus: from.nb_cpus, + cpu_only: from.cpu_only, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::ReseedRandomDevRequest { + fn from(from: ReseedRandomDevRequest) -> Self { + Self { + data: from.data, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::MemHotplugByProbeRequest { + fn from(from: MemHotplugByProbeRequest) -> Self { + Self { + memHotplugProbeAddr: from.mem_hotplug_probe_addr, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for agent::SetGuestDateTimeRequest { + fn from(from: SetGuestDateTimeRequest) -> Self { + Self { + Sec: from.sec, + Usec: from.usec, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for AgentDetails { + fn from(src: agent::AgentDetails) -> Self { + Self { + version: src.version, + init_daemon: src.init_daemon, + device_handlers: into_vec(src.device_handlers), + storage_handlers: into_vec(src.storage_handlers), + supports_seccomp: src.supports_seccomp, + } + } +} + +impl From for GuestDetailsResponse { + fn from(src: agent::GuestDetailsResponse) -> Self { + Self { + mem_block_size_bytes: src.mem_block_size_bytes, + agent_details: into_option(src.agent_details), + support_mem_hotplug_probe: src.support_mem_hotplug_probe, + } + } +} + +impl From for agent::CopyFileRequest { + fn from(from: CopyFileRequest) -> Self { + Self { + path: from.path, + file_size: from.file_size, + file_mode: from.file_mode, + dir_mode: from.dir_mode, + uid: from.uid, + gid: from.gid, + offset: from.offset, + data: from.data, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for WaitProcessResponse { + fn from(from: agent::WaitProcessResponse) -> Self { + Self { + status: from.status, + } + } +} + +impl From for agent::GetOOMEventRequest { + fn from(_: Empty) -> Self { + Self { + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for health::CheckRequest { + fn from(from: CheckRequest) -> Self { + Self { + service: from.service, + unknown_fields: Default::default(), + cached_size: Default::default(), + } + } +} + +impl From for HealthCheckResponse { + fn from(from: health::HealthCheckResponse) -> Self { + Self { + status: from.status as u32, + } + } +} + +impl From for VersionCheckResponse { + fn from(from: health::VersionCheckResponse) -> Self { + Self { + grpc_version: from.grpc_version, + agent_version: from.agent_version, + } + } +} + +impl From for OomEventResponse { + fn from(from: OOMEvent) -> Self { + Self { + container_id: from.container_id, + } + } +} diff --git a/src/runtime-rs/crates/agent/src/lib.rs b/src/runtime-rs/crates/agent/src/lib.rs new file mode 100644 index 0000000000..1e28cc2b8f --- /dev/null +++ b/src/runtime-rs/crates/agent/src/lib.rs @@ -0,0 +1,81 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "agent"); + +pub mod kata; +mod log_forwarder; +mod sock; +pub mod types; +pub use types::{ + ARPNeighbor, ARPNeighbors, AddArpNeighborRequest, BlkioStatsEntry, CheckRequest, + CloseStdinRequest, ContainerID, ContainerProcessID, CopyFileRequest, CreateContainerRequest, + CreateSandboxRequest, Empty, ExecProcessRequest, GetGuestDetailsRequest, GuestDetailsResponse, + HealthCheckResponse, IPAddress, IPFamily, Interface, Interfaces, ListProcessesRequest, + MemHotplugByProbeRequest, OnlineCPUMemRequest, OomEventResponse, ReadStreamRequest, + ReadStreamResponse, RemoveContainerRequest, ReseedRandomDevRequest, Route, Routes, + SetGuestDateTimeRequest, SignalProcessRequest, StatsContainerResponse, Storage, + TtyWinResizeRequest, UpdateContainerRequest, UpdateInterfaceRequest, UpdateRoutesRequest, + VersionCheckResponse, WaitProcessRequest, WaitProcessResponse, WriteStreamRequest, + WriteStreamResponse, +}; + +use anyhow::Result; +use async_trait::async_trait; + +#[async_trait] +pub trait AgentManager: Send + Sync { + async fn start(&self, address: &str) -> Result<()>; + async fn stop(&self); +} + +#[async_trait] +pub trait HealthService: Send + Sync { + async fn check(&self, req: CheckRequest) -> Result; + async fn version(&self, req: CheckRequest) -> Result; +} + +#[async_trait] +pub trait Agent: AgentManager + HealthService + Send + Sync { + // sandbox + async fn create_sandbox(&self, req: CreateSandboxRequest) -> Result; + async fn destroy_sandbox(&self, req: Empty) -> Result; + + // network + async fn add_arp_neighbors(&self, req: AddArpNeighborRequest) -> Result; + async fn list_interfaces(&self, req: Empty) -> Result; + async fn list_routes(&self, req: Empty) -> Result; + async fn update_interface(&self, req: UpdateInterfaceRequest) -> Result; + async fn update_routes(&self, req: UpdateRoutesRequest) -> Result; + + // container + async fn create_container(&self, req: CreateContainerRequest) -> Result; + async fn pause_container(&self, req: ContainerID) -> Result; + async fn remove_container(&self, req: RemoveContainerRequest) -> Result; + async fn resume_container(&self, req: ContainerID) -> Result; + async fn start_container(&self, req: ContainerID) -> Result; + async fn stats_container(&self, req: ContainerID) -> Result; + async fn update_container(&self, req: UpdateContainerRequest) -> Result; + + // process + async fn exec_process(&self, req: ExecProcessRequest) -> Result; + async fn signal_process(&self, req: SignalProcessRequest) -> Result; + async fn wait_process(&self, req: WaitProcessRequest) -> Result; + + // io and tty + async fn close_stdin(&self, req: CloseStdinRequest) -> Result; + async fn read_stderr(&self, req: ReadStreamRequest) -> Result; + async fn read_stdout(&self, req: ReadStreamRequest) -> Result; + async fn tty_win_resize(&self, req: TtyWinResizeRequest) -> Result; + async fn write_stdin(&self, req: WriteStreamRequest) -> Result; + + // utils + async fn copy_file(&self, req: CopyFileRequest) -> Result; + async fn get_oom_event(&self, req: Empty) -> Result; +} diff --git a/src/runtime-rs/crates/agent/src/log_forwarder.rs b/src/runtime-rs/crates/agent/src/log_forwarder.rs new file mode 100644 index 0000000000..73c668f2be --- /dev/null +++ b/src/runtime-rs/crates/agent/src/log_forwarder.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use tokio::io::{AsyncBufReadExt, BufReader}; + +use crate::sock; + +// https://github.com/slog-rs/slog/blob/master/src/lib.rs#L2082 +const LOG_LEVEL_TRACE: &str = "TRCE"; +const LOG_LEVEL_DEBUG: &str = "DEBG"; +const LOG_LEVEL_INFO: &str = "INFO"; +const LOG_LEVEL_WARNING: &str = "WARN"; +const LOG_LEVEL_ERROR: &str = "ERRO"; +const LOG_LEVEL_CRITICAL: &str = "CRIT"; + +pub(crate) struct LogForwarder { + task_handler: Option>, +} + +impl LogForwarder { + pub(crate) fn new() -> Self { + Self { task_handler: None } + } + + pub(crate) fn stop(&mut self) { + let task_handler = self.task_handler.take(); + if let Some(handler) = task_handler { + handler.abort(); + info!(sl!(), "abort log forwarder thread"); + } + } + + // start connect kata-agent log vsock and copy data to hypervisor's log stream + pub(crate) async fn start( + &mut self, + address: &str, + port: u32, + config: sock::ConnectConfig, + ) -> Result<()> { + let logger = sl!().clone(); + let address = address.to_string(); + let task_handler = tokio::spawn(async move { + loop { + info!(logger, "try to connect to get agent log"); + let sock = match sock::new(&address, port) { + Ok(sock) => sock, + Err(err) => { + error!( + sl!(), + "failed to new sock for address {:?} port {} error {:?}", + address, + port, + err + ); + return; + } + }; + + match sock.connect(&config).await { + Ok(stream) => { + let stream = BufReader::new(stream); + let mut lines = stream.lines(); + while let Ok(line) = lines.next_line().await { + if let Some(l) = line { + match parse_agent_log_level(&l) { + LOG_LEVEL_TRACE => trace!(sl!(), "{}", l), + LOG_LEVEL_DEBUG => debug!(sl!(), "{}", l), + LOG_LEVEL_WARNING => warn!(sl!(), "{}", l), + LOG_LEVEL_ERROR => error!(sl!(), "{}", l), + LOG_LEVEL_CRITICAL => crit!(sl!(), "{}", l), + _ => info!(sl!(), "{}", l), + } + } + } + } + Err(err) => { + warn!(logger, "connect agent vsock failed: {:?}", err); + } + } + } + }); + self.task_handler = Some(task_handler); + Ok(()) + } +} + +pub fn parse_agent_log_level(s: &str) -> &str { + let v: serde_json::Result = serde_json::from_str(s); + match v { + Err(_err) => LOG_LEVEL_INFO, + Ok(val) => { + match &val["level"] { + serde_json::Value::String(s) => match s.as_str() { + LOG_LEVEL_TRACE => LOG_LEVEL_TRACE, + LOG_LEVEL_DEBUG => LOG_LEVEL_DEBUG, + LOG_LEVEL_WARNING => LOG_LEVEL_WARNING, + LOG_LEVEL_ERROR => LOG_LEVEL_ERROR, + LOG_LEVEL_CRITICAL => LOG_LEVEL_CRITICAL, + _ => LOG_LEVEL_INFO, // info or other values will return info, + }, + _ => LOG_LEVEL_INFO, // info or other values will return info, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::parse_agent_log_level; + + #[test] + fn test_parse_agent_log_level() { + let cases = vec![ + // normal cases + ( + r#"{"msg":"child exited unexpectedly","level":"TRCE"}"#, + super::LOG_LEVEL_TRACE, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"DEBG"}"#, + super::LOG_LEVEL_DEBUG, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"INFO"}"#, + super::LOG_LEVEL_INFO, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"WARN"}"#, + super::LOG_LEVEL_WARNING, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"ERRO"}"#, + super::LOG_LEVEL_ERROR, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"CRIT"}"#, + super::LOG_LEVEL_CRITICAL, + ), + ( + r#"{"msg":"child exited unexpectedly","level":"abc"}"#, + super::LOG_LEVEL_INFO, + ), + // exception cases + (r#"{"not a valid json struct"}"#, super::LOG_LEVEL_INFO), + ("not a valid json struct", super::LOG_LEVEL_INFO), + ]; + + for case in cases.iter() { + let s = case.0; + let result = parse_agent_log_level(s); + let excepted = case.1; + assert_eq!(result, excepted); + } + } +} diff --git a/src/runtime-rs/crates/agent/src/sock/hybrid_vsock.rs b/src/runtime-rs/crates/agent/src/sock/hybrid_vsock.rs new file mode 100644 index 0000000000..59e93a64d2 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/sock/hybrid_vsock.rs @@ -0,0 +1,81 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::os::unix::prelude::AsRawFd; + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use tokio::{ + io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, + net::UnixStream, +}; + +use super::{ConnectConfig, Sock, Stream}; + +unsafe impl Send for HybridVsock {} +unsafe impl Sync for HybridVsock {} + +#[derive(Debug, PartialEq)] +pub struct HybridVsock { + uds: String, + port: u32, +} + +impl HybridVsock { + pub fn new(uds: &str, port: u32) -> Self { + Self { + uds: uds.to_string(), + port, + } + } +} + +#[async_trait] +impl Sock for HybridVsock { + async fn connect(&self, config: &ConnectConfig) -> Result { + let retry_times = config.reconnect_timeout_ms / config.dial_timeout_ms; + for i in 0..retry_times { + match connect_helper(&self.uds, self.port).await { + Ok(stream) => { + info!( + sl!(), + "connect success on {} current client fd {}", + i, + stream.as_raw_fd() + ); + return Ok(Stream::Unix(stream)); + } + Err(err) => { + debug!(sl!(), "connect on {} err : {:?}", i, err); + tokio::time::sleep(std::time::Duration::from_millis(config.dial_timeout_ms)) + .await; + continue; + } + } + } + Err(anyhow!("cannot connect to agent ttrpc server {:?}", config)) + } +} + +async fn connect_helper(uds: &str, port: u32) -> Result { + info!(sl!(), "connect uds {:?} port {}", &uds, port); + let mut stream = UnixStream::connect(&uds).await.context("connect")?; + stream + .write_all(format!("connect {}\n", port).as_bytes()) + .await + .context("write all")?; + let mut reads = BufReader::new(&mut stream); + let mut response = String::new(); + reads.read_line(&mut response).await.context("read line")?; + //info!(sl!(), "get socket resp: {}", response); + if !response.contains("OK") { + return Err(anyhow!( + "handshake error: malformed response code: {:?}", + response + )); + } + Ok(stream) +} diff --git a/src/runtime-rs/crates/agent/src/sock/mod.rs b/src/runtime-rs/crates/agent/src/sock/mod.rs new file mode 100644 index 0000000000..371f62cd44 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/sock/mod.rs @@ -0,0 +1,160 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod hybrid_vsock; +pub use hybrid_vsock::HybridVsock; +mod vsock; +pub use vsock::Vsock; + +use std::{ + pin::Pin, + task::{Context as TaskContext, Poll}, + { + os::unix::{io::IntoRawFd, prelude::RawFd}, + sync::Arc, + }, +}; + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use tokio::{ + io::{AsyncRead, ReadBuf}, + net::UnixStream, +}; +use url::Url; + +const VSOCK_SCHEME: &str = "vsock"; +const HYBRID_VSOCK_SCHEME: &str = "hvsock"; + +/// Socket stream +pub enum Stream { + // hvsock://:. Firecracker/Dragonball implements the virtio-vsock device + // model, and mediates communication between AF_UNIX sockets (on the host end) + // and AF_VSOCK sockets (on the guest end). + Unix(UnixStream), + // TODO: support vsock + // vsock://: +} + +impl Stream { + fn poll_read_priv( + &mut self, + cx: &mut TaskContext<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + // Safety: `UnixStream::read` correctly handles reads into uninitialized memory + match self { + Stream::Unix(stream) => Pin::new(stream).poll_read(cx, buf), + } + } +} + +impl IntoRawFd for Stream { + fn into_raw_fd(self) -> RawFd { + match self { + Stream::Unix(stream) => match stream.into_std() { + Ok(stream) => stream.into_raw_fd(), + Err(err) => { + error!(sl!(), "failed to into std unix stream {:?}", err); + -1 + } + }, + } + } +} + +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + // we know this is safe because doesn't moved + let me = unsafe { self.get_unchecked_mut() }; + me.poll_read_priv(cx, buf) + } +} + +/// Connect config +#[derive(Debug)] +pub struct ConnectConfig { + dial_timeout_ms: u64, + reconnect_timeout_ms: u64, +} + +impl ConnectConfig { + pub fn new(dial_timeout_ms: u64, reconnect_timeout_ms: u64) -> Self { + Self { + dial_timeout_ms, + reconnect_timeout_ms, + } + } +} + +#[derive(Debug, PartialEq)] +enum SockType { + Vsock(Vsock), + HybridVsock(HybridVsock), +} + +#[async_trait] +pub trait Sock: Send + Sync { + async fn connect(&self, config: &ConnectConfig) -> Result; +} + +// Supported sock address formats are: +// - vsock://: +// - hvsock://:. Firecracker implements the virtio-vsock device +// model, and mediates communication between AF_UNIX sockets (on the host end) +// and AF_VSOCK sockets (on the guest end). +pub fn new(address: &str, port: u32) -> Result> { + match parse(address, port).context("parse url")? { + SockType::Vsock(sock) => Ok(Arc::new(sock)), + SockType::HybridVsock(sock) => Ok(Arc::new(sock)), + } +} + +fn parse(address: &str, port: u32) -> Result { + let url = Url::parse(address).context("parse url")?; + match url.scheme() { + VSOCK_SCHEME => { + let vsock_cid = url + .host_str() + .unwrap_or_default() + .parse::() + .context("parse vsock cid")?; + Ok(SockType::Vsock(Vsock::new(vsock_cid, port))) + } + HYBRID_VSOCK_SCHEME => { + let path: Vec<&str> = url.path().split(':').collect(); + if path.len() != 1 { + return Err(anyhow!("invalid path {:?}", path)); + } + let uds = path[0]; + Ok(SockType::HybridVsock(HybridVsock::new(uds, port))) + } + _ => Err(anyhow!("Unsupported scheme")), + } +} + +#[cfg(test)] +mod test { + use super::{hybrid_vsock::HybridVsock, parse, vsock::Vsock, SockType}; + + #[test] + fn test_parse_url() { + // check vsock + let vsock = parse("vsock://123", 456).unwrap(); + assert_eq!(vsock, SockType::Vsock(Vsock::new(123, 456))); + + // check hybrid vsock + let hvsock = parse("hvsock:///tmp/test.hvsock", 456).unwrap(); + assert_eq!( + hvsock, + SockType::HybridVsock(HybridVsock::new("/tmp/test.hvsock", 456)) + ); + } +} diff --git a/src/runtime-rs/crates/agent/src/sock/vsock.rs b/src/runtime-rs/crates/agent/src/sock/vsock.rs new file mode 100644 index 0000000000..9b62bb9766 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/sock/vsock.rs @@ -0,0 +1,32 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use async_trait::async_trait; + +use super::{ConnectConfig, Sock, Stream}; + +unsafe impl Send for Vsock {} +unsafe impl Sync for Vsock {} + +#[derive(Debug, PartialEq)] +pub struct Vsock { + vsock_cid: u32, + port: u32, +} + +impl Vsock { + pub fn new(vsock_cid: u32, port: u32) -> Self { + Self { vsock_cid, port } + } +} + +#[async_trait] +impl Sock for Vsock { + async fn connect(&self, _config: &ConnectConfig) -> Result { + todo!() + } +} diff --git a/src/runtime-rs/crates/agent/src/types.rs b/src/runtime-rs/crates/agent/src/types.rs new file mode 100644 index 0000000000..49319c2f07 --- /dev/null +++ b/src/runtime-rs/crates/agent/src/types.rs @@ -0,0 +1,488 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::Deserialize; + +#[derive(PartialEq, Clone, Default)] +pub struct Empty {} + +impl Empty { + pub fn new() -> Self { + Self::default() + } +} + +impl Default for FSGroupChangePolicy { + fn default() -> Self { + FSGroupChangePolicy::Always + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FSGroupChangePolicy { + Always = 0, + OnRootMismatch = 1, +} + +#[derive(Debug, PartialEq, Clone, Default)] +pub struct FSGroup { + pub group_id: u32, + pub group_change_policy: FSGroupChangePolicy, +} + +#[derive(PartialEq, Clone, Default)] +pub struct StringUser { + pub uid: String, + pub gid: String, + pub additional_gids: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct Device { + pub id: String, + pub field_type: String, + pub vm_path: String, + pub container_path: String, + pub options: Vec, +} + +#[derive(Debug, PartialEq, Clone, Default)] +pub struct Storage { + pub driver: String, + pub driver_options: Vec, + pub source: String, + pub fs_type: String, + pub fs_group: Option, + pub options: Vec, + pub mount_point: String, +} + +#[derive(Deserialize, Clone, PartialEq, Eq, Debug, Hash)] +pub enum IPFamily { + V4 = 0, + V6 = 1, +} + +impl ::std::default::Default for IPFamily { + fn default() -> Self { + IPFamily::V4 + } +} + +#[derive(Deserialize, Debug, PartialEq, Clone, Default)] +pub struct IPAddress { + pub family: IPFamily, + pub address: String, + pub mask: String, +} + +#[derive(Deserialize, Debug, PartialEq, Clone, Default)] +pub struct Interface { + pub device: String, + pub name: String, + pub ip_addresses: Vec, + pub mtu: u64, + pub hw_addr: String, + #[serde(default)] + pub pci_addr: String, + #[serde(default)] + pub field_type: String, + #[serde(default)] + pub raw_flags: u32, +} + +#[derive(PartialEq, Clone, Default)] +pub struct Interfaces { + pub interfaces: Vec, +} + +#[derive(Deserialize, Debug, PartialEq, Clone, Default)] +pub struct Route { + pub dest: String, + pub gateway: String, + pub device: String, + pub source: String, + pub scope: u32, + pub family: IPFamily, +} + +#[derive(Deserialize, Debug, PartialEq, Clone, Default)] +pub struct Routes { + pub routes: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct CreateContainerRequest { + pub process_id: ContainerProcessID, + pub string_user: Option, + pub devices: Vec, + pub storages: Vec, + pub oci: Option, + pub guest_hooks: Option, + pub sandbox_pidns: bool, + pub rootfs_mounts: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ContainerID { + pub container_id: String, +} + +impl ContainerID { + pub fn new(id: &str) -> Self { + Self { + container_id: id.to_string(), + } + } +} + +#[derive(PartialEq, Clone, Default)] +pub struct ContainerProcessID { + pub container_id: ContainerID, + pub exec_id: String, +} + +impl ContainerProcessID { + pub fn new(container_id: &str, exec_id: &str) -> Self { + Self { + container_id: ContainerID::new(container_id), + exec_id: exec_id.to_string(), + } + } + + pub fn container_id(&self) -> String { + self.container_id.container_id.clone() + } + + pub fn exec_id(&self) -> String { + self.exec_id.clone() + } +} + +#[derive(PartialEq, Clone, Debug, Default)] +pub struct RemoveContainerRequest { + pub container_id: String, + pub timeout: u32, +} + +impl RemoveContainerRequest { + pub fn new(id: &str, timeout: u32) -> Self { + Self { + container_id: id.to_string(), + timeout, + } + } +} + +#[derive(PartialEq, Clone, Default)] +pub struct SignalProcessRequest { + pub process_id: ContainerProcessID, + pub signal: u32, +} + +#[derive(PartialEq, Clone, Default)] +pub struct WaitProcessRequest { + pub process_id: ContainerProcessID, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ListProcessesRequest { + pub container_id: String, + pub format: String, + pub args: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct UpdateContainerRequest { + pub container_id: String, + pub resources: oci::LinuxResources, + pub mounts: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct WriteStreamRequest { + pub process_id: ContainerProcessID, + pub data: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct WriteStreamResponse { + pub length: u32, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ExecProcessRequest { + pub process_id: ContainerProcessID, + pub string_user: Option, + pub process: Option, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct CpuUsage { + pub total_usage: u64, + pub percpu_usage: ::std::vec::Vec, + pub usage_in_kernelmode: u64, + pub usage_in_usermode: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct ThrottlingData { + pub periods: u64, + pub throttled_periods: u64, + pub throttled_time: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct LoadData { + pub one: String, + pub five: String, + pub fifteen: String, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct CpuStats { + pub cpu_usage: Option, + pub throttling_data: Option, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct MemoryData { + pub usage: u64, + pub max_usage: u64, + pub failcnt: u64, + pub limit: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct MemoryStats { + pub cache: u64, + pub usage: Option, + pub swap_usage: Option, + pub kernel_usage: Option, + pub use_hierarchy: bool, + pub stats: ::std::collections::HashMap, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct PidsStats { + pub current: u64, + pub limit: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct BlkioStatsEntry { + pub major: u64, + pub minor: u64, + pub op: String, + pub value: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct BlkioStats { + pub io_service_bytes_recursive: Vec, + pub io_serviced_recursive: Vec, + pub io_queued_recursive: Vec, + pub io_service_time_recursive: Vec, + pub io_wait_time_recursive: Vec, + pub io_merged_recursive: Vec, + pub io_time_recursive: Vec, + pub sectors_recursive: Vec, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct HugetlbStats { + pub usage: u64, + pub max_usage: u64, + pub failcnt: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct CgroupStats { + pub cpu_stats: Option, + pub memory_stats: Option, + pub pids_stats: Option, + pub blkio_stats: Option, + pub hugetlb_stats: ::std::collections::HashMap, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct NetworkStats { + pub name: String, + pub rx_bytes: u64, + pub rx_packets: u64, + pub rx_errors: u64, + pub rx_dropped: u64, + pub tx_bytes: u64, + pub tx_packets: u64, + pub tx_errors: u64, + pub tx_dropped: u64, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct StatsContainerResponse { + pub cgroup_stats: Option, + pub network_stats: Vec, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct WaitProcessResponse { + pub status: i32, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ReadStreamRequest { + pub process_id: ContainerProcessID, + pub len: u32, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ReadStreamResponse { + pub data: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct CloseStdinRequest { + pub process_id: ContainerProcessID, +} + +#[derive(PartialEq, Clone, Default)] +pub struct TtyWinResizeRequest { + pub process_id: ContainerProcessID, + pub row: u32, + pub column: u32, +} + +#[derive(Debug, PartialEq, Clone, Default)] +pub struct UpdateInterfaceRequest { + pub interface: Option, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct UpdateRoutesRequest { + pub route: Option, +} + +#[derive(Deserialize, PartialEq, Clone, Default, Debug)] +pub struct ARPNeighbor { + pub to_ip_address: Option, + pub device: String, + pub ll_addr: String, + pub state: i32, + pub flags: i32, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct ARPNeighbors { + pub neighbors: Vec, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct AddArpNeighborRequest { + pub neighbors: Option, +} + +#[derive(PartialEq, Clone, Default)] +pub struct KernelModule { + pub name: String, + pub parameters: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct CreateSandboxRequest { + pub hostname: String, + pub dns: Vec, + pub storages: Vec, + pub sandbox_pidns: bool, + pub sandbox_id: String, + pub guest_hook_path: String, + pub kernel_modules: Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct OnlineCPUMemRequest { + pub wait: bool, + pub nb_cpus: u32, + pub cpu_only: bool, +} + +#[derive(PartialEq, Clone, Default)] +pub struct ReseedRandomDevRequest { + pub data: ::std::vec::Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct GetGuestDetailsRequest { + pub mem_block_size: bool, + pub mem_hotplug_probe: bool, +} + +#[derive(PartialEq, Clone, Default)] +pub struct MemHotplugByProbeRequest { + pub mem_hotplug_probe_addr: ::std::vec::Vec, +} + +#[derive(PartialEq, Clone, Default)] +pub struct SetGuestDateTimeRequest { + pub sec: i64, + pub usec: i64, +} + +#[derive(PartialEq, Clone, Default)] +pub struct AgentDetails { + pub version: String, + pub init_daemon: bool, + pub device_handlers: Vec, + pub storage_handlers: Vec, + pub supports_seccomp: bool, +} + +#[derive(PartialEq, Clone, Default)] +pub struct GuestDetailsResponse { + pub mem_block_size_bytes: u64, + pub agent_details: Option, + pub support_mem_hotplug_probe: bool, +} + +#[derive(PartialEq, Clone, Default)] +pub struct CopyFileRequest { + pub path: String, + pub file_size: i64, + pub file_mode: u32, + pub dir_mode: u32, + pub uid: i32, + pub gid: i32, + pub offset: i64, + pub data: ::std::vec::Vec, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct CheckRequest { + pub service: String, +} + +impl CheckRequest { + pub fn new(service: &str) -> Self { + Self { + service: service.to_string(), + } + } +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct HealthCheckResponse { + pub status: u32, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct VersionCheckResponse { + pub grpc_version: String, + pub agent_version: String, +} + +#[derive(PartialEq, Clone, Default, Debug)] +pub struct OomEventResponse { + pub container_id: String, +} diff --git a/src/runtime-rs/crates/hypervisor/Cargo.toml b/src/runtime-rs/crates/hypervisor/Cargo.toml new file mode 100644 index 0000000000..0de423b6bf --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "hypervisor" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +dbs-utils = "0.1.0" +go-flag = "0.1.0" +libc = ">=0.2.39" +nix = "0.24.1" +seccompiler = "0.2.0" +serde_json = ">=1.0.9" +slog = "2.5.2" +slog-scope = "4.4.0" +thiserror = "1.0" +tokio = { version = "1.8.0", features = ["sync"] } +vmm-sys-util = "0.9.0" + +kata-sys-util = { path = "../../../libs/kata-sys-util" } +kata-types = { path = "../../../libs/kata-types" } +logging = { path = "../../../libs/logging" } + +dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs"] } + +[features] diff --git a/src/runtime-rs/crates/hypervisor/src/device/block.rs b/src/runtime-rs/crates/hypervisor/src/device/block.rs new file mode 100644 index 0000000000..4f59cc0ea3 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/block.rs @@ -0,0 +1,24 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[derive(Debug)] +pub struct BlockConfig { + /// Unique identifier of the drive. + pub id: String, + + /// Path of the drive. + pub path_on_host: String, + + /// If set to true, the drive is opened in read-only mode. Otherwise, the + /// drive is opened as read-write. + pub is_readonly: bool, + + /// Don't close `path_on_host` file when dropping the device. + pub no_drop: bool, + + /// device index + pub index: u64, +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/mod.rs new file mode 100644 index 0000000000..49215e0d1a --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/mod.rs @@ -0,0 +1,36 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod block; +pub use block::BlockConfig; +mod network; +pub use network::{Address, NetworkConfig}; +mod share_fs_device; +pub use share_fs_device::ShareFsDeviceConfig; +mod vfio; +pub use vfio::{bind_device_to_host, bind_device_to_vfio, VfioBusMode, VfioConfig}; +mod share_fs_mount; +pub use share_fs_mount::{ShareFsMountConfig, ShareFsMountType, ShareFsOperation}; +mod vsock; +pub use vsock::VsockConfig; + +use std::fmt; + +#[derive(Debug)] +pub enum Device { + Block(BlockConfig), + Network(NetworkConfig), + ShareFsDevice(ShareFsDeviceConfig), + Vfio(VfioConfig), + ShareFsMount(ShareFsMountConfig), + Vsock(VsockConfig), +} + +impl fmt::Display for Device { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?}", self) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/network.rs b/src/runtime-rs/crates/hypervisor/src/device/network.rs new file mode 100644 index 0000000000..6c13a9ca1e --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/network.rs @@ -0,0 +1,32 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::fmt; + +pub struct Address(pub [u8; 6]); + +impl fmt::Debug for Address { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let b = self.0; + write!( + f, + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + b[0], b[1], b[2], b[3], b[4], b[5] + ) + } +} + +#[derive(Debug)] +pub struct NetworkConfig { + /// Unique identifier of the device + pub id: String, + + /// Host level path for the guest network interface. + pub host_dev_name: String, + + /// Guest MAC address. + pub guest_mac: Option
, +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/share_fs_device.rs b/src/runtime-rs/crates/hypervisor/src/device/share_fs_device.rs new file mode 100644 index 0000000000..4bf73eab73 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/share_fs_device.rs @@ -0,0 +1,27 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +/// ShareFsDeviceConfig: share fs device config +#[derive(Debug)] +pub struct ShareFsDeviceConfig { + /// fs_type: virtiofs or inline-virtiofs + pub fs_type: String, + + /// socket_path: socket path for virtiofs + pub sock_path: String, + + /// mount_tag: a label used as a hint to the guest. + pub mount_tag: String, + + /// host_path: the host filesystem path for this volume. + pub host_path: String, + + /// queue_size: queue size + pub queue_size: u64, + + /// queue_num: queue number + pub queue_num: u64, +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/share_fs_mount.rs b/src/runtime-rs/crates/hypervisor/src/device/share_fs_mount.rs new file mode 100644 index 0000000000..85f5164562 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/share_fs_mount.rs @@ -0,0 +1,43 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[derive(Copy, Clone, Debug)] +pub enum ShareFsOperation { + Mount, + Umount, + Update, +} + +#[derive(Debug)] +pub enum ShareFsMountType { + PASSTHROUGH, + RAFS, +} + +/// ShareFsMountConfig: share fs mount config +#[derive(Debug)] +pub struct ShareFsMountConfig { + /// source: the passthrough fs exported dir or rafs meta file of rafs + pub source: String, + + /// fstype: specifies the type of this sub-fs, could be passthrough-fs or rafs + pub fstype: ShareFsMountType, + + /// mount_point: the mount point inside guest + pub mount_point: String, + + /// config: the rafs backend config file + pub config: Option, + + /// tag: is the tag used inside the kata guest. + pub tag: String, + + /// op: the operation to take, e.g. mount, umount or update + pub op: ShareFsOperation, + + /// prefetch_list_path: path to file that contains file lists that should be prefetched by rafs + pub prefetch_list_path: Option, +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/vfio.rs b/src/runtime-rs/crates/hypervisor/src/device/vfio.rs new file mode 100644 index 0000000000..fcbaeb19fe --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/vfio.rs @@ -0,0 +1,147 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{fs, path::Path, process::Command}; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use anyhow::anyhow; +use anyhow::{Context, Result}; + +fn override_driver(bdf: &str, driver: &str) -> Result<()> { + let driver_override = format!("/sys/bus/pci/devices/{}/driver_override", bdf); + fs::write(&driver_override, driver) + .with_context(|| format!("echo {} > {}", driver, &driver_override))?; + info!(sl!(), "echo {} > {}", driver, driver_override); + Ok(()) +} + +const SYS_PCI_DEVICES_PATH: &str = "/sys/bus/pci/devices"; +const PCI_DRIVER_PROBE: &str = "/sys/bus/pci/drivers_probe"; +const VFIO_NEW_ID_PATH: &str = "/sys/bus/pci/drivers/vfio-pci/new_id"; + +pub const VFIO_PCI: &str = "vfio-pci"; + +#[derive(Debug)] +pub enum VfioBusMode { + PCI, + MMIO, +} + +impl VfioBusMode { + pub fn new(mode: &str) -> Result { + Ok(match mode { + "mmio" => VfioBusMode::MMIO, + _ => VfioBusMode::PCI, + }) + } +} + +#[derive(Debug)] +pub struct VfioConfig { + /// Unique identifier of the device + pub id: String, + + /// Sysfs path for mdev bus type device + pub sysfs_path: String, + + /// PCI device information: "bus:slot:function" + pub bus_slot_func: String, + + /// Bus Mode, PCI or MMIO + pub mode: VfioBusMode, +} + +/// binds the device to vfio driver after unbinding from host. +/// Will be called by a network interface or a generic pcie device. +pub fn bind_device_to_vfio(bdf: &str, host_driver: &str, _vendor_device_id: &str) -> Result<()> { + // modprobe vfio-pci + if !Path::new(VFIO_NEW_ID_PATH).exists() { + Command::new("modprobe") + .arg(VFIO_PCI) + .output() + .expect("Failed to run modprobe vfio-pci"); + } + + // Arm does not need cmdline to open iommu, just set it through bios. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + // check intel_iommu=on + let cmdline = fs::read_to_string("/proc/cmdline").unwrap(); + if cmdline.contains("iommu=off") || !cmdline.contains("iommu=") { + return Err(anyhow!("iommu isn't set on kernel cmdline")); + } + } + + // if it's already bound to vfio + if is_equal_driver(bdf, VFIO_PCI) { + info!(sl!(), "bdf : {} was already bound to vfio-pci", bdf); + return Ok(()); + } + + info!(sl!(), "host driver : {}", host_driver); + override_driver(bdf, VFIO_PCI).context("override driver")?; + + let unbind_path = format!("/sys/bus/pci/devices/{}/driver/unbind", bdf); + + // echo bdf > /sys/bus/pci/drivers/virtio-pci/unbind" + fs::write(&unbind_path, bdf) + .with_context(|| format!("Failed to echo {} > {}", bdf, &unbind_path))?; + + info!(sl!(), "{} is unbound from {}", bdf, host_driver); + + // echo bdf > /sys/bus/pci/drivers_probe + fs::write(PCI_DRIVER_PROBE, bdf) + .with_context(|| format!("Failed to echo {} > {}", bdf, PCI_DRIVER_PROBE))?; + + info!(sl!(), "echo {} > /sys/bus/pci/drivers_probe", bdf); + Ok(()) +} + +pub fn is_equal_driver(bdf: &str, host_driver: &str) -> bool { + let sys_pci_devices_path = Path::new(SYS_PCI_DEVICES_PATH); + let driver_file = sys_pci_devices_path.join(bdf).join("driver"); + + if driver_file.exists() { + let driver_path = fs::read_link(driver_file).unwrap_or_default(); + let driver_name = driver_path + .file_name() + .map_or(String::new(), |v| v.to_str().unwrap().to_owned()); + return driver_name.eq(host_driver); + } + + false +} + +/// bind_device_to_host binds the device to the host driver after unbinding from vfio-pci. +pub fn bind_device_to_host(bdf: &str, host_driver: &str, _vendor_device_id: &str) -> Result<()> { + // Unbind from vfio-pci driver to the original host driver + + info!(sl!(), "bind {} to {}", bdf, host_driver); + + // if it's already bound to host_driver + if is_equal_driver(bdf, host_driver) { + info!( + sl!(), + "bdf {} was already unbound to host driver {}", bdf, host_driver + ); + return Ok(()); + } + + override_driver(bdf, host_driver).context("override driver")?; + + let unbind_path = "/sys/bus/pci/drivers/vfio-pci/unbind"; + + // echo bdf > /sys/bus/pci/drivers/vfio-pci/unbind" + std::fs::write(unbind_path, bdf).with_context(|| format!("echo {}> {}", bdf, unbind_path))?; + info!(sl!(), "echo {} > {}", bdf, unbind_path); + + // echo bdf > /sys/bus/pci/drivers_probe + std::fs::write(PCI_DRIVER_PROBE, bdf) + .with_context(|| format!("echo {} > {}", bdf, PCI_DRIVER_PROBE))?; + info!(sl!(), "echo {} > {}", bdf, PCI_DRIVER_PROBE); + + Ok(()) +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/vsock.rs b/src/runtime-rs/crates/hypervisor/src/device/vsock.rs new file mode 100644 index 0000000000..3a5b7c8b3c --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/device/vsock.rs @@ -0,0 +1,17 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[derive(Debug)] +pub struct VsockConfig { + /// Unique identifier of the device + pub id: String, + + /// A 32-bit Context Identifier (CID) used to identify the guest. + pub guest_cid: u32, + + /// unix domain socket path + pub uds_path: String, +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs new file mode 100644 index 0000000000..aef8d3352d --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs @@ -0,0 +1,309 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{collections::HashSet, fs::create_dir_all, path::PathBuf}; + +use anyhow::{anyhow, Context, Result}; +use dragonball::{ + api::v1::{BlockDeviceConfigInfo, BootSourceConfig}, + vm::VmConfigInfo, +}; +use kata_sys_util::mount; +use kata_types::config::hypervisor::Hypervisor as HypervisorConfig; + +use super::{vmm_instance::VmmInstance, RUN_PATH_PREFIX}; +use crate::{device::Device, kernel_param::KernelParams, VmmState, VM_ROOTFS_DRIVER_BLK}; + +const DRAGONBALL_KERNEL: &str = "vmlinux"; +const DRAGONBALL_ROOT_FS: &str = "rootfs"; + +unsafe impl Send for DragonballInner {} +unsafe impl Sync for DragonballInner {} +pub struct DragonballInner { + /// sandbox id + pub(crate) id: String, + + /// vm path + pub(crate) vm_path: String, + + /// jailed flag + pub(crate) jailed: bool, + + /// chroot base for the jailer + pub(crate) jailer_root: String, + + /// netns + pub(crate) netns: Option, + + /// hypervisor config + pub(crate) config: HypervisorConfig, + + /// vmm state + pub(crate) state: VmmState, + + /// vmm instance + pub(crate) vmm_instance: VmmInstance, + + /// hypervisor run dir + pub(crate) run_dir: String, + + /// pending device + pub(crate) pending_devices: Vec, + + /// cached block device + pub(crate) cached_block_devices: HashSet, +} + +impl DragonballInner { + pub fn new() -> DragonballInner { + DragonballInner { + id: "".to_string(), + vm_path: "".to_string(), + jailer_root: "".to_string(), + netns: None, + config: Default::default(), + pending_devices: vec![], + state: VmmState::NotReady, + jailed: false, + vmm_instance: VmmInstance::new(""), + run_dir: "".to_string(), + cached_block_devices: Default::default(), + } + } + + pub(crate) async fn cold_start_vm(&mut self, timeout: i32) -> Result<()> { + info!(sl!(), "start sandbox cold"); + + self.set_vm_base_config().context("set vm base config")?; + + // get rootfs driver + let rootfs_driver = self.config.blockdev_info.block_device_driver.clone(); + + // get kernel params + let mut kernel_params = KernelParams::new(self.config.debug_info.enable_debug); + kernel_params.append(&mut KernelParams::new_rootfs_kernel_params(&rootfs_driver)); + kernel_params.append(&mut KernelParams::from_string( + &self.config.boot_info.kernel_params, + )); + + // set boot source + let kernel_path = self.config.boot_info.kernel.clone(); + self.set_boot_source( + &kernel_path, + &kernel_params + .to_string() + .context("kernel params to string")?, + ) + .context("set_boot_source")?; + + // get vm rootfs + let image = { + let initrd_path = self.config.boot_info.initrd.clone(); + let image_path = self.config.boot_info.image.clone(); + if !initrd_path.is_empty() { + Ok(initrd_path) + } else if !image_path.is_empty() { + Ok(image_path) + } else { + Err(anyhow!("failed to get image")) + } + } + .context("get image")?; + self.set_vm_rootfs(&image, &rootfs_driver) + .context("set vm rootfs")?; + + // add pending devices + while let Some(dev) = self.pending_devices.pop() { + self.add_device(dev).await.context("add_device")?; + } + + // start vmm and wait ready + self.start_vmm_instance().context("start vmm instance")?; + self.wait_vmm_ready(timeout).context("wait vmm")?; + + Ok(()) + } + + pub(crate) fn run_vmm_server(&mut self) -> Result<()> { + if !self.config.jailer_path.is_empty() { + self.jailed = true; + } + + // create jailer root + create_dir_all(self.jailer_root.as_str()) + .map_err(|e| anyhow!("Failed to create dir {} err : {:?}", self.jailer_root, e))?; + + // create run dir + self.run_dir = [RUN_PATH_PREFIX, self.id.as_str()].join("/"); + create_dir_all(self.run_dir.as_str()) + .with_context(|| format!("failed to create dir {}", self.run_dir.as_str()))?; + + // run vmm server + self.vmm_instance + .run_vmm_server(&self.id, self.netns.clone()) + .context("run vmm server")?; + self.state = VmmState::VmmServerReady; + + Ok(()) + } + + pub(crate) fn cleanup_resource(&self) { + if self.jailed { + self.umount_jail_resource(DRAGONBALL_KERNEL).ok(); + self.umount_jail_resource(DRAGONBALL_ROOT_FS).ok(); + for id in &self.cached_block_devices { + self.umount_jail_resource(id.as_str()).ok(); + } + } + + std::fs::remove_dir_all(&self.vm_path) + .map_err(|err| { + error!(sl!(), "failed to remove dir all for {}", &self.vm_path); + err + }) + .ok(); + } + + fn set_vm_base_config(&mut self) -> Result<()> { + let serial_path = [&self.run_dir, "console.sock"].join("/"); + let vm_config = VmConfigInfo { + serial_path: Some(serial_path), + mem_size_mib: self.config.memory_info.default_memory as usize, + vcpu_count: self.config.cpu_info.default_vcpus as u8, + ..Default::default() + }; + info!(sl!(), "vm config: {:?}", vm_config); + + self.vmm_instance + .set_vm_configuration(vm_config) + .context("set vm configuration") + } + + pub(crate) fn umount_jail_resource(&self, jailed_path: &str) -> Result<()> { + let path = [self.jailer_root.as_str(), jailed_path].join("/"); + nix::mount::umount2(path.as_str(), nix::mount::MntFlags::MNT_DETACH) + .with_context(|| format!("umount path {}", &path)) + } + + pub(crate) fn get_resource(&self, src: &str, dst: &str) -> Result { + if self.jailed { + self.jail_resource(src, dst) + } else { + Ok(src.to_string()) + } + } + + fn jail_resource(&self, src: &str, dst: &str) -> Result { + info!(sl!(), "jail resource: src {} dst {}", src, dst); + if src.is_empty() || dst.is_empty() { + return Err(anyhow!("invalid param src {} dst {}", src, dst)); + } + + let jailed_location = [self.jailer_root.as_str(), dst].join("/"); + mount::bind_mount_unchecked(src, jailed_location.as_str(), false).context("bind_mount")?; + + let mut abs_path = String::from("/"); + abs_path.push_str(dst); + Ok(abs_path) + } + + fn set_boot_source(&mut self, kernel_path: &str, kernel_params: &str) -> Result<()> { + info!( + sl!(), + "kernel path {} kernel params {}", kernel_path, kernel_params + ); + + let mut boot_cfg = BootSourceConfig { + kernel_path: self + .get_resource(kernel_path, DRAGONBALL_KERNEL) + .context("get resource")?, + ..Default::default() + }; + + if !kernel_params.is_empty() { + boot_cfg.boot_args = Some(kernel_params.to_string()); + } + + self.vmm_instance + .put_boot_source(boot_cfg) + .context("put boot source") + } + + fn set_vm_rootfs(&mut self, path: &str, driver: &str) -> Result<()> { + info!(sl!(), "set vm rootfs {} {}", path, driver); + let jail_drive = self + .get_resource(path, DRAGONBALL_ROOT_FS) + .context("get resource")?; + + if driver == VM_ROOTFS_DRIVER_BLK { + let blk_cfg = BlockDeviceConfigInfo { + path_on_host: PathBuf::from(jail_drive), + drive_id: DRAGONBALL_ROOT_FS.to_string(), + is_root_device: false, + // Add it as a regular block device + // This allows us to use a partitioned root block device + // is_read_only + is_read_only: true, + is_direct: false, + ..Default::default() + }; + + self.vmm_instance + .insert_block_device(blk_cfg) + .context("inert block device") + } else { + Err(anyhow!( + "Unknown vm_rootfs driver {} path {:?}", + driver, + path + )) + } + } + + fn start_vmm_instance(&mut self) -> Result<()> { + info!(sl!(), "Starting VM"); + self.vmm_instance + .instance_start() + .context("Failed to start vmm")?; + self.state = VmmState::VmRunning; + Ok(()) + } + + // wait_vmm_ready will wait for timeout seconds for the VMM to be up and running. + // This does not mean that the VM is up and running. It only indicates that the VMM is up and + // running and able to handle commands to setup and launch a VM + fn wait_vmm_ready(&mut self, timeout: i32) -> Result<()> { + if timeout < 0 { + return Err(anyhow!("Invalid param timeout {}", timeout)); + } + + let time_start = std::time::Instant::now(); + loop { + match self.vmm_instance.is_running() { + Ok(_) => return Ok(()), + Err(err) => { + let time_now = std::time::Instant::now(); + if time_now.duration_since(time_start).as_millis() > timeout as u128 { + return Err(anyhow!( + "waiting vmm ready timeout {} err: {:?}", + timeout, + err + )); + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } + } + } + + pub fn set_hypervisor_config(&mut self, config: HypervisorConfig) { + self.config = config; + } + + pub fn hypervisor_config(&self) -> HypervisorConfig { + self.config.clone() + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs new file mode 100644 index 0000000000..d47cac5698 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs @@ -0,0 +1,316 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::PathBuf; + +use anyhow::{anyhow, Context, Result}; +use dbs_utils::net::MacAddr; +use dragonball::api::v1::{ + BlockDeviceConfigInfo, FsDeviceConfigInfo, FsMountConfigInfo, VirtioNetDeviceConfigInfo, + VsockDeviceConfigInfo, +}; + +use super::DragonballInner; +use crate::{ + device::Device, NetworkConfig, ShareFsDeviceConfig, ShareFsMountConfig, ShareFsMountType, + ShareFsOperation, VmmState, VsockConfig, +}; + +const MB_TO_B: u32 = 1024 * 1024; +const DEFAULT_VIRTIO_FS_NUM_QUEUES: i32 = 1; +const DEFAULT_VIRTIO_FS_QUEUE_SIZE: i32 = 1024; + +const VIRTIO_FS: &str = "virtio-fs"; +const INLINE_VIRTIO_FS: &str = "inline-virtio-fs"; + +pub(crate) fn drive_index_to_id(index: u64) -> String { + format!("drive_{}", index) +} + +impl DragonballInner { + pub(crate) async fn add_device(&mut self, device: Device) -> Result<()> { + if self.state == VmmState::NotReady { + info!(sl!(), "VMM not ready, queueing device {}", device); + + // add the pending device by reverse order, thus the + // start_vm would pop the devices in an right order + // to add the devices. + self.pending_devices.insert(0, device); + return Ok(()); + } + + info!(sl!(), "dragonball add device {:?}", &device); + match device { + Device::Network(config) => self.add_net_device(&config).context("add net device"), + Device::Vfio(_config) => { + todo!() + } + Device::Block(config) => self + .add_block_device( + config.path_on_host.as_str(), + config.id.as_str(), + config.is_readonly, + config.no_drop, + ) + .context("add block device"), + Device::Vsock(config) => self.add_vsock(&config).context("add vsock"), + Device::ShareFsDevice(config) => self + .add_share_fs_device(&config) + .context("add share fs device"), + Device::ShareFsMount(config) => self + .add_share_fs_mount(&config) + .context("add share fs mount"), + } + } + + pub(crate) async fn remove_device(&mut self, device: Device) -> Result<()> { + info!(sl!(), "remove device {} ", device); + + match device { + Device::Block(config) => { + let drive_id = drive_index_to_id(config.index); + self.remove_block_drive(drive_id.as_str()) + .context("remove block drive") + } + Device::Vfio(_config) => { + todo!() + } + _ => Err(anyhow!("unsupported device {:?}", device)), + } + } + + fn add_block_device( + &mut self, + path: &str, + id: &str, + read_only: bool, + no_drop: bool, + ) -> Result<()> { + let jailed_drive = self.get_resource(path, id).context("get resource")?; + self.cached_block_devices.insert(id.to_string()); + + let blk_cfg = BlockDeviceConfigInfo { + drive_id: id.to_string(), + path_on_host: PathBuf::from(jailed_drive), + is_direct: self.config.blockdev_info.block_device_cache_direct, + no_drop, + is_read_only: read_only, + ..Default::default() + }; + self.vmm_instance + .insert_block_device(blk_cfg) + .context("insert block device") + } + + fn remove_block_drive(&mut self, id: &str) -> Result<()> { + self.vmm_instance + .remove_block_device(id) + .context("remove block device")?; + + if self.cached_block_devices.contains(id) && self.jailed { + self.umount_jail_resource(id) + .context("umount jail resource")?; + self.cached_block_devices.remove(id); + } + Ok(()) + } + + fn add_net_device(&mut self, config: &NetworkConfig) -> Result<()> { + let iface_cfg = VirtioNetDeviceConfigInfo { + iface_id: config.id.clone(), + host_dev_name: config.host_dev_name.clone(), + guest_mac: match &config.guest_mac { + Some(mac) => MacAddr::from_bytes(&mac.0).ok(), + None => None, + }, + ..Default::default() + }; + + info!( + sl!(), + "add {} endpoint to {}", iface_cfg.host_dev_name, iface_cfg.iface_id + ); + + self.vmm_instance + .insert_network_device(iface_cfg) + .context("insert network device") + } + + fn add_vsock(&mut self, config: &VsockConfig) -> Result<()> { + let vsock_cfg = VsockDeviceConfigInfo { + id: String::from("root"), + guest_cid: config.guest_cid, + uds_path: Some(config.uds_path.clone()), + ..Default::default() + }; + + self.vmm_instance + .insert_vsock(vsock_cfg) + .context("insert vsock") + } + + fn parse_inline_virtiofs_args(&self, fs_cfg: &mut FsDeviceConfigInfo) -> Result<()> { + let mut debug = false; + let mut opt_list = String::new(); + + fs_cfg.mode = String::from("virtio"); + fs_cfg.cache_policy = self.config.shared_fs.virtio_fs_cache.clone(); + fs_cfg.fuse_killpriv_v2 = true; + + info!( + sl!(), + "args: {:?}", &self.config.shared_fs.virtio_fs_extra_args + ); + let args = &self.config.shared_fs.virtio_fs_extra_args; + let _ = go_flag::parse_args_with_warnings::(args, None, |flags| { + flags.add_flag("d", &mut debug); + flags.add_flag("thread-pool-size", &mut fs_cfg.thread_pool_size); + flags.add_flag("drop-sys-resource", &mut fs_cfg.drop_sys_resource); + flags.add_flag("o", &mut opt_list); + }) + .with_context(|| format!("parse args: {:?}", args))?; + + if debug { + warn!( + sl!(), + "Inline virtiofs \"-d\" option not implemented, ignore" + ); + } + + // Parse comma separated option list + if !opt_list.is_empty() { + let args: Vec<&str> = opt_list.split(',').collect(); + for arg in args { + match arg { + "no_open" => fs_cfg.no_open = true, + "open" => fs_cfg.no_open = false, + "writeback_cache" => fs_cfg.writeback_cache = true, + "no_writeback_cache" => fs_cfg.writeback_cache = false, + "writeback" => fs_cfg.writeback_cache = true, + "no_writeback" => fs_cfg.writeback_cache = false, + "xattr" => fs_cfg.xattr = true, + "no_xattr" => fs_cfg.xattr = false, + "cache_symlinks" => {} // inline virtiofs always cache symlinks + "trace" => warn!( + sl!(), + "Inline virtiofs \"-o trace\" option not supported yet, ignored." + ), + _ => warn!(sl!(), "Inline virtiofs unsupported option: {}", arg), + } + } + } + + debug!(sl!(), "Inline virtiofs config {:?}", fs_cfg); + Ok(()) + } + + fn add_share_fs_device(&self, config: &ShareFsDeviceConfig) -> Result<()> { + let mut fs_cfg = FsDeviceConfigInfo { + sock_path: config.sock_path.clone(), + tag: config.mount_tag.clone(), + num_queues: if config.queue_num > 0 { + config.queue_size as usize + } else { + DEFAULT_VIRTIO_FS_NUM_QUEUES as usize + }, + queue_size: if config.queue_size > 0 { + config.queue_size as u16 + } else { + DEFAULT_VIRTIO_FS_QUEUE_SIZE as u16 + }, + cache_size: (self.config.shared_fs.virtio_fs_cache_size as u64) + .saturating_mul(MB_TO_B as u64), + ..Default::default() + }; + self.do_add_fs_device(&config.fs_type, &mut fs_cfg) + } + + fn do_add_fs_device(&self, fs_type: &str, fs_cfg: &mut FsDeviceConfigInfo) -> Result<()> { + match fs_type { + VIRTIO_FS => { + fs_cfg.mode = String::from("vhostuser"); + } + INLINE_VIRTIO_FS => { + self.parse_inline_virtiofs_args(fs_cfg)?; + } + _ => { + return Err(anyhow!( + "hypervisor isn't configured with shared_fs supported" + )); + } + } + self.vmm_instance + .insert_fs(fs_cfg) + .map_err(|e| anyhow!("insert {} fs error. {:?}", fs_cfg.mode, e)) + } + + fn add_share_fs_mount(&mut self, config: &ShareFsMountConfig) -> Result<()> { + let ops = match config.op { + ShareFsOperation::Mount => "mount", + ShareFsOperation::Umount => "umount", + ShareFsOperation::Update => "update", + }; + + let fstype = match config.fstype { + ShareFsMountType::PASSTHROUGH => "passthroughfs", + ShareFsMountType::RAFS => "rafs", + }; + + let cfg = FsMountConfigInfo { + ops: ops.to_string(), + fstype: Some(fstype.to_string()), + source: Some(config.source.clone()), + mountpoint: config.mount_point.clone(), + config: None, + tag: config.tag.clone(), + prefetch_list_path: config.prefetch_list_path.clone(), + dax_threshold_size_kb: None, + }; + + self.vmm_instance.patch_fs(&cfg, config.op).map_err(|e| { + anyhow!( + "{:?} {} at {} error: {:?}", + config.op, + fstype, + config.mount_point.clone(), + e + ) + }) + } +} + +#[cfg(test)] +mod tests { + use dragonball::api::v1::FsDeviceConfigInfo; + + use crate::dragonball::DragonballInner; + + #[test] + fn test_parse_inline_virtiofs_args() { + let mut dragonball = DragonballInner::new(); + let mut fs_cfg = FsDeviceConfigInfo::default(); + + // no_open and writeback_cache is the default, so test open and no_writeback_cache. "-d" + // and "trace" are ignored for now, but should not return error. + dragonball.config.shared_fs.virtio_fs_extra_args = vec![ + "-o".to_string(), + "open,no_writeback_cache,xattr,trace".to_string(), + "--thread-pool-size=128".to_string(), + "--drop-sys-resource".to_string(), + "-d".to_string(), + ]; + dragonball.config.shared_fs.virtio_fs_cache = "auto".to_string(); + dragonball.parse_inline_virtiofs_args(&mut fs_cfg).unwrap(); + + assert!(!fs_cfg.no_open); + assert!(fs_cfg.xattr); + assert!(fs_cfg.fuse_killpriv_v2); + assert!(!fs_cfg.writeback_cache); + assert_eq!(fs_cfg.cache_policy, "auto".to_string()); + assert!(fs_cfg.drop_sys_resource); + assert!(fs_cfg.thread_pool_size == 128); + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_hypervisor.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_hypervisor.rs new file mode 100644 index 0000000000..2b9c3c77ce --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_hypervisor.rs @@ -0,0 +1,137 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + collections::{HashMap, HashSet}, + iter::FromIterator, +}; + +use anyhow::{Context, Result}; + +use super::inner::DragonballInner; +use crate::{utils, VcpuThreadIds, VmmState}; + +const KATA_PATH: &str = "/run/kata"; +const DEFAULT_HYBRID_VSOCK_NAME: &str = "kata.hvsock"; + +fn get_vsock_path(root: &str) -> String { + [root, DEFAULT_HYBRID_VSOCK_NAME].join("/") +} + +impl DragonballInner { + pub(crate) async fn prepare_vm(&mut self, id: &str, netns: Option) -> Result<()> { + self.id = id.to_string(); + self.state = VmmState::NotReady; + + self.vm_path = [KATA_PATH, id].join("/"); + self.jailer_root = [self.vm_path.as_str(), "root"].join("/"); + self.netns = netns; + + // prepare vsock + let uds_path = [&self.jailer_root, DEFAULT_HYBRID_VSOCK_NAME].join("/"); + let d = crate::device::Device::Vsock(crate::device::VsockConfig { + id: format!("vsock-{}", &self.id), + guest_cid: 3, + uds_path, + }); + + self.add_device(d).await.context("add device")?; + Ok(()) + } + + // start_vm will start the hypervisor for the given sandbox. + // In the context of dragonball, this will start the hypervisor + pub(crate) async fn start_vm(&mut self, timeout: i32) -> Result<()> { + self.run_vmm_server().context("start vmm server")?; + self.cold_start_vm(timeout).await.map_err(|error| { + error!(sl!(), "start micro vm error {:?}", error); + if let Err(err) = self.stop_vm() { + error!(sl!(), "failed to call end err : {:?}", err); + } + error + })?; + + Ok(()) + } + + pub(crate) fn stop_vm(&mut self) -> Result<()> { + info!(sl!(), "Stopping dragonball VM"); + self.vmm_instance.stop().context("stop")?; + Ok(()) + } + + pub(crate) fn pause_vm(&self) -> Result<()> { + info!(sl!(), "do pause vm"); + self.vmm_instance.pause().context("pause vm")?; + Ok(()) + } + + pub(crate) fn resume_vm(&self) -> Result<()> { + info!(sl!(), "do resume vm"); + self.vmm_instance.resume().context("resume vm")?; + Ok(()) + } + + pub(crate) async fn save_vm(&self) -> Result<()> { + todo!() + } + + pub(crate) async fn get_agent_socket(&self) -> Result { + const HYBRID_VSOCK_SCHEME: &str = "hvsock"; + Ok(format!( + "{}://{}", + HYBRID_VSOCK_SCHEME, + get_vsock_path(&self.jailer_root), + )) + } + + pub(crate) async fn disconnect(&mut self) { + self.state = VmmState::NotReady; + } + + pub(crate) async fn get_thread_ids(&self) -> Result { + let mut vcpu_thread_ids: VcpuThreadIds = VcpuThreadIds { + vcpus: HashMap::new(), + }; + + for tid in self.vmm_instance.get_vcpu_tids() { + vcpu_thread_ids.vcpus.insert(tid.0 as u32, tid.1 as u32); + } + info!(sl!(), "get thread ids {:?}", vcpu_thread_ids); + Ok(vcpu_thread_ids) + } + + pub(crate) async fn cleanup(&self) -> Result<()> { + self.cleanup_resource(); + Ok(()) + } + + pub(crate) async fn get_pids(&self) -> Result> { + let mut pids = HashSet::new(); + // get shim thread ids + pids.insert(self.vmm_instance.pid()); + + for tid in utils::get_child_threads(self.vmm_instance.pid()) { + pids.insert(tid); + } + + // remove vcpus + for tid in self.vmm_instance.get_vcpu_tids() { + pids.remove(&tid.1); + } + + info!(sl!(), "get pids {:?}", pids); + Ok(Vec::from_iter(pids.into_iter())) + } + + pub(crate) async fn check(&self) -> Result<()> { + Ok(()) + } + + pub(crate) async fn get_jailer_root(&self) -> Result { + Ok(self.jailer_root.clone()) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs new file mode 100644 index 0000000000..27adfd73b0 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod inner; +mod inner_device; +mod inner_hypervisor; +use inner::DragonballInner; +pub mod vmm_instance; + +pub const RUN_PATH_PREFIX: &str = "/run/kata"; + +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use kata_types::config::hypervisor::Hypervisor as HypervisorConfig; +use tokio::sync::RwLock; + +use crate::{device::Device, Hypervisor, VcpuThreadIds}; + +unsafe impl Send for Dragonball {} +unsafe impl Sync for Dragonball {} +pub struct Dragonball { + inner: Arc>, +} + +impl Default for Dragonball { + fn default() -> Self { + Self::new() + } +} + +impl Dragonball { + pub fn new() -> Self { + Self { + inner: Arc::new(RwLock::new(DragonballInner::new())), + } + } + + pub async fn set_hypervisor_config(&mut self, config: HypervisorConfig) { + let mut inner = self.inner.write().await; + inner.set_hypervisor_config(config) + } +} + +#[async_trait] +impl Hypervisor for Dragonball { + async fn prepare_vm(&self, id: &str, netns: Option) -> Result<()> { + let mut inner = self.inner.write().await; + inner.prepare_vm(id, netns).await + } + + async fn start_vm(&self, timeout: i32) -> Result<()> { + let mut inner = self.inner.write().await; + inner.start_vm(timeout).await + } + + async fn stop_vm(&self) -> Result<()> { + let mut inner = self.inner.write().await; + inner.stop_vm() + } + + async fn pause_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.pause_vm() + } + + async fn resume_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.resume_vm() + } + + async fn save_vm(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.save_vm().await + } + + async fn add_device(&self, device: Device) -> Result<()> { + let mut inner = self.inner.write().await; + inner.add_device(device).await + } + + async fn remove_device(&self, device: Device) -> Result<()> { + let mut inner = self.inner.write().await; + inner.remove_device(device).await + } + + async fn get_agent_socket(&self) -> Result { + let inner = self.inner.read().await; + inner.get_agent_socket().await + } + + async fn disconnect(&self) { + let mut inner = self.inner.write().await; + inner.disconnect().await + } + + async fn hypervisor_config(&self) -> HypervisorConfig { + let inner = self.inner.read().await; + inner.hypervisor_config() + } + + async fn get_thread_ids(&self) -> Result { + let inner = self.inner.read().await; + inner.get_thread_ids().await + } + + async fn cleanup(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.cleanup().await + } + + async fn get_pids(&self) -> Result> { + let inner = self.inner.read().await; + inner.get_pids().await + } + + async fn check(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.check().await + } + + async fn get_jailer_root(&self) -> Result { + let inner = self.inner.read().await; + inner.get_jailer_root().await + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/vmm_instance.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/vmm_instance.rs new file mode 100644 index 0000000000..70172c73a9 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/vmm_instance.rs @@ -0,0 +1,335 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + fs::{File, OpenOptions}, + os::unix::{io::IntoRawFd, prelude::AsRawFd}, + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, Mutex, RwLock, + }, + thread, +}; + +use anyhow::{anyhow, Context, Result}; +use dragonball::{ + api::v1::{ + BlockDeviceConfigInfo, BootSourceConfig, FsDeviceConfigInfo, FsMountConfigInfo, + InstanceInfo, InstanceState, VirtioNetDeviceConfigInfo, VmmAction, VmmActionError, VmmData, + VmmRequest, VmmResponse, VmmService, VsockDeviceConfigInfo, + }, + vm::VmConfigInfo, + Vmm, +}; +use nix::sched::{setns, CloneFlags}; +use seccompiler::BpfProgram; +use vmm_sys_util::eventfd::EventFd; + +use crate::ShareFsOperation; + +pub enum Request { + Sync(VmmAction), +} + +const DRAGONBALL_VERSION: &str = env!("CARGO_PKG_VERSION"); +const REQUEST_RETRY: u32 = 500; +const KVM_DEVICE: &str = "/dev/kvm"; + +pub struct VmmInstance { + /// VMM instance info directly accessible from runtime + vmm_shared_info: Arc>, + to_vmm: Option>, + from_vmm: Option>, + to_vmm_fd: EventFd, + seccomp: BpfProgram, + vmm_thread: Option>>, +} + +impl VmmInstance { + pub fn new(id: &str) -> Self { + let vmm_shared_info = Arc::new(RwLock::new(InstanceInfo::new( + String::from(id), + DRAGONBALL_VERSION.to_string(), + ))); + + let to_vmm_fd = EventFd::new(libc::EFD_NONBLOCK) + .unwrap_or_else(|_| panic!("Failed to create eventfd for vmm {}", id)); + + VmmInstance { + vmm_shared_info, + to_vmm: None, + from_vmm: None, + to_vmm_fd, + seccomp: vec![], + vmm_thread: None, + } + } + + pub fn get_shared_info(&self) -> Arc> { + self.vmm_shared_info.clone() + } + + fn set_instance_id(&mut self, id: &str) { + let share_info_lock = self.vmm_shared_info.clone(); + share_info_lock.write().unwrap().id = String::from(id); + } + + pub fn get_vcpu_tids(&self) -> Vec<(u8, u32)> { + let info = self.vmm_shared_info.clone(); + let result = info.read().unwrap().tids.clone(); + result + } + + pub fn run_vmm_server(&mut self, id: &str, netns: Option) -> Result<()> { + let kvm = OpenOptions::new().read(true).write(true).open(KVM_DEVICE)?; + + let (to_vmm, from_runtime) = channel(); + let (to_runtime, from_vmm) = channel(); + + self.set_instance_id(id); + + let vmm_service = VmmService::new(from_runtime, to_runtime); + + self.to_vmm = Some(to_vmm); + self.from_vmm = Some(from_vmm); + + let api_event_fd2 = self.to_vmm_fd.try_clone().expect("Failed to dup eventfd"); + let vmm = Vmm::new( + self.vmm_shared_info.clone(), + api_event_fd2, + self.seccomp.clone(), + self.seccomp.clone(), + Some(kvm.into_raw_fd()), + ) + .expect("Failed to start vmm"); + + self.vmm_thread = Some( + thread::Builder::new() + .name("vmm_master".to_owned()) + .spawn(move || { + || -> Result { + debug!(sl!(), "run vmm thread start"); + if let Some(netns_path) = netns { + info!(sl!(), "set netns for vmm master {}", &netns_path); + let netns_fd = File::open(&netns_path) + .with_context(|| format!("open netns path {}", &netns_path))?; + setns(netns_fd.as_raw_fd(), CloneFlags::CLONE_NEWNET) + .context("set netns ")?; + } + let exit_code = + Vmm::run_vmm_event_loop(Arc::new(Mutex::new(vmm)), vmm_service); + debug!(sl!(), "run vmm thread exited: {}", exit_code); + Ok(exit_code) + }() + .map_err(|e| { + error!(sl!(), "run vmm thread err. {:?}", e); + e + }) + }) + .expect("Failed to start vmm event loop"), + ); + + Ok(()) + } + + pub fn put_boot_source(&self, boot_source_cfg: BootSourceConfig) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::ConfigureBootSource( + boot_source_cfg, + ))) + .context("Failed to configure boot source")?; + Ok(()) + } + + pub fn instance_start(&self) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::StartMicroVm)) + .context("Failed to start MicroVm")?; + Ok(()) + } + + pub fn is_uninitialized(&self) -> bool { + let share_info = self + .vmm_shared_info + .read() + .expect("Failed to read share_info due to poisoned lock"); + matches!(share_info.state, InstanceState::Uninitialized) + } + + pub fn is_running(&self) -> Result<()> { + let share_info_lock = self.vmm_shared_info.clone(); + let share_info = share_info_lock + .read() + .expect("Failed to read share_info due to poisoned lock"); + if let InstanceState::Running = share_info.state { + return Ok(()); + } + Err(anyhow!("vmm is not running")) + } + + pub fn get_machine_info(&self) -> Result> { + if let Ok(VmmData::MachineConfiguration(vm_config)) = + self.handle_request(Request::Sync(VmmAction::GetVmConfiguration)) + { + return Ok(vm_config); + } + Err(anyhow!("Failed to get machine info")) + } + + pub fn insert_block_device(&self, device_cfg: BlockDeviceConfigInfo) -> Result<()> { + self.handle_request_with_retry(Request::Sync(VmmAction::InsertBlockDevice( + device_cfg.clone(), + ))) + .with_context(|| format!("Failed to insert block device {:?}", device_cfg))?; + Ok(()) + } + + pub fn remove_block_device(&self, id: &str) -> Result<()> { + info!(sl!(), "remove block device {}", id); + self.handle_request(Request::Sync(VmmAction::RemoveBlockDevice(id.to_string()))) + .with_context(|| format!("Failed to remove block device {:?}", id))?; + Ok(()) + } + + pub fn set_vm_configuration(&self, vm_config: VmConfigInfo) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::SetVmConfiguration( + vm_config.clone(), + ))) + .with_context(|| format!("Failed to set vm configuration {:?}", vm_config))?; + Ok(()) + } + + pub fn insert_network_device(&self, net_cfg: VirtioNetDeviceConfigInfo) -> Result<()> { + self.handle_request_with_retry(Request::Sync(VmmAction::InsertNetworkDevice( + net_cfg.clone(), + ))) + .with_context(|| format!("Failed to insert network device {:?}", net_cfg))?; + Ok(()) + } + + pub fn insert_vsock(&self, vsock_cfg: VsockDeviceConfigInfo) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::InsertVsockDevice( + vsock_cfg.clone(), + ))) + .with_context(|| format!("Failed to insert vsock device {:?}", vsock_cfg))?; + Ok(()) + } + + pub fn insert_fs(&self, fs_cfg: &FsDeviceConfigInfo) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::InsertFsDevice(fs_cfg.clone()))) + .with_context(|| format!("Failed to insert {} fs device {:?}", fs_cfg.mode, fs_cfg))?; + Ok(()) + } + + pub fn patch_fs(&self, cfg: &FsMountConfigInfo, op: ShareFsOperation) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::ManipulateFsBackendFs(cfg.clone()))) + .with_context(|| { + format!( + "Failed to {:?} backend {:?} at {} mount config {:?}", + op, cfg.fstype, cfg.mountpoint, cfg + ) + })?; + Ok(()) + } + + pub fn pause(&self) -> Result<()> { + todo!() + } + + pub fn resume(&self) -> Result<()> { + todo!() + } + + pub fn pid(&self) -> u32 { + std::process::id() + } + + pub fn stop(&mut self) -> Result<()> { + self.handle_request(Request::Sync(VmmAction::ShutdownMicroVm)) + .map_err(|e| { + warn!(sl!(), "Failed to shutdown MicroVM. {}", e); + e + }) + .ok(); + // vmm is not running, join thread will be hang. + if self.is_uninitialized() || self.vmm_thread.is_none() { + debug!(sl!(), "vmm-master thread is uninitialized or has exited."); + return Ok(()); + } + debug!(sl!(), "join vmm-master thread exit."); + + // vmm_thread must be exited, otherwise there will be other sync issues. + // unwrap is safe, if vmm_thread is None, impossible run to here. + self.vmm_thread.take().unwrap().join().ok(); + info!(sl!(), "vmm-master thread join succeed."); + Ok(()) + } + + fn send_request(&self, vmm_action: VmmAction) -> Result { + if let Some(ref to_vmm) = self.to_vmm { + to_vmm + .send(Box::new(vmm_action.clone())) + .with_context(|| format!("Failed to send {:?} via channel ", vmm_action))?; + } else { + return Err(anyhow!("to_vmm is None")); + } + + //notify vmm action + if let Err(e) = self.to_vmm_fd.write(1) { + return Err(anyhow!("failed to notify vmm: {}", e)); + } + + if let Some(from_vmm) = self.from_vmm.as_ref() { + match from_vmm.recv() { + Err(e) => Err(anyhow!("vmm recv err: {}", e)), + Ok(vmm_outcome) => Ok(vmm_outcome), + } + } else { + Err(anyhow!("from_vmm is None")) + } + } + + fn handle_request(&self, req: Request) -> Result { + let Request::Sync(vmm_action) = req; + match self.send_request(vmm_action) { + Ok(vmm_outcome) => match *vmm_outcome { + Ok(vmm_data) => Ok(vmm_data), + Err(vmm_action_error) => Err(anyhow!("vmm action error: {:?}", vmm_action_error)), + }, + Err(e) => Err(e), + } + } + + fn handle_request_with_retry(&self, req: Request) -> Result { + let Request::Sync(vmm_action) = req; + for count in 0..REQUEST_RETRY { + match self.send_request(vmm_action.clone()) { + Ok(vmm_outcome) => match *vmm_outcome { + Ok(vmm_data) => { + info!( + sl!(), + "success to send {:?} after retry {}", &vmm_action, count + ); + return Ok(vmm_data); + } + Err(vmm_action_error) => { + if let VmmActionError::UpcallNotReady = vmm_action_error { + std::thread::sleep(std::time::Duration::from_millis(10)); + continue; + } else { + return Err(vmm_action_error.into()); + } + } + }, + Err(err) => { + return Err(err); + } + } + } + return Err(anyhow::anyhow!( + "After {} attempts, it still doesn't work.", + REQUEST_RETRY + )); + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/kernel_param.rs b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs new file mode 100644 index 0000000000..d8b20b5972 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs @@ -0,0 +1,177 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; + +use crate::{VM_ROOTFS_DRIVER_BLK, VM_ROOTFS_DRIVER_PMEM}; + +// Port where the agent will send the logs. Logs are sent through the vsock in cases +// where the hypervisor has no console.sock, i.e dragonball +const VSOCK_LOGS_PORT: &str = "1025"; + +const KERNEL_KV_DELIMITER: &str = "="; +const KERNEL_PARAM_DELIMITER: &str = " "; + +#[derive(Debug, Clone, PartialEq)] +pub struct Param { + pub key: String, + pub value: String, +} + +impl Param { + pub fn new(key: &str, value: &str) -> Self { + Param { + key: key.to_owned(), + value: value.to_owned(), + } + } +} + +#[derive(Debug, PartialEq)] +pub(crate) struct KernelParams { + params: Vec, +} + +impl KernelParams { + pub(crate) fn new(debug: bool) -> Self { + // default kernel params + let mut params = vec![ + Param::new("reboot", "k"), + Param::new("earlyprintk", "ttyS0"), + Param::new("initcall_debug", ""), + Param::new("panic", "1"), + Param::new("systemd.unit", "kata-containers.target"), + Param::new("systemd.mask", "systemd-networkd.service"), + ]; + + if debug { + params.push(Param::new("agent.log_vport", VSOCK_LOGS_PORT)); + } + + Self { params } + } + + pub(crate) fn new_rootfs_kernel_params(rootfs_driver: &str) -> Self { + let params = match rootfs_driver { + VM_ROOTFS_DRIVER_BLK => { + vec![ + Param { + key: "root".to_string(), + value: "/dev/vda1".to_string(), + }, + Param { + key: "rootflags".to_string(), + value: "data=ordered,errors=remount-ro ro".to_string(), + }, + Param { + key: "rootfstype".to_string(), + value: "ext4".to_string(), + }, + ] + } + VM_ROOTFS_DRIVER_PMEM => { + vec![ + Param { + key: "root".to_string(), + value: "/dev/pmem0p1".to_string(), + }, + Param { + key: "rootflags".to_string(), + value: "data=ordered,errors=remount-ro,dax ro".to_string(), + }, + Param { + key: "rootfstype".to_string(), + value: "ext4".to_string(), + }, + ] + } + _ => vec![], + }; + Self { params } + } + + pub(crate) fn append(&mut self, params: &mut KernelParams) { + self.params.append(&mut params.params); + } + + pub(crate) fn from_string(params_string: &str) -> Self { + let mut params = vec![]; + + let parameters_vec: Vec<&str> = params_string.split(KERNEL_PARAM_DELIMITER).collect(); + + for param in parameters_vec.iter() { + if param.is_empty() { + continue; + } + + let ps: Vec<&str> = param.splitn::<_>(2, KERNEL_KV_DELIMITER).collect(); + + if ps.len() == 2 { + params.push(Param { + key: String::from(ps[0]), + value: String::from(ps[1]), + }); + } else { + params.push(Param { + key: String::from(ps[0]), + value: String::from(""), + }); + } + } + + Self { params } + } + + pub(crate) fn to_string(&self) -> Result { + let mut parameters: Vec = Vec::new(); + + for param in &self.params { + if param.key.is_empty() && param.value.is_empty() { + return Err(anyhow!("Empty key and value")); + } else if param.key.is_empty() { + return Err(anyhow!("Empty key")); + } else if param.value.is_empty() { + parameters.push(param.key.to_string()); + } else { + parameters.push(format!( + "{}{}{}", + param.key, KERNEL_KV_DELIMITER, param.value + )); + } + } + + Ok(parameters.join(KERNEL_PARAM_DELIMITER)) + } +} + +#[cfg(test)] +mod tests { + use anyhow::Result; + + use super::*; + + #[test] + fn test_kernel_params() -> Result<()> { + let expect_params_string = "k1=v1 k2=v2 k3=v3".to_string(); + let expect_params = KernelParams { + params: vec![ + Param::new("k1", "v1"), + Param::new("k2", "v2"), + Param::new("k3", "v3"), + ], + }; + + // check kernel params from string + let kernel_params = KernelParams::from_string(&expect_params_string); + assert_eq!(kernel_params, expect_params); + + // check kernel params to string + let kernel_params_string = expect_params.to_string()?; + assert_eq!(kernel_params_string, expect_params_string); + + Ok(()) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/lib.rs b/src/runtime-rs/crates/hypervisor/src/lib.rs new file mode 100644 index 0000000000..095ebd6629 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/lib.rs @@ -0,0 +1,65 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "hypervisor"); + +pub mod device; +pub use device::*; +pub mod dragonball; +mod kernel_param; +pub use kernel_param::Param; +mod utils; + +use std::collections::HashMap; + +use anyhow::Result; +use async_trait::async_trait; +use kata_types::config::hypervisor::Hypervisor as HypervisorConfig; + +// Config which driver to use as vm root dev +const VM_ROOTFS_DRIVER_BLK: &str = "virtio-blk"; +const VM_ROOTFS_DRIVER_PMEM: &str = "virtio-pmem"; + +#[derive(PartialEq)] +pub(crate) enum VmmState { + NotReady, + VmmServerReady, + VmRunning, +} + +// vcpu mapping from vcpu number to thread number +#[derive(Debug)] +pub struct VcpuThreadIds { + pub vcpus: HashMap, +} + +#[async_trait] +pub trait Hypervisor: Send + Sync { + // vm manager + async fn prepare_vm(&self, id: &str, netns: Option) -> Result<()>; + async fn start_vm(&self, timeout: i32) -> Result<()>; + async fn stop_vm(&self) -> Result<()>; + async fn pause_vm(&self) -> Result<()>; + async fn save_vm(&self) -> Result<()>; + async fn resume_vm(&self) -> Result<()>; + + // device manager + async fn add_device(&self, device: device::Device) -> Result<()>; + async fn remove_device(&self, device: device::Device) -> Result<()>; + + // utils + async fn get_agent_socket(&self) -> Result; + async fn disconnect(&self); + async fn hypervisor_config(&self) -> HypervisorConfig; + async fn get_thread_ids(&self) -> Result; + async fn get_pids(&self) -> Result>; + async fn cleanup(&self) -> Result<()>; + async fn check(&self) -> Result<()>; + async fn get_jailer_root(&self) -> Result; +} diff --git a/src/runtime-rs/crates/hypervisor/src/utils.rs b/src/runtime-rs/crates/hypervisor/src/utils.rs new file mode 100644 index 0000000000..8ecf989500 --- /dev/null +++ b/src/runtime-rs/crates/hypervisor/src/utils.rs @@ -0,0 +1,27 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::collections::HashSet; + +pub fn get_child_threads(pid: u32) -> HashSet { + let mut result = HashSet::new(); + let path_name = format!("/proc/{}/task", pid); + let path = std::path::Path::new(path_name.as_str()); + if path.is_dir() { + if let Ok(dir) = path.read_dir() { + for entity in dir { + if let Ok(entity) = entity.as_ref() { + let file_name = entity.file_name(); + let file_name = file_name.to_str().unwrap_or_default(); + if let Ok(tid) = file_name.parse::() { + result.insert(tid); + } + } + } + } + } + result +} diff --git a/src/runtime-rs/crates/resource/Cargo.toml b/src/runtime-rs/crates/resource/Cargo.toml new file mode 100644 index 0000000000..60b761b96b --- /dev/null +++ b/src/runtime-rs/crates/resource/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "resource" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +bitflags = "1.2.1" +cgroups-rs = "0.2.9" +futures = "0.3.11" +lazy_static = "1.4.0" +libc = ">=0.2.39" +netlink-sys = "0.8.3" +netlink-packet-route = "0.13.0" +nix = "0.24.1" +rand = "^0.7.2" +rtnetlink = "0.11.0" +scopeguard = "1.0.0" +slog = "2.5.2" +slog-scope = "4.4.0" +tokio = { version = "1.8.0", features = ["process"] } +uuid = { version = "0.4", features = ["v4"] } + +agent = { path = "../agent" } +hypervisor = { path = "../hypervisor" } +kata-types = { path = "../../../libs/kata-types" } +kata-sys-util = { path = "../../../libs/kata-sys-util" } +logging = { path = "../../../libs/logging" } +oci = { path = "../../../libs/oci" } +actix-rt = "2.7.0" +[features] diff --git a/src/runtime-rs/crates/resource/src/cgroups/mod.rs b/src/runtime-rs/crates/resource/src/cgroups/mod.rs new file mode 100644 index 0000000000..9a176c2fd8 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/cgroups/mod.rs @@ -0,0 +1,220 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod utils; + +use std::{ + collections::{HashMap, HashSet}, + iter::FromIterator, + sync::Arc, +}; + +use anyhow::{anyhow, Context, Result}; +use cgroups_rs::{cgroup_builder::CgroupBuilder, Cgroup, CgroupPid, CpuResources, Resources}; +use hypervisor::Hypervisor; +use kata_sys_util::spec::load_oci_spec; +use kata_types::config::TomlConfig; +use oci::LinuxResources; +use tokio::sync::RwLock; + +pub struct CgroupConfig { + pub path: String, + pub overhead_path: String, + pub sandbox_cgroup_only: bool, +} + +impl CgroupConfig { + fn new(sid: &str, toml_config: &TomlConfig) -> Result { + let overhead_path = utils::gen_overhead_path(sid); + let spec = load_oci_spec()?; + let path = spec + .linux + // The trim of '/' is important, because cgroup_path is a relative path. + .map(|linux| linux.cgroups_path.trim_start_matches('/').to_string()) + .unwrap_or_default(); + + Ok(Self { + path, + overhead_path, + sandbox_cgroup_only: toml_config.runtime.sandbox_cgroup_only, + }) + } +} + +pub struct CgroupsResource { + resources: Arc>>, + cgroup_manager: Cgroup, + overhead_cgroup_manager: Option, +} + +impl CgroupsResource { + pub fn new(sid: &str, toml_config: &TomlConfig) -> Result { + let config = CgroupConfig::new(sid, toml_config)?; + + // Create the sandbox cgroups manager (cgroups on Linux). + // Depending on the sandbox_cgroup_only value, this cgroup + // will either hold all the pod threads (sandbox_cgroup_only is true) + // or only the virtual CPU ones (sandbox_cgroup_only is false). + let hier = cgroups_rs::hierarchies::auto(); + let cgroup_manager = CgroupBuilder::new(&config.path).build(hier); + + // The shim configuration is requesting that we do not put all threads + // into the sandbox resource controller. + // We're creating an overhead controller, with no constraints. Everything but + // the vCPU threads will eventually make it there. + let overhead_cgroup_manager = if !config.sandbox_cgroup_only { + let hier = cgroups_rs::hierarchies::auto(); + Some(CgroupBuilder::new(&config.overhead_path).build(hier)) + } else { + None + }; + + // Add the runtime to the VMM sandbox resource controller + + // By adding the runtime process to either the sandbox or overhead controller, we are making + // sure that any child process of the runtime (i.e. *all* processes serving a Kata pod) + // will initially live in this controller. Depending on the sandbox_cgroup_only settings, we will + // then move the vCPU threads between resource controllers. + let pid = CgroupPid { pid: 0 }; + if let Some(manager) = overhead_cgroup_manager.as_ref() { + manager.add_task_by_tgid(pid).context("add task by tgid")?; + } else { + cgroup_manager + .add_task_by_tgid(pid) + .context("add task by tgid with sandbox only")?; + } + + Ok(Self { + cgroup_manager, + resources: Arc::new(RwLock::new(HashMap::new())), + overhead_cgroup_manager, + }) + } + + /// delete will move the running processes in the cgroup_manager and + /// overhead_cgroup_manager to the parent and then delete the cgroups. + pub async fn delete(&self) -> Result<()> { + for cg_pid in self.cgroup_manager.tasks() { + self.cgroup_manager.remove_task(cg_pid); + } + + self.cgroup_manager + .delete() + .context("delete cgroup manager")?; + + if let Some(overhead) = self.overhead_cgroup_manager.as_ref() { + for cg_pid in overhead.tasks() { + overhead.remove_task(cg_pid); + } + overhead.delete().context("delete overhead")?; + } + + Ok(()) + } + + pub async fn update_cgroups( + &self, + cid: &str, + linux_resources: Option<&LinuxResources>, + h: &dyn Hypervisor, + ) -> Result<()> { + let resource = self.calc_resource(linux_resources); + let changed = self.update_resources(cid, resource).await; + + if !changed { + return Ok(()); + } + + self.do_update_cgroups(h).await + } + + async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool { + let mut resources = self.resources.write().await; + let old_resource = resources.insert(cid.to_owned(), new_resource.clone()); + + if let Some(old_resource) = old_resource { + if old_resource == new_resource { + return false; + } + } + + true + } + + async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> { + let merged_resources = self.merge_resources().await; + self.cgroup_manager + .apply(&merged_resources) + .map_err(|e| anyhow!(e))?; + + if self.overhead_cgroup_manager.is_some() { + // If we have an overhead controller, new vCPU threads would start there, + // as being children of the VMM PID. + // We need to constrain them by moving them into the sandbox controller. + self.constrain_hypervisor(h).await? + } + + Ok(()) + } + + /// constrain_hypervisor will place the VMM and vCPU threads into resource controllers (cgroups on Linux). + async fn constrain_hypervisor(&self, h: &dyn Hypervisor) -> Result<()> { + let tids = h.get_thread_ids().await?; + let tids = tids.vcpus.values(); + + // All vCPU threads move to the sandbox controller. + for tid in tids { + self.cgroup_manager + .add_task_by_tgid(CgroupPid { pid: *tid as u64 })? + } + + Ok(()) + } + + async fn merge_resources(&self) -> Resources { + let resources = self.resources.read().await; + + let mut cpu_list: HashSet = HashSet::new(); + let mut mem_list: HashSet = HashSet::new(); + + resources.values().for_each(|r| { + if let Some(cpus) = &r.cpu.cpus { + cpu_list.insert(cpus.clone()); + } + if let Some(mems) = &r.cpu.mems { + mem_list.insert(mems.clone()); + } + }); + + let cpu_resource = CpuResources { + cpus: Some(Vec::from_iter(cpu_list.into_iter()).join(",")), + mems: Some(Vec::from_iter(mem_list.into_iter()).join(",")), + ..Default::default() + }; + + Resources { + cpu: cpu_resource, + ..Default::default() + } + } + + fn calc_cpu_resources(&self, linux_resources: Option<&LinuxResources>) -> CpuResources { + let cpu = || -> Option { linux_resources.as_ref()?.cpu.clone() }(); + + CpuResources { + cpus: cpu.clone().map(|cpu| cpu.cpus), + mems: cpu.map(|cpu| cpu.mems), + ..Default::default() + } + } + + fn calc_resource(&self, linux_resources: Option<&LinuxResources>) -> Resources { + Resources { + cpu: self.calc_cpu_resources(linux_resources), + ..Default::default() + } + } +} diff --git a/src/runtime-rs/crates/resource/src/cgroups/utils.rs b/src/runtime-rs/crates/resource/src/cgroups/utils.rs new file mode 100644 index 0000000000..7a2d630982 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/cgroups/utils.rs @@ -0,0 +1,16 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +// When the Kata overhead threads (I/O, VMM, etc) are not +// placed in the sandbox resource controller (A cgroup on Linux), +// they are moved to a specific, unconstrained resource controller. +// On Linux, assuming the cgroup mount point is at /sys/fs/cgroup/, +// on a cgroup v1 system, the Kata overhead memory cgroup will be at +// /sys/fs/cgroup/memory/kata_overhead/$CGPATH where $CGPATH is +// defined by the orchestrator. +pub(crate) fn gen_overhead_path(path: &str) -> String { + format!("kata_overhead/{}", path.trim_start_matches('/')) +} diff --git a/src/runtime-rs/crates/resource/src/lib.rs b/src/runtime-rs/crates/resource/src/lib.rs new file mode 100644 index 0000000000..28ffc56019 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/lib.rs @@ -0,0 +1,31 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate lazy_static; + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "resource"); + +pub mod cgroups; +pub mod manager; +mod manager_inner; +pub mod network; +use network::NetworkConfig; +pub mod rootfs; +pub mod share_fs; +pub mod volume; +pub use manager::ResourceManager; + +use kata_types::config::hypervisor::SharedFsInfo; + +#[derive(Debug)] +pub enum ResourceConfig { + Network(NetworkConfig), + ShareFs(SharedFsInfo), +} diff --git a/src/runtime-rs/crates/resource/src/manager.rs b/src/runtime-rs/crates/resource/src/manager.rs new file mode 100644 index 0000000000..94cf3138f0 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/manager.rs @@ -0,0 +1,97 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::{Agent, Storage}; +use anyhow::Result; +use hypervisor::Hypervisor; +use kata_types::config::TomlConfig; +use kata_types::mount::Mount; +use oci::LinuxResources; +use tokio::sync::RwLock; + +use crate::{manager_inner::ResourceManagerInner, rootfs::Rootfs, volume::Volume, ResourceConfig}; + +pub struct ResourceManager { + inner: Arc>, +} + +impl ResourceManager { + pub fn new( + sid: &str, + agent: Arc, + hypervisor: Arc, + toml_config: Arc, + ) -> Result { + Ok(Self { + inner: Arc::new(RwLock::new(ResourceManagerInner::new( + sid, + agent, + hypervisor, + toml_config, + )?)), + }) + } + + pub async fn config(&self) -> Arc { + let inner = self.inner.read().await; + inner.config() + } + + pub async fn prepare_before_start_vm(&self, device_configs: Vec) -> Result<()> { + let mut inner = self.inner.write().await; + inner.prepare_before_start_vm(device_configs).await + } + + pub async fn setup_after_start_vm(&self) -> Result<()> { + let mut inner = self.inner.write().await; + inner.setup_after_start_vm().await + } + + pub async fn get_storage_for_sandbox(&self) -> Result> { + let inner = self.inner.read().await; + inner.get_storage_for_sandbox().await + } + + pub async fn handler_rootfs( + &self, + cid: &str, + bundle_path: &str, + rootfs_mounts: &[Mount], + ) -> Result> { + let inner = self.inner.read().await; + inner.handler_rootfs(cid, bundle_path, rootfs_mounts).await + } + + pub async fn handler_volumes( + &self, + cid: &str, + oci_mounts: &[oci::Mount], + ) -> Result>> { + let inner = self.inner.read().await; + inner.handler_volumes(cid, oci_mounts).await + } + + pub async fn dump(&self) { + let inner = self.inner.read().await; + inner.dump().await + } + + pub async fn update_cgroups( + &self, + cid: &str, + linux_resources: Option<&LinuxResources>, + ) -> Result<()> { + let inner = self.inner.read().await; + inner.update_cgroups(cid, linux_resources).await + } + + pub async fn delete_cgroups(&self) -> Result<()> { + let inner = self.inner.read().await; + inner.delete_cgroups().await + } +} diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs new file mode 100644 index 0000000000..fb90a25a60 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -0,0 +1,200 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::{Agent, Storage}; +use anyhow::{Context, Result}; +use hypervisor::Hypervisor; +use kata_types::config::TomlConfig; +use kata_types::mount::Mount; +use oci::LinuxResources; + +use crate::{ + cgroups::CgroupsResource, + network::{self, Network}, + rootfs::{RootFsResource, Rootfs}, + share_fs::{self, ShareFs}, + volume::{Volume, VolumeResource}, + ResourceConfig, +}; + +pub(crate) struct ResourceManagerInner { + sid: String, + toml_config: Arc, + agent: Arc, + hypervisor: Arc, + network: Option>, + share_fs: Option>, + + pub rootfs_resource: RootFsResource, + pub volume_resource: VolumeResource, + pub cgroups_resource: CgroupsResource, +} + +impl ResourceManagerInner { + pub(crate) fn new( + sid: &str, + agent: Arc, + hypervisor: Arc, + toml_config: Arc, + ) -> Result { + let cgroups_resource = CgroupsResource::new(sid, &toml_config)?; + Ok(Self { + sid: sid.to_string(), + toml_config, + agent, + hypervisor, + network: None, + share_fs: None, + rootfs_resource: RootFsResource::new(), + volume_resource: VolumeResource::new(), + cgroups_resource, + }) + } + + pub fn config(&self) -> Arc { + self.toml_config.clone() + } + + pub async fn prepare_before_start_vm( + &mut self, + device_configs: Vec, + ) -> Result<()> { + for dc in device_configs { + match dc { + ResourceConfig::ShareFs(c) => { + let share_fs = share_fs::new(&self.sid, &c).context("new share fs")?; + share_fs + .setup_device_before_start_vm(self.hypervisor.as_ref()) + .await + .context("setup share fs device before start vm")?; + self.share_fs = Some(share_fs); + } + ResourceConfig::Network(c) => { + let d = network::new(&c).await.context("new network")?; + d.setup(self.hypervisor.as_ref()) + .await + .context("setup network")?; + self.network = Some(d) + } + }; + } + + Ok(()) + } + + async fn handle_interfaces(&self, network: &dyn Network) -> Result<()> { + for i in network.interfaces().await.context("get interfaces")? { + // update interface + info!(sl!(), "update interface {:?}", i); + self.agent + .update_interface(agent::UpdateInterfaceRequest { interface: Some(i) }) + .await + .context("update interface")?; + } + + Ok(()) + } + + async fn handle_neighbours(&self, network: &dyn Network) -> Result<()> { + let neighbors = network.neighs().await.context("neighs")?; + if !neighbors.is_empty() { + info!(sl!(), "update neighbors {:?}", neighbors); + self.agent + .add_arp_neighbors(agent::AddArpNeighborRequest { + neighbors: Some(agent::ARPNeighbors { neighbors }), + }) + .await + .context("update neighbors")?; + } + Ok(()) + } + + async fn handle_routes(&self, network: &dyn Network) -> Result<()> { + let routes = network.routes().await.context("routes")?; + if !routes.is_empty() { + info!(sl!(), "update routes {:?}", routes); + self.agent + .update_routes(agent::UpdateRoutesRequest { + route: Some(agent::Routes { routes }), + }) + .await + .context("update routes")?; + } + Ok(()) + } + + pub async fn setup_after_start_vm(&mut self) -> Result<()> { + if let Some(share_fs) = self.share_fs.as_ref() { + share_fs + .setup_device_after_start_vm(self.hypervisor.as_ref()) + .await + .context("setup share fs device after start vm")?; + } + + if let Some(network) = self.network.as_ref() { + let network = network.as_ref(); + self.handle_interfaces(network) + .await + .context("handle interfaces")?; + self.handle_neighbours(network) + .await + .context("handle neighbors")?; + self.handle_routes(network).await.context("handle routes")?; + } + Ok(()) + } + + pub async fn get_storage_for_sandbox(&self) -> Result> { + let mut storages = vec![]; + if let Some(d) = self.share_fs.as_ref() { + let mut s = d.get_storages().await.context("get storage")?; + storages.append(&mut s); + } + Ok(storages) + } + + pub async fn handler_rootfs( + &self, + cid: &str, + bundle_path: &str, + rootfs_mounts: &[Mount], + ) -> Result> { + self.rootfs_resource + .handler_rootfs(&self.share_fs, cid, bundle_path, rootfs_mounts) + .await + } + + pub async fn handler_volumes( + &self, + cid: &str, + oci_mounts: &[oci::Mount], + ) -> Result>> { + self.volume_resource + .handler_volumes(&self.share_fs, cid, oci_mounts) + .await + } + + pub async fn update_cgroups( + &self, + cid: &str, + linux_resources: Option<&LinuxResources>, + ) -> Result<()> { + self.cgroups_resource + .update_cgroups(cid, linux_resources, self.hypervisor.as_ref()) + .await + } + + pub async fn delete_cgroups(&self) -> Result<()> { + self.cgroups_resource.delete().await + } + + pub async fn dump(&self) { + self.rootfs_resource.dump().await; + self.volume_resource.dump().await; + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs new file mode 100644 index 0000000000..90623d59cb --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs @@ -0,0 +1,369 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[cfg(test)] +mod tests { + use anyhow::Context; + use netlink_packet_route::MACVLAN_MODE_PRIVATE; + use scopeguard::defer; + + use std::sync::Arc; + + use crate::network::{ + endpoint::{IPVlanEndpoint, MacVlanEndpoint, VlanEndpoint}, + network_model::{ + self, + tc_filter_model::{fetch_index, TcFilterModel}, + NetworkModelType, TC_FILTER_NET_MODEL_STR, + }, + network_pair::{NetworkInterface, NetworkPair, TapInterface}, + }; + + // this unit test tests the integrity of MacVlanEndpoint::new() + #[actix_rt::test] + async fn test_vlan_construction() { + let idx = 8193; + let mac_addr = String::from("02:78:CA:FE:00:04"); + let manual_vlan_iface_name = format!("eth{}", idx); + let tap_iface_name = format!("tap{}_kata", idx); // create by NetworkPair::new() + let dummy_name = format!("dummy{}", idx); + let vlanid = 123; + + if let Ok((conn, handle, _)) = + rtnetlink::new_connection().context("failed to create netlink connection") + { + let thread_handler = tokio::spawn(conn); + defer!({ + thread_handler.abort(); + }); + + if let Ok(()) = handle + .link() + .add() + .dummy(dummy_name.clone()) + .execute() + .await + .context("failed to create dummy link") + { + let dummy_index = fetch_index(&handle, dummy_name.clone().as_str()) + .await + .expect("failed to get the index of dummy link"); + + // since IPVlanEndpoint::new() needs an EXISTING virt_iface (which is created + // by containerd normally), we have to manually create a virt_iface. + if let Ok(()) = handle + .link() + .add() + .vlan(manual_vlan_iface_name.clone(), dummy_index, vlanid) + .execute() + .await + .context("failed to create manual veth pair") + { + if let Ok(mut result) = VlanEndpoint::new(&handle, "", idx, 5) + .await + .context("failed to create new ipvlan endpoint") + { + let manual = VlanEndpoint { + net_pair: NetworkPair { + tap: TapInterface { + id: String::from("uniqueTestID_kata"), + name: format!("br{}_kata", idx), + tap_iface: NetworkInterface { + name: tap_iface_name.clone(), + ..Default::default() + }, + }, + virt_iface: NetworkInterface { + name: manual_vlan_iface_name.clone(), + hard_addr: mac_addr.clone(), + ..Default::default() + }, + model: Arc::new(TcFilterModel::new().unwrap()), // impossible to panic + network_qos: false, + }, + }; + + result.net_pair.tap.id = String::from("uniqueTestID_kata"); + result.net_pair.tap.tap_iface.hard_addr = String::from(""); + result.net_pair.virt_iface.hard_addr = mac_addr.clone(); + + // check the integrity by compare all variables + assert_eq!(manual.net_pair.tap.id, result.net_pair.tap.id); + assert_eq!(manual.net_pair.tap.name, result.net_pair.tap.name); + assert_eq!( + manual.net_pair.tap.tap_iface.name, + result.net_pair.tap.tap_iface.name + ); + assert_eq!( + manual.net_pair.tap.tap_iface.hard_addr, + result.net_pair.tap.tap_iface.hard_addr + ); + assert_eq!( + manual.net_pair.tap.tap_iface.addrs, + result.net_pair.tap.tap_iface.addrs + ); + assert_eq!( + manual.net_pair.virt_iface.name, + result.net_pair.virt_iface.name + ); + assert_eq!( + manual.net_pair.virt_iface.hard_addr, + result.net_pair.virt_iface.hard_addr + ); + // using match branch to avoid deriving PartialEq trait + match manual.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} // ok + _ => unreachable!(), + } + match result.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} + _ => unreachable!(), + } + assert_eq!(manual.net_pair.network_qos, result.net_pair.network_qos); + } + let link_index = fetch_index(&handle, manual_vlan_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + let link_index = fetch_index(&handle, tap_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + assert!(handle.link().del(dummy_index).execute().await.is_ok()); + } + } + } + } + + // this unit test tests the integrity of VlanEndpoint::new() + #[actix_rt::test] + async fn test_macvlan_construction() { + let idx = 8194; + let mac_addr = String::from("02:25:CA:FE:00:04"); + let manual_macvlan_iface_name = format!("eth{}", idx); + let tap_iface_name = format!("tap{}_kata", idx); // create by NetworkPair::new() + let model_str = TC_FILTER_NET_MODEL_STR; + let dummy_name = format!("dummy{}", idx); + + if let Ok((conn, handle, _)) = + rtnetlink::new_connection().context("failed to create netlink connection") + { + let thread_handler = tokio::spawn(conn); + defer!({ + thread_handler.abort(); + }); + + if let Ok(()) = handle + .link() + .add() + .dummy(dummy_name.clone()) + .execute() + .await + .context("failed to create dummy link") + { + let dummy_index = fetch_index(&handle, dummy_name.clone().as_str()) + .await + .expect("failed to get the index of dummy link"); + + // the mode here does not matter, could be any of available modes + if let Ok(()) = handle + .link() + .add() + .macvlan( + manual_macvlan_iface_name.clone(), + dummy_index, + MACVLAN_MODE_PRIVATE, + ) + .execute() + .await + .context("failed to create manual macvlan pair") + { + // model here does not matter, could be any of supported models + if let Ok(mut result) = MacVlanEndpoint::new( + &handle, + manual_macvlan_iface_name.clone().as_str(), + idx, + model_str, + 5, + ) + .await + .context("failed to create new macvlan endpoint") + { + let manual = MacVlanEndpoint { + net_pair: NetworkPair { + tap: TapInterface { + id: String::from("uniqueTestID_kata"), + name: format!("br{}_kata", idx), + tap_iface: NetworkInterface { + name: tap_iface_name.clone(), + ..Default::default() + }, + }, + virt_iface: NetworkInterface { + name: manual_macvlan_iface_name.clone(), + hard_addr: mac_addr.clone(), + ..Default::default() + }, + model: network_model::new(model_str) + .expect("failed to create new network model"), + network_qos: false, + }, + }; + + result.net_pair.tap.id = String::from("uniqueTestID_kata"); + result.net_pair.tap.tap_iface.hard_addr = String::from(""); + result.net_pair.virt_iface.hard_addr = mac_addr.clone(); + + // check the integrity by compare all variables + assert_eq!(manual.net_pair.tap.id, result.net_pair.tap.id); + assert_eq!(manual.net_pair.tap.name, result.net_pair.tap.name); + assert_eq!( + manual.net_pair.tap.tap_iface.name, + result.net_pair.tap.tap_iface.name + ); + assert_eq!( + manual.net_pair.tap.tap_iface.hard_addr, + result.net_pair.tap.tap_iface.hard_addr + ); + assert_eq!( + manual.net_pair.tap.tap_iface.addrs, + result.net_pair.tap.tap_iface.addrs + ); + assert_eq!( + manual.net_pair.virt_iface.name, + result.net_pair.virt_iface.name + ); + assert_eq!( + manual.net_pair.virt_iface.hard_addr, + result.net_pair.virt_iface.hard_addr + ); + // using match branch to avoid deriving PartialEq trait + // TcFilter model is hard-coded "model_str" variable + match manual.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} // ok + _ => unreachable!(), + } + match result.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} + _ => unreachable!(), + } + assert_eq!(manual.net_pair.network_qos, result.net_pair.network_qos); + } + // delete the manually created links + let link_index = fetch_index(&handle, manual_macvlan_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + let link_index = fetch_index(&handle, tap_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + assert!(handle.link().del(dummy_index).execute().await.is_ok()); + } + } + } + } + + // this unit test tests the integrity of IPVlanEndpoint::new() + #[actix_rt::test] + async fn test_ipvlan_construction() { + let idx = 8192; + let mac_addr = String::from("02:00:CA:FE:00:04"); + let manual_virt_iface_name = format!("eth{}", idx); + let tap_iface_name = format!("tap{}_kata", idx); // create by kata + + if let Ok((conn, handle, _)) = + rtnetlink::new_connection().context("failed to create netlink connection") + { + let thread_handler = tokio::spawn(conn); + defer!({ + thread_handler.abort(); + }); + + // since IPVlanEndpoint::new() needs an EXISTING virt_iface (which is created + // by containerd normally), we have to manually create a virt_iface. + if let Ok(()) = handle + .link() + .add() + .veth("foo".to_string(), manual_virt_iface_name.clone()) + .execute() + .await + .context("failed to create manual veth pair") + { + if let Ok(mut result) = IPVlanEndpoint::new(&handle, "", idx, 5) + .await + .context("failed to create new ipvlan endpoint") + { + let manual = IPVlanEndpoint { + net_pair: NetworkPair { + tap: TapInterface { + id: String::from("uniqueTestID_kata"), + name: format!("br{}_kata", idx), + tap_iface: NetworkInterface { + name: tap_iface_name.clone(), + ..Default::default() + }, + }, + virt_iface: NetworkInterface { + name: manual_virt_iface_name.clone(), + hard_addr: mac_addr.clone(), + ..Default::default() + }, + model: Arc::new(TcFilterModel::new().unwrap()), // impossible to panic + network_qos: false, + }, + }; + + result.net_pair.tap.id = String::from("uniqueTestID_kata"); + result.net_pair.tap.tap_iface.hard_addr = String::from(""); + result.net_pair.virt_iface.hard_addr = mac_addr.clone(); + + // check the integrity by compare all variables + assert_eq!(manual.net_pair.tap.id, result.net_pair.tap.id); + assert_eq!(manual.net_pair.tap.name, result.net_pair.tap.name); + assert_eq!( + manual.net_pair.tap.tap_iface.name, + result.net_pair.tap.tap_iface.name + ); + assert_eq!( + manual.net_pair.tap.tap_iface.hard_addr, + result.net_pair.tap.tap_iface.hard_addr + ); + assert_eq!( + manual.net_pair.tap.tap_iface.addrs, + result.net_pair.tap.tap_iface.addrs + ); + assert_eq!( + manual.net_pair.virt_iface.name, + result.net_pair.virt_iface.name + ); + assert_eq!( + manual.net_pair.virt_iface.hard_addr, + result.net_pair.virt_iface.hard_addr + ); + // using match branch to avoid deriving PartialEq trait + match manual.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} // ok + _ => unreachable!(), + } + match result.net_pair.model.model_type() { + NetworkModelType::TcFilter => {} + _ => unreachable!(), + } + assert_eq!(manual.net_pair.network_qos, result.net_pair.network_qos); + } + let link_index = fetch_index(&handle, manual_virt_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + let link_index = fetch_index(&handle, tap_iface_name.as_str()) + .await + .expect("failed to fetch index"); + assert!(handle.link().del(link_index).execute().await.is_ok()); + } + } + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs new file mode 100644 index 0000000000..5f31002786 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs @@ -0,0 +1,90 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Error}; + +use anyhow::{Context, Result}; +use async_trait::async_trait; + +use super::Endpoint; +use crate::network::network_model::TC_FILTER_NET_MODEL_STR; +use crate::network::{utils, NetworkPair}; +use hypervisor::{device::NetworkConfig, Device, Hypervisor}; + +// IPVlanEndpoint is the endpoint bridged to VM +#[derive(Debug)] +pub struct IPVlanEndpoint { + pub(crate) net_pair: NetworkPair, +} + +impl IPVlanEndpoint { + pub async fn new( + handle: &rtnetlink::Handle, + name: &str, + idx: u32, + queues: usize, + ) -> Result { + // tc filter network model is the only one works for ipvlan + let net_pair = NetworkPair::new(handle, idx, name, TC_FILTER_NET_MODEL_STR, queues) + .await + .context("error creating new NetworkPair")?; + Ok(IPVlanEndpoint { net_pair }) + } + + fn get_network_config(&self) -> Result { + let iface = &self.net_pair.tap.tap_iface; + let guest_mac = utils::parse_mac(&iface.hard_addr).ok_or_else(|| { + Error::new( + io::ErrorKind::InvalidData, + format!("hard_addr {}", &iface.hard_addr), + ) + })?; + Ok(NetworkConfig { + id: self.net_pair.virt_iface.name.clone(), + host_dev_name: iface.name.clone(), + guest_mac: Some(guest_mac), + }) + } +} + +#[async_trait] +impl Endpoint for IPVlanEndpoint { + async fn name(&self) -> String { + self.net_pair.virt_iface.name.clone() + } + + async fn hardware_addr(&self) -> String { + self.net_pair.tap.tap_iface.hard_addr.clone() + } + + async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .add_network_model() + .await + .context("error adding network model")?; + let config = self.get_network_config().context("get network config")?; + h.add_device(Device::Network(config)) + .await + .context("error adding device by hypervisor")?; + + Ok(()) + } + + async fn detach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .del_network_model() + .await + .context("error deleting network model")?; + let config = self + .get_network_config() + .context("error getting network config")?; + h.remove_device(Device::Network(config)) + .await + .context("error removing device by hypervisor")?; + + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs new file mode 100644 index 0000000000..21f22345d8 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs @@ -0,0 +1,84 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Error}; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hypervisor::{device::NetworkConfig, Device, Hypervisor}; + +use super::Endpoint; +use crate::network::{utils, NetworkPair}; + +#[derive(Debug)] +pub struct MacVlanEndpoint { + pub(crate) net_pair: NetworkPair, +} + +impl MacVlanEndpoint { + pub async fn new( + handle: &rtnetlink::Handle, + name: &str, + idx: u32, + model: &str, + queues: usize, + ) -> Result { + let net_pair = NetworkPair::new(handle, idx, name, model, queues) + .await + .context("error creating new networkInterfacePair")?; + Ok(MacVlanEndpoint { net_pair }) + } + + fn get_network_config(&self) -> Result { + let iface = &self.net_pair.tap.tap_iface; + let guest_mac = utils::parse_mac(&iface.hard_addr).ok_or_else(|| { + Error::new( + io::ErrorKind::InvalidData, + format!("hard_addr {}", &iface.hard_addr), + ) + })?; + Ok(NetworkConfig { + id: self.net_pair.virt_iface.name.clone(), + host_dev_name: iface.name.clone(), + guest_mac: Some(guest_mac), + }) + } +} + +#[async_trait] +impl Endpoint for MacVlanEndpoint { + async fn name(&self) -> String { + self.net_pair.virt_iface.name.clone() + } + + async fn hardware_addr(&self) -> String { + self.net_pair.tap.tap_iface.hard_addr.clone() + } + + async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .add_network_model() + .await + .context("add network model")?; + let config = self.get_network_config().context("get network config")?; + h.add_device(Device::Network(config)) + .await + .context("Error add device")?; + Ok(()) + } + + async fn detach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .del_network_model() + .await + .context("del network model")?; + let config = self.get_network_config().context("get network config")?; + h.remove_device(Device::Network(config)) + .await + .context("remove device")?; + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs new file mode 100644 index 0000000000..9e5e841c82 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs @@ -0,0 +1,29 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod physical_endpoint; +pub use physical_endpoint::PhysicalEndpoint; +mod veth_endpoint; +pub use veth_endpoint::VethEndpoint; +mod ipvlan_endpoint; +pub use ipvlan_endpoint::IPVlanEndpoint; +mod vlan_endpoint; +pub use vlan_endpoint::VlanEndpoint; +mod macvlan_endpoint; +pub use macvlan_endpoint::MacVlanEndpoint; +mod endpoints_test; + +use anyhow::Result; +use async_trait::async_trait; +use hypervisor::Hypervisor; + +#[async_trait] +pub trait Endpoint: std::fmt::Debug + Send + Sync { + async fn name(&self) -> String; + async fn hardware_addr(&self) -> String; + async fn attach(&self, hypervisor: &dyn Hypervisor) -> Result<()>; + async fn detach(&self, hypervisor: &dyn Hypervisor) -> Result<()>; +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs new file mode 100644 index 0000000000..ffdfb5848b --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs @@ -0,0 +1,144 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::Path; + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use hypervisor::{device, Hypervisor}; + +use super::Endpoint; +use crate::network::utils::{self, link}; + +pub const SYS_PCI_DEVICES_PATH: &str = "/sys/bus/pci/devices"; + +#[derive(Debug)] +pub struct VendorDevice { + vendor_id: String, + device_id: String, +} + +impl VendorDevice { + pub fn new(vendor_id: &str, device_id: &str) -> Result { + if vendor_id.is_empty() || device_id.is_empty() { + return Err(anyhow!( + "invalid parameters vendor_id {} device_id {}", + vendor_id, + device_id + )); + } + Ok(Self { + vendor_id: vendor_id.to_string(), + device_id: device_id.to_string(), + }) + } + + pub fn vendor_device_id(&self) -> String { + format!("{}_{}", &self.vendor_id, &self.device_id) + } +} + +#[derive(Debug)] +pub struct PhysicalEndpoint { + iface_name: String, + hard_addr: String, + bdf: String, + driver: String, + vendor_device_id: VendorDevice, +} + +impl PhysicalEndpoint { + pub fn new(name: &str, hardware_addr: &[u8]) -> Result { + let driver_info = link::get_driver_info(name).context("get driver info")?; + let bdf = driver_info.bus_info; + let sys_pci_devices_path = Path::new(SYS_PCI_DEVICES_PATH); + // get driver by following symlink /sys/bus/pci/devices/$bdf/driver + let driver_path = sys_pci_devices_path.join(&bdf).join("driver"); + let link = driver_path.read_link().context("read link")?; + let driver = link + .file_name() + .map_or(String::new(), |v| v.to_str().unwrap().to_owned()); + + // get vendor and device id from pci space (sys/bus/pci/devices/$bdf) + let iface_device_path = sys_pci_devices_path.join(&bdf).join("device"); + let device_id = std::fs::read_to_string(&iface_device_path) + .with_context(|| format!("read device path {:?}", &iface_device_path))?; + + let iface_vendor_path = sys_pci_devices_path.join(&bdf).join("vendor"); + let vendor_id = std::fs::read_to_string(&iface_vendor_path) + .with_context(|| format!("read vendor path {:?}", &iface_vendor_path))?; + + Ok(Self { + iface_name: name.to_string(), + hard_addr: utils::get_mac_addr(hardware_addr).context("get mac addr")?, + vendor_device_id: VendorDevice::new(&vendor_id, &device_id) + .context("new vendor device")?, + driver, + bdf, + }) + } +} + +#[async_trait] +impl Endpoint for PhysicalEndpoint { + async fn name(&self) -> String { + self.iface_name.clone() + } + + async fn hardware_addr(&self) -> String { + self.hard_addr.clone() + } + + async fn attach(&self, hypervisor: &dyn Hypervisor) -> Result<()> { + // bind physical interface from host driver and bind to vfio + device::bind_device_to_vfio( + &self.bdf, + &self.driver, + &self.vendor_device_id.vendor_device_id(), + ) + .with_context(|| format!("bind physical endpoint from {} to vfio", &self.driver))?; + + // set vfio's bus type, pci or mmio. Mostly use pci by default. + let mode = match self.driver.as_str() { + "virtio-pci" => "mmio", + _ => "pci", + }; + + // add vfio device + let d = device::Device::Vfio(device::VfioConfig { + id: format!("physical_nic_{}", self.name().await), + sysfs_path: "".to_string(), + bus_slot_func: self.bdf.clone(), + mode: device::VfioBusMode::new(mode) + .with_context(|| format!("new vfio bus mode {:?}", mode))?, + }); + hypervisor.add_device(d).await.context("add device")?; + Ok(()) + } + + // detach for physical endpoint unbinds the physical network interface from vfio-pci + // and binds it back to the saved host driver. + async fn detach(&self, _hypervisor: &dyn Hypervisor) -> Result<()> { + // bind back the physical network interface to host. + // we need to do this even if a new network namespace has not + // been created by virt-containers. + + // we do not need to enter the network namespace to bind back the + // physical interface to host driver. + device::bind_device_to_host( + &self.bdf, + &self.driver, + &self.vendor_device_id.vendor_device_id(), + ) + .with_context(|| { + format!( + "bind physical endpoint device from vfio to {}", + &self.driver + ) + })?; + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs new file mode 100644 index 0000000000..c1bfb4c464 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs @@ -0,0 +1,84 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Error}; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hypervisor::{device::NetworkConfig, Device, Hypervisor}; + +use super::Endpoint; +use crate::network::{utils, NetworkPair}; + +#[derive(Debug)] +pub struct VethEndpoint { + net_pair: NetworkPair, +} + +impl VethEndpoint { + pub async fn new( + handle: &rtnetlink::Handle, + name: &str, + idx: u32, + model: &str, + queues: usize, + ) -> Result { + let net_pair = NetworkPair::new(handle, idx, name, model, queues) + .await + .context("new networkInterfacePair")?; + Ok(VethEndpoint { net_pair }) + } + + fn get_network_config(&self) -> Result { + let iface = &self.net_pair.tap.tap_iface; + let guest_mac = utils::parse_mac(&iface.hard_addr).ok_or_else(|| { + Error::new( + io::ErrorKind::InvalidData, + format!("hard_addr {}", &iface.hard_addr), + ) + })?; + Ok(NetworkConfig { + id: self.net_pair.virt_iface.name.clone(), + host_dev_name: iface.name.clone(), + guest_mac: Some(guest_mac), + }) + } +} + +#[async_trait] +impl Endpoint for VethEndpoint { + async fn name(&self) -> String { + self.net_pair.virt_iface.name.clone() + } + + async fn hardware_addr(&self) -> String { + self.net_pair.tap.tap_iface.hard_addr.clone() + } + + async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .add_network_model() + .await + .context("add network model")?; + let config = self.get_network_config().context("get network config")?; + h.add_device(Device::Network(config)) + .await + .context("Error add device")?; + Ok(()) + } + + async fn detach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .del_network_model() + .await + .context("del network model")?; + let config = self.get_network_config().context("get network config")?; + h.remove_device(Device::Network(config)) + .await + .context("remove device")?; + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs new file mode 100644 index 0000000000..14626318cf --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs @@ -0,0 +1,88 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::{self, Error}; + +use anyhow::{Context, Result}; +use async_trait::async_trait; + +use super::Endpoint; +use crate::network::network_model::TC_FILTER_NET_MODEL_STR; +use crate::network::{utils, NetworkPair}; +use hypervisor::{device::NetworkConfig, Device, Hypervisor}; + +#[derive(Debug)] +pub struct VlanEndpoint { + pub(crate) net_pair: NetworkPair, +} + +impl VlanEndpoint { + pub async fn new( + handle: &rtnetlink::Handle, + name: &str, + idx: u32, + queues: usize, + ) -> Result { + let net_pair = NetworkPair::new(handle, idx, name, TC_FILTER_NET_MODEL_STR, queues) + .await + .context("error creating networkInterfacePair")?; + Ok(VlanEndpoint { net_pair }) + } + + fn get_network_config(&self) -> Result { + let iface = &self.net_pair.tap.tap_iface; + let guest_mac = utils::parse_mac(&iface.hard_addr).ok_or_else(|| { + Error::new( + io::ErrorKind::InvalidData, + format!("hard_addr {}", &iface.hard_addr), + ) + })?; + Ok(NetworkConfig { + id: self.net_pair.virt_iface.name.clone(), + host_dev_name: iface.name.clone(), + guest_mac: Some(guest_mac), + }) + } +} + +#[async_trait] +impl Endpoint for VlanEndpoint { + async fn name(&self) -> String { + self.net_pair.virt_iface.name.clone() + } + + async fn hardware_addr(&self) -> String { + self.net_pair.tap.tap_iface.hard_addr.clone() + } + + async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .add_network_model() + .await + .context("error adding network model")?; + let config = self.get_network_config().context("get network config")?; + h.add_device(Device::Network(config)) + .await + .context("error adding device by hypervisor")?; + + Ok(()) + } + + async fn detach(&self, h: &dyn Hypervisor) -> Result<()> { + self.net_pair + .del_network_model() + .await + .context("error deleting network model")?; + let config = self + .get_network_config() + .context("error getting network config")?; + h.remove_device(Device::Network(config)) + .await + .context("error removing device by hypervisor")?; + + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/mod.rs b/src/runtime-rs/crates/resource/src/network/mod.rs new file mode 100644 index 0000000000..7193a6d921 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/mod.rs @@ -0,0 +1,48 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod endpoint; +pub use endpoint::Endpoint; +mod network_entity; +mod network_info; +pub use network_info::NetworkInfo; +mod network_model; +pub use network_model::NetworkModel; +mod network_with_netns; +pub use network_with_netns::NetworkWithNetNsConfig; +use network_with_netns::NetworkWithNetns; +mod network_pair; +use network_pair::NetworkPair; +mod utils; + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hypervisor::Hypervisor; + +#[derive(Debug)] +pub enum NetworkConfig { + NetworkResourceWithNetNs(NetworkWithNetNsConfig), +} + +#[async_trait] +pub trait Network: Send + Sync { + async fn setup(&self, h: &dyn Hypervisor) -> Result<()>; + async fn interfaces(&self) -> Result>; + async fn routes(&self) -> Result>; + async fn neighs(&self) -> Result>; +} + +pub async fn new(config: &NetworkConfig) -> Result> { + match config { + NetworkConfig::NetworkResourceWithNetNs(c) => Ok(Arc::new( + NetworkWithNetns::new(c) + .await + .context("new network with netns")?, + )), + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_entity.rs b/src/runtime-rs/crates/resource/src/network/network_entity.rs new file mode 100644 index 0000000000..5182dfe4b0 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_entity.rs @@ -0,0 +1,24 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use super::{Endpoint, NetworkInfo}; + +#[derive(Debug)] +pub(crate) struct NetworkEntity { + pub(crate) endpoint: Arc, + pub(crate) network_info: Arc, +} + +impl NetworkEntity { + pub fn new(endpoint: Arc, network_info: Arc) -> Self { + Self { + endpoint, + network_info, + } + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_info/mod.rs b/src/runtime-rs/crates/resource/src/network/network_info/mod.rs new file mode 100644 index 0000000000..1500d5179e --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_info/mod.rs @@ -0,0 +1,18 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub(crate) mod network_info_from_link; + +use agent::{ARPNeighbor, Interface, Route}; +use anyhow::Result; +use async_trait::async_trait; + +#[async_trait] +pub trait NetworkInfo: std::fmt::Debug + Send + Sync { + async fn interface(&self) -> Result; + async fn routes(&self) -> Result>; + async fn neighs(&self) -> Result>; +} diff --git a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs new file mode 100644 index 0000000000..e8d5494915 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs @@ -0,0 +1,228 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::convert::TryFrom; + +use agent::{ARPNeighbor, IPAddress, IPFamily, Interface, Route}; +use anyhow::{Context, Result}; +use async_trait::async_trait; +use futures::stream::TryStreamExt; +use netlink_packet_route::{ + self, neighbour::NeighbourMessage, nlas::neighbour::Nla, route::RouteMessage, +}; + +use super::NetworkInfo; +use crate::network::utils::{ + address::{parse_ip, Address}, + link::{self, LinkAttrs}, +}; + +#[derive(Debug)] +pub(crate) struct NetworkInfoFromLink { + interface: Interface, + neighs: Vec, + routes: Vec, +} + +impl NetworkInfoFromLink { + pub async fn new( + handle: &rtnetlink::Handle, + link: &dyn link::Link, + hw_addr: &str, + ) -> Result { + let attrs = link.attrs(); + let name = &attrs.name; + + Ok(Self { + interface: Interface { + device: name.clone(), + name: name.clone(), + ip_addresses: handle_addresses(handle, attrs) + .await + .context("handle addresses")?, + mtu: attrs.mtu as u64, + hw_addr: hw_addr.to_string(), + pci_addr: Default::default(), + field_type: link.r#type().to_string(), + raw_flags: attrs.flags & libc::IFF_NOARP as u32, + }, + neighs: handle_neighbors(handle, attrs) + .await + .context("handle neighbours")?, + routes: handle_routes(handle, attrs) + .await + .context("handle routes")?, + }) + } +} + +async fn handle_addresses(handle: &rtnetlink::Handle, attrs: &LinkAttrs) -> Result> { + let mut addr_msg_list = handle + .address() + .get() + .set_link_index_filter(attrs.index) + .execute(); + + let mut addresses = vec![]; + while let Some(addr_msg) = addr_msg_list + .try_next() + .await + .context("try next address msg")? + { + let family = addr_msg.header.family as i32; + if family != libc::AF_INET && family != libc::AF_INET6 { + warn!(sl!(), "unsupported ip family {}", family); + continue; + } + let a = Address::try_from(addr_msg).context("get addr from msg")?; + if a.addr.is_loopback() { + continue; + } + + addresses.push(IPAddress { + family: if a.addr.is_ipv4() { + IPFamily::V4 + } else { + IPFamily::V6 + }, + address: a.addr.to_string(), + mask: a.perfix_len.to_string(), + }); + } + Ok(addresses) +} + +fn generate_neigh(name: &str, n: &NeighbourMessage) -> Result { + let mut neigh = ARPNeighbor { + device: name.to_string(), + state: n.header.state as i32, + ..Default::default() + }; + for nla in &n.nlas { + match nla { + Nla::Destination(addr) => { + let dest = parse_ip(addr, n.header.family).context("parse ip")?; + let addr = Some(IPAddress { + family: if dest.is_ipv4() { + IPFamily::V4 + } else { + IPFamily::V6 + }, + address: dest.to_string(), + mask: "".to_string(), + }); + neigh.to_ip_address = addr; + } + Nla::LinkLocalAddress(addr) => { + if addr.len() < 6 { + continue; + } + let lladdr = format!( + "{:<02x}:{:<02x}:{:<02x}:{:<02x}:{:<02x}:{:<02x}", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5] + ); + neigh.ll_addr = lladdr; + } + _ => { + // skip the unused Nla + } + } + } + + Ok(neigh) +} + +async fn handle_neighbors( + handle: &rtnetlink::Handle, + attrs: &LinkAttrs, +) -> Result> { + let name = &attrs.name; + let mut neighs = vec![]; + let mut neigh_msg_list = handle.neighbours().get().execute(); + while let Some(neigh) = neigh_msg_list + .try_next() + .await + .context("try next neigh msg")? + { + // get neigh filter with index + if neigh.header.ifindex == attrs.index { + neighs.push(generate_neigh(name, &neigh).context("generate neigh")?) + } + } + Ok(neighs) +} + +fn generate_route(name: &str, route: &RouteMessage) -> Result> { + if route.header.protocol == libc::RTPROT_KERNEL { + return Ok(None); + } + + Ok(Some(Route { + dest: route + .destination_prefix() + .map(|(addr, prefix)| format!("{}/{}", addr, prefix)) + .unwrap_or_default(), + gateway: route.gateway().map(|v| v.to_string()).unwrap_or_default(), + device: name.to_string(), + source: route + .source_prefix() + .map(|(addr, _)| addr.to_string()) + .unwrap_or_default(), + scope: route.header.scope as u32, + family: if route.header.address_family == libc::AF_INET as u8 { + IPFamily::V4 + } else { + IPFamily::V6 + }, + })) +} + +async fn get_route_from_msg( + routes: &mut Vec, + handle: &rtnetlink::Handle, + attrs: &LinkAttrs, + ip_version: rtnetlink::IpVersion, +) -> Result<()> { + let name = &attrs.name; + let mut route_msg_list = handle.route().get(ip_version).execute(); + while let Some(route) = route_msg_list.try_next().await? { + // get route filter with index + if let Some(index) = route.output_interface() { + if index == attrs.index { + if let Some(route) = generate_route(name, &route).context("generate route")? { + routes.push(route); + } + } + } + } + Ok(()) +} + +async fn handle_routes(handle: &rtnetlink::Handle, attrs: &LinkAttrs) -> Result> { + let mut routes = vec![]; + get_route_from_msg(&mut routes, handle, attrs, rtnetlink::IpVersion::V4) + .await + .context("get ip v4 route")?; + get_route_from_msg(&mut routes, handle, attrs, rtnetlink::IpVersion::V6) + .await + .context("get ip v6 route")?; + Ok(routes) +} + +#[async_trait] +impl NetworkInfo for NetworkInfoFromLink { + async fn interface(&self) -> Result { + Ok(self.interface.clone()) + } + + async fn routes(&self) -> Result> { + Ok(self.routes.clone()) + } + + async fn neighs(&self) -> Result> { + Ok(self.neighs.clone()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_model/mod.rs b/src/runtime-rs/crates/resource/src/network/network_model/mod.rs new file mode 100644 index 0000000000..16457f1e4f --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_model/mod.rs @@ -0,0 +1,46 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub mod none_model; +pub mod route_model; +pub mod tc_filter_model; +pub mod test_network_model; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use async_trait::async_trait; + +use super::NetworkPair; + +pub(crate) const TC_FILTER_NET_MODEL_STR: &str = "tcfilter"; +pub(crate) const ROUTE_NET_MODEL_STR: &str = "route"; + +pub enum NetworkModelType { + NoneModel, + TcFilter, + Route, +} + +#[async_trait] +pub trait NetworkModel: std::fmt::Debug + Send + Sync { + fn model_type(&self) -> NetworkModelType; + async fn add(&self, net_pair: &NetworkPair) -> Result<()>; + async fn del(&self, net_pair: &NetworkPair) -> Result<()>; +} + +pub fn new(model: &str) -> Result> { + match model { + TC_FILTER_NET_MODEL_STR => Ok(Arc::new( + tc_filter_model::TcFilterModel::new().context("new tc filter model")?, + )), + ROUTE_NET_MODEL_STR => Ok(Arc::new( + route_model::RouteModel::new().context("new route model")?, + )), + _ => Ok(Arc::new( + none_model::NoneModel::new().context("new none model")?, + )), + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_model/none_model.rs b/src/runtime-rs/crates/resource/src/network/network_model/none_model.rs new file mode 100644 index 0000000000..f68b4d3e22 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_model/none_model.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use async_trait::async_trait; + +use super::{NetworkModel, NetworkModelType}; +use crate::network::NetworkPair; + +#[derive(Debug)] +pub(crate) struct NoneModel {} + +impl NoneModel { + pub fn new() -> Result { + Ok(Self {}) + } +} + +#[async_trait] +impl NetworkModel for NoneModel { + fn model_type(&self) -> NetworkModelType { + NetworkModelType::NoneModel + } + + async fn add(&self, _pair: &NetworkPair) -> Result<()> { + Ok(()) + } + + async fn del(&self, _pair: &NetworkPair) -> Result<()> { + Ok(()) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_model/route_model.rs b/src/runtime-rs/crates/resource/src/network/network_model/route_model.rs new file mode 100644 index 0000000000..cb47bdad21 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_model/route_model.rs @@ -0,0 +1,88 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use tokio::process::Command; + +use super::{NetworkModel, NetworkModelType}; +use crate::network::NetworkPair; + +#[derive(Debug)] +pub(crate) struct RouteModel {} + +impl RouteModel { + pub fn new() -> Result { + Ok(Self {}) + } +} + +#[async_trait] +impl NetworkModel for RouteModel { + fn model_type(&self) -> NetworkModelType { + NetworkModelType::Route + } + + async fn add(&self, pair: &NetworkPair) -> Result<()> { + let tap_name = &pair.tap.tap_iface.name; + let virt_name = &pair.virt_iface.name; + let virt_iface_addr = pair.virt_iface.addrs[0].addr.to_string(); + + let commands_args = vec![ + vec![ + "rule", "add", "pref", "10", "from", "all", "lookup", "local", + ], + vec!["rule", "del", "pref", "0", "from", "all"], + vec!["rule", "add", "pref", "5", "iif", virt_name, "table", "10"], + vec![ + "route", "replace", "default", "dev", tap_name, "table", "10", + ], + vec![ + "neigh", + "replace", + &virt_iface_addr, + "lladdr", + &pair.virt_iface.hard_addr, + "dev", + tap_name, + ], + ]; + + for ca in commands_args { + let output = Command::new("/sbin/ip") + .args(&ca) + .output() + .await + .with_context(|| format!("run command ip args {:?}", &ca))?; + if !output.status.success() { + return Err(anyhow!( + "run command ip args {:?} error {}", + &ca, + String::from_utf8(output.stderr)? + )); + } + } + + // TODO: support ipv6 + // change sysctl for tap0_kata + // echo 1 > /proc/sys/net/ipv4/conf/tap0_kata/accept_local + let accept_local_path = format!("/proc/sys/net/ipv4/conf/{}/accept_local", &tap_name); + std::fs::write(&accept_local_path, "1") + .with_context(|| format!("Failed to echo 1 > {}", &accept_local_path))?; + + // echo 1 > /proc/sys/net/ipv4/conf/eth0/proxy_arp + // This enabled ARP reply on peer eth0 to prevent without any reply on VPC + let proxy_arp_path = format!("/proc/sys/net/ipv4/conf/{}/proxy_arp", &virt_name); + std::fs::write(&proxy_arp_path, "1") + .with_context(|| format!("Failed to echo 1 > {}", &proxy_arp_path))?; + + Ok(()) + } + + async fn del(&self, _pair: &NetworkPair) -> Result<()> { + todo!() + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_model/tc_filter_model.rs b/src/runtime-rs/crates/resource/src/network/network_model/tc_filter_model.rs new file mode 100644 index 0000000000..ff689b9b84 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_model/tc_filter_model.rs @@ -0,0 +1,104 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use rtnetlink::Handle; +use scopeguard::defer; + +use super::{NetworkModel, NetworkModelType}; +use crate::network::NetworkPair; + +#[derive(Debug)] +pub(crate) struct TcFilterModel {} + +impl TcFilterModel { + pub fn new() -> Result { + Ok(Self {}) + } +} + +#[async_trait] +impl NetworkModel for TcFilterModel { + fn model_type(&self) -> NetworkModelType { + NetworkModelType::TcFilter + } + + async fn add(&self, pair: &NetworkPair) -> Result<()> { + let (connection, handle, _) = rtnetlink::new_connection().context("new connection")?; + let thread_handler = tokio::spawn(connection); + + defer!({ + thread_handler.abort(); + }); + + let tap_index = fetch_index(&handle, pair.tap.tap_iface.name.as_str()) + .await + .context("fetch tap by index")?; + let virt_index = fetch_index(&handle, pair.virt_iface.name.as_str()) + .await + .context("fetch virt by index")?; + + handle + .qdisc() + .add(tap_index as i32) + .ingress() + .execute() + .await + .context("add tap ingress")?; + + handle + .qdisc() + .add(virt_index as i32) + .ingress() + .execute() + .await + .context("add virt ingress")?; + + handle + .traffic_filter(tap_index as i32) + .add() + .parent(0xffff0000) + // get protocol with network byte order + .protocol(0x0003_u16.to_be()) + .redirect(virt_index) + .execute() + .await + .context("add redirect for tap")?; + + handle + .traffic_filter(virt_index as i32) + .add() + .parent(0xffff0000) + // get protocol with network byte order + .protocol(0x0003_u16.to_be()) + .redirect(tap_index) + .execute() + .await + .context("add redirect for virt")?; + + Ok(()) + } + + async fn del(&self, pair: &NetworkPair) -> Result<()> { + let (connection, handle, _) = rtnetlink::new_connection().context("new connection")?; + let thread_handler = tokio::spawn(connection); + defer!({ + thread_handler.abort(); + }); + let virt_index = fetch_index(&handle, &pair.virt_iface.name).await?; + handle.qdisc().del(virt_index as i32).execute().await?; + Ok(()) + } +} + +pub async fn fetch_index(handle: &Handle, name: &str) -> Result { + let link = crate::network::network_pair::get_link_by_name(handle, name) + .await + .context("get link by name")?; + let base = link.attrs(); + Ok(base.index) +} diff --git a/src/runtime-rs/crates/resource/src/network/network_model/test_network_model.rs b/src/runtime-rs/crates/resource/src/network/network_model/test_network_model.rs new file mode 100644 index 0000000000..bd1bb628f2 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_model/test_network_model.rs @@ -0,0 +1,39 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[cfg(test)] +mod tests { + use crate::network::{ + network_model::{tc_filter_model::fetch_index, TC_FILTER_NET_MODEL_STR}, + network_pair::NetworkPair, + }; + use anyhow::Context; + use scopeguard::defer; + #[actix_rt::test] + async fn test_tc_redirect_network() { + if let Ok((connection, handle, _)) = rtnetlink::new_connection().context("new connection") { + let thread_handler = tokio::spawn(connection); + defer!({ + thread_handler.abort(); + }); + + handle + .link() + .add() + .veth("foo".to_string(), "bar".to_string()); + + if let Ok(net_pair) = + NetworkPair::new(&handle, 1, "bar", TC_FILTER_NET_MODEL_STR, 2).await + { + if let Ok(index) = fetch_index(&handle, "bar").await { + assert!(net_pair.add_network_model().await.is_ok()); + assert!(net_pair.del_network_model().await.is_ok()); + assert!(handle.link().del(index).execute().await.is_ok()); + } + } + } + } +} diff --git a/src/runtime-rs/crates/resource/src/network/network_pair.rs b/src/runtime-rs/crates/resource/src/network/network_pair.rs new file mode 100644 index 0000000000..c96898619b --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_pair.rs @@ -0,0 +1,179 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{convert::TryFrom, sync::Arc, usize}; + +use anyhow::{anyhow, Context, Result}; +use futures::stream::TryStreamExt; + +use super::{ + network_model, + utils::{self, address::Address, link}, +}; + +const TAP_SUFFIX: &str = "_kata"; + +#[derive(Default, Copy, Clone, Debug, PartialEq, Eq)] +pub struct NetInterworkingModel(u32); + +#[derive(Default, Debug, Clone)] +pub struct NetworkInterface { + pub name: String, + pub hard_addr: String, + pub addrs: Vec
, +} + +#[derive(Default, Debug)] +pub struct TapInterface { + pub id: String, + pub name: String, + pub tap_iface: NetworkInterface, +} +#[derive(Debug)] +pub struct NetworkPair { + pub tap: TapInterface, + pub virt_iface: NetworkInterface, + pub model: Arc, + pub network_qos: bool, +} +impl NetworkPair { + pub(crate) async fn new( + handle: &rtnetlink::Handle, + idx: u32, + name: &str, + model: &str, + queues: usize, + ) -> Result { + let unique_id = kata_sys_util::rand::UUID::new(); + let model = network_model::new(model).context("new network model")?; + let tap_iface_name = format!("tap{}{}", idx, TAP_SUFFIX); + let virt_iface_name = format!("eth{}", idx); + let tap_link = create_link(handle, &tap_iface_name, queues) + .await + .context("create link")?; + + let virt_link = get_link_by_name(handle, virt_iface_name.clone().as_str()) + .await + .context("get link by name")?; + + let mut virt_addr_msg_list = handle + .address() + .get() + .set_link_index_filter(virt_link.attrs().index) + .execute(); + + let mut virt_address = vec![]; + while let Some(addr_msg) = virt_addr_msg_list.try_next().await? { + let addr = Address::try_from(addr_msg).context("get address from msg")?; + virt_address.push(addr); + } + + // Save the veth MAC address to the TAP so that it can later be used + // to build the hypervisor command line. This MAC address has to be + // the one inside the VM in order to avoid any firewall issues. The + // bridge created by the network plugin on the host actually expects + // to see traffic from this MAC address and not another one. + let tap_hard_addr = + utils::get_mac_addr(&virt_link.attrs().hardware_addr).context("get mac addr")?; + + // Save the TAP Mac address to the virt_iface so that it can later updated + // the guest's gateway IP's mac as this TAP device. This MAC address has + // to be inside the VM in order to the network reach to the gateway. + let virt_hard_addr = + utils::get_mac_addr(&tap_link.attrs().hardware_addr).context("get mac addr")?; + + handle + .link() + .set(tap_link.attrs().index) + .mtu(virt_link.attrs().mtu) + .execute() + .await + .context("set link mtu")?; + + handle + .link() + .set(tap_link.attrs().index) + .up() + .execute() + .await + .context("set link up")?; + + let mut net_pair = NetworkPair { + tap: TapInterface { + id: String::from(&unique_id), + name: format!("br{}{}", idx, TAP_SUFFIX), + tap_iface: NetworkInterface { + name: tap_iface_name, + hard_addr: tap_hard_addr, + ..Default::default() + }, + }, + virt_iface: NetworkInterface { + name: virt_iface_name, + hard_addr: virt_hard_addr, + addrs: virt_address, + }, + model, + network_qos: false, + }; + + if !name.is_empty() { + net_pair.virt_iface.name = String::from(name); + } + + Ok(net_pair) + } + + pub(crate) async fn add_network_model(&self) -> Result<()> { + let model = self.model.clone(); + model.add(self).await.context("add")?; + Ok(()) + } + + pub(crate) async fn del_network_model(&self) -> Result<()> { + let model = self.model.clone(); + model.del(self).await.context("del")?; + Ok(()) + } +} + +pub async fn create_link( + handle: &rtnetlink::Handle, + name: &str, + queues: usize, +) -> Result> { + link::create_link(name, link::LinkType::Tap, queues)?; + + let link = get_link_by_name(handle, name) + .await + .context("get link by name")?; + + let base = link.attrs(); + if base.master_index != 0 { + handle + .link() + .set(base.index) + .master(base.master_index) + .execute() + .await + .context("set index")?; + } + Ok(link) +} + +pub async fn get_link_by_name( + handle: &rtnetlink::Handle, + name: &str, +) -> Result> { + let mut link_msg_list = handle.link().get().match_name(name.to_string()).execute(); + let msg = if let Some(msg) = link_msg_list.try_next().await? { + msg + } else { + return Err(anyhow!("failed to find link by name {}", name)); + }; + + Ok(link::get_link_from_message(msg)) +} diff --git a/src/runtime-rs/crates/resource/src/network/network_with_netns.rs b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs new file mode 100644 index 0000000000..d89e8ab7e6 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs @@ -0,0 +1,239 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::{ + atomic::{AtomicU32, Ordering}, + Arc, +}; + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use futures::stream::TryStreamExt; +use hypervisor::Hypervisor; +use scopeguard::defer; +use tokio::sync::RwLock; + +use super::{ + endpoint::{ + Endpoint, IPVlanEndpoint, MacVlanEndpoint, PhysicalEndpoint, VethEndpoint, VlanEndpoint, + }, + network_entity::NetworkEntity, + network_info::network_info_from_link::NetworkInfoFromLink, + utils::{link, netns}, + Network, +}; +use crate::network::NetworkInfo; + +#[derive(Debug)] +pub struct NetworkWithNetNsConfig { + pub network_model: String, + pub netns_path: String, + pub queues: usize, +} + +struct NetworkWithNetnsInner { + netns_path: String, + entity_list: Vec, +} + +impl NetworkWithNetnsInner { + async fn new(config: &NetworkWithNetNsConfig) -> Result { + let entity_list = if config.netns_path.is_empty() { + warn!(sl!(), "skip to scan for empty netns"); + vec![] + } else { + // get endpoint + get_entity_from_netns(config) + .await + .context("get entity from netns")? + }; + Ok(Self { + netns_path: config.netns_path.to_string(), + entity_list, + }) + } +} + +pub(crate) struct NetworkWithNetns { + inner: Arc>, +} + +impl NetworkWithNetns { + pub(crate) async fn new(config: &NetworkWithNetNsConfig) -> Result { + Ok(Self { + inner: Arc::new(RwLock::new(NetworkWithNetnsInner::new(config).await?)), + }) + } +} + +#[async_trait] +impl Network for NetworkWithNetns { + async fn setup(&self, h: &dyn Hypervisor) -> Result<()> { + let inner = self.inner.read().await; + let _netns_guard = netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?; + for e in &inner.entity_list { + e.endpoint.attach(h).await.context("attach")?; + } + Ok(()) + } + + async fn interfaces(&self) -> Result> { + let inner = self.inner.read().await; + let mut interfaces = vec![]; + for e in &inner.entity_list { + interfaces.push(e.network_info.interface().await.context("interface")?); + } + Ok(interfaces) + } + + async fn routes(&self) -> Result> { + let inner = self.inner.read().await; + let mut routes = vec![]; + for e in &inner.entity_list { + let mut list = e.network_info.routes().await.context("routes")?; + routes.append(&mut list); + } + Ok(routes) + } + + async fn neighs(&self) -> Result> { + let inner = self.inner.read().await; + let mut neighs = vec![]; + for e in &inner.entity_list { + let mut list = e.network_info.neighs().await.context("neighs")?; + neighs.append(&mut list); + } + Ok(neighs) + } +} + +async fn get_entity_from_netns(config: &NetworkWithNetNsConfig) -> Result> { + info!( + sl!(), + "get network entity for config {:?} tid {:?}", + config, + nix::unistd::gettid() + ); + let mut entity_list = vec![]; + let _netns_guard = netns::NetnsGuard::new(&config.netns_path) + .context("net netns guard") + .unwrap(); + let (connection, handle, _) = rtnetlink::new_connection().context("new connection")?; + let thread_handler = tokio::spawn(connection); + defer!({ + thread_handler.abort(); + }); + + let mut links = handle.link().get().execute(); + + let idx = AtomicU32::new(0); + while let Some(link) = links.try_next().await? { + let link = link::get_link_from_message(link); + let attrs = link.attrs(); + + if (attrs.flags & libc::IFF_LOOPBACK as u32) != 0 { + continue; + } + + let idx = idx.fetch_add(1, Ordering::Relaxed); + let (endpoint, network_info) = create_endpoint(&handle, link.as_ref(), idx, config) + .await + .context("create endpoint")?; + + entity_list.push(NetworkEntity::new(endpoint, network_info)); + } + + Ok(entity_list) +} + +async fn create_endpoint( + handle: &rtnetlink::Handle, + link: &dyn link::Link, + idx: u32, + config: &NetworkWithNetNsConfig, +) -> Result<(Arc, Arc)> { + let _netns_guard = netns::NetnsGuard::new(&config.netns_path) + .context("net netns guard") + .unwrap(); + let attrs = link.attrs(); + let link_type = link.r#type(); + let endpoint: Arc = if is_physical_iface(&attrs.name)? { + info!( + sl!(), + "physical network interface found: {} {:?}", + &attrs.name, + nix::unistd::gettid() + ); + let t = PhysicalEndpoint::new(&attrs.name, &attrs.hardware_addr) + .context("new physical endpoint")?; + Arc::new(t) + } else { + info!( + sl!(), + "{} network interface found: {}", &link_type, &attrs.name + ); + match link_type { + "veth" => { + let ret = VethEndpoint::new( + handle, + &attrs.name, + idx, + &config.network_model, + config.queues, + ) + .await + .context("veth endpoint")?; + Arc::new(ret) + } + "vlan" => { + let ret = VlanEndpoint::new(handle, &attrs.name, idx, config.queues) + .await + .context("vlan endpoint")?; + Arc::new(ret) + } + "ipvlan" => { + let ret = IPVlanEndpoint::new(handle, &attrs.name, idx, config.queues) + .await + .context("ipvlan endpoint")?; + Arc::new(ret) + } + "macvlan" => { + let ret = MacVlanEndpoint::new( + handle, + &attrs.name, + idx, + &config.network_model, + config.queues, + ) + .await + .context("macvlan endpoint")?; + Arc::new(ret) + } + _ => return Err(anyhow!("unsupported link type: {}", link_type)), + } + }; + + let network_info = Arc::new( + NetworkInfoFromLink::new(handle, link, &endpoint.hardware_addr().await) + .await + .context("network info from link")?, + ); + + info!(sl!(), "network info {:?}", network_info); + + Ok((endpoint, network_info)) +} + +fn is_physical_iface(name: &str) -> Result { + if name == "lo" { + return Ok(false); + } + let driver_info = link::get_driver_info(name)?; + if driver_info.bus_info.split(':').count() != 3 { + return Ok(false); + } + Ok(true) +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/address.rs b/src/runtime-rs/crates/resource/src/network/utils/address.rs new file mode 100644 index 0000000000..0484c9f364 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/address.rs @@ -0,0 +1,87 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + convert::TryFrom, + net::{IpAddr, Ipv4Addr, Ipv6Addr}, +}; + +use anyhow::{anyhow, Result}; +use netlink_packet_route::{nlas::address::Nla, AddressMessage, AF_INET, AF_INET6}; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Address { + pub addr: IpAddr, + pub label: String, + pub flags: u32, + pub scope: u8, + pub perfix_len: u8, + pub peer: IpAddr, + pub broadcast: IpAddr, + pub prefered_lft: u32, + pub valid_ltf: u32, +} + +impl TryFrom for Address { + type Error = anyhow::Error; + fn try_from(msg: AddressMessage) -> Result { + let AddressMessage { header, nlas } = msg; + let mut addr = Address { + addr: IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), + peer: IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), + broadcast: IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), + label: String::default(), + flags: 0, + scope: header.scope, + perfix_len: header.prefix_len, + prefered_lft: 0, + valid_ltf: 0, + }; + + for nla in nlas.into_iter() { + match nla { + Nla::Address(a) => { + addr.addr = parse_ip(&a, header.family)?; + } + Nla::Broadcast(b) => { + addr.broadcast = parse_ip(&b, header.family)?; + } + Nla::Label(l) => { + addr.label = l; + } + Nla::Flags(f) => { + addr.flags = f; + } + Nla::CacheInfo(_c) => {} + _ => {} + } + } + + Ok(addr) + } +} + +pub(crate) fn parse_ip(ip: &[u8], family: u8) -> Result { + let support_len = if family as u16 == AF_INET { 4 } else { 16 }; + if ip.len() != support_len { + return Err(anyhow!( + "invalid ip addresses {:?} support {}", + &ip, + support_len + )); + } + match family as u16 { + AF_INET => Ok(IpAddr::V4(Ipv4Addr::new(ip[0], ip[1], ip[2], ip[3]))), + AF_INET6 => { + let mut octets = [0u8; 16]; + octets.copy_from_slice(&ip[..16]); + Ok(IpAddr::V6(Ipv6Addr::from(octets))) + } + _ => { + return Err(anyhow!("unknown IP network family {}", family)); + } + } +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/link/create.rs b/src/runtime-rs/crates/resource/src/network/utils/link/create.rs new file mode 100644 index 0000000000..06bedf79b9 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/link/create.rs @@ -0,0 +1,129 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + fs::{File, OpenOptions}, + os::unix::io::AsRawFd, + path::Path, + {io, mem}, +}; + +use anyhow::{Context, Result}; +use nix::ioctl_write_ptr; + +use super::macros::{get_name, set_name}; + +type IfName = [u8; libc::IFNAMSIZ]; + +#[derive(Copy, Clone, Debug)] +#[repr(C)] +struct CreateLinkMap { + pub mem_start: libc::c_ulong, + pub mem_end: libc::c_ulong, + pub base_addr: libc::c_ushort, + pub irq: libc::c_uchar, + pub dma: libc::c_uchar, + pub port: libc::c_uchar, +} + +#[repr(C)] +union CreateLinkIfru { + pub ifr_addr: libc::sockaddr, + pub ifr_dst_addr: libc::sockaddr, + pub ifr_broad_addr: libc::sockaddr, + pub ifr_netmask: libc::sockaddr, + pub ifr_hw_addr: libc::sockaddr, + pub ifr_flags: libc::c_short, + pub ifr_if_index: libc::c_int, + pub ifr_metric: libc::c_int, + pub ifr_mtu: libc::c_int, + pub ifr_map: CreateLinkMap, + pub ifr_slave: IfName, + pub ifr_new_name: IfName, + pub ifr_data: *mut libc::c_char, +} + +#[repr(C)] +struct CreateLinkReq { + pub ifr_name: IfName, + pub ifr_ifru: CreateLinkIfru, +} + +impl CreateLinkReq { + pub fn from_name(name: &str) -> io::Result { + let mut req: CreateLinkReq = unsafe { mem::zeroed() }; + req.set_name(name)?; + Ok(req) + } + + pub fn set_name(&mut self, name: &str) -> io::Result<()> { + set_name!(self.ifr_name, name) + } + + pub fn get_name(&self) -> io::Result { + get_name!(self.ifr_name) + } + + pub unsafe fn set_raw_flags(&mut self, raw_flags: libc::c_short) { + self.ifr_ifru.ifr_flags = raw_flags; + } +} + +const DEVICE_PATH: &str = "/dev/net/tun"; + +ioctl_write_ptr!(tun_set_iff, b'T', 202, libc::c_int); +ioctl_write_ptr!(tun_set_persist, b'T', 203, libc::c_int); + +#[derive(Clone, Copy, Debug)] +pub enum LinkType { + #[allow(dead_code)] + Tun, + Tap, +} + +pub fn create_link(name: &str, link_type: LinkType, queues: usize) -> Result<()> { + let mut flags = libc::IFF_VNET_HDR; + flags |= match link_type { + LinkType::Tun => libc::IFF_TUN, + LinkType::Tap => libc::IFF_TAP, + }; + + let queues = if queues == 0 { 1 } else { queues }; + if queues > 1 { + flags |= libc::IFF_MULTI_QUEUE | libc::IFF_NO_PI; + } else { + flags |= libc::IFF_ONE_QUEUE; + }; + + // create first queue + let mut files = vec![]; + let (file, result_name) = create_queue(name, flags)?; + unsafe { + tun_set_persist(file.as_raw_fd(), &1).context("tun set persist")?; + } + files.push(file); + + // create other queues + if queues > 1 { + for _ in 0..queues - 1 { + files.push(create_queue(&result_name, flags)?.0); + } + } + + info!(sl!(), "create link with fds {:?}", files); + Ok(()) +} + +fn create_queue(name: &str, flags: libc::c_int) -> Result<(File, String)> { + let path = Path::new(DEVICE_PATH); + let file = OpenOptions::new().read(true).write(true).open(&path)?; + let mut req = CreateLinkReq::from_name(name)?; + unsafe { + req.set_raw_flags(flags as libc::c_short); + tun_set_iff(file.as_raw_fd(), &mut req as *mut _ as *mut _).context("tun set iff")?; + }; + Ok((file, req.get_name()?)) +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/link/driver_info.rs b/src/runtime-rs/crates/resource/src/network/utils/link/driver_info.rs new file mode 100644 index 0000000000..a7269d013a --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/link/driver_info.rs @@ -0,0 +1,102 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{io, mem}; + +use anyhow::{Context, Result}; +use nix::sys::socket::{socket, AddressFamily, SockFlag, SockType}; +use scopeguard::defer; + +use super::macros::{get_name, set_name}; + +/// FW version length +const ETHTOOL_FW_VERSION_LEN: usize = 32; + +/// bus info length +const ETHTOOL_BUS_INFO_LEN: usize = 32; + +/// erom version length +const ETHTOOL_EROM_VERSION_LEN: usize = 32; + +/// driver info +const ETHTOOL_DRIVER_INFO: u32 = 0x00000003; + +/// Ethtool interface define 0x8946 +const IOCTL_ETHTOOL_INTERFACE: u32 = 0x8946; + +nix::ioctl_readwrite_bad!(ioctl_ethtool, IOCTL_ETHTOOL_INTERFACE, DeviceInfoReq); + +#[repr(C)] +pub union DeviceInfoIfru { + pub ifr_addr: libc::sockaddr, + pub ifr_data: *mut libc::c_char, +} + +type IfName = [u8; libc::IFNAMSIZ]; + +#[repr(C)] +pub struct DeviceInfoReq { + pub ifr_name: IfName, + pub ifr_ifru: DeviceInfoIfru, +} + +impl DeviceInfoReq { + pub fn from_name(name: &str) -> io::Result { + let mut req: DeviceInfoReq = unsafe { mem::zeroed() }; + req.set_name(name)?; + Ok(req) + } + + pub fn set_name(&mut self, name: &str) -> io::Result<()> { + set_name!(self.ifr_name, name) + } +} + +#[repr(C)] +#[derive(Debug, Clone)] +struct Driver { + pub cmd: u32, + pub driver: [u8; 32], + pub version: [u8; 32], + pub fw_version: [u8; ETHTOOL_FW_VERSION_LEN], + pub bus_info: [u8; ETHTOOL_BUS_INFO_LEN], + pub erom_version: [u8; ETHTOOL_EROM_VERSION_LEN], + pub reserved2: [u8; 12], + pub n_priv_flags: u32, + pub n_stats: u32, + pub test_info_len: u32, + pub eedump_len: u32, + pub regdump_len: u32, +} + +#[derive(Debug, Clone)] +pub struct DriverInfo { + pub driver: String, + pub bus_info: String, +} + +pub fn get_driver_info(name: &str) -> Result { + let mut req = DeviceInfoReq::from_name(name).context(format!("ifreq from name {}", name))?; + let mut ereq: Driver = unsafe { mem::zeroed() }; + ereq.cmd = ETHTOOL_DRIVER_INFO; + req.ifr_ifru.ifr_data = &mut ereq as *mut _ as *mut _; + + let fd = socket( + AddressFamily::Inet, + SockType::Datagram, + SockFlag::empty(), + None, + ) + .context("new socket")?; + defer!({ + let _ = nix::unistd::close(fd); + }); + unsafe { ioctl_ethtool(fd, &mut req).context("ioctl ethtool")? }; + Ok(DriverInfo { + driver: get_name!(ereq.driver).context("get driver name")?, + bus_info: get_name!(ereq.bus_info).context("get bus info name")?, + }) +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/link/macros.rs b/src/runtime-rs/crates/resource/src/network/utils/link/macros.rs new file mode 100644 index 0000000000..128a76bb29 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/link/macros.rs @@ -0,0 +1,48 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +macro_rules! set_name { + ($name_field:expr, $name_str:expr) => {{ + let name_c = &::std::ffi::CString::new($name_str.to_owned()).map_err(|_| { + ::std::io::Error::new( + ::std::io::ErrorKind::InvalidInput, + "malformed interface name", + ) + })?; + let name_slice = name_c.as_bytes_with_nul(); + if name_slice.len() > libc::IFNAMSIZ { + return Err(io::Error::new(::std::io::ErrorKind::InvalidInput, "").into()); + } + $name_field[..name_slice.len()].clone_from_slice(name_slice); + + Ok(()) + }}; +} + +macro_rules! get_name { + ($name_field:expr) => {{ + let nul_pos = match $name_field.iter().position(|x| *x == 0) { + Some(p) => p, + None => { + return Err(::std::io::Error::new( + ::std::io::ErrorKind::InvalidData, + "malformed interface name", + ) + .into()) + } + }; + + std::ffi::CString::new(&$name_field[..nul_pos]) + .unwrap() + .into_string() + .map_err(|_| { + std::io::Error::new(std::io::ErrorKind::InvalidData, "malformed interface name") + }) + }}; +} + +pub(crate) use get_name; +pub(crate) use set_name; diff --git a/src/runtime-rs/crates/resource/src/network/utils/link/manager.rs b/src/runtime-rs/crates/resource/src/network/utils/link/manager.rs new file mode 100644 index 0000000000..efc43bb704 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/link/manager.rs @@ -0,0 +1,316 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use netlink_packet_route::{ + link::nlas::{Info, InfoBridge, InfoData, InfoKind, Nla}, + LinkMessage, +}; + +use super::{Link, LinkAttrs}; + +pub fn get_link_from_message(mut msg: LinkMessage) -> Box { + let mut base = LinkAttrs { + index: msg.header.index, + flags: msg.header.flags, + link_layer_type: msg.header.link_layer_type, + ..Default::default() + }; + if msg.header.flags & libc::IFF_PROMISC as u32 != 0 { + base.promisc = 1; + } + let mut link: Option> = None; + while let Some(attr) = msg.nlas.pop() { + match attr { + Nla::Info(infos) => { + link = Some(link_info(infos)); + } + Nla::Address(a) => { + base.hardware_addr = a; + } + Nla::IfName(i) => { + base.name = i; + } + Nla::Mtu(m) => { + base.mtu = m; + } + Nla::Link(l) => { + base.parent_index = l; + } + Nla::Master(m) => { + base.master_index = m; + } + Nla::TxQueueLen(t) => { + base.txq_len = t; + } + Nla::IfAlias(a) => { + base.alias = a; + } + Nla::Stats(_s) => {} + Nla::Stats64(_s) => {} + Nla::Xdp(_x) => {} + Nla::ProtoInfo(_) => {} + Nla::OperState(_) => {} + Nla::NetnsId(n) => { + base.net_ns_id = n; + } + Nla::GsoMaxSize(i) => { + base.gso_max_size = i; + } + Nla::GsoMaxSegs(e) => { + base.gso_max_seqs = e; + } + Nla::VfInfoList(_) => {} + Nla::NumTxQueues(t) => { + base.num_tx_queues = t; + } + Nla::NumRxQueues(r) => { + base.num_rx_queues = r; + } + Nla::Group(g) => { + base.group = g; + } + _ => { + // skip unused attr + } + } + } + + let mut ret = link.unwrap_or_else(|| Box::new(Device::default())); + ret.set_attrs(base); + ret +} + +fn link_info(mut infos: Vec) -> Box { + let mut link: Option> = None; + while let Some(info) = infos.pop() { + match info { + Info::Kind(kind) => match kind { + InfoKind::Tun => { + if link.is_none() { + link = Some(Box::new(Tuntap::default())); + } + } + InfoKind::Veth => { + if link.is_none() { + link = Some(Box::new(Veth::default())); + } + } + InfoKind::IpVlan => { + if link.is_none() { + link = Some(Box::new(IpVlan::default())); + } + } + InfoKind::MacVlan => { + if link.is_none() { + link = Some(Box::new(MacVlan::default())); + } + } + InfoKind::Vlan => { + if link.is_none() { + link = Some(Box::new(Vlan::default())); + } + } + InfoKind::Bridge => { + if link.is_none() { + link = Some(Box::new(Bridge::default())); + } + } + _ => { + if link.is_none() { + link = Some(Box::new(Device::default())); + } + } + }, + Info::Data(data) => match data { + InfoData::Tun(_) => { + link = Some(Box::new(Tuntap::default())); + } + InfoData::Veth(_) => { + link = Some(Box::new(Veth::default())); + } + InfoData::IpVlan(_) => { + link = Some(Box::new(IpVlan::default())); + } + InfoData::MacVlan(_) => { + link = Some(Box::new(MacVlan::default())); + } + InfoData::Vlan(_) => { + link = Some(Box::new(Vlan::default())); + } + InfoData::Bridge(ibs) => { + link = Some(Box::new(parse_bridge(ibs))); + } + _ => { + link = Some(Box::new(Device::default())); + } + }, + Info::SlaveKind(_sk) => { + if link.is_none() { + link = Some(Box::new(Device::default())); + } + } + Info::SlaveData(_sd) => { + link = Some(Box::new(Device::default())); + } + _ => { + link = Some(Box::new(Device::default())); + } + } + } + link.unwrap() +} + +fn parse_bridge(mut ibs: Vec) -> Bridge { + let mut bridge = Bridge::default(); + while let Some(ib) = ibs.pop() { + match ib { + InfoBridge::HelloTime(ht) => { + bridge.hello_time = ht; + } + InfoBridge::MulticastSnooping(m) => { + bridge.multicast_snooping = m == 1; + } + InfoBridge::VlanFiltering(v) => { + bridge.vlan_filtering = v == 1; + } + _ => {} + } + } + bridge +} +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct Device { + attrs: Option, +} + +impl Link for Device { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "device" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct Tuntap { + pub attrs: Option, +} + +impl Link for Tuntap { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "tuntap" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct Veth { + attrs: Option, + + /// on create only + pub peer_name: String, +} + +impl Link for Veth { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "veth" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct IpVlan { + attrs: Option, + + /// on create only + pub peer_name: String, +} + +impl Link for IpVlan { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "ipvlan" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct MacVlan { + attrs: Option, + + /// on create only + pub peer_name: String, +} + +impl Link for MacVlan { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr) + } + fn r#type(&self) -> &'static str { + "macvlan" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct Vlan { + attrs: Option, + + /// on create only + pub peer_name: String, +} + +impl Link for Vlan { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "vlan" + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct Bridge { + attrs: Option, + pub multicast_snooping: bool, + pub hello_time: u32, + pub vlan_filtering: bool, +} + +impl Link for Bridge { + fn attrs(&self) -> &LinkAttrs { + self.attrs.as_ref().unwrap() + } + fn set_attrs(&mut self, attr: LinkAttrs) { + self.attrs = Some(attr); + } + fn r#type(&self) -> &'static str { + "bridge" + } +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/link/mod.rs b/src/runtime-rs/crates/resource/src/network/utils/link/mod.rs new file mode 100644 index 0000000000..9fcc2b6405 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/link/mod.rs @@ -0,0 +1,145 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod create; +pub use create::{create_link, LinkType}; +mod driver_info; +pub use driver_info::{get_driver_info, DriverInfo}; +mod macros; +mod manager; +pub use manager::get_link_from_message; + +use std::os::unix::io::RawFd; + +use netlink_packet_route::link::nlas::State; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Namespace { + NetNsPid(u32), + #[allow(dead_code)] + NetNsFd(RawFd), +} +impl Default for Namespace { + fn default() -> Self { + Self::NetNsPid(0) + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum LinkStatistics { + #[allow(dead_code)] + Stats(LinkStatistics32), + Stats64(LinkStatistics64), +} +impl Default for LinkStatistics { + fn default() -> Self { + Self::Stats64(LinkStatistics64::default()) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct LinkStatistics32 { + pub rx_packets: u32, + pub tx_packets: u32, + pub rx_bytes: u32, + pub tx_bytes: u32, + pub rx_errors: u32, + pub tx_errors: u32, + pub rx_dropped: u32, + pub tx_dropped: u32, + pub multicast: u32, + pub collisions: u32, + pub rx_length_errors: u32, + pub rx_over_errors: u32, + pub rx_crc_errors: u32, + pub rx_frame_errors: u32, + pub rx_fifo_errors: u32, + pub rx_missed_errors: u32, + pub tx_aborted_errors: u32, + pub tx_carrier_errors: u32, + pub tx_fifo_errors: u32, + pub tx_heartbeat_errors: u32, + pub tx_window_errors: u32, + pub rx_compressed: u32, + pub tx_compressed: u32, +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct LinkStatistics64 { + pub rx_packets: u64, + pub tx_packets: u64, + pub rx_bytes: u64, + pub tx_bytes: u64, + pub rx_errors: u64, + pub tx_errors: u64, + pub rx_dropped: u64, + pub tx_dropped: u64, + pub multicast: u64, + pub collisions: u64, + pub rx_length_errors: u64, + pub rx_over_errors: u64, + pub rx_crc_errors: u64, + pub rx_frame_errors: u64, + pub rx_fifo_errors: u64, + pub rx_missed_errors: u64, + pub tx_aborted_errors: u64, + pub tx_carrier_errors: u64, + pub tx_fifo_errors: u64, + pub tx_heartbeat_errors: u64, + pub tx_window_errors: u64, + pub rx_compressed: u64, + pub tx_compressed: u64, +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct LinkXdp { + pub fd: RawFd, + pub attached: bool, + pub flags: u32, + pub prog_id: u32, +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct OperState(State); +impl Default for OperState { + fn default() -> Self { + Self(State::Unknown) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default)] +pub struct LinkAttrs { + pub index: u32, + pub mtu: u32, + pub txq_len: u32, + + pub name: String, + pub hardware_addr: Vec, + pub flags: u32, + pub parent_index: u32, + pub master_index: u32, + pub namespace: Namespace, + pub alias: String, + pub statistics: LinkStatistics, + pub promisc: u32, + pub xdp: LinkXdp, + pub link_layer_type: u16, + pub proto_info: Vec, + pub oper_state: OperState, + pub net_ns_id: i32, + pub num_tx_queues: u32, + pub num_rx_queues: u32, + pub gso_max_size: u32, + pub gso_max_seqs: u32, + pub vfs: Vec, + pub group: u32, +} + +pub trait Link: Send + Sync { + fn attrs(&self) -> &LinkAttrs; + fn set_attrs(&mut self, attr: LinkAttrs); + fn r#type(&self) -> &str; +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/mod.rs b/src/runtime-rs/crates/resource/src/network/utils/mod.rs new file mode 100644 index 0000000000..574178c3de --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/mod.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +pub(crate) mod address; +pub(crate) mod link; +pub(crate) mod netns; + +use anyhow::{anyhow, Result}; + +pub(crate) fn parse_mac(s: &str) -> Option { + let v: Vec<_> = s.split(':').collect(); + if v.len() != 6 { + return None; + } + let mut bytes = [0u8; 6]; + for i in 0..6 { + bytes[i] = u8::from_str_radix(v[i], 16).ok()?; + } + + Some(hypervisor::Address(bytes)) +} + +pub(crate) fn get_mac_addr(b: &[u8]) -> Result { + if b.len() != 6 { + return Err(anyhow!("invalid mac address {:?}", b)); + } else { + Ok(format!( + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + b[0], b[1], b[2], b[3], b[4], b[5] + )) + } +} diff --git a/src/runtime-rs/crates/resource/src/network/utils/netns.rs b/src/runtime-rs/crates/resource/src/network/utils/netns.rs new file mode 100644 index 0000000000..a2a29dc971 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/network/utils/netns.rs @@ -0,0 +1,51 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{fs::File, os::unix::io::AsRawFd}; + +use anyhow::{Context, Result}; +use nix::sched::{setns, CloneFlags}; +use nix::unistd::{getpid, gettid}; + +pub(crate) struct NetnsGuard { + old_netns: Option, +} + +impl NetnsGuard { + pub(crate) fn new(new_netns_path: &str) -> Result { + let old_netns = if !new_netns_path.is_empty() { + let current_netns_path = format!("/proc/{}/task/{}/ns/{}", getpid(), gettid(), "net"); + let old_netns = File::open(¤t_netns_path) + .with_context(|| format!("open current netns path {}", ¤t_netns_path))?; + let new_netns = File::open(&new_netns_path) + .with_context(|| format!("open new netns path {}", &new_netns_path))?; + setns(new_netns.as_raw_fd(), CloneFlags::CLONE_NEWNET) + .with_context(|| "set netns to new netns")?; + info!( + sl!(), + "set netns from old {:?} to new {:?} tid {}", + old_netns, + new_netns, + gettid().to_string() + ); + Some(old_netns) + } else { + warn!(sl!(), "skip to set netns for empty netns path"); + None + }; + Ok(Self { old_netns }) + } +} + +impl Drop for NetnsGuard { + fn drop(&mut self) { + if let Some(old_netns) = self.old_netns.as_ref() { + let old_netns_fd = old_netns.as_raw_fd(); + setns(old_netns_fd, CloneFlags::CLONE_NEWNET).unwrap(); + info!(sl!(), "set netns to old {:?}", old_netns_fd); + } + } +} diff --git a/src/runtime-rs/crates/resource/src/rootfs/mod.rs b/src/runtime-rs/crates/resource/src/rootfs/mod.rs new file mode 100644 index 0000000000..fcf796e550 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/rootfs/mod.rs @@ -0,0 +1,121 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod share_fs_rootfs; + +use std::{sync::Arc, vec::Vec}; + +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use kata_types::mount::Mount; +use nix::sys::stat::{self, SFlag}; +use tokio::sync::RwLock; + +use crate::share_fs::ShareFs; + +const ROOTFS: &str = "rootfs"; + +#[async_trait] +pub trait Rootfs: Send + Sync { + async fn get_guest_rootfs_path(&self) -> Result; + async fn get_rootfs_mount(&self) -> Result>; +} + +#[derive(Default)] +struct RootFsResourceInner { + rootfs: Vec>, +} + +pub struct RootFsResource { + inner: Arc>, +} + +impl Default for RootFsResource { + fn default() -> Self { + Self::new() + } +} + +impl RootFsResource { + pub fn new() -> Self { + Self { + inner: Arc::new(RwLock::new(RootFsResourceInner::default())), + } + } + + pub async fn handler_rootfs( + &self, + share_fs: &Option>, + cid: &str, + bundle_path: &str, + rootfs_mounts: &[Mount], + ) -> Result> { + match rootfs_mounts { + mounts_vec if is_single_layer_rootfs(mounts_vec) => { + // Safe as single_layer_rootfs must have one layer + let layer = &mounts_vec[0]; + + let rootfs = if let Some(share_fs) = share_fs { + // share fs rootfs + let share_fs_mount = share_fs.get_share_fs_mount(); + share_fs_rootfs::ShareFsRootfs::new(&share_fs_mount, cid, bundle_path, layer) + .await + .context("new share fs rootfs")? + } else { + return Err(anyhow!("unsupported rootfs {:?}", &layer)); + }; + + let mut inner = self.inner.write().await; + let r = Arc::new(rootfs); + inner.rootfs.push(r.clone()); + Ok(r) + } + _ => { + return Err(anyhow!( + "unsupported rootfs mounts count {}", + rootfs_mounts.len() + )) + } + } + } + + pub async fn dump(&self) { + let inner = self.inner.read().await; + for r in &inner.rootfs { + info!( + sl!(), + "rootfs {:?}: count {}", + r.get_guest_rootfs_path().await, + Arc::strong_count(r) + ); + } + } +} + +fn is_single_layer_rootfs(rootfs_mounts: &[Mount]) -> bool { + rootfs_mounts.len() == 1 +} + +#[allow(dead_code)] +fn get_block_device(file_path: &str) -> Option { + if file_path.is_empty() { + return None; + } + + match stat::stat(file_path) { + Ok(fstat) => { + if SFlag::from_bits_truncate(fstat.st_mode) == SFlag::S_IFBLK { + return Some(fstat.st_rdev); + } + } + Err(err) => { + error!(sl!(), "failed to stat for {} {:?}", file_path, err); + return None; + } + }; + + None +} diff --git a/src/runtime-rs/crates/resource/src/rootfs/share_fs_rootfs.rs b/src/runtime-rs/crates/resource/src/rootfs/share_fs_rootfs.rs new file mode 100644 index 0000000000..643af13fed --- /dev/null +++ b/src/runtime-rs/crates/resource/src/rootfs/share_fs_rootfs.rs @@ -0,0 +1,59 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use kata_sys_util::mount::Mounter; +use kata_types::mount::Mount; + +use super::{Rootfs, ROOTFS}; +use crate::share_fs::{ShareFsMount, ShareFsRootfsConfig}; + +pub(crate) struct ShareFsRootfs { + guest_path: String, +} + +impl ShareFsRootfs { + pub async fn new( + share_fs_mount: &Arc, + cid: &str, + bundle_path: &str, + rootfs: &Mount, + ) -> Result { + let bundle_rootfs = format!("{}/{}", bundle_path, ROOTFS); + rootfs.mount(&bundle_rootfs).context(format!( + "mount rootfs from {:?} to {}", + &rootfs, &bundle_rootfs + ))?; + + let mount_result = share_fs_mount + .share_rootfs(ShareFsRootfsConfig { + cid: cid.to_string(), + source: bundle_rootfs.to_string(), + target: ROOTFS.to_string(), + readonly: false, + }) + .await + .context("share rootfs")?; + + Ok(ShareFsRootfs { + guest_path: mount_result.guest_path, + }) + } +} + +#[async_trait] +impl Rootfs for ShareFsRootfs { + async fn get_guest_rootfs_path(&self) -> Result { + Ok(self.guest_path.clone()) + } + + async fn get_rootfs_mount(&self) -> Result> { + todo!() + } +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/mod.rs b/src/runtime-rs/crates/resource/src/share_fs/mod.rs new file mode 100644 index 0000000000..36f4f1ec26 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/mod.rs @@ -0,0 +1,78 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod share_virtio_fs; +mod share_virtio_fs_inline; +use share_virtio_fs_inline::ShareVirtioFsInline; +mod share_virtio_fs_standalone; +use share_virtio_fs_standalone::ShareVirtioFsStandalone; +mod utils; +mod virtio_fs_share_mount; +use virtio_fs_share_mount::VirtiofsShareMount; + +use std::sync::Arc; + +use agent::Storage; +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use hypervisor::Hypervisor; +use kata_types::config::hypervisor::SharedFsInfo; + +const VIRTIO_FS: &str = "virtio-fs"; +const INLINE_VIRTIO_FS: &str = "inline-virtio-fs"; + +const KATA_HOST_SHARED_DIR: &str = "/run/kata-containers/shared/sandboxes/"; +const KATA_GUEST_SHARE_DIR: &str = "/run/kata-containers/shared/containers/"; +pub(crate) const DEFAULT_KATA_GUEST_SANDBOX_DIR: &str = "/run/kata-containers/sandbox/"; + +const PASSTHROUGH_FS_DIR: &str = "passthrough"; + +#[async_trait] +pub trait ShareFs: Send + Sync { + fn get_share_fs_mount(&self) -> Arc; + async fn setup_device_before_start_vm(&self, h: &dyn Hypervisor) -> Result<()>; + async fn setup_device_after_start_vm(&self, h: &dyn Hypervisor) -> Result<()>; + async fn get_storages(&self) -> Result>; +} + +pub struct ShareFsRootfsConfig { + // TODO: for nydus v5/v6 need to update ShareFsMount + pub cid: String, + pub source: String, + pub target: String, + pub readonly: bool, +} + +pub struct ShareFsVolumeConfig { + pub cid: String, + pub source: String, + pub target: String, + pub readonly: bool, +} + +pub struct ShareFsMountResult { + pub guest_path: String, +} + +#[async_trait] +pub trait ShareFsMount: Send + Sync { + async fn share_rootfs(&self, config: ShareFsRootfsConfig) -> Result; + async fn share_volume(&self, config: ShareFsVolumeConfig) -> Result; +} + +pub fn new(id: &str, config: &SharedFsInfo) -> Result> { + let shared_fs = config.shared_fs.clone(); + let shared_fs = shared_fs.unwrap_or_default(); + match shared_fs.as_str() { + INLINE_VIRTIO_FS => Ok(Arc::new( + ShareVirtioFsInline::new(id, config).context("new inline virtio fs")?, + )), + VIRTIO_FS => Ok(Arc::new( + ShareVirtioFsStandalone::new(id, config).context("new standalone virtio fs")?, + )), + _ => Err(anyhow!("unsupported shred fs {:?}", &shared_fs)), + } +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs new file mode 100644 index 0000000000..f1a5bc5fe2 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs @@ -0,0 +1,53 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::Path; + +use anyhow::{Context, Result}; +use hypervisor::{device, Hypervisor}; +use kata_sys_util::mount; + +use super::utils; + +pub(crate) const MOUNT_GUEST_TAG: &str = "kataShared"; +pub(crate) const PASSTHROUGH_FS_DIR: &str = "passthrough"; + +pub(crate) const FS_TYPE_VIRTIO_FS: &str = "virtiofs"; +pub(crate) const KATA_VIRTIO_FS_DEV_TYPE: &str = "virtio-fs"; + +const VIRTIO_FS_SOCKET: &str = "virtiofsd.sock"; + +pub(crate) fn generate_sock_path(root: &str) -> String { + let socket_path = Path::new(root).join(VIRTIO_FS_SOCKET); + socket_path.to_str().unwrap().to_string() +} + +pub(crate) async fn prepare_virtiofs( + h: &dyn Hypervisor, + fs_type: &str, + id: &str, + root: &str, +) -> Result<()> { + let host_ro_dest = utils::get_host_ro_shared_path(id); + utils::ensure_dir_exist(&host_ro_dest)?; + + let host_rw_dest = utils::get_host_rw_shared_path(id); + utils::ensure_dir_exist(&host_rw_dest)?; + + mount::bind_mount_unchecked(&host_rw_dest, &host_ro_dest, true) + .context("bind mount shared_fs directory")?; + + let share_fs_device = device::Device::ShareFsDevice(device::ShareFsDeviceConfig { + sock_path: generate_sock_path(root), + mount_tag: String::from(MOUNT_GUEST_TAG), + host_path: String::from(host_ro_dest.to_str().unwrap()), + fs_type: fs_type.to_string(), + queue_size: 0, + queue_num: 0, + }); + h.add_device(share_fs_device).await.context("add device")?; + Ok(()) +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_inline.rs b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_inline.rs new file mode 100644 index 0000000000..e3967b8ce3 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_inline.rs @@ -0,0 +1,109 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use agent::Storage; +use anyhow::{Context, Result}; +use async_trait::async_trait; +use hypervisor::{ + device::{Device as HypervisorDevice, ShareFsMountConfig, ShareFsMountType, ShareFsOperation}, + Hypervisor, +}; +use kata_types::config::hypervisor::SharedFsInfo; + +use super::{ + share_virtio_fs::{ + prepare_virtiofs, FS_TYPE_VIRTIO_FS, KATA_VIRTIO_FS_DEV_TYPE, MOUNT_GUEST_TAG, + PASSTHROUGH_FS_DIR, + }, + utils, ShareFs, *, +}; + +lazy_static! { + pub(crate) static ref SHARED_DIR_VIRTIO_FS_OPTIONS: Vec:: = vec![String::from("nodev")]; +} + +#[derive(Debug, Clone)] +pub struct ShareVirtioFsInlineConfig { + pub id: String, +} + +pub struct ShareVirtioFsInline { + config: ShareVirtioFsInlineConfig, + share_fs_mount: Arc, +} + +impl ShareVirtioFsInline { + pub(crate) fn new(id: &str, _config: &SharedFsInfo) -> Result { + Ok(Self { + config: ShareVirtioFsInlineConfig { id: id.to_string() }, + share_fs_mount: Arc::new(VirtiofsShareMount::new(id)), + }) + } +} + +#[async_trait] +impl ShareFs for ShareVirtioFsInline { + fn get_share_fs_mount(&self) -> Arc { + self.share_fs_mount.clone() + } + + async fn setup_device_before_start_vm(&self, h: &dyn Hypervisor) -> Result<()> { + prepare_virtiofs(h, INLINE_VIRTIO_FS, &self.config.id, "") + .await + .context("prepare virtiofs")?; + Ok(()) + } + + async fn setup_device_after_start_vm(&self, h: &dyn Hypervisor) -> Result<()> { + setup_inline_virtiofs(&self.config.id, h) + .await + .context("setup inline virtiofs")?; + Ok(()) + } + async fn get_storages(&self) -> Result> { + // setup storage + let mut storages: Vec = Vec::new(); + + let shared_volume: Storage = Storage { + driver: String::from(KATA_VIRTIO_FS_DEV_TYPE), + driver_options: Vec::new(), + source: String::from(MOUNT_GUEST_TAG), + fs_type: String::from(FS_TYPE_VIRTIO_FS), + fs_group: None, + options: SHARED_DIR_VIRTIO_FS_OPTIONS.clone(), + mount_point: String::from(KATA_GUEST_SHARE_DIR), + }; + + storages.push(shared_volume); + Ok(storages) + } +} + +async fn setup_inline_virtiofs(id: &str, h: &dyn Hypervisor) -> Result<()> { + // - source is the absolute path of PASSTHROUGH_FS_DIR on host, e.g. + // /run/kata-containers/shared/sandboxes//passthrough + // - mount point is the path relative to KATA_GUEST_SHARE_DIR in guest + let mnt = format!("/{}", PASSTHROUGH_FS_DIR); + + let rw_source = utils::get_host_rw_shared_path(id).join(PASSTHROUGH_FS_DIR); + utils::ensure_dir_exist(&rw_source)?; + + let ro_source = utils::get_host_ro_shared_path(id).join(PASSTHROUGH_FS_DIR); + let source = String::from(ro_source.to_str().unwrap()); + + let virtio_fs = HypervisorDevice::ShareFsMount(ShareFsMountConfig { + source: source.clone(), + fstype: ShareFsMountType::PASSTHROUGH, + mount_point: mnt, + config: None, + tag: String::from(MOUNT_GUEST_TAG), + op: ShareFsOperation::Mount, + prefetch_list_path: None, + }); + h.add_device(virtio_fs) + .await + .context(format!("fail to attach passthrough fs {:?}", source)) +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_standalone.rs b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_standalone.rs new file mode 100644 index 0000000000..9c798d7467 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs_standalone.rs @@ -0,0 +1,179 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{process::Stdio, sync::Arc}; + +use agent::Storage; +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use hypervisor::Hypervisor; +use kata_types::config::hypervisor::SharedFsInfo; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::{Child, Command}, + sync::{ + mpsc::{channel, Receiver, Sender}, + RwLock, + }, +}; + +use super::{ + share_virtio_fs::generate_sock_path, utils::get_host_ro_shared_path, + virtio_fs_share_mount::VirtiofsShareMount, ShareFs, ShareFsMount, +}; + +#[derive(Debug, Clone)] +pub struct ShareVirtioFsStandaloneConfig { + id: String, + jail_root: String, + + // virtio_fs_daemon is the virtio-fs vhost-user daemon path + pub virtio_fs_daemon: String, + // virtio_fs_cache cache mode for fs version cache or "none" + pub virtio_fs_cache: String, + // virtio_fs_extra_args passes options to virtiofsd daemon + pub virtio_fs_extra_args: Vec, +} + +#[derive(Default)] +struct ShareVirtioFsStandaloneInner { + pid: Option, +} +pub(crate) struct ShareVirtioFsStandalone { + inner: Arc>, + config: ShareVirtioFsStandaloneConfig, + share_fs_mount: Arc, +} + +impl ShareVirtioFsStandalone { + pub(crate) fn new(id: &str, _config: &SharedFsInfo) -> Result { + Ok(Self { + inner: Arc::new(RwLock::new(ShareVirtioFsStandaloneInner::default())), + // TODO: update with config + config: ShareVirtioFsStandaloneConfig { + id: id.to_string(), + jail_root: "".to_string(), + virtio_fs_daemon: "".to_string(), + virtio_fs_cache: "".to_string(), + virtio_fs_extra_args: vec![], + }, + share_fs_mount: Arc::new(VirtiofsShareMount::new(id)), + }) + } + + fn virtiofsd_args(&self, sock_path: &str) -> Result> { + let source_path = get_host_ro_shared_path(&self.config.id); + if !source_path.exists() { + return Err(anyhow!("The virtiofs shared path didn't exist")); + } + + let mut args: Vec = vec![ + String::from("-f"), + String::from("-o"), + format!("vhost_user_socket={}", sock_path), + String::from("-o"), + format!("source={}", source_path.to_str().unwrap()), + String::from("-o"), + format!("cache={}", self.config.virtio_fs_cache), + ]; + + if !self.config.virtio_fs_extra_args.is_empty() { + let mut extra_args: Vec = self.config.virtio_fs_extra_args.clone(); + args.append(&mut extra_args); + } + + Ok(args) + } + + async fn setup_virtiofsd(&self) -> Result<()> { + let sock_path = generate_sock_path(&self.config.jail_root); + let args = self.virtiofsd_args(&sock_path).context("virtiofsd args")?; + + let mut cmd = Command::new(&self.config.virtio_fs_daemon); + let child_cmd = cmd.args(&args).stderr(Stdio::piped()); + let child = child_cmd.spawn().context("spawn virtiofsd")?; + + // update virtiofsd pid{ + { + let mut inner = self.inner.write().await; + inner.pid = child.id(); + } + + let (tx, mut rx): (Sender>, Receiver>) = channel(100); + tokio::spawn(run_virtiofsd(child, tx)); + + // TODO: support timeout + match rx.recv().await.unwrap() { + Ok(_) => { + info!(sl!(), "start virtiofsd successfully"); + Ok(()) + } + Err(e) => { + error!(sl!(), "failed to start virtiofsd {}", e); + self.shutdown_virtiofsd() + .await + .context("shutdown_virtiofsd")?; + Err(anyhow!("failed to start virtiofsd")) + } + } + } + + async fn shutdown_virtiofsd(&self) -> Result<()> { + let mut inner = self.inner.write().await; + + if let Some(pid) = inner.pid.take() { + info!(sl!(), "shutdown virtiofsd pid {}", pid); + let pid = ::nix::unistd::Pid::from_raw(pid as i32); + if let Err(err) = ::nix::sys::signal::kill(pid, nix::sys::signal::SIGKILL) { + if err != ::nix::Error::ESRCH { + return Err(anyhow!("failed to kill virtiofsd pid {} {}", pid, err)); + } + } + } + inner.pid = None; + + Ok(()) + } +} + +async fn run_virtiofsd(mut child: Child, tx: Sender>) -> Result<()> { + let stderr = child.stderr.as_mut().unwrap(); + let stderr_reader = BufReader::new(stderr); + let mut lines = stderr_reader.lines(); + + while let Some(buffer) = lines.next_line().await.context("read next line")? { + let trim_buffer = buffer.trim_end(); + if !trim_buffer.is_empty() { + info!(sl!(), "source: virtiofsd {}", trim_buffer); + } + if buffer.contains("Waiting for vhost-user socket connection") { + tx.send(Ok(())).await.unwrap(); + } + } + + info!(sl!(), "wait virtiofsd {:?}", child.wait().await); + Ok(()) +} + +#[async_trait] +impl ShareFs for ShareVirtioFsStandalone { + fn get_share_fs_mount(&self) -> Arc { + self.share_fs_mount.clone() + } + + async fn setup_device_before_start_vm(&self, _h: &dyn Hypervisor) -> Result<()> { + self.setup_virtiofsd().await.context("setup virtiofsd")?; + Ok(()) + } + + async fn setup_device_after_start_vm(&self, _h: &dyn Hypervisor) -> Result<()> { + Ok(()) + } + + async fn get_storages(&self) -> Result> { + Ok(vec![]) + } +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/utils.rs b/src/runtime-rs/crates/resource/src/share_fs/utils.rs new file mode 100644 index 0000000000..fbdf93f78e --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/utils.rs @@ -0,0 +1,94 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::{Path, PathBuf}; + +use anyhow::Result; +use kata_sys_util::mount; + +use super::*; + +pub(crate) fn ensure_dir_exist(path: &Path) -> Result<()> { + if !path.exists() { + std::fs::create_dir_all(path).context(format!("failed to create directory {:?}", path))?; + } + Ok(()) +} + +pub(crate) fn share_to_guest( + // absolute path for source + source: &str, + // relative path for target + target: &str, + sid: &str, + cid: &str, + readonly: bool, + is_volume: bool, +) -> Result { + let host_dest = do_get_host_path(target, sid, cid, is_volume, false); + mount::bind_mount_unchecked(source, &host_dest, readonly) + .with_context(|| format!("failed to bind mount {} to {}", source, &host_dest))?; + + // bind mount remount event is not propagated to mount subtrees, so we have + // to remount the read only dir mount point directly. + if readonly { + let dst = do_get_host_path(target, sid, cid, is_volume, true); + mount::bind_remount_read_only(&dst).context("bind remount readonly")?; + } + + Ok(do_get_guest_path(target, cid, is_volume)) +} + +pub(crate) fn get_host_ro_shared_path(id: &str) -> PathBuf { + Path::new(KATA_HOST_SHARED_DIR).join(id).join("ro") +} + +pub(crate) fn get_host_rw_shared_path(id: &str) -> PathBuf { + Path::new(KATA_HOST_SHARED_DIR).join(id).join("rw") +} + +fn do_get_guest_any_path(target: &str, cid: &str, is_volume: bool, is_virtiofs: bool) -> String { + let dir = PASSTHROUGH_FS_DIR; + let guest_share_dir = if is_virtiofs { + Path::new("/").to_path_buf() + } else { + Path::new(KATA_GUEST_SHARE_DIR).to_path_buf() + }; + + let path = if is_volume && !is_virtiofs { + guest_share_dir.join(dir).join(target) + } else { + guest_share_dir.join(dir).join(cid).join(target) + }; + path.to_str().unwrap().to_string() +} + +fn do_get_guest_path(target: &str, cid: &str, is_volume: bool) -> String { + do_get_guest_any_path(target, cid, is_volume, false) +} + +fn do_get_host_path( + target: &str, + sid: &str, + cid: &str, + is_volume: bool, + read_only: bool, +) -> String { + let dir = PASSTHROUGH_FS_DIR; + + let get_host_path = if read_only { + get_host_ro_shared_path + } else { + get_host_rw_shared_path + }; + + let path = if is_volume { + get_host_path(sid).join(dir).join(target) + } else { + get_host_path(sid).join(dir).join(cid).join(target) + }; + path.to_str().unwrap().to_string() +} diff --git a/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs b/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs new file mode 100644 index 0000000000..1f1abdb1cb --- /dev/null +++ b/src/runtime-rs/crates/resource/src/share_fs/virtio_fs_share_mount.rs @@ -0,0 +1,50 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use async_trait::async_trait; + +use super::{utils, ShareFsMount, ShareFsMountResult, ShareFsRootfsConfig, ShareFsVolumeConfig}; + +pub struct VirtiofsShareMount { + id: String, +} + +impl VirtiofsShareMount { + pub fn new(id: &str) -> Self { + Self { id: id.to_string() } + } +} + +#[async_trait] +impl ShareFsMount for VirtiofsShareMount { + async fn share_rootfs(&self, config: ShareFsRootfsConfig) -> Result { + // TODO: select virtiofs or support nydus + let guest_path = utils::share_to_guest( + &config.source, + &config.target, + &self.id, + &config.cid, + config.readonly, + false, + ) + .context("share to guest")?; + Ok(ShareFsMountResult { guest_path }) + } + + async fn share_volume(&self, config: ShareFsVolumeConfig) -> Result { + let guest_path = utils::share_to_guest( + &config.source, + &config.target, + &self.id, + &config.cid, + config.readonly, + true, + ) + .context("share to guest")?; + Ok(ShareFsMountResult { guest_path }) + } +} diff --git a/src/runtime-rs/crates/resource/src/volume/block_volume.rs b/src/runtime-rs/crates/resource/src/volume/block_volume.rs new file mode 100644 index 0000000000..f015c92785 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/block_volume.rs @@ -0,0 +1,37 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; + +use super::Volume; + +pub(crate) struct BlockVolume {} + +/// BlockVolume: block device volume +impl BlockVolume { + pub(crate) fn new(_m: &oci::Mount) -> Result { + Ok(Self {}) + } +} + +impl Volume for BlockVolume { + fn get_volume_mount(&self) -> anyhow::Result> { + todo!() + } + + fn get_storage(&self) -> Result> { + todo!() + } + + fn cleanup(&self) -> Result<()> { + todo!() + } +} + +pub(crate) fn is_block_volume(_m: &oci::Mount) -> bool { + // attach block device + false +} diff --git a/src/runtime-rs/crates/resource/src/volume/default_volume.rs b/src/runtime-rs/crates/resource/src/volume/default_volume.rs new file mode 100644 index 0000000000..3b7752a4e7 --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/default_volume.rs @@ -0,0 +1,36 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; + +use super::Volume; + +pub(crate) struct DefaultVolume { + mount: oci::Mount, +} + +/// DefaultVolume: passthrough the mount to guest +impl DefaultVolume { + pub fn new(mount: &oci::Mount) -> Result { + Ok(Self { + mount: mount.clone(), + }) + } +} + +impl Volume for DefaultVolume { + fn get_volume_mount(&self) -> anyhow::Result> { + Ok(vec![self.mount.clone()]) + } + + fn get_storage(&self) -> Result> { + Ok(vec![]) + } + + fn cleanup(&self) -> Result<()> { + todo!() + } +} diff --git a/src/runtime-rs/crates/resource/src/volume/mod.rs b/src/runtime-rs/crates/resource/src/volume/mod.rs new file mode 100644 index 0000000000..53c737c79c --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/mod.rs @@ -0,0 +1,99 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod block_volume; +mod default_volume; +mod share_fs_volume; +mod shm_volume; + +use std::{sync::Arc, vec::Vec}; + +use anyhow::{Context, Result}; +use tokio::sync::RwLock; + +use crate::share_fs::ShareFs; + +pub trait Volume: Send + Sync { + fn get_volume_mount(&self) -> Result>; + fn get_storage(&self) -> Result>; + fn cleanup(&self) -> Result<()>; +} + +#[derive(Default)] +pub struct VolumeResourceInner { + volumes: Vec>, +} + +#[derive(Default)] +pub struct VolumeResource { + inner: Arc>, +} + +impl VolumeResource { + pub fn new() -> Self { + Self::default() + } + + pub async fn handler_volumes( + &self, + share_fs: &Option>, + cid: &str, + oci_mounts: &[oci::Mount], + ) -> Result>> { + let mut volumes: Vec> = vec![]; + for m in oci_mounts { + let volume: Arc = if shm_volume::is_shim_volume(m) { + let shm_size = shm_volume::DEFAULT_SHM_SIZE; + Arc::new( + shm_volume::ShmVolume::new(m, shm_size) + .with_context(|| format!("new shm volume {:?}", m))?, + ) + } else if share_fs_volume::is_share_fs_volume(m) { + Arc::new( + share_fs_volume::ShareFsVolume::new(share_fs, m, cid) + .await + .with_context(|| format!("new share fs volume {:?}", m))?, + ) + } else if block_volume::is_block_volume(m) { + Arc::new( + block_volume::BlockVolume::new(m) + .with_context(|| format!("new block volume {:?}", m))?, + ) + } else if is_skip_volume(m) { + info!(sl!(), "skip volume {:?}", m); + continue; + } else { + Arc::new( + default_volume::DefaultVolume::new(m) + .with_context(|| format!("new default volume {:?}", m))?, + ) + }; + + volumes.push(volume.clone()); + let mut inner = self.inner.write().await; + inner.volumes.push(volume); + } + + Ok(volumes) + } + + pub async fn dump(&self) { + let inner = self.inner.read().await; + for v in &inner.volumes { + info!( + sl!(), + "volume mount {:?}: count {}", + v.get_volume_mount(), + Arc::strong_count(v) + ); + } + } +} + +fn is_skip_volume(_m: &oci::Mount) -> bool { + // TODO: support volume check + false +} diff --git a/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs new file mode 100644 index 0000000000..9bf02ddc4f --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/share_fs_volume.rs @@ -0,0 +1,153 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{path::Path, sync::Arc}; + +use anyhow::{anyhow, Context, Result}; +use nix::sys::stat::{stat, SFlag}; + +use super::Volume; +use crate::share_fs::{ShareFs, ShareFsVolumeConfig}; + +// copy file to container's rootfs if filesystem sharing is not supported, otherwise +// bind mount it in the shared directory. +// Ignore /dev, directories and all other device files. We handle +// only regular files in /dev. It does not make sense to pass the host +// device nodes to the guest. +// skip the volumes whose source had already set to guest share dir. +pub(crate) struct ShareFsVolume { + mounts: Vec, +} + +impl ShareFsVolume { + pub(crate) async fn new( + share_fs: &Option>, + m: &oci::Mount, + cid: &str, + ) -> Result { + let file_name = Path::new(&m.source).file_name().unwrap().to_str().unwrap(); + let file_name = generate_mount_path(cid, file_name); + + let mut volume = Self { mounts: vec![] }; + match share_fs { + None => { + let mut need_copy = false; + match stat(Path::new(&m.source)) { + Ok(stat) => { + // Ignore the mount if this is not a regular file (excludes + // directory, socket, device, ...) as it cannot be handled by + // a simple copy. But this should not be treated as an error, + // only as a limitation. + // golang implement: + // ModeType = ModeDir | ModeSymlink | ModeNamedPipe | ModeSocket | + // ModeDevice | ModeCharDevice | ModeIrregular + let file_type = SFlag::S_IFDIR + | SFlag::S_IFLNK + | SFlag::S_IFIFO + | SFlag::S_IFSOCK + | SFlag::S_IFCHR + | SFlag::S_IFREG; + if !file_type.contains(SFlag::from_bits_truncate(stat.st_mode)) { + debug!( + sl!(), + "Ignoring non-regular file as FS sharing not supported. mount: {:?}", + m + ); + return Ok(volume); + } + if SFlag::from_bits_truncate(stat.st_mode) != SFlag::S_IFDIR { + need_copy = true; + } + } + Err(err) => { + return Err(anyhow!(format!( + "failed to stat file {} {:?}", + &m.source, err + ))); + } + }; + + if need_copy { + // TODO: copy file + } + } + Some(share_fs) => { + let share_fs_mount = share_fs.get_share_fs_mount(); + let mount_result = share_fs_mount + .share_volume(ShareFsVolumeConfig { + cid: cid.to_string(), + source: m.source.clone(), + target: file_name, + readonly: false, + }) + .await + .context("share fs volume")?; + + volume.mounts.push(oci::Mount { + destination: m.destination.clone(), + r#type: "bind".to_string(), + source: mount_result.guest_path, + options: m.options.clone(), + }); + } + } + Ok(volume) + } +} + +impl Volume for ShareFsVolume { + fn get_volume_mount(&self) -> anyhow::Result> { + Ok(self.mounts.clone()) + } + + fn get_storage(&self) -> Result> { + Ok(vec![]) + } + + fn cleanup(&self) -> Result<()> { + todo!() + } +} + +pub(crate) fn is_share_fs_volume(m: &oci::Mount) -> bool { + m.r#type == "bind" && !is_host_device(&m.destination) +} + +fn is_host_device(dest: &str) -> bool { + if dest == "/dev" { + return true; + } + + if dest.starts_with("/dev") { + let src = match std::fs::canonicalize(dest) { + Err(_) => return false, + Ok(src) => src, + }; + + if src.is_file() { + return false; + } + + return true; + } + + false +} + +// Note, don't generate random name, attaching rafs depends on the predictable name. +// If template_mnt is passed, just use existed name in it +pub fn generate_mount_path(id: &str, file_name: &str) -> String { + let mut nid = String::from(id); + if nid.len() > 10 { + nid = nid.chars().take(10).collect(); + } + + let mut uid = uuid::Uuid::new_v4().to_string(); + let uid_vec: Vec<&str> = uid.splitn(2, '-').collect(); + uid = String::from(uid_vec[0]); + + format!("{}-{}-{}", nid, uid, file_name) +} diff --git a/src/runtime-rs/crates/resource/src/volume/shm_volume.rs b/src/runtime-rs/crates/resource/src/volume/shm_volume.rs new file mode 100644 index 0000000000..c1c9df993f --- /dev/null +++ b/src/runtime-rs/crates/resource/src/volume/shm_volume.rs @@ -0,0 +1,106 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::Path; + +use anyhow::Result; + +use super::Volume; +use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR; + +pub const SHM_DIR: &str = "shm"; +// DEFAULT_SHM_SIZE is the default shm size to be used in case host +// IPC is used. +pub const DEFAULT_SHM_SIZE: u64 = 65536 * 1024; + +// KATA_EPHEMERAL_DEV_TYPE creates a tmpfs backed volume for sharing files between containers. +pub const KATA_EPHEMERAL_DEV_TYPE: &str = "ephemeral"; + +pub(crate) struct ShmVolume { + mount: oci::Mount, + storage: Option, +} + +impl ShmVolume { + pub(crate) fn new(m: &oci::Mount, shm_size: u64) -> Result { + let (storage, mount) = if shm_size > 0 { + // storage + let mount_path = Path::new(DEFAULT_KATA_GUEST_SANDBOX_DIR).join(SHM_DIR); + let mount_path = mount_path.to_str().unwrap(); + let option = format!("size={}", shm_size); + + let options = vec![ + String::from("noexec"), + String::from("nosuid"), + String::from("nodev"), + String::from("mode=1777"), + option, + ]; + + let storage = agent::Storage { + driver: String::from(KATA_EPHEMERAL_DEV_TYPE), + driver_options: Vec::new(), + source: String::from("shm"), + fs_type: String::from("tmpfs"), + fs_group: None, + options, + mount_point: mount_path.to_string(), + }; + + // mount + let mount = oci::Mount { + r#type: "bind".to_string(), + destination: m.destination.clone(), + source: mount_path.to_string(), + options: vec!["rbind".to_string()], + }; + + (Some(storage), mount) + } else { + let mount = oci::Mount { + r#type: "tmpfs".to_string(), + destination: m.destination.clone(), + source: "shm".to_string(), + options: vec![ + "noexec", + "nosuid", + "nodev", + "mode=1777", + &format!("size={}", DEFAULT_SHM_SIZE), + ] + .iter() + .map(|s| s.to_string()) + .collect(), + }; + (None, mount) + }; + + Ok(Self { storage, mount }) + } +} + +impl Volume for ShmVolume { + fn get_volume_mount(&self) -> anyhow::Result> { + Ok(vec![self.mount.clone()]) + } + + fn get_storage(&self) -> Result> { + let s = if let Some(s) = self.storage.as_ref() { + vec![s.clone()] + } else { + vec![] + }; + Ok(s) + } + + fn cleanup(&self) -> Result<()> { + todo!() + } +} + +pub(crate) fn is_shim_volume(m: &oci::Mount) -> bool { + m.destination == "/dev/shm" && m.r#type != KATA_EPHEMERAL_DEV_TYPE +} diff --git a/src/runtime-rs/crates/runtimes/Cargo.toml b/src/runtime-rs/crates/runtimes/Cargo.toml new file mode 100644 index 0000000000..304a7639bc --- /dev/null +++ b/src/runtime-rs/crates/runtimes/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "runtimes" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +lazy_static = "1.4.0" +slog = "2.5.2" +slog-scope = "4.4.0" +tokio = { version = "1.8.0", features = ["rt-multi-thread"] } + +common = { path = "./common" } +kata-types = { path = "../../../libs/kata-types" } +logging = { path = "../../../libs/logging"} +oci = { path = "../../../libs/oci" } + +# runtime handler +linux_container = { path = "./linux_container", optional = true } +virt_container = { path = "./virt_container", optional = true } +wasm_container = { path = "./wasm_container", optional = true } + +[features] +default = ["virt"] +linux = ["linux_container"] +virt = ["virt_container"] +wasm = ["wasm_container"] diff --git a/src/runtime-rs/crates/runtimes/common/Cargo.toml b/src/runtime-rs/crates/runtimes/common/Cargo.toml new file mode 100644 index 0000000000..56cec862c2 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "common" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +containerd-shim-protos = { version = "0.2.0", features = ["async"]} +lazy_static = "1.4.0" +nix = "0.24.1" +protobuf = "2.27.0" +serde_json = "1.0.39" +slog = "2.5.2" +slog-scope = "4.4.0" +strum = { version = "0.24.0", features = ["derive"] } +thiserror = "^1.0" +tokio = { version = "1.8.0", features = ["rt-multi-thread", "process", "fs"] } +ttrpc = { version = "0.6.1" } + +agent = { path = "../../agent" } +kata-sys-util = { path = "../../../../libs/kata-sys-util" } +kata-types = { path = "../../../../libs/kata-types" } +oci = { path = "../../../../libs/oci" } diff --git a/src/runtime-rs/crates/runtimes/common/src/container_manager.rs b/src/runtime-rs/crates/runtimes/common/src/container_manager.rs new file mode 100644 index 0000000000..040b557ee6 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/container_manager.rs @@ -0,0 +1,40 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use async_trait::async_trait; + +use crate::types::{ + ContainerConfig, ContainerID, ContainerProcess, ExecProcessRequest, KillRequest, + ProcessExitStatus, ProcessStateInfo, ResizePTYRequest, ShutdownRequest, StatsInfo, + UpdateRequest, PID, +}; + +#[async_trait] +pub trait ContainerManager: Send + Sync { + // container lifecycle + async fn create_container(&self, config: ContainerConfig, spec: oci::Spec) -> Result; + async fn pause_container(&self, container_id: &ContainerID) -> Result<()>; + async fn resume_container(&self, container_id: &ContainerID) -> Result<()>; + async fn stats_container(&self, container_id: &ContainerID) -> Result; + async fn update_container(&self, req: UpdateRequest) -> Result<()>; + async fn connect_container(&self, container_id: &ContainerID) -> Result; + + // process lifecycle + async fn close_process_io(&self, process_id: &ContainerProcess) -> Result<()>; + async fn delete_process(&self, process_id: &ContainerProcess) -> Result; + async fn exec_process(&self, req: ExecProcessRequest) -> Result<()>; + async fn kill_process(&self, req: &KillRequest) -> Result<()>; + async fn resize_process_pty(&self, req: &ResizePTYRequest) -> Result<()>; + async fn start_process(&self, process_id: &ContainerProcess) -> Result; + async fn state_process(&self, process_id: &ContainerProcess) -> Result; + async fn wait_process(&self, process_id: &ContainerProcess) -> Result; + + // utility + async fn pid(&self) -> Result; + async fn need_shutdown_sandbox(&self, req: &ShutdownRequest) -> bool; + async fn is_sandbox_container(&self, process_id: &ContainerProcess) -> bool; +} diff --git a/src/runtime-rs/crates/runtimes/common/src/error.rs b/src/runtime-rs/crates/runtimes/common/src/error.rs new file mode 100644 index 0000000000..2ec03c4c6c --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/error.rs @@ -0,0 +1,17 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use crate::types::{ContainerProcess, Response}; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("failed to find container {0}")] + ContainerNotFound(String), + #[error("failed to find process {0}")] + ProcessNotFound(ContainerProcess), + #[error("unexpected response {0} to shim {1}")] + UnexpectedResponse(Response, String), +} diff --git a/src/runtime-rs/crates/runtimes/common/src/lib.rs b/src/runtime-rs/crates/runtimes/common/src/lib.rs new file mode 100644 index 0000000000..36977964ad --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/lib.rs @@ -0,0 +1,15 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod container_manager; +pub use container_manager::ContainerManager; +pub mod error; +pub mod message; +mod runtime_handler; +pub use runtime_handler::{RuntimeHandler, RuntimeInstance}; +mod sandbox; +pub use sandbox::Sandbox; +pub mod types; diff --git a/src/runtime-rs/crates/runtimes/common/src/message.rs b/src/runtime-rs/crates/runtimes/common/src/message.rs new file mode 100644 index 0000000000..856a6e5990 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/message.rs @@ -0,0 +1,72 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::sync::Arc; + +use anyhow::{Context, Result}; +use containerd_shim_protos::{events::task::TaskOOM, protobuf::Message as ProtobufMessage}; +use tokio::sync::mpsc::{channel, Receiver, Sender}; + +/// message receiver buffer size +const MESSAGE_RECEIVER_BUFFER_SIZE: usize = 1; + +#[derive(Debug)] +pub enum Action { + Start, + Stop, + Shutdown, + Event(Arc), +} + +unsafe impl Send for Message {} +unsafe impl Sync for Message {} + +#[derive(Debug)] +pub struct Message { + pub action: Action, + pub resp_sender: Option>>, +} + +impl Message { + pub fn new(action: Action) -> Self { + Message { + action, + resp_sender: None, + } + } + + pub fn new_with_receiver(action: Action) -> (Receiver>, Self) { + let (resp_sender, receiver) = channel(MESSAGE_RECEIVER_BUFFER_SIZE); + ( + receiver, + Message { + action, + resp_sender: Some(resp_sender), + }, + ) + } +} + +const TASK_OOM_EVENT_TOPIC: &str = "/tasks/oom"; + +pub trait Event: std::fmt::Debug + Send { + fn r#type(&self) -> String; + fn type_url(&self) -> String; + fn value(&self) -> Result>; +} + +impl Event for TaskOOM { + fn r#type(&self) -> String { + TASK_OOM_EVENT_TOPIC.to_string() + } + + fn type_url(&self) -> String { + "containerd.events.TaskOOM".to_string() + } + + fn value(&self) -> Result> { + self.write_to_bytes().context("get oom value") + } +} diff --git a/src/runtime-rs/crates/runtimes/common/src/runtime_handler.rs b/src/runtime-rs/crates/runtimes/common/src/runtime_handler.rs new file mode 100644 index 0000000000..c12df38b12 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/runtime_handler.rs @@ -0,0 +1,43 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use kata_types::config::TomlConfig; +use tokio::sync::mpsc::Sender; + +use crate::{message::Message, ContainerManager, Sandbox}; + +#[derive(Clone)] +pub struct RuntimeInstance { + pub sandbox: Arc, + pub container_manager: Arc, +} + +#[async_trait] +pub trait RuntimeHandler: Send + Sync { + fn init() -> Result<()> + where + Self: Sized; + + fn name() -> String + where + Self: Sized; + + fn new_handler() -> Arc + where + Self: Sized; + + async fn new_instance( + &self, + sid: &str, + msg_sender: Sender, + config: Arc, + ) -> Result; + + fn cleanup(&self, id: &str) -> Result<()>; +} diff --git a/src/runtime-rs/crates/runtimes/common/src/sandbox.rs b/src/runtime-rs/crates/runtimes/common/src/sandbox.rs new file mode 100644 index 0000000000..fbb5db53bc --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/sandbox.rs @@ -0,0 +1,16 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::Result; +use async_trait::async_trait; + +#[async_trait] +pub trait Sandbox: Send + Sync { + async fn start(&self, netns: Option) -> Result<()>; + async fn stop(&self) -> Result<()>; + async fn cleanup(&self, container_id: &str) -> Result<()>; + async fn shutdown(&self) -> Result<()>; +} diff --git a/src/runtime-rs/crates/runtimes/common/src/types/mod.rs b/src/runtime-rs/crates/runtimes/common/src/types/mod.rs new file mode 100644 index 0000000000..14f188d7d3 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/types/mod.rs @@ -0,0 +1,235 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod trans_from_agent; +mod trans_from_shim; +mod trans_into_agent; +mod trans_into_shim; + +use std::fmt; + +use anyhow::{Context, Result}; +use kata_sys_util::validate; +use kata_types::mount::Mount; +use strum::Display; + +/// Request: request from shim +/// Request and Response messages need to be paired +#[derive(Debug, Clone, Display)] +pub enum Request { + CreateContainer(ContainerConfig), + CloseProcessIO(ContainerProcess), + DeleteProcess(ContainerProcess), + ExecProcess(ExecProcessRequest), + KillProcess(KillRequest), + WaitProcess(ContainerProcess), + StartProcess(ContainerProcess), + StateProcess(ContainerProcess), + ShutdownContainer(ShutdownRequest), + PauseContainer(ContainerID), + ResumeContainer(ContainerID), + ResizeProcessPTY(ResizePTYRequest), + StatsContainer(ContainerID), + UpdateContainer(UpdateRequest), + Pid, + ConnectContainer(ContainerID), +} + +/// Response: response to shim +/// Request and Response messages need to be paired +#[derive(Debug, Clone, Display)] +pub enum Response { + CreateContainer(PID), + CloseProcessIO, + DeleteProcess(ProcessStateInfo), + ExecProcess, + KillProcess, + WaitProcess(ProcessExitStatus), + StartProcess(PID), + StateProcess(ProcessStateInfo), + ShutdownContainer, + PauseContainer, + ResumeContainer, + ResizeProcessPTY, + StatsContainer(StatsInfo), + UpdateContainer, + Pid(PID), + ConnectContainer(PID), +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum ProcessType { + Container, + Exec, +} + +#[derive(Clone, Debug)] +pub struct ContainerID { + pub container_id: String, +} + +impl ToString for ContainerID { + fn to_string(&self) -> String { + self.container_id.clone() + } +} + +impl ContainerID { + pub fn new(container_id: &str) -> Result { + validate::verify_id(container_id).context("verify container id")?; + Ok(Self { + container_id: container_id.to_string(), + }) + } +} + +#[derive(Clone, Debug)] +pub struct ContainerProcess { + pub container_id: ContainerID, + pub exec_id: String, + pub process_type: ProcessType, +} + +impl fmt::Display for ContainerProcess { + fn fmt(&self, f: &mut std::fmt::Formatter) -> fmt::Result { + write!(f, "{:?}", &self) + } +} + +impl ContainerProcess { + pub fn new(container_id: &str, exec_id: &str) -> Result { + let (exec_id, process_type) = if exec_id.is_empty() || container_id == exec_id { + ("".to_string(), ProcessType::Container) + } else { + validate::verify_id(exec_id).context("verify exec id")?; + (exec_id.to_string(), ProcessType::Exec) + }; + Ok(Self { + container_id: ContainerID::new(container_id)?, + exec_id, + process_type, + }) + } + + pub fn container_id(&self) -> &str { + &self.container_id.container_id + } + + pub fn exec_id(&self) -> &str { + &self.exec_id + } +} +#[derive(Debug, Clone)] +pub struct ContainerConfig { + pub container_id: String, + pub bundle: String, + pub rootfs_mounts: Vec, + pub terminal: bool, + pub stdin: Option, + pub stdout: Option, + pub stderr: Option, +} + +#[derive(Debug, Clone)] +pub struct PID { + pub pid: u32, +} + +impl PID { + pub fn new(pid: u32) -> Self { + Self { pid } + } +} + +#[derive(Debug, Clone)] +pub struct KillRequest { + pub process: ContainerProcess, + pub signal: u32, + pub all: bool, +} + +#[derive(Debug, Clone)] +pub struct ShutdownRequest { + pub container_id: String, + pub is_now: bool, +} + +#[derive(Debug, Clone)] +pub struct ResizePTYRequest { + pub process: ContainerProcess, + pub width: u32, + pub height: u32, +} + +#[derive(Debug, Clone)] +pub struct ExecProcessRequest { + pub process: ContainerProcess, + pub terminal: bool, + pub stdin: Option, + pub stdout: Option, + pub stderr: Option, + pub spec_type_url: String, + pub spec_value: Vec, +} + +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum ProcessStatus { + Unknown = 0, + Created = 1, + Running = 2, + Stopped = 3, + Paused = 4, + Pausing = 5, +} + +#[derive(Debug, Clone)] +pub struct ProcessStateInfo { + pub container_id: String, + pub exec_id: String, + pub pid: PID, + pub bundle: String, + pub stdin: Option, + pub stdout: Option, + pub stderr: Option, + pub terminal: bool, + pub status: ProcessStatus, + pub exit_status: i32, + pub exited_at: Option, +} + +#[derive(Debug, Clone, Default)] +pub struct ProcessExitStatus { + pub exit_code: i32, + pub exit_time: Option, +} + +impl ProcessExitStatus { + pub fn new() -> Self { + Self::default() + } + + pub fn update_exit_code(&mut self, exit_code: i32) { + self.exit_code = exit_code; + self.exit_time = Some(std::time::SystemTime::now()); + } +} + +#[derive(Debug, Clone)] +pub struct StatsInfoValue { + pub type_url: String, + pub value: Vec, +} + +#[derive(Debug, Clone)] +pub struct StatsInfo { + pub value: Option, +} + +#[derive(Debug, Clone)] +pub struct UpdateRequest { + pub container_id: String, + pub value: Vec, +} diff --git a/src/runtime-rs/crates/runtimes/common/src/types/trans_from_agent.rs b/src/runtime-rs/crates/runtimes/common/src/types/trans_from_agent.rs new file mode 100644 index 0000000000..8877771223 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/types/trans_from_agent.rs @@ -0,0 +1,214 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::convert::From; + +use containerd_shim_protos::cgroups::metrics; +use protobuf::Message; + +use super::{StatsInfo, StatsInfoValue}; + +// TODO: trans from agent proto? +impl From> for StatsInfo { + fn from(c_stats: Option) -> Self { + let mut metric = metrics::Metrics::new(); + let stats = match c_stats { + None => { + return StatsInfo { value: None }; + } + Some(stats) => stats, + }; + + if let Some(cg_stats) = stats.cgroup_stats { + if let Some(cpu) = cg_stats.cpu_stats { + // set protobuf cpu stat + let mut p_cpu = metrics::CPUStat::new(); + if let Some(usage) = cpu.cpu_usage { + let mut p_usage = metrics::CPUUsage::new(); + p_usage.set_total(usage.total_usage); + p_usage.set_per_cpu(usage.percpu_usage); + p_usage.set_kernel(usage.usage_in_kernelmode); + p_usage.set_user(usage.usage_in_usermode); + + // set protobuf cpu usage + p_cpu.set_usage(p_usage); + } + + if let Some(throttle) = cpu.throttling_data { + let mut p_throttle = metrics::Throttle::new(); + p_throttle.set_periods(throttle.periods); + p_throttle.set_throttled_time(throttle.throttled_time); + p_throttle.set_throttled_periods(throttle.throttled_periods); + + // set protobuf cpu usage + p_cpu.set_throttling(p_throttle); + } + + metric.set_cpu(p_cpu); + } + + if let Some(m_stats) = cg_stats.memory_stats { + let mut p_m = metrics::MemoryStat::new(); + p_m.set_cache(m_stats.cache); + // memory usage + if let Some(m_data) = m_stats.usage { + let mut p_m_entry = metrics::MemoryEntry::new(); + p_m_entry.set_usage(m_data.usage); + p_m_entry.set_limit(m_data.limit); + p_m_entry.set_failcnt(m_data.failcnt); + p_m_entry.set_max(m_data.max_usage); + + p_m.set_usage(p_m_entry); + } + // memory swap_usage + if let Some(m_data) = m_stats.swap_usage { + let mut p_m_entry = metrics::MemoryEntry::new(); + p_m_entry.set_usage(m_data.usage); + p_m_entry.set_limit(m_data.limit); + p_m_entry.set_failcnt(m_data.failcnt); + p_m_entry.set_max(m_data.max_usage); + + p_m.set_swap(p_m_entry); + } + // memory kernel_usage + if let Some(m_data) = m_stats.kernel_usage { + let mut p_m_entry = metrics::MemoryEntry::new(); + p_m_entry.set_usage(m_data.usage); + p_m_entry.set_limit(m_data.limit); + p_m_entry.set_failcnt(m_data.failcnt); + p_m_entry.set_max(m_data.max_usage); + + p_m.set_kernel(p_m_entry); + } + + for (k, v) in m_stats.stats { + match k.as_str() { + "dirty" => p_m.set_dirty(v), + "rss" => p_m.set_rss(v), + "rss_huge" => p_m.set_rss_huge(v), + "mapped_file" => p_m.set_mapped_file(v), + "writeback" => p_m.set_writeback(v), + "pg_pg_in" => p_m.set_pg_pg_in(v), + "pg_pg_out" => p_m.set_pg_pg_out(v), + "pg_fault" => p_m.set_pg_fault(v), + "pg_maj_fault" => p_m.set_pg_maj_fault(v), + "inactive_file" => p_m.set_inactive_file(v), + "inactive_anon" => p_m.set_inactive_anon(v), + "active_file" => p_m.set_active_file(v), + "unevictable" => p_m.set_unevictable(v), + "hierarchical_memory_limit" => p_m.set_hierarchical_memory_limit(v), + "hierarchical_swap_limit" => p_m.set_hierarchical_swap_limit(v), + "total_cache" => p_m.set_total_cache(v), + "total_rss" => p_m.set_total_rss(v), + "total_mapped_file" => p_m.set_total_mapped_file(v), + "total_dirty" => p_m.set_total_dirty(v), + + "total_pg_pg_in" => p_m.set_total_pg_pg_in(v), + "total_pg_pg_out" => p_m.set_total_pg_pg_out(v), + "total_pg_fault" => p_m.set_total_pg_fault(v), + "total_pg_maj_fault" => p_m.set_total_pg_maj_fault(v), + "total_inactive_file" => p_m.set_total_inactive_file(v), + "total_inactive_anon" => p_m.set_total_inactive_anon(v), + "total_active_file" => p_m.set_total_active_file(v), + "total_unevictable" => p_m.set_total_unevictable(v), + _ => (), + } + } + metric.set_memory(p_m); + } + + if let Some(pid_stats) = cg_stats.pids_stats { + let mut p_pid = metrics::PidsStat::new(); + p_pid.set_limit(pid_stats.limit); + p_pid.set_current(pid_stats.current); + metric.set_pids(p_pid); + } + + if let Some(blk_stats) = cg_stats.blkio_stats { + let mut p_blk_stats = metrics::BlkIOStat::new(); + p_blk_stats + .set_io_serviced_recursive(copy_blkio_entry(&blk_stats.io_serviced_recursive)); + p_blk_stats.set_io_service_bytes_recursive(copy_blkio_entry( + &blk_stats.io_service_bytes_recursive, + )); + p_blk_stats + .set_io_queued_recursive(copy_blkio_entry(&blk_stats.io_queued_recursive)); + p_blk_stats.set_io_service_time_recursive(copy_blkio_entry( + &blk_stats.io_service_time_recursive, + )); + p_blk_stats.set_io_wait_time_recursive(copy_blkio_entry( + &blk_stats.io_wait_time_recursive, + )); + p_blk_stats + .set_io_merged_recursive(copy_blkio_entry(&blk_stats.io_merged_recursive)); + p_blk_stats.set_io_time_recursive(copy_blkio_entry(&blk_stats.io_time_recursive)); + p_blk_stats.set_sectors_recursive(copy_blkio_entry(&blk_stats.sectors_recursive)); + + metric.set_blkio(p_blk_stats); + } + + if !cg_stats.hugetlb_stats.is_empty() { + let mut p_huge = ::protobuf::RepeatedField::new(); + for (k, v) in cg_stats.hugetlb_stats { + let mut h = metrics::HugetlbStat::new(); + h.set_pagesize(k); + h.set_max(v.max_usage); + h.set_usage(v.usage); + h.set_failcnt(v.failcnt); + p_huge.push(h); + } + metric.set_hugetlb(p_huge); + } + } + + let net_stats = stats.network_stats; + if !net_stats.is_empty() { + let mut p_net = ::protobuf::RepeatedField::new(); + for v in net_stats.iter() { + let mut h = metrics::NetworkStat::new(); + h.set_name(v.name.clone()); + + h.set_tx_bytes(v.tx_bytes); + h.set_tx_packets(v.tx_packets); + h.set_tx_errors(v.tx_errors); + h.set_tx_dropped(v.tx_dropped); + + h.set_rx_bytes(v.rx_bytes); + h.set_rx_packets(v.rx_packets); + h.set_rx_errors(v.rx_errors); + h.set_rx_dropped(v.rx_dropped); + + p_net.push(h); + } + metric.set_network(p_net); + } + + StatsInfo { + value: Some(StatsInfoValue { + type_url: "io.containerd.cgroups.v1.Metrics".to_string(), + value: metric.write_to_bytes().unwrap(), + }), + } + } +} + +fn copy_blkio_entry( + entry: &[agent::BlkioStatsEntry], +) -> ::protobuf::RepeatedField { + let mut p_entry = ::protobuf::RepeatedField::new(); + + for e in entry.iter() { + let mut blk = metrics::BlkIOEntry::new(); + blk.set_op(e.op.clone()); + blk.set_value(e.value); + blk.set_major(e.major); + blk.set_minor(e.minor); + + p_entry.push(blk); + } + + p_entry +} diff --git a/src/runtime-rs/crates/runtimes/common/src/types/trans_from_shim.rs b/src/runtime-rs/crates/runtimes/common/src/types/trans_from_shim.rs new file mode 100644 index 0000000000..07f1f8d79e --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/types/trans_from_shim.rs @@ -0,0 +1,198 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + convert::{From, TryFrom}, + path::PathBuf, +}; + +use anyhow::{Context, Result}; +use containerd_shim_protos::api; +use kata_types::mount::Mount; + +use super::{ + ContainerConfig, ContainerID, ContainerProcess, ExecProcessRequest, KillRequest, Request, + ResizePTYRequest, ShutdownRequest, UpdateRequest, +}; + +fn trans_from_shim_mount(from: api::Mount) -> Mount { + let options = from.options.to_vec(); + let mut read_only = false; + for o in &options { + if o == "ro" { + read_only = true; + break; + } + } + + Mount { + source: from.source.clone(), + destination: PathBuf::from(&from.target), + fs_type: from.field_type, + options, + device_id: None, + host_shared_fs_path: None, + read_only, + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::CreateTaskRequest) -> Result { + Ok(Request::CreateContainer(ContainerConfig { + container_id: from.id.clone(), + bundle: from.bundle.clone(), + rootfs_mounts: from + .rootfs + .to_vec() + .into_iter() + .map(trans_from_shim_mount) + .collect(), + terminal: from.terminal, + stdin: (!from.stdin.is_empty()).then(|| from.stdin.clone()), + stdout: (!from.stdout.is_empty()).then(|| from.stdout.clone()), + stderr: (!from.stderr.is_empty()).then(|| from.stderr.clone()), + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::CloseIORequest) -> Result { + Ok(Request::CloseProcessIO( + ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + )) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::DeleteRequest) -> Result { + Ok(Request::DeleteProcess( + ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + )) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::ExecProcessRequest) -> Result { + let spec = from.get_spec(); + Ok(Request::ExecProcess(ExecProcessRequest { + process: ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + terminal: from.terminal, + stdin: (!from.stdin.is_empty()).then(|| from.stdin.clone()), + stdout: (!from.stdout.is_empty()).then(|| from.stdout.clone()), + stderr: (!from.stderr.is_empty()).then(|| from.stderr.clone()), + spec_type_url: spec.get_type_url().to_string(), + spec_value: spec.get_value().to_vec(), + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::KillRequest) -> Result { + Ok(Request::KillProcess(KillRequest { + process: ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + signal: from.signal, + all: from.all, + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::WaitRequest) -> Result { + Ok(Request::WaitProcess( + ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + )) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::StartRequest) -> Result { + Ok(Request::StartProcess( + ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + )) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::StateRequest) -> Result { + Ok(Request::StateProcess( + ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + )) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::ShutdownRequest) -> Result { + Ok(Request::ShutdownContainer(ShutdownRequest { + container_id: from.id.to_string(), + is_now: from.now, + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::ResizePtyRequest) -> Result { + Ok(Request::ResizeProcessPTY(ResizePTYRequest { + process: ContainerProcess::new(&from.id, &from.exec_id).context("new process id")?, + width: from.width, + height: from.height, + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::PauseRequest) -> Result { + Ok(Request::PauseContainer(ContainerID::new(&from.id)?)) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::ResumeRequest) -> Result { + Ok(Request::ResumeContainer(ContainerID::new(&from.id)?)) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::StatsRequest) -> Result { + Ok(Request::StatsContainer(ContainerID::new(&from.id)?)) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::UpdateTaskRequest) -> Result { + Ok(Request::UpdateContainer(UpdateRequest { + container_id: from.id.to_string(), + value: from.get_resources().get_value().to_vec(), + })) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(_from: api::PidsRequest) -> Result { + Ok(Request::Pid) + } +} + +impl TryFrom for Request { + type Error = anyhow::Error; + fn try_from(from: api::ConnectRequest) -> Result { + Ok(Request::ConnectContainer(ContainerID::new(&from.id)?)) + } +} diff --git a/src/runtime-rs/crates/runtimes/common/src/types/trans_into_agent.rs b/src/runtime-rs/crates/runtimes/common/src/types/trans_into_agent.rs new file mode 100644 index 0000000000..f032fd70bc --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/types/trans_into_agent.rs @@ -0,0 +1,28 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::convert::From; + +use agent; + +use super::{ContainerID, ContainerProcess}; + +impl From for agent::ContainerID { + fn from(from: ContainerID) -> Self { + Self { + container_id: from.container_id, + } + } +} + +impl From for agent::ContainerProcessID { + fn from(from: ContainerProcess) -> Self { + Self { + container_id: from.container_id.into(), + exec_id: from.exec_id, + } + } +} diff --git a/src/runtime-rs/crates/runtimes/common/src/types/trans_into_shim.rs b/src/runtime-rs/crates/runtimes/common/src/types/trans_into_shim.rs new file mode 100644 index 0000000000..3c3134e8fd --- /dev/null +++ b/src/runtime-rs/crates/runtimes/common/src/types/trans_into_shim.rs @@ -0,0 +1,242 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + any::type_name, + convert::{Into, TryFrom, TryInto}, + time, +}; + +use anyhow::{anyhow, Result}; +use containerd_shim_protos::api; + +use super::{ProcessExitStatus, ProcessStateInfo, ProcessStatus, Response}; +use crate::error::Error; + +fn system_time_into(time: time::SystemTime) -> ::protobuf::well_known_types::Timestamp { + let mut proto_time = ::protobuf::well_known_types::Timestamp::new(); + proto_time.set_seconds( + time.duration_since(time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .try_into() + .unwrap_or_default(), + ); + proto_time +} + +fn option_system_time_into( + time: Option, +) -> ::protobuf::SingularPtrField<::protobuf::well_known_types::Timestamp> { + match time { + Some(v) => ::protobuf::SingularPtrField::some(system_time_into(v)), + None => ::protobuf::SingularPtrField::none(), + } +} + +impl From for api::WaitResponse { + fn from(from: ProcessExitStatus) -> Self { + Self { + exit_status: from.exit_code as u32, + exited_at: option_system_time_into(from.exit_time), + ..Default::default() + } + } +} + +impl From for api::Status { + fn from(from: ProcessStatus) -> Self { + match from { + ProcessStatus::Unknown => api::Status::UNKNOWN, + ProcessStatus::Created => api::Status::CREATED, + ProcessStatus::Running => api::Status::RUNNING, + ProcessStatus::Stopped => api::Status::STOPPED, + ProcessStatus::Paused => api::Status::PAUSED, + ProcessStatus::Pausing => api::Status::PAUSING, + } + } +} +impl From for api::StateResponse { + fn from(from: ProcessStateInfo) -> Self { + Self { + id: from.container_id.clone(), + bundle: from.bundle.clone(), + pid: from.pid.pid, + status: from.status.into(), + stdin: from.stdin.unwrap_or_default(), + stdout: from.stdout.unwrap_or_default(), + stderr: from.stderr.unwrap_or_default(), + terminal: from.terminal, + exit_status: from.exit_status as u32, + exited_at: option_system_time_into(from.exited_at), + exec_id: from.exec_id, + ..Default::default() + } + } +} + +impl From for api::DeleteResponse { + fn from(from: ProcessStateInfo) -> Self { + Self { + pid: from.pid.pid, + exit_status: from.exit_status as u32, + exited_at: option_system_time_into(from.exited_at), + ..Default::default() + } + } +} + +impl TryFrom for api::CreateTaskResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::CreateContainer(resp) => Ok(Self { + pid: resp.pid, + ..Default::default() + }), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::DeleteResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::DeleteProcess(resp) => Ok(resp.into()), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::WaitResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::WaitProcess(resp) => Ok(resp.into()), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::StartResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::StartProcess(resp) => Ok(api::StartResponse { + pid: resp.pid, + ..Default::default() + }), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::StateResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::StateProcess(resp) => Ok(resp.into()), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::StatsResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + let mut any = ::protobuf::well_known_types::Any::new(); + let mut response = api::StatsResponse::new(); + match from { + Response::StatsContainer(resp) => { + if let Some(value) = resp.value { + any.set_type_url(value.type_url); + any.set_value(value.value); + response.set_stats(any); + } + Ok(response) + } + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::PidsResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::Pid(resp) => { + let mut processes: Vec = vec![]; + let mut p_info = api::ProcessInfo::new(); + let mut res = api::PidsResponse::new(); + p_info.set_pid(resp.pid); + processes.push(p_info); + let v = protobuf::RepeatedField::::from_vec(processes); + res.set_processes(v); + Ok(res) + } + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::ConnectResponse { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::ConnectContainer(resp) => { + let mut res = api::ConnectResponse::new(); + res.set_shim_pid(resp.pid); + Ok(res) + } + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} + +impl TryFrom for api::Empty { + type Error = anyhow::Error; + fn try_from(from: Response) -> Result { + match from { + Response::CloseProcessIO => Ok(api::Empty::new()), + Response::ExecProcess => Ok(api::Empty::new()), + Response::KillProcess => Ok(api::Empty::new()), + Response::ShutdownContainer => Ok(api::Empty::new()), + Response::PauseContainer => Ok(api::Empty::new()), + Response::ResumeContainer => Ok(api::Empty::new()), + Response::ResizeProcessPTY => Ok(api::Empty::new()), + Response::UpdateContainer => Ok(api::Empty::new()), + _ => Err(anyhow!(Error::UnexpectedResponse( + from, + type_name::().to_string() + ))), + } + } +} diff --git a/src/runtime-rs/crates/runtimes/linux_container/Cargo.toml b/src/runtime-rs/crates/runtimes/linux_container/Cargo.toml new file mode 100644 index 0000000000..58e6f6012c --- /dev/null +++ b/src/runtime-rs/crates/runtimes/linux_container/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "linux_container" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +tokio = { version = "1.8.0" } + +common = { path = "../common" } +kata-types = { path = "../../../../libs/kata-types" } \ No newline at end of file diff --git a/src/runtime-rs/crates/runtimes/linux_container/src/lib.rs b/src/runtime-rs/crates/runtimes/linux_container/src/lib.rs new file mode 100644 index 0000000000..582b4e961f --- /dev/null +++ b/src/runtime-rs/crates/runtimes/linux_container/src/lib.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use common::{message::Message, RuntimeHandler, RuntimeInstance}; +use kata_types::config::TomlConfig; +use tokio::sync::mpsc::Sender; + +unsafe impl Send for LinuxContainer {} +unsafe impl Sync for LinuxContainer {} +pub struct LinuxContainer {} + +#[async_trait] +impl RuntimeHandler for LinuxContainer { + fn init() -> Result<()> { + Ok(()) + } + + fn name() -> String { + "linux_container".to_string() + } + + fn new_handler() -> Arc { + Arc::new(LinuxContainer {}) + } + + async fn new_instance( + &self, + _sid: &str, + _msg_sender: Sender, + _config: Arc, + ) -> Result { + todo!() + } + + fn cleanup(&self, _id: &str) -> Result<()> { + todo!() + } +} diff --git a/src/runtime-rs/crates/runtimes/src/lib.rs b/src/runtime-rs/crates/runtimes/src/lib.rs new file mode 100644 index 0000000000..64c57feeae --- /dev/null +++ b/src/runtime-rs/crates/runtimes/src/lib.rs @@ -0,0 +1,13 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "runtimes"); + +mod manager; +pub use manager::RuntimeHandlerManager; diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs new file mode 100644 index 0000000000..10a4a427bb --- /dev/null +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -0,0 +1,284 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use anyhow::{anyhow, Context, Result}; +use common::{ + message::Message, + types::{Request, Response}, + RuntimeHandler, RuntimeInstance, +}; +use kata_types::{annotations::Annotation, config::TomlConfig}; +use tokio::sync::{mpsc::Sender, RwLock}; + +#[cfg(feature = "linux")] +use linux_container::LinuxContainer; +#[cfg(feature = "virt")] +use virt_container::VirtContainer; +#[cfg(feature = "wasm")] +use wasm_container::WasmContainer; + +struct RuntimeHandlerManagerInner { + id: String, + msg_sender: Sender, + runtime_instance: Option>, +} + +impl RuntimeHandlerManagerInner { + fn new(id: &str, msg_sender: Sender) -> Result { + Ok(Self { + id: id.to_string(), + msg_sender, + runtime_instance: None, + }) + } + + async fn init_runtime_handler( + &mut self, + netns: Option, + config: Arc, + ) -> Result<()> { + info!(sl!(), "new runtime handler {}", &config.runtime.name); + let runtime_handler = match config.runtime.name.as_str() { + #[cfg(feature = "linux")] + name if name == LinuxContainer::name() => LinuxContainer::new_handler(), + #[cfg(feature = "wasm")] + name if name == WasmContainer::name() => WasmContainer::new_handler(), + #[cfg(feature = "virt")] + name if name == VirtContainer::name() || name.is_empty() => { + VirtContainer::new_handler() + } + _ => return Err(anyhow!("Unsupported runtime: {}", &config.runtime.name)), + }; + let runtime_instance = runtime_handler + .new_instance(&self.id, self.msg_sender.clone(), config) + .await + .context("new runtime instance")?; + + // start sandbox + runtime_instance + .sandbox + .start(netns) + .await + .context("start sandbox")?; + self.runtime_instance = Some(Arc::new(runtime_instance)); + Ok(()) + } + + async fn try_init(&mut self, spec: &oci::Spec) -> Result<()> { + // return if runtime instance has init + if self.runtime_instance.is_some() { + return Ok(()); + } + + #[cfg(feature = "linux")] + LinuxContainer::init().context("init linux container")?; + #[cfg(feature = "wasm")] + WasmContainer::init().context("init wasm container")?; + #[cfg(feature = "virt")] + VirtContainer::init().context("init virt container")?; + + let netns = if let Some(linux) = &spec.linux { + let mut netns = None; + for ns in &linux.namespaces { + if ns.r#type.as_str() != oci::NETWORKNAMESPACE { + continue; + } + + if !ns.path.is_empty() { + netns = Some(ns.path.clone()); + break; + } + } + netns + } else { + None + }; + + let config = load_config(spec).context("load config")?; + self.init_runtime_handler(netns, Arc::new(config)) + .await + .context("init runtime handler")?; + + Ok(()) + } + + fn get_runtime_instance(&self) -> Option> { + self.runtime_instance.clone() + } +} + +unsafe impl Send for RuntimeHandlerManager {} +unsafe impl Sync for RuntimeHandlerManager {} +pub struct RuntimeHandlerManager { + inner: Arc>, +} + +impl RuntimeHandlerManager { + pub async fn new(id: &str, msg_sender: Sender) -> Result { + Ok(Self { + inner: Arc::new(RwLock::new(RuntimeHandlerManagerInner::new( + id, msg_sender, + )?)), + }) + } + + pub fn cleanup(_id: &str) -> Result<()> { + // TODO: load runtime from persist and cleanup + Ok(()) + } + + async fn get_runtime_instance(&self) -> Result> { + let inner = self.inner.read().await; + inner + .get_runtime_instance() + .ok_or_else(|| anyhow!("runtime not ready")) + } + + async fn try_init_runtime_instance(&self, spec: &oci::Spec) -> Result<()> { + let mut inner = self.inner.write().await; + inner.try_init(spec).await + } + + pub async fn handler_message(&self, req: Request) -> Result { + if let Request::CreateContainer(req) = req { + // get oci spec + let bundler_path = format!("{}/{}", req.bundle, oci::OCI_SPEC_CONFIG_FILE_NAME); + let spec = oci::Spec::load(&bundler_path).context("load spec")?; + + self.try_init_runtime_instance(&spec) + .await + .context("try init runtime instance")?; + let instance = self + .get_runtime_instance() + .await + .context("get runtime instance")?; + + let shim_pid = instance + .container_manager + .create_container(req, spec) + .await + .context("create container")?; + Ok(Response::CreateContainer(shim_pid)) + } else { + self.handler_request(req).await.context("handler request") + } + } + + pub async fn handler_request(&self, req: Request) -> Result { + let instance = self + .get_runtime_instance() + .await + .context("get runtime instance")?; + let sandbox = instance.sandbox.clone(); + let cm = instance.container_manager.clone(); + + match req { + Request::CreateContainer(req) => Err(anyhow!("Unreachable request {:?}", req)), + Request::CloseProcessIO(process_id) => { + cm.close_process_io(&process_id).await.context("close io")?; + Ok(Response::CloseProcessIO) + } + Request::DeleteProcess(process_id) => { + let resp = cm.delete_process(&process_id).await.context("do delete")?; + Ok(Response::DeleteProcess(resp)) + } + Request::ExecProcess(req) => { + cm.exec_process(req).await.context("exec")?; + Ok(Response::ExecProcess) + } + Request::KillProcess(req) => { + cm.kill_process(&req).await.context("kill process")?; + Ok(Response::KillProcess) + } + Request::ShutdownContainer(req) => { + if cm.need_shutdown_sandbox(&req).await { + sandbox.shutdown().await.context("do shutdown")?; + } + Ok(Response::ShutdownContainer) + } + Request::WaitProcess(process_id) => { + let exit_status = cm.wait_process(&process_id).await.context("wait process")?; + if cm.is_sandbox_container(&process_id).await { + sandbox.stop().await.context("stop sandbox")?; + } + Ok(Response::WaitProcess(exit_status)) + } + Request::StartProcess(process_id) => { + let shim_pid = cm + .start_process(&process_id) + .await + .context("start process")?; + Ok(Response::StartProcess(shim_pid)) + } + + Request::StateProcess(process_id) => { + let state = cm + .state_process(&process_id) + .await + .context("state process")?; + Ok(Response::StateProcess(state)) + } + Request::PauseContainer(container_id) => { + cm.pause_container(&container_id) + .await + .context("pause container")?; + Ok(Response::PauseContainer) + } + Request::ResumeContainer(container_id) => { + cm.resume_container(&container_id) + .await + .context("resume container")?; + Ok(Response::ResumeContainer) + } + Request::ResizeProcessPTY(req) => { + cm.resize_process_pty(&req).await.context("resize pty")?; + Ok(Response::ResizeProcessPTY) + } + Request::StatsContainer(container_id) => { + let stats = cm + .stats_container(&container_id) + .await + .context("stats container")?; + Ok(Response::StatsContainer(stats)) + } + Request::UpdateContainer(req) => { + cm.update_container(req).await.context("update container")?; + Ok(Response::UpdateContainer) + } + Request::Pid => Ok(Response::Pid(cm.pid().await.context("pid")?)), + Request::ConnectContainer(container_id) => Ok(Response::ConnectContainer( + cm.connect_container(&container_id) + .await + .context("connect")?, + )), + } + } +} + +/// Config override ordering(high to low): +/// 1. podsandbox annotation +/// 2. shimv2 create task option +/// TODO: https://github.com/kata-containers/kata-containers/issues/3961 +/// 3. environment +fn load_config(spec: &oci::Spec) -> Result { + const KATA_CONF_FILE: &str = "KATA_CONF_FILE"; + let annotation = Annotation::new(spec.annotations.clone()); + let config_path = if let Some(path) = annotation.get_sandbox_config_path() { + path + } else if let Ok(path) = std::env::var(KATA_CONF_FILE) { + path + } else { + String::from("") + }; + info!(sl!(), "get config path {:?}", &config_path); + let (mut toml_config, _) = + TomlConfig::load_from_file(&config_path).context("load toml config")?; + annotation.update_config_by_annotation(&mut toml_config)?; + info!(sl!(), "get config content {:?}", &toml_config); + Ok(toml_config) +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml new file mode 100644 index 0000000000..8ce387f2af --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "virt_container" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +awaitgroup = "0.6.0" +containerd-shim-protos = { version = "0.2.0", features = ["async"]} +futures = "0.3.19" +lazy_static = "1.4.0" +libc = ">=0.2.39" +nix = "0.16.0" +protobuf = "2.27.0" +serde = { version = "1.0.100", features = ["derive"] } +serde_derive = "1.0.27" +serde_json = "1.0.39" +slog = "2.5.2" +slog-scope = "4.4.0" +tokio = { version = "1.8.0" } +toml = "0.4.2" +url = "2.1.1" + +agent = { path = "../../agent" } +common = { path = "../common" } +hypervisor = { path = "../../hypervisor" } +kata-sys-util = { path = "../../../../libs/kata-sys-util" } +kata-types = { path = "../../../../libs/kata-types" } +logging = { path = "../../../../libs/logging"} +oci = { path = "../../../../libs/oci" } +resource = { path = "../../resource" } diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs new file mode 100644 index 0000000000..2d414318bd --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs @@ -0,0 +1,485 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::Agent; +use anyhow::{anyhow, Context, Result}; +use common::{ + error::Error, + types::{ + ContainerConfig, ContainerID, ContainerProcess, ProcessStateInfo, ProcessStatus, + ProcessType, + }, +}; +use oci::{LinuxResources, Process as OCIProcess}; +use resource::ResourceManager; +use tokio::sync::RwLock; + +use super::{ + process::{Process, ProcessWatcher}, + ContainerInner, +}; +use crate::container_manager::logger_with_process; + +pub struct Exec { + pub(crate) process: Process, + pub(crate) oci_process: OCIProcess, +} + +pub struct Container { + pid: u32, + pub container_id: ContainerID, + config: ContainerConfig, + inner: Arc>, + agent: Arc, + resource_manager: Arc, + logger: slog::Logger, +} + +impl Container { + pub fn new( + pid: u32, + config: ContainerConfig, + agent: Arc, + resource_manager: Arc, + ) -> Result { + let container_id = ContainerID::new(&config.container_id).context("new container id")?; + let logger = sl!().new(o!("container_id" => config.container_id.clone())); + let process = ContainerProcess::new(&config.container_id, "")?; + let init_process = Process::new( + &process, + pid, + &config.bundle, + config.stdin.clone(), + config.stdout.clone(), + config.stderr.clone(), + config.terminal, + ); + + Ok(Self { + pid, + container_id, + config, + inner: Arc::new(RwLock::new(ContainerInner::new( + agent.clone(), + init_process, + logger.clone(), + ))), + agent, + resource_manager, + logger, + }) + } + + pub async fn create(&self, mut spec: oci::Spec) -> Result<()> { + // process oci spec + let mut inner = self.inner.write().await; + let toml_config = self.resource_manager.config().await; + let config = &self.config; + let sandbox_pidns = amend_spec(&mut spec, toml_config.runtime.disable_guest_seccomp) + .context("load spec")?; + // handler rootfs + let rootfs = self + .resource_manager + .handler_rootfs(&config.container_id, &config.bundle, &config.rootfs_mounts) + .await + .context("handler rootfs")?; + + // update rootfs + match spec.root.as_mut() { + Some(spec) => { + spec.path = rootfs + .get_guest_rootfs_path() + .await + .context("get guest rootfs path")? + } + None => return Err(anyhow!("spec miss root field")), + }; + inner.rootfs.push(rootfs); + + // handler volumes + let volumes = self + .resource_manager + .handler_volumes(&config.container_id, &spec.mounts) + .await + .context("handler volumes")?; + let mut oci_mounts = vec![]; + let mut storages = vec![]; + for v in volumes { + let mut volume_mounts = v.get_volume_mount().context("get volume mount")?; + if !volume_mounts.is_empty() { + oci_mounts.append(&mut volume_mounts); + } + + let mut s = v.get_storage().context("get storage")?; + if !s.is_empty() { + storages.append(&mut s); + } + inner.volumes.push(v); + } + spec.mounts = oci_mounts; + + // TODO: handler device + + // update cgroups + self.resource_manager + .update_cgroups( + &config.container_id, + spec.linux + .as_ref() + .and_then(|linux| linux.resources.as_ref()), + ) + .await?; + + // create container + let r = agent::CreateContainerRequest { + process_id: agent::ContainerProcessID::new(&config.container_id, ""), + string_user: None, + devices: vec![], + storages, + oci: Some(spec), + guest_hooks: None, + sandbox_pidns, + rootfs_mounts: vec![], + }; + + self.agent + .create_container(r) + .await + .context("agent create container")?; + self.resource_manager.dump().await; + Ok(()) + } + + pub async fn start(&self, process: &ContainerProcess) -> Result<()> { + let mut inner = self.inner.write().await; + match process.process_type { + ProcessType::Container => { + if let Err(err) = inner.start_container(&process.container_id).await { + let _ = inner.stop_process(process, true).await; + return Err(err); + } + + let container_io = inner.new_container_io(process).await?; + inner + .init_process + .start_io_and_wait(self.agent.clone(), container_io) + .await?; + } + ProcessType::Exec => { + if let Err(e) = inner.start_exec_process(process).await { + let _ = inner.stop_process(process, true).await; + return Err(e).context("enter process"); + } + + let container_io = inner.new_container_io(process).await.context("io stream")?; + + { + let exec = inner + .exec_processes + .get(&process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(process.clone()))?; + if exec.process.height != 0 && exec.process.width != 0 { + inner + .win_resize_process(process, exec.process.height, exec.process.width) + .await + .context("win resize")?; + } + } + + // start io and wait + { + let exec = inner + .exec_processes + .get_mut(&process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(process.clone()))?; + + exec.process + .start_io_and_wait(self.agent.clone(), container_io) + .await + .context("start io and wait")?; + } + } + } + + Ok(()) + } + + pub async fn delete_exec_process(&self, container_process: &ContainerProcess) -> Result<()> { + let mut inner = self.inner.write().await; + inner + .delete_exec_process(&container_process.exec_id) + .await + .context("delete process") + } + + pub async fn state_process( + &self, + container_process: &ContainerProcess, + ) -> Result { + let inner = self.inner.read().await; + match container_process.process_type { + ProcessType::Container => inner.init_process.state().await, + ProcessType::Exec => { + let exec = inner + .exec_processes + .get(&container_process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(container_process.clone()))?; + exec.process.state().await + } + } + } + + pub async fn wait_process( + &self, + container_process: &ContainerProcess, + ) -> Result { + let logger = logger_with_process(container_process); + info!(logger, "start wait process"); + + let inner = self.inner.read().await; + inner + .fetch_exit_watcher(container_process) + .context("fetch exit watcher") + } + + pub async fn kill_process( + &self, + container_process: &ContainerProcess, + signal: u32, + all: bool, + ) -> Result<()> { + let inner = self.inner.read().await; + inner.signal_process(container_process, signal, all).await + } + + pub async fn exec_process( + &self, + container_process: &ContainerProcess, + stdin: Option, + stdout: Option, + stderr: Option, + terminal: bool, + oci_process: OCIProcess, + ) -> Result<()> { + let process = Process::new( + container_process, + self.pid, + &self.config.bundle, + stdin, + stdout, + stderr, + terminal, + ); + let exec = Exec { + process, + oci_process, + }; + let mut inner = self.inner.write().await; + inner.add_exec_process(&container_process.exec_id, exec); + Ok(()) + } + + pub async fn close_io(&self, container_process: &ContainerProcess) -> Result<()> { + let mut inner = self.inner.write().await; + inner.close_io(container_process).await + } + + pub async fn stop_process(&self, container_process: &ContainerProcess) -> Result<()> { + let mut inner = self.inner.write().await; + inner + .stop_process(container_process, true) + .await + .context("stop process") + } + + pub async fn pause(&self) -> Result<()> { + let inner = self.inner.read().await; + if inner.init_process.get_status().await == ProcessStatus::Paused { + warn!(self.logger, "container is paused no need to pause"); + return Ok(()); + } + self.agent + .pause_container(self.container_id.clone().into()) + .await + .context("agent pause container")?; + Ok(()) + } + + pub async fn resume(&self) -> Result<()> { + let inner = self.inner.read().await; + if inner.init_process.get_status().await == ProcessStatus::Running { + warn!(self.logger, "container is running no need to resume"); + return Ok(()); + } + self.agent + .resume_container(self.container_id.clone().into()) + .await + .context("agent pause container")?; + Ok(()) + } + + pub async fn resize_pty( + &self, + process: &ContainerProcess, + width: u32, + height: u32, + ) -> Result<()> { + let logger = logger_with_process(process); + let inner = self.inner.read().await; + if inner.init_process.get_status().await != ProcessStatus::Running { + warn!(logger, "container is not running"); + return Ok(()); + } + self.agent + .tty_win_resize(agent::TtyWinResizeRequest { + process_id: process.clone().into(), + row: height, + column: width, + }) + .await + .context("resize pty")?; + Ok(()) + } + + pub async fn stats(&self) -> Result> { + let stats_resp = self + .agent + .stats_container(self.container_id.clone().into()) + .await + .context("agent stats container")?; + Ok(Some(stats_resp)) + } + + pub async fn update(&self, resources: &LinuxResources) -> Result<()> { + self.resource_manager + .update_cgroups(&self.config.container_id, Some(resources)) + .await?; + + let req = agent::UpdateContainerRequest { + container_id: self.container_id.container_id.clone(), + resources: resources.clone(), + mounts: Vec::new(), + }; + self.agent + .update_container(req) + .await + .context("agent update container")?; + Ok(()) + } +} + +fn amend_spec(spec: &mut oci::Spec, disable_guest_seccomp: bool) -> Result { + // hook should be done on host + spec.hooks = None; + + if let Some(linux) = spec.linux.as_mut() { + if disable_guest_seccomp { + linux.seccomp = None; + } + + if let Some(resource) = linux.resources.as_mut() { + resource.devices = Vec::new(); + resource.pids = None; + resource.block_io = None; + resource.hugepage_limits = Vec::new(); + resource.network = None; + } + + // Host pidns path does not make sense in kata. Let's just align it with + // sandbox namespace whenever it is set. + let mut ns: Vec = Vec::new(); + for n in linux.namespaces.iter() { + match n.r#type.as_str() { + oci::PIDNAMESPACE | oci::NETWORKNAMESPACE => continue, + _ => ns.push(n.clone()), + } + } + + linux.namespaces = ns; + + return Ok(handle_pid_namespace(&linux.namespaces)); + } + + Ok(false) +} + +// handle_pid_namespace checks if Pid namespace for a container needs to be shared with its sandbox +// pid namespace. +fn handle_pid_namespace(namespaces: &[oci::LinuxNamespace]) -> bool { + for n in namespaces.iter() { + match n.r#type.as_str() { + oci::PIDNAMESPACE => { + if !n.path.is_empty() { + return true; + } + } + _ => continue, + } + } + false +} + +#[cfg(test)] +mod tests { + use super::amend_spec; + use crate::container_manager::container::handle_pid_namespace; + #[test] + fn test_amend_spec_disable_guest_seccomp() { + let mut spec = oci::Spec { + linux: Some(oci::Linux { + seccomp: Some(oci::LinuxSeccomp::default()), + ..Default::default() + }), + ..Default::default() + }; + + assert!(spec.linux.as_ref().unwrap().seccomp.is_some()); + + // disable_guest_seccomp = false + amend_spec(&mut spec, false).unwrap(); + assert!(spec.linux.as_ref().unwrap().seccomp.is_some()); + + // disable_guest_seccomp = true + amend_spec(&mut spec, true).unwrap(); + assert!(spec.linux.as_ref().unwrap().seccomp.is_none()); + } + #[test] + fn test_handle_pid_namespace() { + let namespaces = vec![ + oci::LinuxNamespace { + r#type: "pid".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "network".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "ipc".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "uts".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "mount".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "user".to_string(), + path: "".to_string(), + }, + oci::LinuxNamespace { + r#type: "cgroup".to_string(), + path: "".to_string(), + }, + ]; + assert!(!handle_pid_namespace(&namespaces)); + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container_inner.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container_inner.rs new file mode 100644 index 0000000000..6cfaef7ff3 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container_inner.rs @@ -0,0 +1,271 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{collections::HashMap, sync::Arc}; + +use agent::Agent; +use anyhow::{anyhow, Context, Result}; +use common::{ + error::Error, + types::{ContainerID, ContainerProcess, ProcessExitStatus, ProcessStatus, ProcessType}, +}; +use nix::sys::signal::Signal; +use resource::{rootfs::Rootfs, volume::Volume}; +use tokio::sync::RwLock; + +use crate::container_manager::logger_with_process; + +use super::{ + io::ContainerIo, + process::{Process, ProcessWatcher}, + Exec, +}; + +pub struct ContainerInner { + agent: Arc, + logger: slog::Logger, + pub(crate) init_process: Process, + pub(crate) exec_processes: HashMap, + pub(crate) rootfs: Vec>, + pub(crate) volumes: Vec>, +} + +impl ContainerInner { + pub(crate) fn new(agent: Arc, init_process: Process, logger: slog::Logger) -> Self { + Self { + agent, + logger, + init_process, + exec_processes: HashMap::new(), + rootfs: vec![], + volumes: vec![], + } + } + + fn container_id(&self) -> &str { + self.init_process.process.container_id() + } + + pub(crate) async fn check_state(&self, states: Vec) -> Result<()> { + let state = self.init_process.get_status().await; + if states.contains(&state) { + return Ok(()); + } + + Err(anyhow!( + "failed to check state {:?} for {:?}", + state, + states + )) + } + + pub(crate) async fn set_state(&mut self, state: ProcessStatus) { + let mut status = self.init_process.status.write().await; + *status = state; + } + + pub(crate) async fn start_exec_process(&mut self, process: &ContainerProcess) -> Result<()> { + let exec = self + .exec_processes + .get_mut(&process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(process.clone()))?; + + self.agent + .exec_process(agent::ExecProcessRequest { + process_id: process.clone().into(), + string_user: None, + process: Some(exec.oci_process.clone()), + }) + .await + .context("exec process")?; + exec.process.set_status(ProcessStatus::Running).await; + Ok(()) + } + + pub(crate) async fn win_resize_process( + &self, + process: &ContainerProcess, + height: u32, + width: u32, + ) -> Result<()> { + self.check_state(vec![ProcessStatus::Created, ProcessStatus::Running]) + .await + .context("check state")?; + + self.agent + .tty_win_resize(agent::TtyWinResizeRequest { + process_id: process.clone().into(), + row: height, + column: width, + }) + .await?; + Ok(()) + } + + pub fn fetch_exit_watcher(&self, process: &ContainerProcess) -> Result { + match process.process_type { + ProcessType::Container => self.init_process.fetch_exit_watcher(), + ProcessType::Exec => { + let exec = self + .exec_processes + .get(&process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(process.clone()))?; + exec.process.fetch_exit_watcher() + } + } + } + + pub(crate) async fn start_container(&mut self, cid: &ContainerID) -> Result<()> { + self.check_state(vec![ProcessStatus::Created, ProcessStatus::Stopped]) + .await + .context("check state")?; + + self.agent + .start_container(agent::ContainerID { + container_id: cid.container_id.clone(), + }) + .await + .context("start container")?; + + self.set_state(ProcessStatus::Running).await; + + Ok(()) + } + + async fn get_exit_status(&self) -> Arc> { + self.init_process.exit_status.clone() + } + + pub(crate) fn add_exec_process(&mut self, id: &str, exec: Exec) -> Option { + self.exec_processes.insert(id.to_string(), exec) + } + + pub(crate) async fn delete_exec_process(&mut self, eid: &str) -> Result<()> { + match self.exec_processes.remove(eid) { + Some(_) => { + debug!(self.logger, " delete process eid {}", eid); + Ok(()) + } + None => Err(anyhow!( + "failed to find cid {} eid {}", + self.container_id(), + eid + )), + } + } + + async fn cleanup_container(&mut self, cid: &str, force: bool) -> Result<()> { + // wait until the container process + // terminated and the status write lock released. + info!(self.logger, "wait on container terminated"); + let exit_status = self.get_exit_status().await; + let _locked_exit_status = exit_status.read().await; + info!(self.logger, "container terminated"); + let timeout: u32 = 10; + self.agent + .remove_container(agent::RemoveContainerRequest::new(cid, timeout)) + .await + .or_else(|e| { + if force { + warn!( + self.logger, + "stop container: agent remove container failed: {}", e + ); + Ok(agent::Empty::new()) + } else { + Err(e) + } + })?; + + // close the exit channel to wakeup wait service + // send to notify watchers who are waiting for the process exit + self.init_process.stop().await; + Ok(()) + } + + pub(crate) async fn stop_process( + &mut self, + process: &ContainerProcess, + force: bool, + ) -> Result<()> { + let logger = logger_with_process(process); + info!(logger, "begin to stop process"); + + // do not stop again when state stopped, may cause multi cleanup resource + let state = self.init_process.get_status().await; + if state == ProcessStatus::Stopped { + return Ok(()); + } + + self.check_state(vec![ProcessStatus::Running]) + .await + .context("check state")?; + + // if use force mode to stop container, stop always successful + // send kill signal to container + // ignore the error of sending signal, since the process would + // have been killed and exited yet. + self.signal_process(process, Signal::SIGKILL as u32, false) + .await + .map_err(|e| { + warn!(logger, "failed to signal kill. {:?}", e); + }) + .ok(); + + match process.process_type { + ProcessType::Container => self + .cleanup_container(&process.container_id.container_id, force) + .await + .context("stop container")?, + ProcessType::Exec => { + let exec = self + .exec_processes + .get_mut(&process.exec_id) + .ok_or_else(|| anyhow!("failed to find exec"))?; + exec.process.stop().await; + } + } + + Ok(()) + } + + pub(crate) async fn signal_process( + &self, + process: &ContainerProcess, + signal: u32, + all: bool, + ) -> Result<()> { + let mut process_id: agent::ContainerProcessID = process.clone().into(); + if all { + // force signal init process + process_id.exec_id.clear(); + }; + + self.agent + .signal_process(agent::SignalProcessRequest { process_id, signal }) + .await?; + Ok(()) + } + + pub async fn new_container_io(&self, process: &ContainerProcess) -> Result { + Ok(ContainerIo::new(self.agent.clone(), process.clone())) + } + + pub async fn close_io(&mut self, process: &ContainerProcess) -> Result<()> { + match process.process_type { + ProcessType::Container => self.init_process.close_io().await, + ProcessType::Exec => { + let exec = self + .exec_processes + .get_mut(&process.exec_id) + .ok_or_else(|| Error::ProcessNotFound(process.clone()))?; + exec.process.close_io().await; + } + }; + + Ok(()) + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/container_io.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/container_io.rs new file mode 100644 index 0000000000..c211e8bca4 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/container_io.rs @@ -0,0 +1,171 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + future::Future, + io, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use agent::Agent; +use anyhow::Result; +use common::types::ContainerProcess; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +struct ContainerIoInfo { + pub agent: Arc, + pub process: ContainerProcess, +} + +pub struct ContainerIo { + pub stdin: Box, + pub stdout: Box, + pub stderr: Box, +} + +impl ContainerIo { + pub fn new(agent: Arc, process: ContainerProcess) -> Self { + let info = Arc::new(ContainerIoInfo { agent, process }); + + Self { + stdin: Box::new(ContainerIoWrite::new(info.clone())), + stdout: Box::new(ContainerIoRead::new(info.clone(), true)), + stderr: Box::new(ContainerIoRead::new(info, false)), + } + } +} + +struct ContainerIoWrite<'inner> { + pub info: Arc, + write_future: + Option> + Send + 'inner>>>, +} + +impl<'inner> ContainerIoWrite<'inner> { + pub fn new(info: Arc) -> Self { + Self { + info, + write_future: Default::default(), + } + } + + fn poll_write_inner( + &'inner mut self, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let mut write_future = self.write_future.take(); + if write_future.is_none() { + let req = agent::WriteStreamRequest { + process_id: self.info.process.clone().into(), + data: buf.to_vec(), + }; + write_future = Some(Box::pin(self.info.agent.write_stdin(req))); + } + + let mut write_future = write_future.unwrap(); + match write_future.as_mut().poll(cx) { + Poll::Ready(v) => match v { + Ok(resp) => Poll::Ready(Ok(resp.length as usize)), + Err(err) => Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, err))), + }, + Poll::Pending => { + self.write_future = Some(write_future); + Poll::Pending + } + } + } +} + +impl<'inner> AsyncWrite for ContainerIoWrite<'inner> { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let me = unsafe { + std::mem::transmute::<&mut ContainerIoWrite<'_>, &mut ContainerIoWrite<'inner>>( + &mut *self, + ) + }; + me.poll_write_inner(cx, buf) + } + + fn poll_flush(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn poll_shutdown(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } +} + +type ResultBuffer = Result; +struct ContainerIoRead<'inner> { + pub info: Arc, + is_stdout: bool, + read_future: Option + Send + 'inner>>>, +} + +impl<'inner> ContainerIoRead<'inner> { + pub fn new(info: Arc, is_stdout: bool) -> Self { + Self { + info, + is_stdout, + read_future: Default::default(), + } + } + fn poll_read_inner( + &'inner mut self, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + let mut read_future = self.read_future.take(); + if read_future.is_none() { + let req = agent::ReadStreamRequest { + process_id: self.info.process.clone().into(), + len: buf.remaining() as u32, + }; + read_future = if self.is_stdout { + Some(Box::pin(self.info.agent.read_stdout(req))) + } else { + Some(Box::pin(self.info.agent.read_stderr(req))) + }; + } + + let mut read_future = read_future.unwrap(); + match read_future.as_mut().poll(cx) { + Poll::Ready(v) => match v { + Ok(resp) => { + buf.put_slice(&resp.data); + Poll::Ready(Ok(())) + } + Err(err) => Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, err))), + }, + Poll::Pending => { + self.read_future = Some(read_future); + Poll::Pending + } + } + } +} + +impl<'inner> AsyncRead for ContainerIoRead<'inner> { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + let me = unsafe { + std::mem::transmute::<&mut ContainerIoRead<'_>, &mut ContainerIoRead<'inner>>( + &mut *self, + ) + }; + me.poll_read_inner(cx, buf) + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/mod.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/mod.rs new file mode 100644 index 0000000000..3c6ca719bc --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/mod.rs @@ -0,0 +1,10 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod container_io; +pub use container_io::ContainerIo; +mod shim_io; +pub use shim_io::ShimIo; diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/shim_io.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/shim_io.rs new file mode 100644 index 0000000000..7559fe2ee4 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/io/shim_io.rs @@ -0,0 +1,150 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + io, + os::unix::{io::FromRawFd, net::UnixStream as StdUnixStream}, + pin::Pin, + task::Context as TaskContext, + task::Poll, +}; + +use anyhow::{anyhow, Context, Result}; +use nix::{ + fcntl::{self, OFlag}, + sys::stat::Mode, +}; +use tokio::{ + fs::OpenOptions, + io::{AsyncRead, AsyncWrite}, + net::UnixStream as AsyncUnixStream, +}; +use url::Url; + +fn open_fifo(path: &str) -> Result { + let fd = fcntl::open(path, OFlag::O_RDWR, Mode::from_bits(0).unwrap())?; + + let std_stream = unsafe { StdUnixStream::from_raw_fd(fd) }; + std_stream + .set_nonblocking(true) + .context("set nonblocking")?; + + AsyncUnixStream::from_std(std_stream).map_err(|e| anyhow!(e)) +} + +pub struct ShimIo { + pub stdin: Option>, + pub stdout: Option>, + pub stderr: Option>, +} + +impl ShimIo { + pub async fn new( + stdin: &Option, + stdout: &Option, + stderr: &Option, + ) -> Result { + info!( + sl!(), + "new shim io stdin {:?} stdout {:?} stderr {:?}", stdin, stdout, stderr + ); + + let stdin_fd: Option> = if let Some(stdin) = stdin { + info!(sl!(), "open stdin {:?}", &stdin); + match OpenOptions::new() + .read(true) + .write(false) + .custom_flags(libc::O_NONBLOCK) + .open(&stdin) + .await + { + Ok(file) => Some(Box::new(file)), + Err(err) => { + error!(sl!(), "failed to open {} error {:?}", &stdin, err); + None + } + } + } else { + None + }; + + let get_url = |url: &Option| -> Option { + info!(sl!(), "get url for {:?}", url); + + match url { + None => None, + Some(out) => match Url::parse(out.as_str()) { + Err(url::ParseError::RelativeUrlWithoutBase) => { + let out = "fifo://".to_owned() + out.as_str(); + let u = Url::parse(out.as_str()).unwrap(); + Some(u) + } + Err(err) => { + warn!(sl!(), "unable to parse stdout uri: {}", err); + None + } + Ok(u) => Some(u), + }, + } + }; + + let stdout_url = get_url(stdout); + let get_fd = |url: &Option| -> Option> { + info!(sl!(), "get fd for {:?}", &url); + if let Some(url) = url { + if url.scheme() == "fifo" { + let path = url.path(); + match open_fifo(path) { + Ok(s) => { + return Some(Box::new(ShimIoWrite::Stream(s))); + } + Err(err) => { + error!(sl!(), "failed to open file {} error {:?}", url.path(), err); + } + } + } + } + None + }; + + let stderr_url = get_url(stderr); + Ok(Self { + stdin: stdin_fd, + stdout: get_fd(&stdout_url), + stderr: get_fd(&stderr_url), + }) + } +} + +#[derive(Debug)] +enum ShimIoWrite { + Stream(AsyncUnixStream), + // TODO: support other type +} + +impl AsyncWrite for ShimIoWrite { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut TaskContext<'_>, + buf: &[u8], + ) -> Poll> { + match *self { + ShimIoWrite::Stream(ref mut s) => Pin::new(s).poll_write(cx, buf), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll> { + match *self { + ShimIoWrite::Stream(ref mut s) => Pin::new(s).poll_flush(cx), + } + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut TaskContext<'_>) -> Poll> { + match *self { + ShimIoWrite::Stream(ref mut s) => Pin::new(s).poll_shutdown(cx), + } + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs new file mode 100644 index 0000000000..b4b20bbf36 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs @@ -0,0 +1,275 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Context, Result}; + +use std::{collections::HashMap, sync::Arc}; + +use agent::Agent; +use async_trait::async_trait; +use common::{ + error::Error, + types::{ + ContainerConfig, ContainerID, ContainerProcess, ExecProcessRequest, KillRequest, + ProcessExitStatus, ProcessStateInfo, ProcessType, ResizePTYRequest, ShutdownRequest, + StatsInfo, UpdateRequest, PID, + }, + ContainerManager, +}; +use oci::Process as OCIProcess; +use resource::ResourceManager; +use tokio::sync::RwLock; + +use super::{logger_with_process, Container}; + +unsafe impl Send for VirtContainerManager {} +unsafe impl Sync for VirtContainerManager {} +pub struct VirtContainerManager { + sid: String, + pid: u32, + containers: Arc>>, + resource_manager: Arc, + agent: Arc, +} + +impl VirtContainerManager { + pub fn new( + sid: &str, + pid: u32, + agent: Arc, + resource_manager: Arc, + ) -> Self { + Self { + sid: sid.to_string(), + pid, + containers: Default::default(), + resource_manager, + agent, + } + } +} + +#[async_trait] +impl ContainerManager for VirtContainerManager { + async fn create_container(&self, config: ContainerConfig, spec: oci::Spec) -> Result { + let container = Container::new( + self.pid, + config, + self.agent.clone(), + self.resource_manager.clone(), + ) + .context("new container")?; + + let mut containers = self.containers.write().await; + container.create(spec).await.context("create")?; + containers.insert(container.container_id.to_string(), container); + + Ok(PID { pid: self.pid }) + } + + async fn close_process_io(&self, process: &ContainerProcess) -> Result<()> { + let containers = self.containers.read().await; + let container_id = &process.container_id.to_string(); + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + + c.close_io(process).await.context("close io")?; + Ok(()) + } + + async fn delete_process(&self, process: &ContainerProcess) -> Result { + let container_id = &process.container_id.container_id; + match process.process_type { + ProcessType::Container => { + let mut containers = self.containers.write().await; + let c = containers + .remove(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.to_string()))?; + c.state_process(process).await.context("state process") + } + ProcessType::Exec => { + let containers = self.containers.read().await; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.to_string()))?; + let state = c.state_process(process).await.context("state process"); + c.delete_exec_process(process) + .await + .context("delete process")?; + return state; + } + } + } + + async fn exec_process(&self, req: ExecProcessRequest) -> Result<()> { + if req.spec_type_url.is_empty() { + return Err(anyhow!("invalid type url")); + } + let oci_process: OCIProcess = + serde_json::from_slice(&req.spec_value).context("serde from slice")?; + + let containers = self.containers.read().await; + let container_id = &req.process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + c.exec_process( + &req.process, + req.stdin, + req.stdout, + req.stderr, + req.terminal, + oci_process, + ) + .await + .context("exec")?; + Ok(()) + } + + async fn kill_process(&self, req: &KillRequest) -> Result<()> { + let containers = self.containers.read().await; + let container_id = &req.process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + c.kill_process(&req.process, req.signal, req.all) + .await + .map_err(|err| { + warn!( + sl!(), + "failed to signal process {:?} {:?}", &req.process, err + ); + err + }) + .ok(); + Ok(()) + } + + async fn wait_process(&self, process: &ContainerProcess) -> Result { + let logger = logger_with_process(process); + + let containers = self.containers.read().await; + let container_id = &process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + let (watcher, status) = c.wait_process(process).await.context("wait")?; + drop(containers); + + match watcher { + Some(mut watcher) => { + info!(logger, "begin wait exit"); + while watcher.changed().await.is_ok() {} + info!(logger, "end wait exited"); + } + None => { + warn!(logger, "failed to find watcher for wait process"); + } + } + + let status = status.read().await; + + info!(logger, "wait process exit status {:?}", status); + + // stop process + let containers = self.containers.read().await; + let container_id = &process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + c.stop_process(process).await.context("stop container")?; + Ok(status.clone()) + } + + async fn start_process(&self, process: &ContainerProcess) -> Result { + let containers = self.containers.read().await; + let container_id = &process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + c.start(process).await.context("start")?; + Ok(PID { pid: self.pid }) + } + + async fn state_process(&self, process: &ContainerProcess) -> Result { + let containers = self.containers.read().await; + let container_id = &process.container_id.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.clone()))?; + let state = c.state_process(process).await.context("state process")?; + Ok(state) + } + + async fn pause_container(&self, id: &ContainerID) -> Result<()> { + let containers = self.containers.read().await; + let c = containers + .get(&id.container_id) + .ok_or_else(|| Error::ContainerNotFound(id.container_id.clone()))?; + c.pause().await.context("pause")?; + Ok(()) + } + + async fn resume_container(&self, id: &ContainerID) -> Result<()> { + let containers = self.containers.read().await; + let c = containers + .get(&id.container_id) + .ok_or_else(|| Error::ContainerNotFound(id.container_id.clone()))?; + c.resume().await.context("resume")?; + Ok(()) + } + + async fn resize_process_pty(&self, req: &ResizePTYRequest) -> Result<()> { + let containers = self.containers.read().await; + let c = containers + .get(&req.process.container_id.container_id) + .ok_or_else(|| { + Error::ContainerNotFound(req.process.container_id.container_id.clone()) + })?; + c.resize_pty(&req.process, req.width, req.height) + .await + .context("resize pty")?; + Ok(()) + } + + async fn stats_container(&self, id: &ContainerID) -> Result { + let containers = self.containers.read().await; + let c = containers + .get(&id.container_id) + .ok_or_else(|| Error::ContainerNotFound(id.container_id.clone()))?; + let stats = c.stats().await.context("stats")?; + Ok(StatsInfo::from(stats)) + } + + async fn update_container(&self, req: UpdateRequest) -> Result<()> { + let resource = serde_json::from_slice::(&req.value) + .context("deserialize LinuxResource")?; + let containers = self.containers.read().await; + let container_id = &req.container_id; + let c = containers + .get(container_id) + .ok_or_else(|| Error::ContainerNotFound(container_id.to_string()))?; + c.update(&resource).await.context("stats") + } + + async fn pid(&self) -> Result { + Ok(PID { pid: self.pid }) + } + + async fn connect_container(&self, _id: &ContainerID) -> Result { + Ok(PID { pid: self.pid }) + } + + async fn need_shutdown_sandbox(&self, req: &ShutdownRequest) -> bool { + req.is_now || self.containers.read().await.is_empty() || self.sid == req.container_id + } + + async fn is_sandbox_container(&self, process: &ContainerProcess) -> bool { + process.process_type == ProcessType::Container + && process.container_id.container_id == self.sid + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/mod.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/mod.rs new file mode 100644 index 0000000000..3c615517fd --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/mod.rs @@ -0,0 +1,20 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +mod container; +use container::{Container, Exec}; +mod container_inner; +mod io; +use container_inner::ContainerInner; +mod manager; +pub use manager::VirtContainerManager; +mod process; + +use common::types::ContainerProcess; + +fn logger_with_process(container_process: &ContainerProcess) -> slog::Logger { + sl!().new(o!("container_id" => container_process.container_id.container_id.clone(), "exec_id" => container_process.exec_id.clone())) +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/process.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/process.rs new file mode 100644 index 0000000000..334488453a --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/process.rs @@ -0,0 +1,239 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::Agent; +use anyhow::{Context, Result}; +use awaitgroup::{WaitGroup, Worker as WaitGroupWorker}; +use common::types::{ContainerProcess, ProcessExitStatus, ProcessStateInfo, ProcessStatus, PID}; +use tokio::{ + io::{AsyncRead, AsyncWrite}, + sync::{watch, RwLock}, +}; + +use super::{ + io::{ContainerIo, ShimIo}, + logger_with_process, +}; + +pub type ProcessWatcher = ( + Option>, + Arc>, +); + +#[derive(Debug)] +pub struct Process { + pub process: ContainerProcess, + pub pid: u32, + logger: slog::Logger, + pub bundle: String, + + pub stdin: Option, + pub stdout: Option, + pub stderr: Option, + pub terminal: bool, + + pub height: u32, + pub width: u32, + pub status: Arc>, + + pub exit_status: Arc>, + pub exit_watcher_rx: Option>, + pub exit_watcher_tx: Option>, + // used to sync between stdin io copy thread(tokio) and the close it call. + // close io call should wait until the stdin io copy finished to + // prevent stdin data lost. + pub wg_stdin: WaitGroup, +} + +impl Process { + pub fn new( + process: &ContainerProcess, + pid: u32, + bundle: &str, + stdin: Option, + stdout: Option, + stderr: Option, + terminal: bool, + ) -> Process { + let (sender, receiver) = watch::channel(false); + + Process { + process: process.clone(), + pid, + logger: logger_with_process(process), + bundle: bundle.to_string(), + stdin, + stdout, + stderr, + terminal, + height: 0, + width: 0, + status: Arc::new(RwLock::new(ProcessStatus::Created)), + exit_status: Arc::new(RwLock::new(ProcessExitStatus::new())), + exit_watcher_rx: Some(receiver), + exit_watcher_tx: Some(sender), + wg_stdin: WaitGroup::new(), + } + } + + pub async fn start_io_and_wait( + &mut self, + agent: Arc, + container_io: ContainerIo, + ) -> Result<()> { + info!(self.logger, "start io and wait"); + + // new shim io + let shim_io = ShimIo::new(&self.stdin, &self.stdout, &self.stderr) + .await + .context("new shim io")?; + + // start io copy for stdin + let wgw_stdin = self.wg_stdin.worker(); + if let Some(stdin) = shim_io.stdin { + self.run_io_copy("stdin", wgw_stdin, stdin, container_io.stdin) + .await?; + } + + // prepare for wait group for stdout, stderr + let wg = WaitGroup::new(); + let wgw = wg.worker(); + + // start io copy for stdout + if let Some(stdout) = shim_io.stdout { + self.run_io_copy("stdout", wgw.clone(), container_io.stdout, stdout) + .await?; + } + + // start io copy for stderr + if !self.terminal { + if let Some(stderr) = shim_io.stderr { + self.run_io_copy("stderr", wgw, container_io.stderr, stderr) + .await?; + } + } + + self.run_io_wait(agent, wg).await.context("run io thread")?; + Ok(()) + } + + async fn run_io_copy<'a>( + &'a self, + io_name: &'a str, + wgw: WaitGroupWorker, + mut reader: Box, + mut writer: Box, + ) -> Result<()> { + info!(self.logger, "run io copy for {}", io_name); + let io_name = io_name.to_string(); + let logger = self.logger.new(o!("io name" => io_name)); + let _ = tokio::spawn(async move { + loop { + match tokio::io::copy(&mut reader, &mut writer).await { + Err(e) => { + if let Some(error_code) = e.raw_os_error() { + if error_code == libc::EAGAIN { + continue; + } + } + warn!(logger, "io: failed to copy stream {}", e); + } + Ok(length) => warn!(logger, "io: stop to copy stream length {}", length), + }; + break; + } + + wgw.done(); + }); + + Ok(()) + } + + async fn run_io_wait(&mut self, agent: Arc, mut wg: WaitGroup) -> Result<()> { + let logger = self.logger.clone(); + info!(logger, "start run io wait"); + let process = self.process.clone(); + let exit_status = self.exit_status.clone(); + let exit_notifier = self.exit_watcher_tx.take(); + let status = self.status.clone(); + + let _ = tokio::spawn(async move { + //wait on all of the container's io stream terminated + info!(logger, "begin wait group io",); + wg.wait().await; + info!(logger, "end wait group for io"); + + let req = agent::WaitProcessRequest { + process_id: process.clone().into(), + }; + + info!(logger, "begin wait process"); + let resp = match agent.wait_process(req).await { + Ok(ret) => ret, + Err(e) => { + error!(logger, "failed to wait process {:?}", e); + return; + } + }; + + info!(logger, "end wait process exit code {}", resp.status); + + let mut exit_status = exit_status.write().await; + exit_status.update_exit_code(resp.status); + drop(exit_status); + + let mut status = status.write().await; + *status = ProcessStatus::Stopped; + drop(status); + + drop(exit_notifier); + info!(logger, "end io wait thread"); + }); + Ok(()) + } + + pub fn fetch_exit_watcher(&self) -> Result { + Ok((self.exit_watcher_rx.clone(), self.exit_status.clone())) + } + + pub async fn state(&self) -> Result { + let exit_status = self.exit_status.read().await; + Ok(ProcessStateInfo { + container_id: self.process.container_id.container_id.clone(), + exec_id: self.process.exec_id.clone(), + pid: PID { pid: self.pid }, + bundle: self.bundle.clone(), + stdin: self.stdin.clone(), + stdout: self.stdout.clone(), + stderr: self.stderr.clone(), + terminal: self.terminal, + status: self.get_status().await, + exit_status: exit_status.exit_code, + exited_at: exit_status.exit_time, + }) + } + + pub async fn stop(&mut self) { + let mut status = self.status.write().await; + *status = ProcessStatus::Stopped; + } + + pub async fn close_io(&mut self) { + self.wg_stdin.wait().await; + } + + pub async fn get_status(&self) -> ProcessStatus { + let status = self.status.read().await; + *status + } + + pub async fn set_status(&self, new_status: ProcessStatus) { + let mut status = self.status.write().await; + *status = new_status; + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs new file mode 100644 index 0000000000..3a7703ac3b --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/health_check.rs @@ -0,0 +1,123 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::Agent; +use anyhow::Context; +use tokio::sync::{mpsc, Mutex}; + +/// monitor check interval 30s +const HEALTH_CHECK_TIMER_INTERVAL: u64 = 30; + +/// version check threshold 5min +const VERSION_CHECK_THRESHOLD: u64 = 5 * 60 / HEALTH_CHECK_TIMER_INTERVAL; + +/// health check stop channel buffer size +const HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE: usize = 1; + +pub struct HealthCheck { + pub keep_alive: bool, + keep_vm: bool, + stop_tx: mpsc::Sender<()>, + stop_rx: Arc>>, +} + +impl HealthCheck { + pub fn new(keep_alive: bool, keep_vm: bool) -> HealthCheck { + let (tx, rx) = mpsc::channel(HEALTH_CHECK_STOP_CHANNEL_BUFFER_SIZE); + HealthCheck { + keep_alive, + keep_vm, + stop_tx: tx, + stop_rx: Arc::new(Mutex::new(rx)), + } + } + + pub fn start(&self, id: &str, agent: Arc) { + if !self.keep_alive { + return; + } + let id = id.to_string(); + + info!(sl!(), "start runtime keep alive"); + + let stop_rx = self.stop_rx.clone(); + let keep_vm = self.keep_vm; + let _ = tokio::spawn(async move { + let mut version_check_threshold_count = 0; + + loop { + tokio::time::sleep(std::time::Duration::from_secs(HEALTH_CHECK_TIMER_INTERVAL)) + .await; + let mut stop_rx = stop_rx.lock().await; + match stop_rx.try_recv() { + Ok(_) => { + info!(sl!(), "revive stop {} monitor signal", id); + break; + } + + Err(mpsc::error::TryRecvError::Empty) => { + // check agent + match agent + .check(agent::CheckRequest::new("")) + .await + .context("check health") + { + Ok(_) => { + debug!(sl!(), "check {} agent health successfully", id); + version_check_threshold_count += 1; + if version_check_threshold_count >= VERSION_CHECK_THRESHOLD { + // need to check version + version_check_threshold_count = 0; + if let Ok(v) = agent + .version(agent::CheckRequest::new("")) + .await + .context("check version") + { + info!(sl!(), "agent {}", v.agent_version) + } + } + continue; + } + Err(e) => { + error!(sl!(), "failed to do {} agent health check: {}", id, e); + if let Err(mpsc::error::TryRecvError::Empty) = stop_rx.try_recv() { + error!(sl!(), "failed to receive stop monitor signal"); + if !keep_vm { + ::std::process::exit(1); + } + } else { + info!(sl!(), "wait to exit exit {}", id); + break; + } + } + } + } + + Err(mpsc::error::TryRecvError::Disconnected) => { + warn!(sl!(), "{} monitor channel has broken", id); + break; + } + } + } + }); + } + + pub async fn stop(&self) { + if !self.keep_alive { + return; + } + info!(sl!(), "stop runtime keep alive"); + self.stop_tx + .send(()) + .await + .map_err(|e| { + warn!(sl!(), "failed send monitor channel. {:?}", e); + }) + .ok(); + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs new file mode 100644 index 0000000000..8869c03c07 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs @@ -0,0 +1,124 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "virt-container"); + +mod container_manager; +pub mod health_check; +pub mod sandbox; + +use std::sync::Arc; + +use agent::kata::KataAgent; +use anyhow::{anyhow, Context, Result}; +use async_trait::async_trait; +use common::{message::Message, RuntimeHandler, RuntimeInstance}; +use hypervisor::{dragonball::Dragonball, Hypervisor}; +use kata_types::config::{hypervisor::register_hypervisor_plugin, DragonballConfig, TomlConfig}; +use resource::ResourceManager; +use tokio::sync::mpsc::Sender; + +const HYPERVISOR_DRAGONBALL: &str = "dragonball"; + +unsafe impl Send for VirtContainer {} +unsafe impl Sync for VirtContainer {} +pub struct VirtContainer {} + +#[async_trait] +impl RuntimeHandler for VirtContainer { + fn init() -> Result<()> { + // register + let dragonball_config = Arc::new(DragonballConfig::new()); + register_hypervisor_plugin("dragonball", dragonball_config); + Ok(()) + } + + fn name() -> String { + "virt_container".to_string() + } + + fn new_handler() -> Arc { + Arc::new(VirtContainer {}) + } + + async fn new_instance( + &self, + sid: &str, + msg_sender: Sender, + config: Arc, + ) -> Result { + let hypervisor = new_hypervisor(&config).await.context("new hypervisor")?; + + // get uds from hypervisor and get config from toml_config + let agent = Arc::new(KataAgent::new(kata_types::config::Agent { + debug: true, + enable_tracing: false, + server_port: 1024, + log_port: 1025, + dial_timeout_ms: 10, + reconnect_timeout_ms: 3_000, + request_timeout_ms: 30_000, + health_check_request_timeout_ms: 90_000, + kernel_modules: Default::default(), + container_pipe_size: 0, + debug_console_enabled: false, + })); + + let resource_manager = Arc::new(ResourceManager::new( + sid, + agent.clone(), + hypervisor.clone(), + config, + )?); + let pid = std::process::id(); + + let sandbox = sandbox::VirtSandbox::new( + sid, + msg_sender, + agent.clone(), + hypervisor, + resource_manager.clone(), + ) + .await + .context("new virt sandbox")?; + let container_manager = + container_manager::VirtContainerManager::new(sid, pid, agent, resource_manager); + Ok(RuntimeInstance { + sandbox: Arc::new(sandbox), + container_manager: Arc::new(container_manager), + }) + } + + fn cleanup(&self, _id: &str) -> Result<()> { + // TODO + Ok(()) + } +} + +async fn new_hypervisor(toml_config: &TomlConfig) -> Result> { + let hypervisor_name = &toml_config.runtime.hypervisor_name; + let hypervisor_config = toml_config + .hypervisor + .get(hypervisor_name) + .ok_or_else(|| anyhow!("failed to get hypervisor for {}", &hypervisor_name)) + .context("get hypervisor")?; + + // TODO: support other hypervisor + // issue: https://github.com/kata-containers/kata-containers/issues/4634 + match hypervisor_name.as_str() { + HYPERVISOR_DRAGONBALL => { + let mut hypervisor = Dragonball::new(); + hypervisor + .set_hypervisor_config(hypervisor_config.clone()) + .await; + Ok(Arc::new(hypervisor)) + } + _ => Err(anyhow!("Unsupported hypervisor {}", &hypervisor_name)), + } +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs new file mode 100644 index 0000000000..5f9625ad75 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -0,0 +1,247 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::sync::Arc; + +use agent::{self, Agent}; +use anyhow::{Context, Result}; +use async_trait::async_trait; +use common::{ + message::{Action, Message}, + Sandbox, +}; +use containerd_shim_protos::events::task::TaskOOM; +use hypervisor::Hypervisor; +use resource::{ + network::{NetworkConfig, NetworkWithNetNsConfig}, + ResourceConfig, ResourceManager, +}; +use tokio::sync::{mpsc::Sender, Mutex, RwLock}; + +use crate::health_check::HealthCheck; + +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum SandboxState { + Init, + Running, + Stopped, +} + +struct SandboxInner { + state: SandboxState, +} + +impl SandboxInner { + pub fn new() -> Self { + Self { + state: SandboxState::Init, + } + } +} + +unsafe impl Send for VirtSandbox {} +unsafe impl Sync for VirtSandbox {} +#[derive(Clone)] +pub struct VirtSandbox { + sid: String, + msg_sender: Arc>>, + inner: Arc>, + resource_manager: Arc, + agent: Arc, + hypervisor: Arc, + monitor: Arc, +} + +impl VirtSandbox { + pub async fn new( + sid: &str, + msg_sender: Sender, + agent: Arc, + hypervisor: Arc, + resource_manager: Arc, + ) -> Result { + Ok(Self { + sid: sid.to_string(), + msg_sender: Arc::new(Mutex::new(msg_sender)), + inner: Arc::new(RwLock::new(SandboxInner::new())), + agent, + hypervisor, + resource_manager, + monitor: Arc::new(HealthCheck::new(true, false)), + }) + } + + async fn prepare_for_start_sandbox( + &self, + _id: &str, + netns: Option, + ) -> Result> { + let mut resource_configs = vec![]; + + let config = self.resource_manager.config().await; + if let Some(netns_path) = netns { + let network_config = ResourceConfig::Network(NetworkConfig::NetworkResourceWithNetNs( + NetworkWithNetNsConfig { + network_model: config.runtime.internetworking_model.clone(), + netns_path, + queues: self + .hypervisor + .hypervisor_config() + .await + .network_info + .network_queues as usize, + }, + )); + resource_configs.push(network_config); + } + + let hypervisor_config = self.hypervisor.hypervisor_config().await; + let virtio_fs_config = ResourceConfig::ShareFs(hypervisor_config.shared_fs); + resource_configs.push(virtio_fs_config); + + Ok(resource_configs) + } +} + +#[async_trait] +impl Sandbox for VirtSandbox { + async fn start(&self, netns: Option) -> Result<()> { + let id = &self.sid; + + // if sandbox running, return + // if sandbox not running try to start sandbox + let mut inner = self.inner.write().await; + if inner.state == SandboxState::Running { + warn!(sl!(), "sandbox is running, no need to start"); + return Ok(()); + } + + self.hypervisor + .prepare_vm(id, netns.clone()) + .await + .context("prepare vm")?; + + // generate device and setup before start vm + // should after hypervisor.prepare_vm + let resources = self.prepare_for_start_sandbox(id, netns).await?; + self.resource_manager + .prepare_before_start_vm(resources) + .await + .context("set up device before start vm")?; + + // start vm + self.hypervisor.start_vm(10_000).await.context("start vm")?; + info!(sl!(), "start vm"); + + // connect agent + // set agent socket + let address = self + .hypervisor + .get_agent_socket() + .await + .context("get agent socket")?; + self.agent.start(&address).await.context("connect")?; + + self.resource_manager + .setup_after_start_vm() + .await + .context("setup device after start vm")?; + + // create sandbox in vm + let req = agent::CreateSandboxRequest { + hostname: "".to_string(), + dns: vec![], + storages: self + .resource_manager + .get_storage_for_sandbox() + .await + .context("get storages for sandbox")?, + sandbox_pidns: false, + sandbox_id: id.to_string(), + guest_hook_path: "".to_string(), + kernel_modules: vec![], + }; + + self.agent + .create_sandbox(req) + .await + .context("create sandbox")?; + + inner.state = SandboxState::Running; + let agent = self.agent.clone(); + let sender = self.msg_sender.clone(); + info!(sl!(), "oom watcher start"); + let _ = tokio::spawn(async move { + loop { + match agent + .get_oom_event(agent::Empty::new()) + .await + .context("get oom event") + { + Ok(resp) => { + let cid = &resp.container_id; + warn!(sl!(), "send oom event for container {}", &cid); + let event = TaskOOM { + container_id: cid.to_string(), + ..Default::default() + }; + let msg = Message::new(Action::Event(Arc::new(event))); + let lock_sender = sender.lock().await; + if let Err(err) = lock_sender.send(msg).await.context("send event") { + error!( + sl!(), + "failed to send oom event for {} error {:?}", cid, err + ); + } + } + Err(err) => { + warn!(sl!(), "failed to get oom event error {:?}", err); + break; + } + } + } + }); + self.monitor.start(id, self.agent.clone()); + Ok(()) + } + + async fn stop(&self) -> Result<()> { + info!(sl!(), "begin stop sandbox"); + self.hypervisor.stop_vm().await.context("stop vm")?; + Ok(()) + } + + async fn shutdown(&self) -> Result<()> { + info!(sl!(), "shutdown"); + + self.stop().await.context("stop")?; + + info!(sl!(), "delete cgroup"); + self.resource_manager + .delete_cgroups() + .await + .context("delete cgroups")?; + + info!(sl!(), "stop monitor"); + self.monitor.stop().await; + + info!(sl!(), "stop agent"); + self.agent.stop().await; + + // stop server + info!(sl!(), "send shutdown message"); + let msg = Message::new(Action::Shutdown); + let sender = self.msg_sender.clone(); + let sender = sender.lock().await; + sender.send(msg).await.context("send shutdown msg")?; + Ok(()) + } + + async fn cleanup(&self, _id: &str) -> Result<()> { + // TODO: cleanup + Ok(()) + } +} diff --git a/src/runtime-rs/crates/runtimes/wasm_container/Cargo.toml b/src/runtime-rs/crates/runtimes/wasm_container/Cargo.toml new file mode 100644 index 0000000000..b8174ee822 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/wasm_container/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "wasm_container" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +tokio = { version = "1.8.0" } + +common = { path = "../common" } +kata-types = { path = "../../../../libs/kata-types" } \ No newline at end of file diff --git a/src/runtime-rs/crates/runtimes/wasm_container/src/lib.rs b/src/runtime-rs/crates/runtimes/wasm_container/src/lib.rs new file mode 100644 index 0000000000..c687274670 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/wasm_container/src/lib.rs @@ -0,0 +1,43 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::sync::Arc; + +use anyhow::Result; +use async_trait::async_trait; +use common::{message::Message, RuntimeHandler, RuntimeInstance}; +use kata_types::config::TomlConfig; +use tokio::sync::mpsc::Sender; +unsafe impl Send for WasmContainer {} +unsafe impl Sync for WasmContainer {} +pub struct WasmContainer {} + +#[async_trait] +impl RuntimeHandler for WasmContainer { + fn init() -> Result<()> { + Ok(()) + } + + fn name() -> String { + "wasm_container".to_string() + } + + fn new_handler() -> Arc { + Arc::new(WasmContainer {}) + } + + async fn new_instance( + &self, + _sid: &str, + _msg_sender: Sender, + _config: Arc, + ) -> Result { + todo!() + } + + fn cleanup(&self, _id: &str) -> Result<()> { + todo!() + } +} diff --git a/src/runtime-rs/crates/service/Cargo.toml b/src/runtime-rs/crates/service/Cargo.toml new file mode 100644 index 0000000000..6d7f64ff5d --- /dev/null +++ b/src/runtime-rs/crates/service/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "service" +version = "0.1.0" +authors = ["The Kata Containers community "] +edition = "2018" + +[dependencies] +anyhow = "^1.0" +async-trait = "0.1.48" +slog = "2.5.2" +slog-scope = "4.4.0" +tokio = { version = "1.8.0", features = ["rt-multi-thread"] } +ttrpc = { version = "0.6.1" } + +common = { path = "../runtimes/common" } +containerd-shim-protos = { version = "0.2.0", features = ["async"]} +logging = { path = "../../../libs/logging"} +runtimes = { path = "../runtimes" } diff --git a/src/runtime-rs/crates/service/src/lib.rs b/src/runtime-rs/crates/service/src/lib.rs new file mode 100644 index 0000000000..1f28a8009c --- /dev/null +++ b/src/runtime-rs/crates/service/src/lib.rs @@ -0,0 +1,14 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "service"); + +mod manager; +pub use manager::ServiceManager; +mod task_service; diff --git a/src/runtime-rs/crates/service/src/manager.rs b/src/runtime-rs/crates/service/src/manager.rs new file mode 100644 index 0000000000..d22cdf86f1 --- /dev/null +++ b/src/runtime-rs/crates/service/src/manager.rs @@ -0,0 +1,197 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + fs, + os::unix::io::{FromRawFd, RawFd}, + process::Stdio, + sync::Arc, +}; + +use anyhow::{Context, Result}; +use common::message::{Action, Event, Message}; +use containerd_shim_protos::{ + protobuf::{well_known_types::Any, Message as ProtobufMessage}, + shim_async, +}; +use runtimes::RuntimeHandlerManager; +use tokio::{ + io::AsyncWriteExt, + process::Command, + sync::mpsc::{channel, Receiver}, +}; +use ttrpc::asynchronous::Server; + +use crate::task_service::TaskService; + +/// message buffer size +const MESSAGE_BUFFER_SIZE: usize = 8; + +pub const KATA_PATH: &str = "/run/kata"; + +pub struct ServiceManager { + receiver: Option>, + handler: Arc, + task_server: Option, + binary: String, + address: String, + namespace: String, +} + +async fn send_event( + containerd_binary: String, + address: String, + namespace: String, + event: Arc, +) -> Result<()> { + let any = Any { + type_url: event.type_url(), + value: event.value().context("get event value")?, + ..Default::default() + }; + let data = any.write_to_bytes().context("write to any")?; + let mut child = Command::new(containerd_binary) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .args(&[ + "--address", + &address, + "publish", + "--topic", + &event.r#type(), + "--namespace", + &namespace, + ]) + .spawn() + .context("sawn cmd")?; + + let stdin = child.stdin.as_mut().context("failed to open stdin")?; + stdin + .write_all(&data) + .await + .context("failed to write to stdin")?; + let output = child + .wait_with_output() + .await + .context("failed to read stdout")?; + info!(sl!(), "get output: {:?}", output); + Ok(()) +} + +impl ServiceManager { + pub async fn new( + id: &str, + containerd_binary: &str, + address: &str, + namespace: &str, + task_server_fd: RawFd, + ) -> Result { + let (sender, receiver) = channel::(MESSAGE_BUFFER_SIZE); + let handler = Arc::new( + RuntimeHandlerManager::new(id, sender) + .await + .context("new runtime handler")?, + ); + let mut task_server = unsafe { Server::from_raw_fd(task_server_fd) }; + task_server = task_server.set_domain_unix(); + Ok(Self { + receiver: Some(receiver), + handler, + task_server: Some(task_server), + binary: containerd_binary.to_string(), + address: address.to_string(), + namespace: namespace.to_string(), + }) + } + + pub async fn run(&mut self) -> Result<()> { + info!(sl!(), "begin to run service"); + self.start().await.context("start")?; + + info!(sl!(), "wait server message"); + let mut rx = self.receiver.take(); + if let Some(rx) = rx.as_mut() { + while let Some(r) = rx.recv().await { + info!(sl!(), "receive action {:?}", &r.action); + let result = match r.action { + Action::Start => self.start().await.context("start listen"), + Action::Stop => self.stop_listen().await.context("stop listen"), + Action::Shutdown => { + self.stop_listen().await.context("stop listen")?; + break; + } + Action::Event(event) => { + info!(sl!(), "get event {:?}", &event); + send_event( + self.binary.clone(), + self.address.clone(), + self.namespace.clone(), + event, + ) + .await + .context("send event")?; + Ok(()) + } + }; + + if let Some(ref sender) = r.resp_sender { + if let Err(err) = result.as_ref() { + error!(sl!(), "failed to process action {:?}", err); + } + sender.send(result).await.context("send response")?; + } + } + } + + info!(sl!(), "end to run service"); + + Ok(()) + } + + pub fn cleanup(sid: &str) -> Result<()> { + let temp_dir = [KATA_PATH, sid].join("/"); + if std::fs::metadata(temp_dir.as_str()).is_ok() { + // try to remove dir and skip the result + fs::remove_dir_all(temp_dir) + .map_err(|err| { + warn!(sl!(), "failed to clean up sandbox tmp dir"); + err + }) + .ok(); + } + Ok(()) + } + + async fn start(&mut self) -> Result<()> { + let task_service = Arc::new(Box::new(TaskService::new(self.handler.clone())) + as Box); + let task_server = self.task_server.take(); + let task_server = match task_server { + Some(t) => { + let mut t = t.register_service(shim_async::create_task(task_service)); + t.start().await.context("task server start")?; + Some(t) + } + None => None, + }; + self.task_server = task_server; + Ok(()) + } + + async fn stop_listen(&mut self) -> Result<()> { + let task_server = self.task_server.take(); + let task_server = match task_server { + Some(mut t) => { + t.stop_listen().await; + Some(t) + } + None => None, + }; + self.task_server = task_server; + Ok(()) + } +} diff --git a/src/runtime-rs/crates/service/src/task_service.rs b/src/runtime-rs/crates/service/src/task_service.rs new file mode 100644 index 0000000000..447207a851 --- /dev/null +++ b/src/runtime-rs/crates/service/src/task_service.rs @@ -0,0 +1,81 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + convert::{TryFrom, TryInto}, + sync::Arc, +}; + +use async_trait::async_trait; +use common::types::{Request, Response}; +use containerd_shim_protos::{api, shim_async}; +use ttrpc::{self, r#async::TtrpcContext}; + +use runtimes::RuntimeHandlerManager; + +pub(crate) struct TaskService { + handler: Arc, +} + +impl TaskService { + pub(crate) fn new(handler: Arc) -> Self { + Self { handler } + } +} + +async fn handler_message( + s: &RuntimeHandlerManager, + ctx: &TtrpcContext, + req: TtrpcReq, +) -> ttrpc::Result +where + Request: TryFrom, + >::Error: std::fmt::Debug, + TtrpcResp: TryFrom, + >::Error: std::fmt::Debug, +{ + let r = req + .try_into() + .map_err(|err| ttrpc::Error::Others(format!("failed to translate from shim {:?}", err)))?; + let logger = sl!().new(o!("steam id" => ctx.mh.stream_id)); + debug!(logger, "====> task service {:?}", &r); + let resp = s + .handler_message(r) + .await + .map_err(|err| ttrpc::Error::Others(format!("failed to handler message {:?}", err)))?; + debug!(logger, "<==== task service {:?}", &resp); + resp.try_into() + .map_err(|err| ttrpc::Error::Others(format!("failed to translate to shim {:?}", err))) +} + +macro_rules! impl_service { + ($($name: tt | $req: ty | $resp: ty),*) => { + #[async_trait] + impl shim_async::Task for TaskService { + $(async fn $name(&self, ctx: &TtrpcContext, req: $req) -> ttrpc::Result<$resp> { + handler_message(&self.handler, ctx, req).await + })* + } + }; +} + +impl_service!( + state | api::StateRequest | api::StateResponse, + create | api::CreateTaskRequest | api::CreateTaskResponse, + start | api::StartRequest | api::StartResponse, + delete | api::DeleteRequest | api::DeleteResponse, + pids | api::PidsRequest | api::PidsResponse, + pause | api::PauseRequest | api::Empty, + resume | api::ResumeRequest | api::Empty, + kill | api::KillRequest | api::Empty, + exec | api::ExecProcessRequest | api::Empty, + resize_pty | api::ResizePtyRequest | api::Empty, + update | api::UpdateTaskRequest | api::Empty, + wait | api::WaitRequest | api::WaitResponse, + stats | api::StatsRequest | api::StatsResponse, + connect | api::ConnectRequest | api::ConnectResponse, + shutdown | api::ShutdownRequest | api::Empty +); diff --git a/src/runtime-rs/crates/shim/Cargo.toml b/src/runtime-rs/crates/shim/Cargo.toml new file mode 100644 index 0000000000..ecc0e4751d --- /dev/null +++ b/src/runtime-rs/crates/shim/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "shim" +version = "0.1.0" +authors = ["The Kata Containers community "] +description = "Containerd shim runtime for Kata Containers" +keywords = ["kata-containers", "shim"] +repository = "https://github.com/kata-containers/kata-containers.git" +license = "Apache-2.0" +edition = "2018" + +[[bin]] +name = "containerd-shim-kata-v2" +path = "src/bin/main.rs" + +[dependencies] +anyhow = "^1.0" +backtrace = {version = ">=0.3.35", features = ["libunwind", "libbacktrace", "std"], default-features = false} +containerd-shim-protos = { version = "0.2.0", features = ["async"]} +go-flag = "0.1.0" +libc = "0.2.108" +log = "0.4.14" +nix = "0.24.1" +protobuf = "2.27.0" +sha2 = "=0.9.3" +slog = {version = "2.5.2", features = ["std", "release_max_level_trace", "max_level_trace"]} +slog-async = "2.5.2" +slog-scope = "4.4.0" +slog-stdlog = "4.1.0" +thiserror = "1.0.30" +tokio = { version = "1.8.0", features = [ "rt", "rt-multi-thread" ] } +unix_socket2 = "0.5.4" + +kata-types = { path = "../../../libs/kata-types"} +kata-sys-util = { path = "../../../libs/kata-sys-util"} +logging = { path = "../../../libs/logging"} +oci = { path = "../../../libs/oci" } +service = { path = "../service" } + +[build-dependencies] +vergen = { version = "6", default-features = false, features = ["build", "git", "rustc"] } + +[dev-dependencies] +tempfile = "3.2.0" +rand = "0.8.4" +serial_test = "0.5.1" +tests_utils = { path = "../../tests/utils"} diff --git a/src/runtime-rs/crates/shim/build.rs b/src/runtime-rs/crates/shim/build.rs new file mode 100644 index 0000000000..6fd7ff9a99 --- /dev/null +++ b/src/runtime-rs/crates/shim/build.rs @@ -0,0 +1,12 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use vergen::{vergen, Config}; + +fn main() { + // Generate the default 'cargo:' instruction output + vergen(Config::default()).unwrap(); +} diff --git a/src/runtime-rs/crates/shim/src/args.rs b/src/runtime-rs/crates/shim/src/args.rs new file mode 100644 index 0000000000..0db33cbf81 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/args.rs @@ -0,0 +1,325 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{os::unix::fs::FileTypeExt, path::PathBuf}; + +use anyhow::{anyhow, Context, Result}; +use kata_sys_util::validate; + +use crate::Error; + +/// Received command-line arguments or environment arguments +/// from a shimv2 container manager such as containerd. +/// +/// For detailed information, please refer to the +/// [shim spec](https://github.com/containerd/containerd/blob/main/runtime/v2/README.md). +#[derive(Debug, Default, Clone)] +pub struct Args { + /// the id of the container + pub id: String, + /// the namespace for the container + pub namespace: String, + /// the address of the containerd's main socket + pub address: String, + /// the binary path to publish events back to containerd + pub publish_binary: String, + /// Abstract socket path to serve. + pub socket: String, + /// the path to the bundle to delete + pub bundle: String, + /// Whether or not to enable debug + pub debug: bool, +} + +impl Args { + /// Check the shim argument object is vaild or not. + /// + /// The id, namespace, address and publish_binary are mandatory for START, RUN and DELETE. + /// And bundle is mandatory for DELETE. + pub fn validate(&mut self, should_check_bundle: bool) -> Result<()> { + if self.id.is_empty() + || self.namespace.is_empty() + || self.address.is_empty() + || self.publish_binary.is_empty() + { + return Err(anyhow!(Error::ArgumentIsEmpty(format!( + "id: {} namespace: {} address: {} publish_binary: {}", + &self.id, &self.namespace, &self.address, &self.publish_binary + )))); + } + + validate::verify_id(&self.id).context("verify container id")?; + validate::verify_id(&self.namespace).context("verify namespace")?; + + // Ensure `address` is a valid path. + let path = PathBuf::from(self.address.clone()) + .canonicalize() + .context(Error::InvalidPath(self.address.clone()))?; + let md = path + .metadata() + .context(Error::FileGetMetadata(format!("{:?}", path)))?; + if !md.file_type().is_socket() { + return Err(Error::InvalidArgument).context("address is not socket"); + } + self.address = path + .to_str() + .map(|v| v.to_owned()) + .ok_or(Error::InvalidArgument)?; + + // Ensure `bundle` is a valid path. + if should_check_bundle { + if self.bundle.is_empty() { + return Err(anyhow!(Error::ArgumentIsEmpty("bundle".to_string()))); + } + + let path = PathBuf::from(self.bundle.clone()) + .canonicalize() + .map_err(|_| Error::InvalidArgument)?; + let md = path + .metadata() + .map_err(|_| Error::InvalidArgument) + .context("get address metadata")?; + if !md.is_dir() { + return Err(Error::InvalidArgument).context("medata is dir"); + } + self.bundle = path + .to_str() + .map(|v| v.to_owned()) + .ok_or(Error::InvalidArgument) + .context("path to string")?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::unix::net::UnixListener; + + use anyhow::anyhow; + use kata_sys_util::validate; + + #[test] + fn test_args_is_valid() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().to_path_buf(); + let path = path.to_str().unwrap(); + let bind_address = &format!("{}/socket1", path); + UnixListener::bind(bind_address).unwrap(); + + #[derive(Debug)] + struct TestData { + arg: Args, + should_check_bundle: bool, + result: Result<()>, + } + + let default_id = "1dfc0567".to_string(); + let default_namespace = "ns1".to_string(); + let default_address = bind_address.to_string(); + let default_publish_binary = "containerd".to_string(); + let default_socket = "socket".to_string(); + let default_bundle = path.to_string(); + let default_debug = false; + + let mut arg = Args { + id: default_id.clone(), + namespace: default_namespace.clone(), + address: default_address.clone(), + publish_binary: default_publish_binary.clone(), + socket: default_socket, + bundle: default_bundle.clone(), + debug: default_debug, + }; + + let tests = &[ + TestData { + arg: arg.clone(), + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: { + arg.namespace = "".to_string(); + arg.clone() + }, + should_check_bundle: false, + result: Err(anyhow!(Error::ArgumentIsEmpty(format!( + "id: {} namespace: {} address: {} publish_binary: {}", + &arg.id, &arg.namespace, &arg.address, &arg.publish_binary + )))), + }, + TestData { + arg: { + arg.namespace = default_namespace.clone(); + arg.clone() + }, + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: { + arg.id = "".to_string(); + arg.clone() + }, + should_check_bundle: false, + result: Err(anyhow!(Error::ArgumentIsEmpty(format!( + "id: {} namespace: {} address: {} publish_binary: {}", + &arg.id, &arg.namespace, &arg.address, &arg.publish_binary + )))), + }, + TestData { + arg: { + arg.id = default_id; + arg.clone() + }, + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: { + arg.address = "".to_string(); + arg.clone() + }, + should_check_bundle: false, + result: Err(anyhow!(Error::ArgumentIsEmpty(format!( + "id: {} namespace: {} address: {} publish_binary: {}", + &arg.id, &arg.namespace, &arg.address, &arg.publish_binary + )))), + }, + TestData { + arg: { + arg.address = default_address.clone(); + arg.clone() + }, + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: { + arg.publish_binary = "".to_string(); + arg.clone() + }, + should_check_bundle: false, + result: Err(anyhow!(Error::ArgumentIsEmpty(format!( + "id: {} namespace: {} address: {} publish_binary: {}", + &arg.id, &arg.namespace, &arg.address, &arg.publish_binary + )))), + }, + TestData { + arg: { + arg.publish_binary = default_publish_binary; + arg.clone() + }, + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: { + arg.bundle = "".to_string(); + arg.clone() + }, + should_check_bundle: false, + result: Ok(()), + }, + TestData { + arg: arg.clone(), + should_check_bundle: true, + result: Err(anyhow!(Error::ArgumentIsEmpty("bundle".to_string()))), + }, + TestData { + arg: { + arg.bundle = default_bundle; + arg.clone() + }, + should_check_bundle: true, + result: Ok(()), + }, + TestData { + arg: { + arg.namespace = "id1/id2".to_string(); + arg.clone() + }, + should_check_bundle: true, + result: Err( + anyhow!(validate::Error::InvalidContainerID("id/id2".to_string())) + .context("verify namespace"), + ), + }, + TestData { + arg: { + arg.namespace = default_namespace.clone() + "id1 id2"; + arg.clone() + }, + should_check_bundle: true, + result: Err(anyhow!(validate::Error::InvalidContainerID( + default_namespace.clone() + "id1 id2", + )) + .context("verify namespace")), + }, + TestData { + arg: { + arg.namespace = default_namespace.clone() + "id2\tid2"; + arg.clone() + }, + should_check_bundle: true, + result: Err(anyhow!(validate::Error::InvalidContainerID( + default_namespace.clone() + "id1\tid2", + )) + .context("verify namespace")), + }, + TestData { + arg: { + arg.namespace = default_namespace; + arg.clone() + }, + should_check_bundle: true, + result: Ok(()), + }, + TestData { + arg: { + arg.address = default_address.clone() + "/.."; + arg.clone() + }, + should_check_bundle: true, + result: Err(anyhow!(Error::InvalidPath(arg.address.clone()))), + }, + TestData { + arg: { + arg.address = default_address.clone() + "/.."; + arg.clone() + }, + should_check_bundle: true, + result: Err(anyhow!(Error::InvalidPath(arg.address.clone()))), + }, + TestData { + arg: { + arg.address = default_address; + arg + }, + should_check_bundle: true, + result: Ok(()), + }, + ]; + + for (i, t) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, t); + let should_check_bundle = t.should_check_bundle; + let result = t.arg.clone().validate(should_check_bundle); + let msg = format!("{}, result: {:?}", msg, result); + + if t.result.is_ok() { + assert!(result.is_ok(), "{}", msg); + } else { + let expected_error = format!("{}", t.result.as_ref().unwrap_err()); + let actual_error = format!("{}", result.unwrap_err()); + assert!(actual_error == expected_error, "{}", msg); + } + } + } +} diff --git a/src/runtime-rs/crates/shim/src/bin/main.rs b/src/runtime-rs/crates/shim/src/bin/main.rs new file mode 100644 index 0000000000..262f9e2385 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/bin/main.rs @@ -0,0 +1,194 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + ffi::{OsStr, OsString}, + path::PathBuf, +}; + +use anyhow::{anyhow, Context, Result}; +use nix::{ + mount::{mount, MsFlags}, + sched::{self, CloneFlags}, +}; +use shim::{config, Args, Error, ShimExecutor}; + +// default tokio runtime worker threads +const DEFAULT_TOKIO_RUNTIME_WORKER_THREADS: usize = 2; +// env to config tokio runtime worker threads +const ENV_TOKIO_RUNTIME_WORKER_THREADS: &str = "TOKIO_RUNTIME_WORKER_THREADS"; + +#[derive(Debug)] +enum Action { + Run(Args), + Start(Args), + Delete(Args), + Help, + Version, +} + +fn parse_args(args: &[OsString]) -> Result { + let mut help = false; + let mut version = false; + let mut shim_args = Args::default(); + + // Crate `go_flag` is used to keep compatible with go/flag package. + let rest_args = go_flag::parse_args_with_warnings::(&args[1..], None, |flags| { + flags.add_flag("address", &mut shim_args.address); + flags.add_flag("bundle", &mut shim_args.bundle); + flags.add_flag("debug", &mut shim_args.debug); + flags.add_flag("id", &mut shim_args.id); + flags.add_flag("namespace", &mut shim_args.namespace); + flags.add_flag("publish-binary", &mut shim_args.publish_binary); + flags.add_flag("socket", &mut shim_args.socket); + flags.add_flag("help", &mut help); + flags.add_flag("version", &mut version); + }) + .context(Error::ParseArgument(format!("{:?}", args)))?; + + if help { + Ok(Action::Help) + } else if version { + Ok(Action::Version) + } else if rest_args.is_empty() { + Ok(Action::Run(shim_args)) + } else if rest_args[0] == "start" { + Ok(Action::Start(shim_args)) + } else if rest_args[0] == "delete" { + Ok(Action::Delete(shim_args)) + } else { + Err(anyhow!(Error::InvalidArgument)) + } +} + +fn show_help(cmd: &OsStr) { + let path = PathBuf::from(cmd); + let name = match path.file_name() { + Some(v) => v.to_str(), + None => None, + }; + + let name = name.unwrap_or(config::RUNTIME_NAME); + + println!( + r#"Usage of {}: + -address string + grpc address back to main containerd + -bundle string + path to the bundle if not workdir + -debug + enable debug output in logs + -id string + id of the task + -namespace string + namespace that owns the shim + -publish-binary string + path to publish binary (used for publishing events) (default "containerd") + -socket string + socket path to serve + --version + show the runtime version detail and exit +"#, + name + ); +} + +fn show_version(err: Option) { + let data = format!( + r#"{} containerd shim: id: {}, version: {}, commit: {}"#, + config::PROJECT_NAME, + config::CONTAINERD_RUNTIME_NAME, + config::RUNTIME_VERSION, + config::RUNTIME_VERSION_COMMIT, + ); + + if let Some(err) = err { + eprintln!( + "{}\r\nERROR: {} failed: {:?}", + data, + config::RUNTIME_NAME, + err + ); + } else { + println!("{}", data) + } +} + +fn get_tokio_runtime() -> Result { + let worker_threads = std::env::var(ENV_TOKIO_RUNTIME_WORKER_THREADS) + .unwrap_or_default() + .parse() + .unwrap_or(DEFAULT_TOKIO_RUNTIME_WORKER_THREADS); + + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(worker_threads) + .enable_all() + .build() + .context("prepare tokio runtime")?; + Ok(rt) +} + +fn real_main() -> Result<()> { + let args = std::env::args_os().collect::>(); + if args.is_empty() { + return Err(anyhow!(Error::ArgumentIsEmpty( + "command-line arguments".to_string() + ))); + } + + let action = parse_args(&args).context("parse args")?; + match action { + Action::Start(args) => ShimExecutor::new(args).start().context("shim start")?, + Action::Delete(args) => ShimExecutor::new(args).delete().context("shim delete")?, + Action::Run(args) => { + // set mnt namespace + // need setup before other async call + setup_mnt().context("setup mnt")?; + + let mut shim = ShimExecutor::new(args); + let rt = get_tokio_runtime().context("get tokio runtime")?; + rt.block_on(shim.run())? + } + Action::Help => show_help(&args[0]), + Action::Version => show_version(None), + } + Ok(()) +} +fn main() { + if let Err(err) = real_main() { + show_version(Some(err)); + } +} + +fn setup_mnt() -> Result<()> { + // Unshare the mount namespace, so that the calling process has a private copy of its namespace + // which is not shared with any other process. + sched::unshare(CloneFlags::CLONE_NEWNS).context("unshare clone newns")?; + + // Mount and unmount events propagate into this mount from the (master) shared peer group of + // which it was formerly a member. Mount and unmount events under this mount do not propagate + // to any peer. + mount( + Some("none"), + "/", + Some(""), + MsFlags::MS_REC | MsFlags::MS_SLAVE, + Some(""), + ) + .context("mount with slave")?; + + // Mount and unmount events immediately under this mount will propagate to the other mounts + // that are members of this mount's peer group. + mount( + Some("none"), + "/", + Some(""), + MsFlags::MS_REC | MsFlags::MS_SHARED, + Some(""), + ) + .context("mount with shared")?; + Ok(()) +} diff --git a/src/runtime-rs/crates/shim/src/config.rs.in b/src/runtime-rs/crates/shim/src/config.rs.in new file mode 100644 index 0000000000..e1a181ec4a --- /dev/null +++ b/src/runtime-rs/crates/shim/src/config.rs.in @@ -0,0 +1,19 @@ +// Copyright (c) 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +// +// WARNING: This file is auto-generated - DO NOT EDIT! +// + +#![allow(dead_code)] + +pub const PROJECT_NAME: &str = "@PROJECT_NAME@"; +pub const RUNTIME_VERSION: &str = "@RUNTIME_VERSION@"; +pub const RUNTIME_VERSION_COMMIT: &str = "@VERSION_COMMIT@"; +pub const RUNTIME_GIT_COMMIT: &str = "@COMMIT@"; +pub const RUNTIME_NAME: &str = "@RUNTIME_NAME@"; +pub const CONTAINERD_RUNTIME_NAME: &str = "@CONTAINERD_RUNTIME_NAME@"; +pub const RUNTIME_DIR: &str = "@BINDIR@"; +pub const RUNTIME_PATH: &str = "@BINDIR@/@RUNTIME_NAME@"; diff --git a/src/runtime-rs/crates/shim/src/error.rs b/src/runtime-rs/crates/shim/src/error.rs new file mode 100644 index 0000000000..3867963fbc --- /dev/null +++ b/src/runtime-rs/crates/shim/src/error.rs @@ -0,0 +1,52 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::path::PathBuf; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("failed to parse argument {0}")] + ParseArgument(String), + #[error("failed to get bundle path")] + GetBundlePath, + #[error("invalid argument")] + InvalidArgument, + #[error("argument is empty {0}")] + ArgumentIsEmpty(String), + #[error("invalid path {0}")] + InvalidPath(String), + + // File + #[error("failed to open file {0}")] + FileOpen(String), + #[error("failed to get file metadata {0}")] + FileGetMetadata(String), + #[error("failed to read file {0}")] + FileRead(String), + #[error("failed to write file {0}")] + FileWrite(String), + + #[error("empty sandbox id")] + EmptySandboxId, + #[error("failed to get self exec: {0}")] + SelfExec(#[source] std::io::Error), + #[error("failed to bind socket at {1} with error: {0}")] + BindSocket(#[source] std::io::Error, PathBuf), + #[error("failed to spawn child: {0}")] + SpawnChild(#[source] std::io::Error), + #[error("failed to clean container {0}")] + CleanUpContainer(String), + #[error("failed to get env variable: {0}")] + EnvVar(#[source] std::env::VarError), + #[error("failed to parse server fd environment variable {0}")] + ServerFd(String), + #[error("failed to wait ttrpc server when {0}")] + WaitServer(String), + #[error("failed to get system time: {0}")] + SystemTime(#[source] std::time::SystemTimeError), + #[error("failed to parse pid")] + ParsePid, +} diff --git a/src/runtime-rs/crates/shim/src/lib.rs b/src/runtime-rs/crates/shim/src/lib.rs new file mode 100644 index 0000000000..000c5620a2 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/lib.rs @@ -0,0 +1,24 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate slog; + +logging::logger_with_subsystem!(sl, "shim"); + +mod args; +pub use args::Args; +mod error; +pub use error::Error; +mod logger; +mod panic_hook; +mod shim; +pub use shim::ShimExecutor; +#[rustfmt::skip] +pub mod config; +mod shim_delete; +mod shim_run; +mod shim_start; diff --git a/src/runtime-rs/crates/shim/src/logger.rs b/src/runtime-rs/crates/shim/src/logger.rs new file mode 100644 index 0000000000..50ba891fb3 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/logger.rs @@ -0,0 +1,41 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::os::unix::fs::OpenOptionsExt; + +use anyhow::{Context, Result}; + +use crate::Error; + +pub(crate) fn set_logger(path: &str, sid: &str, is_debug: bool) -> Result { + let fifo = std::fs::OpenOptions::new() + .custom_flags(libc::O_NONBLOCK) + .create(true) + .write(true) + .append(true) + .open(path) + .context(Error::FileOpen(path.to_string()))?; + + let level = if is_debug { + slog::Level::Debug + } else { + slog::Level::Info + }; + + let (logger, async_guard) = logging::create_logger("kata-runtime", sid, level, fifo); + + // not reset global logger when drop + slog_scope::set_global_logger(logger).cancel_reset(); + + let level = if is_debug { + log::Level::Debug + } else { + log::Level::Info + }; + slog_stdlog::init_with_level(level).context(format!("init with level {}", level))?; + + Ok(async_guard) +} diff --git a/src/runtime-rs/crates/shim/src/panic_hook.rs b/src/runtime-rs/crates/shim/src/panic_hook.rs new file mode 100644 index 0000000000..88dbf305a6 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/panic_hook.rs @@ -0,0 +1,57 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{boxed::Box, fs::OpenOptions, io::Write, ops::Deref}; + +use backtrace::Backtrace; + +const KMESG_DEVICE: &str = "/dev/kmsg"; + +// TODO: the Kata 1.x runtime had a SIGUSR1 handler that would log a formatted backtrace on +// receiving that signal. It could be useful to re-add that feature. +pub(crate) fn set_panic_hook() { + std::panic::set_hook(Box::new(move |panic_info| { + let (filename, line) = panic_info + .location() + .map(|loc| (loc.file(), loc.line())) + .unwrap_or(("", 0)); + + let cause = panic_info + .payload() + .downcast_ref::() + .map(std::string::String::deref); + + let cause = cause.unwrap_or_else(|| { + panic_info + .payload() + .downcast_ref::<&str>() + .copied() + .unwrap_or("") + }); + let bt = Backtrace::new(); + let bt_data = format!("{:?}", bt); + error!( + sl!(), + "A panic occurred at {}:{}: {}\r\n{:?}", filename, line, cause, bt_data + ); + + // print panic log to dmesg + // The panic log size is too large to /dev/kmsg, so write by line. + if let Ok(mut file) = OpenOptions::new().write(true).open(KMESG_DEVICE) { + file.write_all( + format!("A panic occurred at {}:{}: {}", filename, line, cause).as_bytes(), + ) + .ok(); + let lines: Vec<&str> = bt_data.split('\n').collect(); + for line in lines { + file.write_all(line.as_bytes()).ok(); + } + + file.flush().ok(); + } + std::process::abort(); + })); +} diff --git a/src/runtime-rs/crates/shim/src/shim.rs b/src/runtime-rs/crates/shim/src/shim.rs new file mode 100644 index 0000000000..83060f2b22 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/shim.rs @@ -0,0 +1,119 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + os::unix::ffi::OsStrExt, + path::{Path, PathBuf}, +}; + +use anyhow::{anyhow, Context, Result}; +use sha2::Digest; + +use crate::{Args, Error}; + +const SOCKET_ROOT: &str = "/run/containerd"; +const SHIM_PID_FILE: &str = "shim.pid"; + +pub(crate) const ENV_KATA_RUNTIME_BIND_FD: &str = "KATA_RUNTIME_BIND_FD"; + +/// Command executor for shim. +pub struct ShimExecutor { + pub(crate) args: Args, +} + +impl ShimExecutor { + /// Create a new instance of [`Shim`]. + pub fn new(args: Args) -> Self { + ShimExecutor { args } + } + + pub(crate) fn load_oci_spec(&self, path: &Path) -> Result { + let spec_file = path.join(oci::OCI_SPEC_CONFIG_FILE_NAME); + oci::Spec::load(spec_file.to_str().unwrap_or_default()).context("load spec") + } + + pub(crate) fn write_address(&self, path: &Path, address: &Path) -> Result<()> { + let file_path = &path.join("address"); + std::fs::write(file_path, address.as_os_str().as_bytes()) + .context(Error::FileWrite(format!("{:?}", &file_path))) + } + + pub(crate) fn write_pid_file(&self, path: &Path, pid: u32) -> Result<()> { + let file_path = &path.join(SHIM_PID_FILE); + std::fs::write(file_path, format!("{}", pid)) + .context(Error::FileWrite(format!("{:?}", &file_path))) + } + + // There may be a multi-container for a Pod, each container has a bundle path, we need to write + // the PID to the file for each container in their own bundle path, so we can directly get the + // `bundle_path()` and write the PID. + // While the real runtime process's PID is stored in the file in the sandbox container's bundle + // path, so needs to read from the sandbox container's bundle path. + pub(crate) fn read_pid_file(&self, path: &Path) -> Result { + let file_path = path.join(SHIM_PID_FILE); + let data = std::fs::read_to_string(&file_path) + .context(Error::FileOpen(format!("{:?}", file_path)))?; + + data.parse::().context(Error::ParsePid) + } + + pub(crate) fn socket_address(&self, id: &str) -> Result { + if id.is_empty() { + return Err(anyhow!(Error::EmptySandboxId)); + } + + let data = [&self.args.address, &self.args.namespace, id].join("/"); + let mut hasher = sha2::Sha256::new(); + hasher.update(data); + // https://github.com/containerd/containerd/blob/main/runtime/v2/shim/util_unix.go#L68 to + // generate a shim socket path. + Ok(PathBuf::from(format!( + "unix://{}/s/{:X}", + SOCKET_ROOT, + hasher.finalize() + ))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serial_test::serial; + + use kata_sys_util::spec::get_bundle_path; + + #[test] + #[serial] + fn test_shim_executor() { + let dir = tempfile::tempdir().unwrap(); + let bundle_path = dir.path(); + std::env::set_current_dir(bundle_path).unwrap(); + + let args = Args { + id: "1dfc0567".to_string(), + namespace: "test_namespace".into(), + address: "containerd_socket".into(), + publish_binary: "containerd".into(), + socket: "socket".into(), + bundle: bundle_path.to_str().unwrap().into(), + debug: false, + }; + + let executor = ShimExecutor::new(args); + + executor + .write_address(bundle_path, Path::new("12345")) + .unwrap(); + let dir = get_bundle_path().unwrap(); + let file_path = &dir.join("address"); + let buf = std::fs::read_to_string(file_path).unwrap(); + assert_eq!(&buf, "12345"); + + executor.write_pid_file(&dir, 1267).unwrap(); + let read_pid = executor.read_pid_file(&dir).unwrap(); + assert_eq!(read_pid, 1267); + } +} diff --git a/src/runtime-rs/crates/shim/src/shim_delete.rs b/src/runtime-rs/crates/shim/src/shim_delete.rs new file mode 100644 index 0000000000..fd90775662 --- /dev/null +++ b/src/runtime-rs/crates/shim/src/shim_delete.rs @@ -0,0 +1,47 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{Context, Result}; +use containerd_shim_protos::api; +use protobuf::Message; +use std::{fs, path::Path}; + +use crate::{shim::ShimExecutor, Error}; + +impl ShimExecutor { + pub fn delete(&mut self) -> Result<()> { + self.args.validate(true).context("validate")?; + let rsp = self.do_cleanup().context("do cleanup")?; + rsp.write_to_writer(&mut std::io::stdout()) + .context(Error::FileWrite(format!("write {:?} to stdout", rsp)))?; + Ok(()) + } + + fn do_cleanup(&self) -> Result { + let mut rsp = api::DeleteResponse::new(); + rsp.set_exit_status(128 + libc::SIGKILL as u32); + let mut exited_time = protobuf::well_known_types::Timestamp::new(); + let seconds = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(Error::SystemTime)? + .as_secs() as i64; + exited_time.set_seconds(seconds); + rsp.set_exited_at(exited_time); + + let address = self + .socket_address(&self.args.id) + .context("socket address")?; + let trim_path = address.strip_prefix("unix://").context("trim path")?; + let file_path = Path::new("/").join(trim_path); + let file_path = file_path.as_path(); + if std::fs::metadata(&file_path).is_ok() { + info!(sl!(), "remote socket path: {:?}", &file_path); + fs::remove_file(file_path).ok(); + } + service::ServiceManager::cleanup(&self.args.id).context("cleanup")?; + Ok(rsp) + } +} diff --git a/src/runtime-rs/crates/shim/src/shim_run.rs b/src/runtime-rs/crates/shim/src/shim_run.rs new file mode 100644 index 0000000000..cde365780e --- /dev/null +++ b/src/runtime-rs/crates/shim/src/shim_run.rs @@ -0,0 +1,64 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::os::unix::io::RawFd; + +use anyhow::{Context, Result}; +use kata_sys_util::spec::get_bundle_path; + +use crate::{ + logger, + shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD}, + Error, +}; + +impl ShimExecutor { + pub async fn run(&mut self) -> Result<()> { + crate::panic_hook::set_panic_hook(); + let sid = self.args.id.clone(); + let bundle_path = get_bundle_path().context("get bundle")?; + let path = bundle_path.join("log"); + let _logger_guard = + logger::set_logger(path.to_str().unwrap(), &sid, self.args.debug).context("set logger"); + + self.do_run() + .await + .map_err(|err| { + error!(sl!(), "failed run shim {:?}", err); + err + }) + .context("run shim")?; + + Ok(()) + } + + async fn do_run(&mut self) -> Result<()> { + info!(sl!(), "start to run"); + self.args.validate(false).context("validate")?; + + let server_fd = get_server_fd().context("get server fd")?; + let mut service_manager = service::ServiceManager::new( + &self.args.id, + &self.args.publish_binary, + &self.args.address, + &self.args.namespace, + server_fd, + ) + .await + .context("new shim server")?; + service_manager.run().await.context("run")?; + + Ok(()) + } +} + +fn get_server_fd() -> Result { + let env_fd = std::env::var(ENV_KATA_RUNTIME_BIND_FD).map_err(Error::EnvVar)?; + let fd = env_fd + .parse::() + .map_err(|_| Error::ServerFd(env_fd))?; + Ok(fd) +} diff --git a/src/runtime-rs/crates/shim/src/shim_start.rs b/src/runtime-rs/crates/shim/src/shim_start.rs new file mode 100644 index 0000000000..414e0ccf3f --- /dev/null +++ b/src/runtime-rs/crates/shim/src/shim_start.rs @@ -0,0 +1,234 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::{ + fs, + io::Write, + os::unix::{io::IntoRawFd, prelude::OsStrExt}, + path::{Path, PathBuf}, +}; + +use anyhow::{anyhow, Context, Result}; +use kata_sys_util::spec::get_bundle_path; +use kata_types::{container::ContainerType, k8s}; +use unix_socket::UnixListener; + +use crate::{ + shim::{ShimExecutor, ENV_KATA_RUNTIME_BIND_FD}, + Error, +}; + +impl ShimExecutor { + pub fn start(&mut self) -> Result<()> { + self.args.validate(false).context("validate")?; + + let address = self.do_start().context("do start")?; + std::io::stdout() + .write_all(address.as_os_str().as_bytes()) + .context("failed to write stdout")?; + Ok(()) + } + + fn do_start(&mut self) -> Result { + let bundle_path = get_bundle_path().context("get bundle path")?; + let spec = self.load_oci_spec(&bundle_path)?; + let (container_type, id) = k8s::container_type_with_id(&spec); + + match container_type { + ContainerType::PodSandbox => { + let address = self.socket_address(&self.args.id)?; + let socket = new_listener(&address)?; + let child_pid = self.create_shim_process(socket)?; + self.write_pid_file(&bundle_path, child_pid)?; + self.write_address(&bundle_path, &address)?; + Ok(address) + } + ContainerType::PodContainer => { + let sid = id + .ok_or(Error::InvalidArgument) + .context("get sid for container")?; + let (address, pid) = self.get_shim_info_from_sandbox(&sid)?; + self.write_pid_file(&bundle_path, pid)?; + self.write_address(&bundle_path, &address)?; + Ok(address) + } + } + } + + fn new_command(&self) -> Result { + if self.args.id.is_empty() + || self.args.namespace.is_empty() + || self.args.address.is_empty() + || self.args.publish_binary.is_empty() + { + return Err(anyhow!("invalid param")); + } + + let bundle_path = get_bundle_path().context("get bundle path")?; + let self_exec = std::env::current_exe().map_err(Error::SelfExec)?; + let mut command = std::process::Command::new(self_exec); + + command + .current_dir(bundle_path) + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .arg("-id") + .arg(&self.args.id) + .arg("-namespace") + .arg(&self.args.namespace) + .arg("-address") + .arg(&self.args.address) + .arg("-publish-binary") + .arg(&self.args.publish_binary) + .env("RUST_BACKTRACE", "1"); + + if self.args.debug { + command.arg("-debug"); + } + + Ok(command) + } + + fn create_shim_process(&self, socket: T) -> Result { + let mut cmd = self.new_command().context("new command")?; + cmd.env( + ENV_KATA_RUNTIME_BIND_FD, + format!("{}", socket.into_raw_fd()), + ); + let child = cmd + .spawn() + .map_err(Error::SpawnChild) + .context("spawn child")?; + + Ok(child.id()) + } + + fn get_shim_info_from_sandbox(&self, sandbox_id: &str) -> Result<(PathBuf, u32)> { + // All containers of a pod share the same pod socket address. + let address = self.socket_address(sandbox_id).context("socket address")?; + let bundle_path = get_bundle_path().context("get bundle path")?; + let parent_bundle_path = Path::new(&bundle_path) + .parent() + .unwrap_or_else(|| Path::new("")); + let sandbox_bundle_path = parent_bundle_path + .join(sandbox_id) + .canonicalize() + .context(Error::GetBundlePath)?; + let pid = self.read_pid_file(&sandbox_bundle_path)?; + + Ok((address, pid)) + } +} + +fn new_listener(address: &Path) -> Result { + let trim_path = address.strip_prefix("unix:").context("trim path")?; + let file_path = Path::new("/").join(trim_path); + let file_path = file_path.as_path(); + if let Some(parent_dir) = file_path.parent() { + fs::create_dir_all(parent_dir).context("create parent dir")?; + } + + UnixListener::bind(file_path).context("bind address") +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use serial_test::serial; + use tests_utils::gen_id; + + use super::*; + use crate::Args; + + #[test] + #[serial] + fn test_new_command() { + let dir = tempfile::tempdir().unwrap(); + let bundle_path = dir.path(); + std::env::set_current_dir(bundle_path).unwrap(); + + let args = Args { + id: "sandbox1".into(), + namespace: "ns".into(), + address: "address".into(), + publish_binary: "containerd".into(), + socket: "socket".into(), + bundle: bundle_path.to_str().unwrap().into(), + debug: false, + }; + let mut executor = ShimExecutor::new(args); + + let cmd = executor.new_command().unwrap(); + assert_eq!(cmd.get_args().len(), 8); + assert_eq!(cmd.get_envs().len(), 1); + assert_eq!(cmd.get_current_dir().unwrap(), get_bundle_path().unwrap()); + + executor.args.debug = true; + let cmd = executor.new_command().unwrap(); + assert_eq!(cmd.get_args().len(), 9); + assert_eq!(cmd.get_envs().len(), 1); + assert_eq!(cmd.get_current_dir().unwrap(), get_bundle_path().unwrap()); + } + + #[test] + #[serial] + fn test_get_info_from_sandbox() { + let dir = tempfile::tempdir().unwrap(); + let sandbox_id = gen_id(16); + let bundle_path = &dir.path().join(&sandbox_id); + std::fs::create_dir(bundle_path).unwrap(); + std::env::set_current_dir(bundle_path).unwrap(); + + let args = Args { + id: sandbox_id.to_owned(), + namespace: "ns1".into(), + address: "containerd_socket".into(), + publish_binary: "containerd".into(), + socket: "socket".into(), + bundle: bundle_path.to_str().unwrap().into(), + debug: false, + }; + let executor = ShimExecutor::new(args); + + let addr = executor.socket_address(&executor.args.id).unwrap(); + executor.write_address(bundle_path, &addr).unwrap(); + executor.write_pid_file(bundle_path, 1267).unwrap(); + + let container_id = gen_id(16); + let bundle_path2 = &dir.path().join(&container_id); + std::fs::create_dir(bundle_path2).unwrap(); + std::env::set_current_dir(bundle_path2).unwrap(); + + let args = Args { + id: container_id, + namespace: "ns1".into(), + address: "containerd_socket".into(), + publish_binary: "containerd".into(), + socket: "socket".into(), + bundle: bundle_path2.to_str().unwrap().into(), + debug: false, + }; + let executor2 = ShimExecutor::new(args); + + let (address, pid) = executor2.get_shim_info_from_sandbox(&sandbox_id).unwrap(); + + assert_eq!(pid, 1267); + assert_eq!(&address, &addr); + } + + #[test] + #[serial] + fn test_new_listener() { + let path = "/tmp/aaabbbccc"; + let uds_path = format!("unix://{}", path); + std::fs::remove_file(path).ok(); + + let _ = new_listener(Path::new(&uds_path)).unwrap(); + std::fs::remove_file(path).ok(); + } +} diff --git a/src/runtime-rs/tests/texture/image-bundle/config.json b/src/runtime-rs/tests/texture/image-bundle/config.json new file mode 100644 index 0000000000..0b6665a2eb --- /dev/null +++ b/src/runtime-rs/tests/texture/image-bundle/config.json @@ -0,0 +1,395 @@ +{ + "ociVersion": "0.5.0-dev", + "process": { + "terminal": true, + "user": { + "uid": 1, + "gid": 1, + "additionalGids": [ + 5, + 6 + ] + }, + "args": [ + "sh" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL" + ], + "ambient": [ + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_CORE", + "hard": 1024, + "soft": 1024 + }, + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "apparmorProfile": "acme_secure_profile", + "selinuxLabel": "system_u:system_r:svirt_lxc_net_t:s0:c124,c675", + "noNewPrivileges": true + }, + "root": { + "path": "rootfs", + "readonly": true + }, + "hostname": "slartibartfast", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "hooks": { + "prestart": [ + { + "path": "/usr/bin/fix-mounts", + "args": [ + "fix-mounts", + "arg1", + "arg2" + ], + "env": [ + "key1=value1" + ] + }, + { + "path": "/usr/bin/setup-network" + } + ], + "createRuntime": [ + { + "path": "/usr/bin/fix-mounts", + "args": ["fix-mounts", "arg1", "arg2"], + "env": [ "key1=value1"] + }, + { + "path": "/usr/bin/setup-network" + } + ], + "createContainer": [ + { + "path": "/usr/bin/mount-hook", + "args": ["-mount", "arg1", "arg2"], + "env": [ "key1=value1"] + } + ], + "startContainer": [ + { + "path": "/usr/bin/refresh-ldcache" + } + ], + "poststart": [ + { + "path": "/usr/bin/notify-start", + "timeout": 5 + } + ], + "poststop": [ + { + "path": "/usr/sbin/cleanup.sh", + "args": [ + "cleanup.sh", + "-f" + ] + } + ] + }, + "linux": { + "devices": [ + { + "path": "/dev/fuse", + "type": "c", + "major": 10, + "minor": 229, + "fileMode": 438, + "uid": 0, + "gid": 0 + }, + { + "path": "/dev/sda", + "type": "b", + "major": 8, + "minor": 0, + "fileMode": 432, + "uid": 0, + "gid": 0 + } + ], + "uidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 32000 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 1000, + "size": 32000 + } + ], + "sysctl": { + "net.ipv4.ip_forward": "1", + "net.core.somaxconn": "256" + }, + "cgroupsPath": "/myRuntime/myContainer", + "resources": { + "network": { + "classID": 1048577, + "priorities": [ + { + "name": "eth0", + "priority": 500 + }, + { + "name": "eth1", + "priority": 1000 + } + ] + }, + "pids": { + "limit": 32771 + }, + "hugepageLimits": [ + { + "pageSize": "2MB", + "limit": 9223372036854772000 + }, + { + "pageSize": "64KB", + "limit": 1000000 + } + ], + "oomScoreAdj": 100, + "memory": { + "limit": 536870912, + "reservation": 536870912, + "swap": 536870912, + "kernel": -1, + "kernelTCP": -1, + "swappiness": 0, + "disableOOMKiller": false, + "useHierarchy": false + }, + "cpu": { + "shares": 1024, + "quota": 1000000, + "period": 500000, + "realtimeRuntime": 950000, + "realtimePeriod": 1000000, + "cpus": "2-3", + "mems": "0-7" + }, + "devices": [ + { + "allow": false, + "access": "rwm" + }, + { + "allow": true, + "type": "c", + "major": 10, + "minor": 229, + "access": "rw" + }, + { + "allow": true, + "type": "b", + "major": 8, + "minor": 0, + "access": "r" + } + ], + "blockIO": { + "weight": 10, + "leafWeight": 10, + "weightDevice": [ + { + "major": 8, + "minor": 0, + "weight": 500, + "leafWeight": 300 + }, + { + "major": 8, + "minor": 16, + "weight": 500 + } + ], + "throttleReadBpsDevice": [ + { + "major": 8, + "minor": 0, + "rate": 600 + } + ], + "throttleWriteIOPSDevice": [ + { + "major": 8, + "minor": 16, + "rate": 300 + } + ] + } + }, + "rootfsPropagation": "slave", + "seccomp": { + "defaultAction": "SCMP_ACT_ALLOW", + "architectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32" + ], + "syscalls": [ + { + "names": [ + "getcwd", + "chmod" + ], + "action": "SCMP_ACT_ERRNO" + } + ] + }, + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + }, + { + "type": "user" + }, + { + "type": "cgroup" + } + ], + "maskedPaths": [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_stats", + "/proc/sched_debug" + ], + "readonlyPaths": [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ], + "mountLabel": "system_u:object_r:svirt_sandbox_file_t:s0:c715,c811" + }, + "annotations": { + "com.example.key1": "value1", + "com.example.key2": "value2" + } +} \ No newline at end of file diff --git a/src/runtime-rs/tests/texture/kata-containers-configuration.toml b/src/runtime-rs/tests/texture/kata-containers-configuration.toml new file mode 100644 index 0000000000..9116080ddf --- /dev/null +++ b/src/runtime-rs/tests/texture/kata-containers-configuration.toml @@ -0,0 +1,11 @@ +[runtime] +enable_debug = true + +[hypervisor] + +[hypervisor.dragonball] +default_vcpus = 2 + +[hypervisor.qemu] +default_vcpus = 4 + diff --git a/src/runtime-rs/tests/utils/Cargo.toml b/src/runtime-rs/tests/utils/Cargo.toml new file mode 100644 index 0000000000..7317b7f0ff --- /dev/null +++ b/src/runtime-rs/tests/utils/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "tests_utils" +version = "0.1.0" +edition = "2018" +description = "This crate is used to share code among tests" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +rand = "0.8.4" diff --git a/src/runtime-rs/tests/utils/src/lib.rs b/src/runtime-rs/tests/utils/src/lib.rs new file mode 100644 index 0000000000..b3a4b35174 --- /dev/null +++ b/src/runtime-rs/tests/utils/src/lib.rs @@ -0,0 +1,35 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +// This crate is used to share code among tests + +use std::path::PathBuf; + +use rand::{ + distributions::Alphanumeric, + {thread_rng, Rng}, +}; + +pub fn get_kata_config_file() -> PathBuf { + let target = format!( + "{}/../texture/kata-containers-configuration.toml", + env!("CARGO_MANIFEST_DIR") + ); + std::fs::canonicalize(target).unwrap() +} + +pub fn get_image_bundle_path() -> PathBuf { + let target = format!("{}/../texture/image-bundle", env!("CARGO_MANIFEST_DIR")); + std::fs::canonicalize(target).unwrap() +} + +pub fn gen_id(len: usize) -> String { + thread_rng() + .sample_iter(&Alphanumeric) + .take(len) + .map(char::from) + .collect() +} diff --git a/src/runtime/cmd/kata-runtime/kata-check_amd64.go b/src/runtime/cmd/kata-runtime/kata-check_amd64.go index 46b3a29165..09f5bfe179 100644 --- a/src/runtime/cmd/kata-runtime/kata-check_amd64.go +++ b/src/runtime/cmd/kata-runtime/kata-check_amd64.go @@ -107,6 +107,8 @@ func setCPUtype(hypervisorType vc.HypervisorType) error { fallthrough case "clh": fallthrough + case "dragonball": + fallthrough case "qemu": archRequiredCPUFlags = map[string]string{ cpuFlagVMX: "Virtualization support", diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 0903c8ea9e..fe93a84128 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -51,6 +51,7 @@ const ( clhHypervisorTableType = "clh" qemuHypervisorTableType = "qemu" acrnHypervisorTableType = "acrn" + dragonballHypervisorTableType = "dragonball" // the maximum amount of PCI bridges that can be cold plugged in a VM maxPCIBridges uint32 = 5 @@ -989,6 +990,30 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { }, nil } +func newDragonballHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { + kernel, err := h.kernel() + if err != nil { + return vc.HypervisorConfig{}, err + } + image, err := h.image() + if err != nil { + return vc.HypervisorConfig{}, err + } + kernelParams := h.kernelParams() + + return vc.HypervisorConfig{ + KernelPath: kernel, + ImagePath: image, + KernelParams: vc.DeserializeParams(strings.Fields(kernelParams)), + NumVCPUs: h.defaultVCPUs(), + DefaultMaxVCPUs: h.defaultMaxVCPUs(), + MemorySize: h.defaultMemSz(), + MemSlots: h.defaultMemSlots(), + EntropySource: h.GetEntropySource(), + Debug: h.Debug, + }, nil +} + func newFactoryConfig(f factory) (oci.FactoryConfig, error) { if f.TemplatePath == "" { f.TemplatePath = defaultTemplatePath @@ -1022,6 +1047,9 @@ func updateRuntimeConfigHypervisor(configPath string, tomlConf tomlConfig, confi case clhHypervisorTableType: config.HypervisorType = vc.ClhHypervisor hConfig, err = newClhHypervisorConfig(hypervisor) + case dragonballHypervisorTableType: + config.HypervisorType = vc.DragonballHypervisor + hConfig, err = newDragonballHypervisorConfig(hypervisor) } if err != nil { diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 48eda09778..b82700a0e1 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -46,6 +46,9 @@ const ( // ClhHypervisor is the ICH hypervisor. ClhHypervisor HypervisorType = "clh" + // DragonballHypervisor is the Dragonball hypervisor. + DragonballHypervisor HypervisorType = "dragonball" + // MockHypervisor is a mock hypervisor for testing purposes MockHypervisor HypervisorType = "mock" @@ -169,6 +172,9 @@ func (hType *HypervisorType) Set(value string) error { case "clh": *hType = ClhHypervisor return nil + case "dragonball": + *hType = DragonballHypervisor + return nil case "mock": *hType = MockHypervisor return nil diff --git a/src/runtime/virtcontainers/hypervisor_linux.go b/src/runtime/virtcontainers/hypervisor_linux.go index 1fd0375b81..3d81c1ada0 100644 --- a/src/runtime/virtcontainers/hypervisor_linux.go +++ b/src/runtime/virtcontainers/hypervisor_linux.go @@ -37,6 +37,8 @@ func NewHypervisor(hType HypervisorType) (Hypervisor, error) { return &Acrn{}, nil case ClhHypervisor: return &cloudHypervisor{}, nil + case DragonballHypervisor: + return &mockHypervisor{}, nil case MockHypervisor: return &mockHypervisor{}, nil default: diff --git a/src/runtime/virtcontainers/ipvlan_endpoint.go b/src/runtime/virtcontainers/ipvlan_endpoint.go index a257e6ecb4..53a57e740d 100644 --- a/src/runtime/virtcontainers/ipvlan_endpoint.go +++ b/src/runtime/virtcontainers/ipvlan_endpoint.go @@ -96,7 +96,6 @@ func (endpoint *IPVlanEndpoint) NetworkPair() *NetworkInterfacePair { // Attach for ipvlan endpoint bridges the network pair and adds the // tap interface of the network pair to the hypervisor. -// tap interface of the network pair to the Hypervisor. func (endpoint *IPVlanEndpoint) Attach(ctx context.Context, s *Sandbox) error { span, ctx := ipvlanTrace(ctx, "Attach", endpoint) defer span.End() diff --git a/src/tools/agent-ctl/Cargo.lock b/src/tools/agent-ctl/Cargo.lock index 6634ed6599..a7629599e7 100644 --- a/src/tools/agent-ctl/Cargo.lock +++ b/src/tools/agent-ctl/Cargo.lock @@ -72,16 +72,6 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" -[[package]] -name = "bytes" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" -dependencies = [ - "byteorder", - "iovec", -] - [[package]] name = "bytes" version = "1.1.0" @@ -135,7 +125,7 @@ checksum = "1b827f9d9f6c2fff719d25f5d44cbc8d2ef6df1ef00d055c5c14d5dc25529579" dependencies = [ "libc", "log", - "nix 0.23.1", + "nix", "regex", ] @@ -404,15 +394,6 @@ dependencies = [ "libc", ] -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - [[package]] name = "itertools" version = "0.10.3" @@ -440,7 +421,7 @@ dependencies = [ "lazy_static", "libc", "logging", - "nix 0.23.1", + "nix", "oci", "protobuf", "protocols", @@ -528,19 +509,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5e06129fb611568ef4e868c14b326274959aa70ff7776e9d55323531c374945" -dependencies = [ - "bitflags", - "cc", - "cfg-if 1.0.0", - "libc", - "memoffset", -] - [[package]] name = "nix" version = "0.23.1" @@ -672,7 +640,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" dependencies = [ - "bytes 1.1.0", + "bytes", "prost-derive", ] @@ -682,7 +650,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" dependencies = [ - "bytes 1.1.0", + "bytes", "heck", "itertools", "log", @@ -713,15 +681,15 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" dependencies = [ - "bytes 1.1.0", + "bytes", "prost", ] [[package]] name = "protobuf" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" dependencies = [ "serde", "serde_derive", @@ -729,18 +697,18 @@ dependencies = [ [[package]] name = "protobuf-codegen" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c" +checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707" dependencies = [ "protobuf", ] [[package]] name = "protobuf-codegen-pure" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440" +checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6" dependencies = [ "protobuf", "protobuf-codegen", @@ -750,7 +718,7 @@ dependencies = [ name = "protocols" version = "0.1.0" dependencies = [ - "async-trait", + "oci", "protobuf", "serde", "serde_json", @@ -865,7 +833,7 @@ dependencies = [ "inotify", "lazy_static", "libc", - "nix 0.23.1", + "nix", "oci", "path-absolutize", "protobuf", @@ -1087,7 +1055,7 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838" dependencies = [ - "bytes 1.1.0", + "bytes", "libc", "memchr", "mio", @@ -1109,36 +1077,19 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-vsock" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0723fc001950a3b018947b05eeb45014fd2b7c6e8f292502193ab74486bdb6" -dependencies = [ - "bytes 0.4.12", - "futures", - "libc", - "tokio", - "vsock", -] - [[package]] name = "ttrpc" -version = "0.5.2" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a973ce6d5eaa20c173635b29ffb660dafbc7ef109172c0015ba44e47a23711" +checksum = "0c7d6c992964a013c17814c08d31708d577b0aae44ebadb58755659dd824c2d1" dependencies = [ - "async-trait", "byteorder", - "futures", "libc", "log", - "nix 0.20.2", + "nix", "protobuf", "protobuf-codegen-pure", "thiserror", - "tokio", - "tokio-vsock", ] [[package]] @@ -1192,16 +1143,6 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" -[[package]] -name = "vsock" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133" -dependencies = [ - "libc", - "nix 0.23.1", -] - [[package]] name = "wasi" version = "0.10.2+wasi-snapshot-preview1" diff --git a/src/tools/agent-ctl/Cargo.toml b/src/tools/agent-ctl/Cargo.toml index fba7dd41f5..8d2f93b847 100644 --- a/src/tools/agent-ctl/Cargo.toml +++ b/src/tools/agent-ctl/Cargo.toml @@ -26,12 +26,12 @@ logging = { path = "../../libs/logging" } slog = "2.7.0" slog-scope = "4.4.0" rand = "0.8.4" -protobuf = "2.14.0" +protobuf = "2.27.0" nix = "0.23.0" libc = "0.2.112" # XXX: Must be the same as the version used by the agent -ttrpc = { version = "0.5.2" } +ttrpc = { version = "0.6.0" } # For parsing timeouts humantime = "2.1.0" diff --git a/src/tools/agent-ctl/Makefile b/src/tools/agent-ctl/Makefile index df3eacf243..99713f6c8d 100644 --- a/src/tools/agent-ctl/Makefile +++ b/src/tools/agent-ctl/Makefile @@ -8,12 +8,9 @@ include ../../../utils.mk .DEFAULT_GOAL := default default: build -build: logging-crate-tests +build: @RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) -logging-crate-tests: - make -C $(CWD)/../../libs/logging - clean: cargo clean @@ -32,6 +29,5 @@ check: standard_rust_check check \ clean \ install \ - logging-crate-tests \ test \ vendor diff --git a/src/tools/agent-ctl/src/client.rs b/src/tools/agent-ctl/src/client.rs index 315969b87b..802bc79669 100644 --- a/src/tools/agent-ctl/src/client.rs +++ b/src/tools/agent-ctl/src/client.rs @@ -561,7 +561,7 @@ fn create_ttrpc_client( } }; - Ok(ttrpc::client::Client::new(fd)) + Ok(ttrpc::Client::new(fd)) } fn kata_service_agent( diff --git a/src/tools/agent-ctl/src/main.rs b/src/tools/agent-ctl/src/main.rs index 909b8a269e..f08c1a884b 100644 --- a/src/tools/agent-ctl/src/main.rs +++ b/src/tools/agent-ctl/src/main.rs @@ -181,11 +181,11 @@ fn connect(name: &str, global_args: clap::ArgMatches) -> Result<()> { let cfg = Config { server_address, bundle_dir, - interactive, - ignore_errors, timeout_nano, hybrid_vsock_port, + interactive, hybrid_vsock, + ignore_errors, no_auto_values, }; diff --git a/src/tools/agent-ctl/src/utils.rs b/src/tools/agent-ctl/src/utils.rs index 4c15dbb062..b24c277534 100644 --- a/src/tools/agent-ctl/src/utils.rs +++ b/src/tools/agent-ctl/src/utils.rs @@ -203,11 +203,7 @@ pub fn get_option(name: &str, options: &mut Options, args: &str) -> Result { msg = "derived"; - match options.get("cid") { - Some(value) => value, - None => "", - } - .into() + options.get("cid").unwrap_or(&"".to_string()).into() } _ => "".into(), }; diff --git a/src/tools/runk/Cargo.lock b/src/tools/runk/Cargo.lock index a0762d49ce..9a9ed7bc4b 100644 --- a/src/tools/runk/Cargo.lock +++ b/src/tools/runk/Cargo.lock @@ -63,16 +63,6 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" -[[package]] -name = "bytes" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" -dependencies = [ - "byteorder", - "iovec", -] - [[package]] name = "bytes" version = "1.1.0" @@ -127,7 +117,7 @@ checksum = "cdae996d9638ba03253ffa1c93345a585974a97abbdeab9176c77922f3efc1e8" dependencies = [ "libc", "log", - "nix 0.23.1", + "nix", "regex", ] @@ -495,15 +485,6 @@ dependencies = [ "cfg-if 1.0.0", ] -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - [[package]] name = "itertools" version = "0.10.3" @@ -540,7 +521,7 @@ dependencies = [ "derive_builder", "libc", "logging", - "nix 0.23.1", + "nix", "oci", "rustjail", "scopeguard", @@ -633,19 +614,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd0eaf8df8bab402257e0a5c17a254e4cc1f72a93588a1ddfb5d356c801aa7cb" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc", - "void", -] - [[package]] name = "nix" version = "0.23.1" @@ -831,7 +799,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" dependencies = [ - "bytes 1.1.0", + "bytes", "prost-derive", ] @@ -841,7 +809,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" dependencies = [ - "bytes 1.1.0", + "bytes", "heck 0.3.3", "itertools", "log", @@ -872,15 +840,15 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" dependencies = [ - "bytes 1.1.0", + "bytes", "prost", ] [[package]] name = "protobuf" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" dependencies = [ "serde", "serde_derive", @@ -888,18 +856,18 @@ dependencies = [ [[package]] name = "protobuf-codegen" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de113bba758ccf2c1ef816b127c958001b7831136c9bc3f8e9ec695ac4e82b0c" +checksum = "aec1632b7c8f2e620343439a7dfd1f3c47b18906c4be58982079911482b5d707" dependencies = [ "protobuf", ] [[package]] name = "protobuf-codegen-pure" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1a4febc73bf0cada1d77c459a0c8e5973179f1cfd5b0f1ab789d45b17b6440" +checksum = "9f8122fdb18e55190c796b088a16bdb70cd7acdcd48f7a8b796b58c62e532cc6" dependencies = [ "protobuf", "protobuf-codegen", @@ -909,7 +877,7 @@ dependencies = [ name = "protocols" version = "0.1.0" dependencies = [ - "async-trait", + "oci", "protobuf", "ttrpc", "ttrpc-codegen", @@ -979,7 +947,7 @@ dependencies = [ "libcontainer", "liboci-cli", "logging", - "nix 0.23.1", + "nix", "oci", "rustjail", "serde", @@ -1006,7 +974,7 @@ dependencies = [ "inotify", "lazy_static", "libc", - "nix 0.23.1", + "nix", "oci", "path-absolutize", "protobuf", @@ -1271,7 +1239,7 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" dependencies = [ - "bytes 1.1.0", + "bytes", "libc", "memchr", "mio", @@ -1296,36 +1264,19 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-vsock" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0723fc001950a3b018947b05eeb45014fd2b7c6e8f292502193ab74486bdb6" -dependencies = [ - "bytes 0.4.12", - "futures", - "libc", - "tokio", - "vsock", -] - [[package]] name = "ttrpc" -version = "0.5.1" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "004604e91de38bc16cb9c7898187343075388ea414ad24896a21fc4e91a7c861" +checksum = "2ecfff459a859c6ba6668ff72b34c2f1d94d9d58f7088414c2674ad0f31cc7d8" dependencies = [ - "async-trait", "byteorder", - "futures", "libc", "log", - "nix 0.16.1", + "nix", "protobuf", "protobuf-codegen-pure", "thiserror", - "tokio", - "tokio-vsock", ] [[package]] @@ -1389,22 +1340,6 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" - -[[package]] -name = "vsock" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e32675ee2b3ce5df274c0ab52d19b28789632406277ca26bffee79a8e27dc133" -dependencies = [ - "libc", - "nix 0.23.1", -] - [[package]] name = "wasi" version = "0.10.0+wasi-snapshot-preview1" diff --git a/src/tools/trace-forwarder/Cargo.lock b/src/tools/trace-forwarder/Cargo.lock index ab87c9db71..5357d1ec94 100644 --- a/src/tools/trace-forwarder/Cargo.lock +++ b/src/tools/trace-forwarder/Cargo.lock @@ -528,9 +528,9 @@ dependencies = [ [[package]] name = "protobuf" -version = "2.14.0" +version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485" +checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" [[package]] name = "quote" diff --git a/src/tools/trace-forwarder/Cargo.toml b/src/tools/trace-forwarder/Cargo.toml index 8a520a26a0..2579ae5b6c 100644 --- a/src/tools/trace-forwarder/Cargo.toml +++ b/src/tools/trace-forwarder/Cargo.toml @@ -22,7 +22,7 @@ serde_json = "1.0.44" anyhow = "1.0.31" opentelemetry = { version = "0.14.0", features=["serialize"] } opentelemetry-jaeger = "0.13.0" -protobuf = "=2.14.0" +protobuf = "2.27.0" tracing-opentelemetry = "0.16.0" tracing = "0.1.29" tracing-subscriber = "0.3.3" diff --git a/src/tools/trace-forwarder/Makefile b/src/tools/trace-forwarder/Makefile index 5b1c53849b..b6b223c001 100644 --- a/src/tools/trace-forwarder/Makefile +++ b/src/tools/trace-forwarder/Makefile @@ -8,12 +8,9 @@ include ../../../utils.mk .DEFAULT_GOAL := default default: build -build: logging-crate-tests +build: @RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) -logging-crate-tests: - make -C $(CWD)/../../libs/logging - clean: cargo clean @@ -32,6 +29,5 @@ check: standard_rust_check check \ clean \ install \ - logging-crate-tests \ test \ vendor diff --git a/utils.mk b/utils.mk index c87da0c06b..a52bb7362d 100644 --- a/utils.mk +++ b/utils.mk @@ -3,6 +3,23 @@ # SPDX-License-Identifier: Apache-2.0 # +# Note: +# +# Since this file defines rules, it should be included +# in other makefiles *after* their default rule has been defined. + +# Owner for installed files +export KATA_INSTALL_OWNER ?= root + +# Group for installed files +export KATA_INSTALL_GROUP ?= adm + +# Permissions for installed configuration files. +# +# XXX: Note that the permissions MUST be zero for "other" +# XXX: in case the configuration file contains secrets. +export KATA_INSTALL_CFG_PERMS ?= 0640 + # Create a set of standard rules for a project such that: # # - The component depends on its Makefile. @@ -160,3 +177,28 @@ standard_rust_check: cargo clippy --all-targets --all-features --release \ -- \ -D warnings + +# Install a file (full version). +# +# params: +# +# $1 : File to install. +# $2 : Directory path where file will be installed. +# $3 : Permissions to apply to the installed file. +define INSTALL_FILE_FULL + sudo install \ + --mode $3 \ + --owner $(KATA_INSTALL_OWNER) \ + --group $(KATA_INSTALL_GROUP) \ + -D $1 $2/$(notdir $1) || exit 1; +endef + +# Install a configuration file. +# +# params: +# +# $1 : File to install. +# $2 : Directory path where file will be installed. +define INSTALL_CONFIG + $(call INSTALL_FILE_FULL,$1,$2,$(KATA_INSTALL_CFG_PERMS)) +endef diff --git a/versions.yaml b/versions.yaml index 53fbe769d4..d597412a9f 100644 --- a/versions.yaml +++ b/versions.yaml @@ -313,6 +313,16 @@ languages: building Kata newest-version: "1.62.0" + golangci-lint: + description: "golangci-lint" + notes: "'version' is the default minimum version used by this project." + version: "1.41.1" + meta: + description: | + 'newest-version' is the latest version known to work when + building Kata + newest-version: "1.46.2" + specs: description: "Details of important specifications"