mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-02 19:06:13 +00:00
Compare commits
91 Commits
2.4.0-alph
...
2.3.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
365e358115 | ||
|
|
a2e524f356 | ||
|
|
3d4dedefda | ||
|
|
919fc56daa | ||
|
|
dfbe74c489 | ||
|
|
9e7eed7c4b | ||
|
|
a0bb8c5599 | ||
|
|
53cf1dd042 | ||
|
|
a4dee6a591 | ||
|
|
fd87b60c7a | ||
|
|
2cb4f7ba70 | ||
|
|
993dcc94ff | ||
|
|
bbd7cc2f93 | ||
|
|
9837ec728c | ||
|
|
8785106f6c | ||
|
|
a915f08266 | ||
|
|
ec3faab892 | ||
|
|
1f61be842d | ||
|
|
d2d8f9ac65 | ||
|
|
ca30eee3e2 | ||
|
|
0217abce24 | ||
|
|
572b25dd35 | ||
|
|
84e69ecb22 | ||
|
|
57a6d46376 | ||
|
|
77b6cfbd15 | ||
|
|
0e1cb124b7 | ||
|
|
24085c9553 | ||
|
|
514bf74f8f | ||
|
|
77a2502a0f | ||
|
|
6413ecf459 | ||
|
|
a31b5b9ee8 | ||
|
|
a0bed72d49 | ||
|
|
d61bcb8a44 | ||
|
|
d03e05e803 | ||
|
|
0f7db91c0f | ||
|
|
25ee73ceb3 | ||
|
|
64ae76e967 | ||
|
|
271d67a831 | ||
|
|
f42c7d5125 | ||
|
|
7c15335dc9 | ||
|
|
15080f20e7 | ||
|
|
c2b8eb3c2c | ||
|
|
fe0fbab574 | ||
|
|
89f9672f56 | ||
|
|
0a32a1793d | ||
|
|
be5468fda7 | ||
|
|
18bb9a5d9b | ||
|
|
f068057073 | ||
|
|
3458073d09 | ||
|
|
f9c09ad5bc | ||
|
|
0e91503cd4 | ||
|
|
185f96d170 | ||
|
|
9bc543f5db | ||
|
|
198e0d1666 | ||
|
|
bf183c5f7f | ||
|
|
df34e91978 | ||
|
|
5995efc0a6 | ||
|
|
000f878417 | ||
|
|
a6a76bb092 | ||
|
|
f61e31cd84 | ||
|
|
cb7891e0b4 | ||
|
|
2667e0286a | ||
|
|
3542cba8f3 | ||
|
|
117b920230 | ||
|
|
5694749ce5 | ||
|
|
db9cd1078f | ||
|
|
a51a1f6d06 | ||
|
|
5bc1c209b2 | ||
|
|
b2851ffc9c | ||
|
|
45eafafdf3 | ||
|
|
34a1b5396a | ||
|
|
f1cd3b6300 | ||
|
|
e0b74bb413 | ||
|
|
8a705f74b5 | ||
|
|
ac5ab86ebd | ||
|
|
d22ec59920 | ||
|
|
440657b36d | ||
|
|
0c00a9d463 | ||
|
|
f9bde321e9 | ||
|
|
b821511992 | ||
|
|
a9d5377bd9 | ||
|
|
ea83ff1fc3 | ||
|
|
03f7a5e49b | ||
|
|
91003c2751 | ||
|
|
57ffe14940 | ||
|
|
5e9b807ba0 | ||
|
|
de6fe98ec0 | ||
|
|
de0eea5f44 | ||
|
|
73d7929c10 | ||
|
|
96b66d2cb4 | ||
|
|
62a51d51a2 |
1
.github/workflows/PR-wip-checks.yaml
vendored
1
.github/workflows/PR-wip-checks.yaml
vendored
@@ -15,7 +15,6 @@ jobs:
|
||||
name: WIP Check
|
||||
steps:
|
||||
- name: WIP Check
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: tim-actions/wip-check@1c2a1ca6c110026b3e2297bb2ef39e1747b5a755
|
||||
with:
|
||||
labels: '["do-not-merge", "wip", "rfc"]'
|
||||
|
||||
14
.github/workflows/commit-message-check.yaml
vendored
14
.github/workflows/commit-message-check.yaml
vendored
@@ -5,8 +5,6 @@ on:
|
||||
- opened
|
||||
- reopened
|
||||
- synchronize
|
||||
- labeled
|
||||
- unlabeled
|
||||
|
||||
env:
|
||||
error_msg: |+
|
||||
@@ -20,26 +18,24 @@ jobs:
|
||||
name: Commit Message Check
|
||||
steps:
|
||||
- name: Get PR Commits
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
id: 'get-pr-commits'
|
||||
uses: tim-actions/get-pr-commits@v1.0.0
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: DCO Check
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: tim-actions/dco@2fd0504dc0d27b33f542867c300c60840c6dcb20
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
|
||||
- name: Commit Body Missing Check
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') && ( success() || failure() ) }}
|
||||
if: ${{ success() || failure() }}
|
||||
uses: tim-actions/commit-body-check@v1.0.2
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
|
||||
- name: Check Subject Line Length
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') && ( success() || failure() ) }}
|
||||
if: ${{ success() || failure() }}
|
||||
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
@@ -48,7 +44,7 @@ jobs:
|
||||
post_error: ${{ env.error_msg }}
|
||||
|
||||
- name: Check Body Line Length
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') && ( success() || failure() ) }}
|
||||
if: ${{ success() || failure() }}
|
||||
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
@@ -75,7 +71,7 @@ jobs:
|
||||
post_error: ${{ env.error_msg }}
|
||||
|
||||
- name: Check Fixes
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') && ( success() || failure() ) }}
|
||||
if: ${{ success() || failure() }}
|
||||
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
@@ -86,7 +82,7 @@ jobs:
|
||||
one_pass_all_pass: 'true'
|
||||
|
||||
- name: Check Subsystem
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') && ( success() || failure() ) }}
|
||||
if: ${{ success() || failure() }}
|
||||
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
|
||||
with:
|
||||
commits: ${{ steps.get-pr-commits.outputs.commits }}
|
||||
|
||||
18
.github/workflows/kata-deploy-push.yaml
vendored
18
.github/workflows/kata-deploy-push.yaml
vendored
@@ -1,15 +1,6 @@
|
||||
name: kata deploy build
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- edited
|
||||
- reopened
|
||||
- synchronize
|
||||
- labeled
|
||||
- unlabeled
|
||||
push:
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
@@ -28,13 +19,11 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install docker
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
curl -fsSL https://test.docker.com -o test-docker.sh
|
||||
sh test-docker.sh
|
||||
|
||||
- name: Build ${{ matrix.asset }}
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
make "${KATA_ASSET}-tarball"
|
||||
build_dir=$(readlink -f build)
|
||||
@@ -44,7 +33,6 @@ jobs:
|
||||
KATA_ASSET: ${{ matrix.asset }}
|
||||
|
||||
- name: store-artifact ${{ matrix.asset }}
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: kata-artifacts
|
||||
@@ -57,17 +45,14 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: get-artifacts
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: build
|
||||
- name: merge-artifacts
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
make merge-builds
|
||||
- name: store-artifacts
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: kata-static-tarball
|
||||
@@ -78,7 +63,6 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: make kata-tarball
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
make kata-tarball
|
||||
sudo make install-tarball
|
||||
|
||||
31
.github/workflows/kata-deploy-test.yaml
vendored
31
.github/workflows/kata-deploy-test.yaml
vendored
@@ -48,18 +48,7 @@ jobs:
|
||||
- rootfs-initrd
|
||||
- shim-v2
|
||||
steps:
|
||||
# As Github action event `issue_comment` does not provide the right ref
|
||||
# (commit/branch) to be tested, let's use this third part action to work
|
||||
# this limitation around.
|
||||
- name: resolve pr refs
|
||||
id: refs
|
||||
uses: kata-containers/resolve-pr-refs@v0.0.3
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
ref: ${{ steps.refs.outputs.base_ref }}
|
||||
- name: Install docker
|
||||
run: |
|
||||
curl -fsSL https://test.docker.com -o test-docker.sh
|
||||
@@ -86,17 +75,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build-asset
|
||||
steps:
|
||||
# As Github action event `issue_comment` does not provide the right ref
|
||||
# (commit/branch) to be tested, let's use this third part action to work
|
||||
# this limitation around.
|
||||
- name: resolve pr refs
|
||||
id: refs
|
||||
uses: kata-containers/resolve-pr-refs@v0.0.3
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
ref: ${{ steps.refs.outputs.base_ref }}
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
@@ -115,17 +94,7 @@ jobs:
|
||||
needs: create-kata-tarball
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# As Github action event `issue_comment` does not provide the right ref
|
||||
# (commit/branch) to be tested, let's use this third part action to work
|
||||
# this limitation around.
|
||||
- name: resolve pr refs
|
||||
id: refs
|
||||
uses: kata-containers/resolve-pr-refs@v0.0.3
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
ref: ${{ steps.refs.outputs.base_ref }}
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
|
||||
@@ -10,15 +10,12 @@ on:
|
||||
types:
|
||||
- opened
|
||||
- reopened
|
||||
- labeled
|
||||
- unlabeled
|
||||
|
||||
jobs:
|
||||
move-linked-issues-to-in-progress:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install hub
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
HUB_ARCH="amd64"
|
||||
HUB_VER=$(curl -sL "https://api.github.com/repos/github/hub/releases/latest" |\
|
||||
@@ -29,7 +26,6 @@ jobs:
|
||||
sudo install hub /usr/local/bin
|
||||
|
||||
- name: Install hub extension script
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
# Clone into a temporary directory to avoid overwriting
|
||||
# any existing github directory.
|
||||
@@ -39,11 +35,9 @@ jobs:
|
||||
popd &>/dev/null
|
||||
|
||||
- name: Checkout code to allow hub to communicate with the project
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Move issue to "In progress"
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.KATA_GITHUB_ACTIONS_TOKEN }}
|
||||
run: |
|
||||
|
||||
@@ -20,7 +20,6 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install hub
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
HUB_ARCH="amd64"
|
||||
HUB_VER=$(curl -sL "https://api.github.com/repos/github/hub/releases/latest" |\
|
||||
@@ -31,7 +30,6 @@ jobs:
|
||||
sudo install hub /usr/local/bin
|
||||
|
||||
- name: Checkout code to allow hub to communicate with the project
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install porting checker script
|
||||
@@ -44,7 +42,6 @@ jobs:
|
||||
popd &>/dev/null
|
||||
|
||||
- name: Stop PR being merged unless it has a correct set of porting labels
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.KATA_GITHUB_ACTIONS_TOKEN }}
|
||||
run: |
|
||||
|
||||
14
.github/workflows/snap.yaml
vendored
14
.github/workflows/snap.yaml
vendored
@@ -1,29 +1,17 @@
|
||||
name: snap CI
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
- edited
|
||||
- labeled
|
||||
- unlabeled
|
||||
|
||||
on: ["pull_request"]
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: Check out
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install Snapcraft
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
uses: samuelmeuli/action-snapcraft@v1
|
||||
|
||||
- name: Build snap
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
snapcraft -d snap --destructive-mode
|
||||
|
||||
93
Glossary.md
93
Glossary.md
@@ -1,3 +1,94 @@
|
||||
# Glossary
|
||||
|
||||
See the [project glossary hosted in the wiki](https://github.com/kata-containers/kata-containers/wiki/Glossary).
|
||||
[A](#a), [B](#b), [C](#c), [D](#d), [E](#e), [F](#f), [G](#g), [H](#h), [I](#i), [J](#j), [K](#k), [L](#l), [M](#m), [N](#n), [O](#o), [P](#p), [Q](#q), [R](#r), [S](#s), [T](#t), [U](#u), [V](#v), [W](#w), [X](#x), [Y](#y), [Z](#z)
|
||||
|
||||
## A
|
||||
|
||||
### Auto Scaling
|
||||
a method used in cloud computing, whereby the amount of computational resources in a server farm, typically measured in terms of the number of active servers, which vary automatically based on the load on the farm.
|
||||
|
||||
## B
|
||||
|
||||
## C
|
||||
|
||||
### Container Security Solutions
|
||||
The process of implementing security tools and policies that will give you the assurance that everything in your container is running as intended, and only as intended.
|
||||
|
||||
### Container Software
|
||||
A standard unit of software that packages up code and all its dependencies so the application runs quickly and reliably from one computing environment to another.
|
||||
|
||||
### Container Runtime Interface
|
||||
A plugin interface which enables Kubelet to use a wide variety of container runtimes, without the need to recompile.
|
||||
|
||||
### Container Virtualization
|
||||
A container is a virtual runtime environment that runs on top of a single operating system (OS) kernel and emulates an operating system rather than the underlying hardware.
|
||||
|
||||
## D
|
||||
|
||||
## E
|
||||
|
||||
## F
|
||||
|
||||
## G
|
||||
|
||||
## H
|
||||
|
||||
## I
|
||||
|
||||
### Infrastructure Architecture
|
||||
A structured and modern approach for supporting an organization and facilitating innovation within an enterprise.
|
||||
|
||||
## J
|
||||
|
||||
## K
|
||||
|
||||
### Kata Containers
|
||||
Kata containers is an open source project delivering increased container security and Workload isolation through an implementation of lightweight virtual machines.
|
||||
|
||||
## L
|
||||
|
||||
## M
|
||||
|
||||
## N
|
||||
|
||||
## O
|
||||
|
||||
## P
|
||||
|
||||
### Pod Containers
|
||||
A Group of one or more containers , with shared storage/network, and a specification for how to run the containers.
|
||||
|
||||
### Private Cloud
|
||||
A computing model that offers a proprietary environment dedicated to a single business entity.
|
||||
|
||||
### Public Cloud
|
||||
Computing services offered by third-party providers over the public Internet, making them available to anyone who wants to use or purchase them.
|
||||
|
||||
## Q
|
||||
|
||||
## R
|
||||
|
||||
## S
|
||||
|
||||
### Serverless Containers
|
||||
An architecture in which code is executed on-demand. Serverless workloads are typically in the cloud, but on-premises serverless platforms exist, too.
|
||||
|
||||
## T
|
||||
|
||||
## U
|
||||
|
||||
## V
|
||||
|
||||
### Virtual Machine Monitor
|
||||
Computer software, firmware or hardware that creates and runs virtual machines.
|
||||
|
||||
### Virtual Machine Software
|
||||
A software program or operating system that not only exhibits the behavior of a separate computer, but is also capable of performing tasks such as running applications and programs like a separate computer.
|
||||
|
||||
## W
|
||||
|
||||
## X
|
||||
|
||||
## Y
|
||||
|
||||
## Z
|
||||
|
||||
19
Makefile
19
Makefile
@@ -8,25 +8,20 @@ COMPONENTS =
|
||||
|
||||
COMPONENTS += agent
|
||||
COMPONENTS += runtime
|
||||
COMPONENTS += trace-forwarder
|
||||
|
||||
# List of available tools
|
||||
TOOLS =
|
||||
|
||||
TOOLS += agent-ctl
|
||||
TOOLS += trace-forwarder
|
||||
|
||||
STANDARD_TARGETS = build check clean install test vendor
|
||||
|
||||
default: all
|
||||
|
||||
all: logging-crate-tests build
|
||||
|
||||
logging-crate-tests:
|
||||
make -C src/libs/logging
|
||||
|
||||
include utils.mk
|
||||
include ./tools/packaging/kata-deploy/local-build/Makefile
|
||||
|
||||
all: build
|
||||
|
||||
# Create the rules
|
||||
$(eval $(call create_all_rules,$(COMPONENTS),$(TOOLS),$(STANDARD_TARGETS)))
|
||||
|
||||
@@ -39,10 +34,4 @@ generate-protocols:
|
||||
static-checks: build
|
||||
bash ci/static-checks.sh
|
||||
|
||||
.PHONY: \
|
||||
all \
|
||||
binary-tarball \
|
||||
default \
|
||||
install-binary-tarball \
|
||||
logging-crate-tests \
|
||||
static-checks
|
||||
.PHONY: all default static-checks binary-tarball install-binary-tarball
|
||||
|
||||
@@ -70,8 +70,8 @@ The table below lists the remaining parts of the project:
|
||||
| [packaging](tools/packaging) | infrastructure | Scripts and metadata for producing packaged binaries<br/>(components, hypervisors, kernel and rootfs). |
|
||||
| [kernel](https://www.kernel.org) | kernel | Linux kernel used by the hypervisor to boot the guest image. Patches are stored [here](tools/packaging/kernel). |
|
||||
| [osbuilder](tools/osbuilder) | infrastructure | Tool to create "mini O/S" rootfs and initrd images and kernel for the hypervisor. |
|
||||
| [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
|
||||
| [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. |
|
||||
| [`agent-ctl`](tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
|
||||
| [`trace-forwarder`](src/trace-forwarder) | utility | Agent tracing helper. |
|
||||
| [`ci`](https://github.com/kata-containers/ci) | CI | Continuous Integration configuration files and scripts. |
|
||||
| [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. |
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Copyright (c) 2020 Intel Corporation
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2019 Intel Corporation
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright 2021 Sony Group Corporation
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2020 Ant Group
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019 Ant Financial
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018 Intel Corporation
|
||||
#
|
||||
|
||||
@@ -36,7 +36,7 @@ run_static_checks()
|
||||
# Make sure we have the targeting branch
|
||||
git remote set-branches --add origin "${branch}"
|
||||
git fetch -a
|
||||
bash "$tests_repo_dir/.ci/static-checks.sh" "$@"
|
||||
bash "$tests_repo_dir/.ci/static-checks.sh" "github.com/kata-containers/kata-containers"
|
||||
}
|
||||
|
||||
run_go_test()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2019 Ant Financial
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018 Intel Corporation
|
||||
#
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2017-2018 Intel Corporation
|
||||
#
|
||||
@@ -9,4 +9,4 @@ set -e
|
||||
cidir=$(dirname "$0")
|
||||
source "${cidir}/lib.sh"
|
||||
|
||||
run_static_checks "${@:-github.com/kata-containers/kata-containers}"
|
||||
run_static_checks
|
||||
|
||||
@@ -86,6 +86,21 @@ All other configurations are supported and are working properly.
|
||||
|
||||
## Networking
|
||||
|
||||
### Docker swarm and compose support
|
||||
|
||||
The newest version of Docker supported is specified by the
|
||||
`externals.docker.version` variable in the
|
||||
[versions database](https://github.com/kata-containers/runtime/blob/master/versions.yaml).
|
||||
|
||||
Basic Docker swarm support works. However, if you want to use custom networks
|
||||
with Docker's swarm, an older version of Docker is required. This is specified
|
||||
by the `externals.docker.meta.swarm-version` variable in the
|
||||
[versions database](https://github.com/kata-containers/runtime/blob/master/versions.yaml).
|
||||
|
||||
See issue https://github.com/kata-containers/runtime/issues/175 for more information.
|
||||
|
||||
Docker compose normally uses custom networks, so also has the same limitations.
|
||||
|
||||
## Resource management
|
||||
|
||||
Due to the way VMs differ in their CPU and memory allocation, and sharing
|
||||
@@ -104,6 +119,31 @@ set the size of the `/dev/shm tmpfs` within the container. It is possible to pas
|
||||
|
||||
See issue https://github.com/kata-containers/kata-containers/issues/21 for more information.
|
||||
|
||||
### docker run and sysctl
|
||||
|
||||
The `docker run --sysctl` feature is not implemented. At the runtime
|
||||
level, this equates to the `linux.sysctl` OCI configuration. Docker
|
||||
allows configuring the sysctl settings that support namespacing. From a security and isolation point of view, it might make sense to set them in the VM, which isolates sysctl settings. Also, given that each Kata Container has its own kernel, we can support setting of sysctl settings that are not namespaced. In some cases, we might need to support configuring some of the settings on both the host side Kata Container namespace and the Kata Containers kernel.
|
||||
|
||||
See issue https://github.com/kata-containers/runtime/issues/185 for more information.
|
||||
|
||||
## Docker daemon features
|
||||
|
||||
Some features enabled or implemented via the
|
||||
[`dockerd` daemon](https://docs.docker.com/config/daemon/) configuration are not yet
|
||||
implemented.
|
||||
|
||||
### SELinux support
|
||||
|
||||
The `dockerd` configuration option `"selinux-enabled": true` is not presently implemented
|
||||
in Kata Containers. Enabling this option causes an OCI runtime error.
|
||||
|
||||
See issue https://github.com/kata-containers/runtime/issues/784 for more information.
|
||||
|
||||
The consequence of this is that the [Docker --security-opt is only partially supported](#docker---security-opt-option-partially-supported).
|
||||
|
||||
Kubernetes [SELinux labels](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#assign-selinux-labels-to-a-container) will also not be applied.
|
||||
|
||||
# Architectural limitations
|
||||
|
||||
This section lists items that might not be fixed due to fundamental
|
||||
|
||||
@@ -41,7 +41,7 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
|
||||
### Design and Implementations
|
||||
|
||||
* [Kata Containers Architecture](design/architecture): Architectural overview of Kata Containers
|
||||
* [Kata Containers Architecture](design/architecture.md): Architectural overview of Kata Containers
|
||||
* [Kata Containers E2E Flow](design/end-to-end-flow.md): The entire end-to-end flow of Kata Containers
|
||||
* [Kata Containers design](./design/README.md): More Kata Containers design documents
|
||||
* [Kata Containers threat model](./threat-model/threat-model.md): Kata Containers threat model
|
||||
@@ -52,18 +52,6 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
* [How to contribute to Kata Containers](https://github.com/kata-containers/community/blob/master/CONTRIBUTING.md)
|
||||
* [Code of Conduct](../CODE_OF_CONDUCT.md)
|
||||
|
||||
## Help Writing a Code PR
|
||||
|
||||
* [Code PR advice](code-pr-advice.md).
|
||||
|
||||
## Help Writing Unit Tests
|
||||
|
||||
* [Unit Test Advice](Unit-Test-Advice.md)
|
||||
|
||||
## Help Improving the Documents
|
||||
|
||||
* [Documentation Requirements](Documentation-Requirements.md)
|
||||
|
||||
### Code Licensing
|
||||
|
||||
* [Licensing](Licensing-strategy.md): About the licensing strategy of Kata Containers.
|
||||
@@ -73,6 +61,10 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
* [Release strategy](Stable-Branch-Strategy.md)
|
||||
* [Release Process](Release-Process.md)
|
||||
|
||||
## Help Improving the Documents
|
||||
|
||||
* [Documentation Requirements](Documentation-Requirements.md)
|
||||
|
||||
## Website Changes
|
||||
|
||||
If you have a suggestion for how we can improve the
|
||||
|
||||
@@ -120,7 +120,7 @@ stable and main. While this is not in place currently, it should be considered i
|
||||
|
||||
### Patch releases
|
||||
|
||||
Releases are made every four weeks, which include a GitHub release as
|
||||
Releases are made every three weeks, which include a GitHub release as
|
||||
well as binary packages. These patch releases are made for both stable branches, and a "release candidate"
|
||||
for the next `MAJOR` or `MINOR` is created from main. If there are no changes across all the repositories, no
|
||||
release is created and an announcement is made on the developer mailing list to highlight this.
|
||||
@@ -136,7 +136,8 @@ The process followed for making a release can be found at [Release Process](Rele
|
||||
|
||||
### Frequency
|
||||
Minor releases are less frequent in order to provide a more stable baseline for users. They are currently
|
||||
running on a sixteen weeks cadence. The release schedule can be seen on the
|
||||
running on a twelve week cadence. As the Kata Containers code base has reached a certain level of
|
||||
maturity, we have increased the cadence from six weeks to twelve weeks. The release schedule can be seen on the
|
||||
[release rotation wiki page](https://github.com/kata-containers/community/wiki/Release-Team-Rota).
|
||||
|
||||
### Compatibility
|
||||
|
||||
@@ -1,379 +0,0 @@
|
||||
# Unit Test Advice
|
||||
|
||||
## Overview
|
||||
|
||||
This document offers advice on writing a Unit Test (UT) in
|
||||
[Golang](https://golang.org) and [Rust](https://www.rust-lang.org).
|
||||
|
||||
## General advice
|
||||
|
||||
### Unit test strategies
|
||||
|
||||
#### Positive and negative tests
|
||||
|
||||
Always add positive tests (where success is expected) *and* negative
|
||||
tests (where failure is expected).
|
||||
|
||||
#### Boundary condition tests
|
||||
|
||||
Try to add unit tests that exercise boundary conditions such as:
|
||||
|
||||
- Missing values (`null` or `None`).
|
||||
- Empty strings and huge strings.
|
||||
- Empty (or uninitialised) complex data structures
|
||||
(such as lists, vectors and hash tables).
|
||||
- Common numeric values (such as `-1`, `0`, `1` and the minimum and
|
||||
maximum values).
|
||||
|
||||
#### Test unusual values
|
||||
|
||||
Also always consider "unusual" input values such as:
|
||||
|
||||
- String values containing spaces, Unicode characters, special
|
||||
characters, escaped characters or null bytes.
|
||||
|
||||
> **Note:** Consider these unusual values in prefix, infix and
|
||||
> suffix position.
|
||||
|
||||
- String values that cannot be converted into numeric values or which
|
||||
contain invalid structured data (such as invalid JSON).
|
||||
|
||||
#### Other types of tests
|
||||
|
||||
If the code requires other forms of testing (such as stress testing,
|
||||
fuzz testing and integration testing), raise a GitHub issue and
|
||||
reference it on the issue you are using for the main work. This
|
||||
ensures the test team are aware that a new test is required.
|
||||
|
||||
### Test environment
|
||||
|
||||
#### Create unique files and directories
|
||||
|
||||
Ensure your tests do not write to a fixed file or directory. This can
|
||||
cause problems when running multiple tests simultaneously and also
|
||||
when running tests after a previous test run failure.
|
||||
|
||||
#### Assume parallel testing
|
||||
|
||||
Always assume your tests will be run *in parallel*. If this is
|
||||
problematic for a test, force it to run in isolation using the
|
||||
`serial_test` crate for Rust code for example.
|
||||
|
||||
### Running
|
||||
|
||||
Ensure you run the unit tests and they all pass before raising a PR.
|
||||
Ideally do this on different distributions on different architectures
|
||||
to maximise coverage (and so minimise surprises when your code runs in
|
||||
the CI).
|
||||
|
||||
## Assertions
|
||||
|
||||
### Golang assertions
|
||||
|
||||
Use the `testify` assertions package to create a new assertion object as this
|
||||
keeps the test code free from distracting `if` tests:
|
||||
|
||||
```go
|
||||
func TestSomething(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
err := doSomething()
|
||||
assert.NoError(err)
|
||||
}
|
||||
```
|
||||
|
||||
### Rust assertions
|
||||
|
||||
Use the standard set of `assert!()` macros.
|
||||
|
||||
## Table driven tests
|
||||
|
||||
Try to write tests using a table-based approach. This allows you to distill
|
||||
the logic into a compact table (rather than spreading the tests across
|
||||
multiple test functions). It also makes it easy to cover all the
|
||||
interesting boundary conditions:
|
||||
|
||||
### Golang table driven tests
|
||||
|
||||
Assume the following function:
|
||||
|
||||
```go
|
||||
// The function under test.
|
||||
//
|
||||
// Accepts a string and an integer and returns the
|
||||
// result of sticking them together separated by a dash as a string.
|
||||
func joinParamsWithDash(str string, num int) (string, error) {
|
||||
if str == "" {
|
||||
return "", errors.New("string cannot be blank")
|
||||
}
|
||||
|
||||
if num <= 0 {
|
||||
return "", errors.New("number must be positive")
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s-%d", str, num), nil
|
||||
}
|
||||
```
|
||||
|
||||
A table driven approach to testing it:
|
||||
|
||||
```go
|
||||
import (
|
||||
"testing"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestJoinParamsWithDash(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Type used to hold function parameters and expected results.
|
||||
type testData struct {
|
||||
param1 string
|
||||
param2 int
|
||||
expectedResult string
|
||||
expectError bool
|
||||
}
|
||||
|
||||
// List of tests to run including the expected results
|
||||
data := []testData{
|
||||
// Failure scenarios
|
||||
{"", -1, "", true},
|
||||
{"", 0, "", true},
|
||||
{"", 1, "", true},
|
||||
{"foo", 0, "", true},
|
||||
{"foo", -1, "", true},
|
||||
|
||||
// Success scenarios
|
||||
{"foo", 1, "foo-1", false},
|
||||
{"bar", 42, "bar-42", false},
|
||||
}
|
||||
|
||||
// Run the tests
|
||||
for i, d := range data {
|
||||
// Create a test-specific string that is added to each assert
|
||||
// call. It will be displayed if any assert test fails.
|
||||
msg := fmt.Sprintf("test[%d]: %+v", i, d)
|
||||
|
||||
// Call the function under test
|
||||
result, err := joinParamsWithDash(d.param1, d.param2)
|
||||
|
||||
// update the message for more information on failure
|
||||
msg = fmt.Sprintf("%s, result: %q, err: %v", msg, result, err)
|
||||
|
||||
if d.expectError {
|
||||
assert.Error(err, msg)
|
||||
|
||||
// If an error is expected, there is no point
|
||||
// performing additional checks.
|
||||
continue
|
||||
}
|
||||
|
||||
assert.NoError(err, msg)
|
||||
assert.Equal(d.expectedResult, result, msg)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Rust table driven tests
|
||||
|
||||
Assume the following function:
|
||||
|
||||
```rust
|
||||
// Convenience type to allow Result return types to only specify the type
|
||||
// for the true case; failures are specified as static strings.
|
||||
// XXX: This is an example. In real code use the "anyhow" and
|
||||
// XXX: "thiserror" crates.
|
||||
pub type Result<T> = std::result::Result<T, &'static str>;
|
||||
|
||||
// The function under test.
|
||||
//
|
||||
// Accepts a string and an integer and returns the
|
||||
// result of sticking them together separated by a dash as a string.
|
||||
fn join_params_with_dash(str: &str, num: i32) -> Result<String> {
|
||||
if str.is_empty() {
|
||||
return Err("string cannot be blank");
|
||||
}
|
||||
|
||||
if num <= 0 {
|
||||
return Err("number must be positive");
|
||||
}
|
||||
|
||||
let result = format!("{}-{}", str, num);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
A table driven approach to testing it:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_join_params_with_dash() {
|
||||
// This is a type used to record all details of the inputs
|
||||
// and outputs of the function under test.
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
str: &'a str,
|
||||
num: i32,
|
||||
result: Result<String>,
|
||||
}
|
||||
|
||||
// The tests can now be specified as a set of inputs and outputs
|
||||
let tests = &[
|
||||
// Failure scenarios
|
||||
TestData {
|
||||
str: "",
|
||||
num: 0,
|
||||
result: Err("string cannot be blank"),
|
||||
},
|
||||
TestData {
|
||||
str: "foo",
|
||||
num: -1,
|
||||
result: Err("number must be positive"),
|
||||
},
|
||||
|
||||
// Success scenarios
|
||||
TestData {
|
||||
str: "foo",
|
||||
num: 42,
|
||||
result: Ok("foo-42".to_string()),
|
||||
},
|
||||
TestData {
|
||||
str: "-",
|
||||
num: 1,
|
||||
result: Ok("--1".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
// Run the tests
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
// Create a string containing details of the test
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
// Call the function under test
|
||||
let result = join_params_with_dash(d.str, d.num);
|
||||
|
||||
// Update the test details string with the results of the call
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
// Perform the checks
|
||||
if d.result.is_ok() {
|
||||
assert!(result == d.result, msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Temporary files
|
||||
|
||||
Always delete temporary files on success.
|
||||
|
||||
### Golang temporary files
|
||||
|
||||
```go
|
||||
func TestSomething(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Create a temporary directory
|
||||
tmpdir, err := os.MkdirTemp("", "")
|
||||
assert.NoError(err)
|
||||
|
||||
// Delete it at the end of the test
|
||||
defer os.RemoveAll(tmpdir)
|
||||
|
||||
// Add test logic that will use the tmpdir here...
|
||||
}
|
||||
```
|
||||
|
||||
### Rust temporary files
|
||||
|
||||
Use the `tempfile` crate which allows files and directories to be deleted
|
||||
automatically:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_something() {
|
||||
|
||||
// Create a temporary directory (which will be deleted automatically
|
||||
let dir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
let filename = dir.path().join("file.txt");
|
||||
|
||||
// create filename ...
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Test user
|
||||
|
||||
[Unit tests are run *twice*](https://github.com/kata-containers/tests/blob/main/.ci/go-test.sh):
|
||||
|
||||
- as the current user
|
||||
- as the `root` user (if different to the current user)
|
||||
|
||||
When writing a test consider which user should run it; even if the code the
|
||||
test is exercising runs as `root`, it may be necessary to *only* run the test
|
||||
as a non-`root` for the test to be meaningful. Add appropriate skip
|
||||
guards around code that requires `root` and non-`root` so that the test
|
||||
will run if the correct type of user is detected and skipped if not.
|
||||
|
||||
### Run Golang tests as a different user
|
||||
|
||||
The main repository has the most comprehensive set of skip abilities. See:
|
||||
|
||||
- https://github.com/kata-containers/kata-containers/tree/main/src/runtime/pkg/katatestutils
|
||||
|
||||
### Run Rust tests as a different user
|
||||
|
||||
One method is to use the `nix` crate along with some custom macros:
|
||||
|
||||
```
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[allow(unused_macros)]
|
||||
macro_rules! skip_if_root {
|
||||
() => {
|
||||
if nix::unistd::Uid::effective().is_root() {
|
||||
println!("INFO: skipping {} which needs non-root", module_path!());
|
||||
return;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[allow(unused_macros)]
|
||||
macro_rules! skip_if_not_root {
|
||||
() => {
|
||||
if !nix::unistd::Uid::effective().is_root() {
|
||||
println!("INFO: skipping {} which needs root", module_path!());
|
||||
return;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_that_must_be_run_as_root() {
|
||||
// Not running as the superuser, so skip.
|
||||
skip_if_not_root!();
|
||||
|
||||
// Run test *iff* the user running the test is root
|
||||
|
||||
// ...
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -102,7 +102,7 @@ first
|
||||
[install the latest release](#determine-latest-version).
|
||||
|
||||
See the
|
||||
[manual installation documentation](install/README.md#manual-installation)
|
||||
[manual installation installation documentation](install/README.md#manual-installation)
|
||||
for details on how to automatically install and configuration a static release
|
||||
with containerd.
|
||||
|
||||
@@ -114,7 +114,7 @@ with containerd.
|
||||
> kernel or image.
|
||||
|
||||
If you are using custom
|
||||
[guest assets](design/architecture/README.md#guest-assets),
|
||||
[guest assets](design/architecture.md#guest-assets),
|
||||
you must upgrade them to work with Kata Containers 2.x since Kata
|
||||
Containers 1.x assets will **not** work.
|
||||
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
# Code PR Advice
|
||||
|
||||
Before raising a PR containing code changes, we suggest you consider
|
||||
the following to ensure a smooth and fast process.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> - All the advice in this document is optional. However, if the
|
||||
> advice provided is not followed, there is no guarantee your PR
|
||||
> will be merged.
|
||||
>
|
||||
> - All the check tools will be run automatically on your PR by the CI.
|
||||
> However, if you run them locally first, there is a much better
|
||||
> chance of a successful initial CI run.
|
||||
|
||||
## Assumptions
|
||||
|
||||
This document assumes you have already read (and in the case of the
|
||||
code of conduct agreed to):
|
||||
|
||||
- The [Kata Containers code of conduct](https://github.com/kata-containers/community/blob/main/CODE_OF_CONDUCT.md).
|
||||
- The [Kata Containers contributing guide](https://github.com/kata-containers/community/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## Code
|
||||
|
||||
### Architectures
|
||||
|
||||
Do not write architecture-specific code if it is possible to write the
|
||||
code generically.
|
||||
|
||||
### General advice
|
||||
|
||||
- Do not write code to impress: instead write code that is easy to read and understand.
|
||||
|
||||
- Always consider which user will run the code. Try to minimise
|
||||
the privileges the code requires.
|
||||
|
||||
### Comments
|
||||
|
||||
Always add comments if the intent of the code is not obvious. However,
|
||||
try to avoid comments if the code could be made clearer (for example
|
||||
by using more meaningful variable names).
|
||||
|
||||
### Constants
|
||||
|
||||
Don't embed magic numbers and strings in functions, particularly if
|
||||
they are used repeatedly.
|
||||
|
||||
Create constants at the top of the file instead.
|
||||
|
||||
### Copyright and license
|
||||
|
||||
Ensure all new files contain a copyright statement and an SPDX license
|
||||
identifier in the comments at the top of the file.
|
||||
|
||||
### FIXME and TODO
|
||||
|
||||
If the code contains areas that are not fully implemented, make this
|
||||
clear a comment which provides a link to a GitHub issue that provides
|
||||
further information.
|
||||
|
||||
Do not just rely on comments in this case though: if possible, return
|
||||
a "`BUG: feature X not implemented see {bug-url}`" type error.
|
||||
|
||||
### Functions
|
||||
|
||||
- Keep functions relatively short (less than 100 lines is a good "rule of thumb").
|
||||
|
||||
- Document functions if the parameters, return value or general intent
|
||||
of the function is not obvious.
|
||||
|
||||
- Always return errors where possible.
|
||||
|
||||
Do not discard error return values from the functions this function
|
||||
calls.
|
||||
|
||||
### Logging
|
||||
|
||||
- Don't use multiple log calls when a single log call could be used.
|
||||
|
||||
- Use structured logging where possible to allow
|
||||
[standard tooling](https://github.com/kata-containers/tests/tree/main/cmd/log-parser)
|
||||
be able to extract the log fields.
|
||||
|
||||
### Names
|
||||
|
||||
Give functions, macros and variables clear and meaningful names.
|
||||
|
||||
### Structures
|
||||
|
||||
#### Golang structures
|
||||
|
||||
Unlike Rust, Go does not enforce that all structure members be set.
|
||||
This has lead to numerous bugs in the past where code like the
|
||||
following is used:
|
||||
|
||||
```go
|
||||
type Foo struct {
|
||||
Key string
|
||||
Value string
|
||||
}
|
||||
|
||||
// BUG: Key not set, but nobody noticed! ;(
|
||||
let foo1 = Foo {
|
||||
Value: "foo",
|
||||
}
|
||||
```
|
||||
|
||||
A much safer approach is to create a constructor function to enforce
|
||||
integrity:
|
||||
|
||||
```go
|
||||
type Foo struct {
|
||||
Key string
|
||||
Value string
|
||||
}
|
||||
|
||||
func NewFoo(key, value string) (*Foo, error) {
|
||||
if key == "" {
|
||||
return nil, errors.New("Foo needs a key")
|
||||
}
|
||||
|
||||
if value == "" {
|
||||
return nil, errors.New("Foo needs a value")
|
||||
}
|
||||
|
||||
return &Foo{
|
||||
Key: key,
|
||||
Value: value,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func testFoo() error {
|
||||
// BUG: Key not set, but nobody noticed! ;(
|
||||
badFoo := Foo{Value: "value"}
|
||||
|
||||
// Ok - the constructor performs needed validation
|
||||
goodFoo, err := NewFoo("name", "value")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The above is just an example. The *safest* approach would be to move
|
||||
> `NewFoo()` into a separate package and make `Foo` and it's elements
|
||||
> private. The compiler would then enforce the use of the constructor
|
||||
> to guarantee correctly defined objects.
|
||||
|
||||
|
||||
### Tracing
|
||||
|
||||
Consider if the code needs to create a new
|
||||
[trace span](https://github.com/kata-containers/kata-containers/blob/main/docs/tracing.md).
|
||||
|
||||
Ensure any new trace spans added to the code are completed.
|
||||
|
||||
## Tests
|
||||
|
||||
### Unit tests
|
||||
|
||||
Where possible, code changes should be accompanied by unit tests.
|
||||
|
||||
Consider using the standard
|
||||
[table-based approach](Unit-Test-Advice.md)
|
||||
as it encourages you to make functions small and simple, and also
|
||||
allows you to think about what types of value to test.
|
||||
|
||||
### Other categories of test
|
||||
|
||||
Raised a GitHub issue in the
|
||||
[`tests`](https://github.com/kata-containers/tests) repository that
|
||||
explains what sort of test is required along with as much detail as
|
||||
possible. Ensure the original issue is referenced on the `tests` issue.
|
||||
|
||||
### Unsafe code
|
||||
|
||||
#### Rust language specifics
|
||||
|
||||
Minimise the use of `unsafe` blocks in Rust code and since it is
|
||||
potentially dangerous always write [unit tests][#unit-tests]
|
||||
for this code where possible.
|
||||
|
||||
`expect()` and `unwrap()` will cause the code to panic on error.
|
||||
Prefer to return a `Result` on error rather than using these calls to
|
||||
allow the caller to deal with the error condition.
|
||||
|
||||
The table below lists the small number of cases where use of
|
||||
`expect()` and `unwrap()` are permitted:
|
||||
|
||||
| Area | Rationale for permitting |
|
||||
|-|-|
|
||||
| In test code (the `tests` module) | Panics will cause the test to fail, which is desirable. |
|
||||
| `lazy_static!()` | This magic macro cannot "return" a value as it runs before `main()`. |
|
||||
| `defer!()` | Similar to golang's `defer()` but doesn't allow the use of `?`. |
|
||||
| `tokio::spawn(async move {})` | Cannot currently return a `Result` from an `async move` closure. |
|
||||
| If an explicit test is performed before the `unwrap()` / `expect()` | *"Just about acceptable"*, but not ideal `[*]` |
|
||||
| `Mutex.lock()` | Almost unrecoverable if failed in the lock acquisition |
|
||||
|
||||
|
||||
`[*]` - There can lead to bad *future* code: consider what would
|
||||
happen if the explicit test gets dropped in the future. This is easier
|
||||
to happen if the test and the extraction of the value are two separate
|
||||
operations. In summary, this strategy can introduce an insidious
|
||||
maintenance issue.
|
||||
|
||||
## Documentation
|
||||
|
||||
### General requirements
|
||||
|
||||
- All new features should be accompanied by documentation explaining:
|
||||
|
||||
- What the new feature does
|
||||
|
||||
- Why it is useful
|
||||
|
||||
- How to use the feature
|
||||
|
||||
- Any known issues or limitations
|
||||
|
||||
Links should be provided to GitHub issues tracking the issues
|
||||
|
||||
- The [documentation requirements document](Documentation-Requirements.md)
|
||||
explains how the project formats documentation.
|
||||
|
||||
### Markdown syntax
|
||||
|
||||
Run the
|
||||
[markdown checker](https://github.com/kata-containers/tests/tree/main/cmd/check-markdown)
|
||||
on your documentation changes.
|
||||
|
||||
### Spell check
|
||||
|
||||
Run the
|
||||
[spell checker](https://github.com/kata-containers/tests/tree/main/cmd/check-spelling)
|
||||
on your documentation changes.
|
||||
|
||||
## Finally
|
||||
|
||||
You may wish to read the documentation that the
|
||||
[Kata Review Team](https://github.com/kata-containers/community/blob/main/Rota-Process.md) use to help review PRs:
|
||||
|
||||
- [PR review guide](https://github.com/kata-containers/community/blob/main/PR-Review-Guide.md).
|
||||
- [documentation review process](https://github.com/kata-containers/community/blob/main/Documentation-Review-Process.md).
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
Kata Containers design documents:
|
||||
|
||||
- [Kata Containers architecture](architecture)
|
||||
- [Kata Containers architecture](architecture.md)
|
||||
- [API Design of Kata Containers](kata-api-design.md)
|
||||
- [Design requirements for Kata Containers](kata-design-requirements.md)
|
||||
- [VSocks](VSocks.md)
|
||||
|
||||
@@ -1 +1 @@
|
||||
<mxfile host="app.diagrams.net" modified="2021-11-05T13:07:32.992Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" etag="j5e7J3AOXxeQrt-Zz2uw" version="15.6.8" type="device"><diagram id="XNV8G0dePIPkhS_Khqr4" name="Page-1">7Vxdd9o4EP01nLP7QI5s+fORUNhNT7rNbnqaZl/2CCywG2OxQhDIr18Z29iyZD6CDZRuHho8toQ9986dGVlNC3Yny98omvqfiIfDlg68ZQt+aOm6AUyb/4otq8TiGnpiGNPAS0wgNzwGbzgxapl1Hnh4ltoSEyMkZMFUNA5JFOEhE2yIUvIqXjYioScYpmiMJcPjEIWy9SnwmJ9YdQjd/MTvOBj76VdDCNI7n6Ds6tQw85FHXgsm2GvBLiWEJZ8myy4OY++JjulXnN3cGcUR22fAn3fPzx+jj7e9HrIXA330feIZ7czPCxTO00dO75atMh+MKZlP08swZXip8jwaZJcD+ca0zeNyomAywYyu+CXpRG3NNM1kUMoSXU+PX3OfG9nEfsHdG56gFOfxZvbcE/xD6gy1Yz7/88nuPiLwcNcZDLvEvZ10Vm1zt1+4WyIPx5NoLXj76gcMP07RMD77yqOB23w2CdPTIxKxlN78nuHtmCIv4A7qkpDQ9XzQxsjC8blREIYFu4ewMxpy+4xR8oILZ6yhgwcjfkZ2+Xa0yzDK2JzP81ZzntcNtedHI8+1LNnzo9FIHyo971kDy7Qa8Xx61gJCSGhyRHCtkXHRLLMhXGwJF659/KFrCwtHBkAbIA3rKgAAsHqdfjqDCBn/aRIYTRORUWiVa8rAwKZwcSRcuCQzFESYcrNWLz6K4HFtD9i2QrZM7HiGCjtHH8Ak3ETs+n0A+v0msdNhCTv7RkZPM1TwNSV37lb4asw6VwifDc4MnqJ88ldTTBfBjHulDB1/SibiI/o2IhEuAZGaUBiMI3445B7lvIC3sc8CXqZ20hOTwPPir1ESIqcMUORDn9DgLaZcmF7QnHKWUhqQ4TNUpUZj6GkSeuM5nvGUBl4wjeJW5odAsDnAzBJihiLgrIYgUz6DrJYSRqdvVwzblol82qJZM3Y75sfrV9wKGC+pXdEa7BTP16/s7/lLbVc0uY+8hn7lYGCkdgVKyJy0XdHkPvJn6VcOxu4C2xVte7t5zf3K0fCdv12Ry6efpl05XDgvrFvR5V7zmruVw/E6Z7OiDjcoQYK9MX5MDwllPhmTCIW93FpyXn7NPSHTFMXvmLFV6lI0Z0TEGC8D9q3w+TmeiueN5OjDMp15fbCSMdJijDg0dPUtuzI+KMwSH+bTrI+yeWRss3xO5nSYsfb+y53jDp6+P1r+y+fXj+HLU97AMETHmG1xalo+xI7cygqKQ8SCBRZuQwXxemiHUrQqXDAlQcRmhZkfYoPQBNqOmJstu0gYxQjD1ksjnBLFkrvICbd5nCNUA0qqwRidDpXMvEcDLiMCm/ZXAopnwVvaV8dcSF3IJzdvW+YHBctktmwNo727+erWHdxormWIMpEcHUYXGV0osmFz09kUZDSaYSZJymEIqyNHLqirEb5A7Xmv1hTZZB+nPa6sPVtFaqf45HyDBrRFvtnHEa5WQqklQy7ir5g6aiE6hjpqEbuQtOUCUf52p63yCFOzS6w7Lm1t9WtB1LqFNrMXjfmnlm6FcYE74CZrHH/6ZdOLeq04F/T5v92/7tqff5UoXeeyj+U5tqXsPWHHNGA2w37LPtlLib0L37auOa4IKpQXpDXHkktfq4bSV4lf1tb+HBpyXOmr75t+4L7pZ28ROQpjXY7RBxrP7eP5LH5wTBdYXla4rsATyz4LqALPaCbw1Mn7nHGXx9pzMdR2xF0eas/ZfCfJ3VDRcqrFrPa4e2fytk1bSbfq5F0eYTpmrclbyUH5XaTP2FRJzMvsOPUKJTi44QQ3Um6uqd/MXm9l/aZuiFM01x7ICwqV6J5MdqCY8QGwTqI98aQPmAbcpTFTj1wC21+P4J56tGGhCQEUK8QjaVhNs8NVzbHFezNAaSP7TlUrjTha1dDX0f1d+292B77+8dRGX7/MfHCmzPpOplaycPcah9tIslM1lpq4jWZTPGWTJBGTjjuGYVXftKXpLY0wHbh9hA5ca9uIZtpkKKfaF8RQe0KigCle6V3sXofDa2/NNfWbCgIVqm/dVLxgba76lrcvXGjb+64yetvK1u4VMKtuYTkOKjl0frQXI5VR8446FZqmXlNlCsqvQkqyXktpqnxnvMdWvOZ3hzqGmAgMR7EmYG928hR1yW5s05XCMcnaqRcsBAdZ/87j/5C4pmR7tuZkh1+gGdPlmpjZ+WzFNV9wbc/8YNJep5/FZnp+t+tvSC6uLx8ZDeejrfQ6aDtqBdTNrbzKujYjw5f6XK/a8esAwIt4hes7JgAGOKXrs/2o2o1e2qUtb51T1QZ1bL5SPoL8mvYCxEn5pkDNWKcpxir3rv+vToeGiF1BnMtSJ3n16ArUaX/XV6qTKW9Wq0md+GH+RwaSSiv/Ww2w9x8=</diagram></mxfile>
|
||||
<mxfile host="Chrome" modified="2020-07-02T06:44:28.736Z" agent="5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" etag="r7FpfnbGNK7jbg54Gu9x" version="13.3.5" type="device"><diagram id="XNV8G0dePIPkhS_Khqr4" name="Page-1">7VvZcuI4FP0aHqFky+sjkNCTqfR0qtLV6fTLlMDy0hiLscWWrx8Zy3iRQkhjQxbygnVlhK1zz9HRkg4cztZfYjT3vxIHhx0VOOsOvOqoKrRthX2kkU0WMTWYBbw4cLIQKAL3wRPOgkoeXQQOTngsC1FCQhrMq8EJiSI8oZUYimOyqt7mktCpBObIw0LgfoJCMfoQONTPoiqEdlHxFw48n/80hIA/+Qzld/NA4iOHrEoheN2Bw5gQml3N1kMcpr1X7ZjRM7W7J4txRA/5wrd/v5rDewTubvrjyZDYg1l/01W0rJklChf8lfnT0k3eBzFZRA5OW1E6cLDyA4rv52iS1q4Y6izm01nIq10SUQ4jwxAOvBg5AXvCIQlJvG0PmhgZOK1zgzAsxR2ELXfC4gmNyRSXaoyJhccuqxHfmXfDEscUr0sh3gdfMJlhGm/YLby2q+i6nn2J56QOzKy8KhDWctT8Eri7rEQ8q7xd60W/swve9a+BQW8PBlWTw+C6jm0YIgyu66oTKQyOMTZ0oykYNLsGQ/7OJRgYnUQYFENvCYYWUXgvZFDss5PBuHBBVc7OBQUKvY4dNjbyIompTzwSofC6iA4KXNKULu65JWTO0fiNKd1wONCCkipWeB3Qn6Xrx7Spns5LV2ve8raw4YUyy7R9iCRkEU/4u3i3328se/zw+97wp99Wf4fTh2I0pCj2MN3TOZwjaYfsBTjGIaLBsmomGodKhQJjKI3nEymAt2jMPFql01EYeBG7nrAOwyy/B2nqBswE9XnFLHCcDF+cBE9ovG0v7fo5CSK6fR190NGvDgJjb7YJpNlZO/6rFfMkJRPoKbahVUUtKx2MBm/8Ln27Ustqmonldrt2tQ3iugnLmzqeu4c8COK9mVkRRSNkXTpwgmUFZeO/RWopt0h0ky0UfXaDos3XWzzyenblpZ+sfykKIhw73cQPZt0poqi73DXPHnf7C9nNzY2Hmqi2yhgpWJWpLQDGdX/EW6jqM/trSoUts6bCUBwLFdPMk6Csw0YDg6EcePMV3H6D4szwiDc/y4XSt9Ji8bVtSSbq5nGibouivpdjL6o6TxjQA5ZuVjMGHKc0jQqJfKwQzdQHzpwj7YAkc+Sj17nswN7HLklGofHNCbglCrjhWKahyQQc9nUN5i20JeBMnMHLAi6bzLQm35r+megGjqKbeqhQw0OF+jR8U0W+3cVp2z5eJOmL45gl8ccmnm1XiGdIltQUS2uHePJx7py8K7j2WKbaC7wrqPaYt3eSYQ5KZr1yMTsb76QQi1Min9K5FPe3OelVnyHaq+e8oMfYVWXgkVPefIKrKL3aAlN71lRcxXgWz5PxWP2YRIYHEnmXXxBa1fSyj8uvRrMJ/XBvb7q/6A348c9DF/34nvjgzANA61lzgizJMX4jNguKer9dqpqRKKCkXX917pUpW1drS48yh6Xqp1yZ0kS9Tshk2hwOsl0xCwATynDo6wBooG0cLFibYFoiCjIQYGs2VxH6+43OL/9omNu32vLyqozxpuyqKurXe9uleZYyf+BYoa6rx3mInJWGUuFkVwOn86yKgOllW6Zp0TVr2zKaRHRPvS2jiWT+8IOfqcBeDQodqucd/8TdMeSl7/iRzaCm1bYpVREEWwZCW0dFLAGEnXilCtcsGJLTO7bpANOUEEbHliNdFbXUMczO+1SBGo0AGI2aAgqqdaC0XKTK0qWdkjB79oZYWJw0f1qsDMkgc1Kk8vN15fWwzRzHyyCRzHbZi9IqGNWOjEiEa73OQ4f7Shn61blE/aidT+LgKc2vsPPCssWrzizWBVB2ZlF2ZLE1qEQb6C1wkprUKY6j9Ez8u4CrmeEJVNGBsk1Y46TwiCdKP59L0HOhOpdLkJxkutgE2dCjw/PbBGW/p7v4hB1Y5tl9gmjpLj5B6hNka+Yn9QmqaOkuPmGHjtaeT2DF4h/tsrW/4v8V4fX/</diagram></mxfile>
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 93 KiB |
290
docs/design/architecture.md
Normal file
290
docs/design/architecture.md
Normal file
@@ -0,0 +1,290 @@
|
||||
# Kata Containers Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
This is an architectural overview of Kata Containers, based on the 2.0 release.
|
||||
|
||||
The primary deliverable of the Kata Containers project is a CRI friendly shim. There is also a CRI friendly library API behind them.
|
||||
|
||||
The [Kata Containers runtime](../../src/runtime)
|
||||
is compatible with the [OCI](https://github.com/opencontainers) [runtime specification](https://github.com/opencontainers/runtime-spec)
|
||||
and therefore works seamlessly with the [Kubernetes\* Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-node/container-runtime-interface.md)
|
||||
through the [CRI-O\*](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[Containerd\*](https://github.com/containerd/containerd) implementation.
|
||||
|
||||
Kata Containers creates a QEMU\*/KVM virtual machine for pod that `kubelet` (Kubernetes) creates respectively.
|
||||
|
||||
The [`containerd-shim-kata-v2` (shown as `shimv2` from this point onwards)](../../src/runtime/cmd/containerd-shim-kata-v2/)
|
||||
is the Kata Containers entrypoint, which
|
||||
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/master/runtime/v2) for Kata.
|
||||
|
||||
Before `shimv2` (as done in [Kata Containers 1.x releases](https://github.com/kata-containers/runtime/releases)), we need to create a `containerd-shim` and a [`kata-shim`](https://github.com/kata-containers/shim) for each container and the Pod sandbox itself, plus an optional [`kata-proxy`](https://github.com/kata-containers/proxy) when VSOCK is not available. With `shimv2`, Kubernetes can launch Pod and OCI compatible containers with one shim (the `shimv2`) per Pod instead of `2N+1` shims, and no standalone `kata-proxy` process even if no VSOCK is available.
|
||||
|
||||

|
||||
|
||||
The container process is then spawned by
|
||||
[`kata-agent`](../../src/agent), an agent process running
|
||||
as a daemon inside the virtual machine. `kata-agent` runs a [`ttRPC`](https://github.com/containerd/ttrpc-rust) server in
|
||||
the guest using a VIRTIO serial or VSOCK interface which QEMU exposes as a socket
|
||||
file on the host. `shimv2` uses a `ttRPC` protocol to communicate with
|
||||
the agent. This protocol allows the runtime to send container management
|
||||
commands to the agent. The protocol is also used to carry the I/O streams (stdout,
|
||||
stderr, stdin) between the containers and the manage engines (e.g. CRI-O or containerd).
|
||||
|
||||
For any given container, both the init process and all potentially executed
|
||||
commands within that container, together with their related I/O streams, need
|
||||
to go through the VSOCK interface exported by QEMU.
|
||||
|
||||
The container workload, that is, the actual OCI bundle rootfs, is exported from the
|
||||
host to the virtual machine. In the case where a block-based graph driver is
|
||||
configured, `virtio-scsi` will be used. In all other cases a `virtio-fs` VIRTIO mount point
|
||||
will be used. `kata-agent` uses this mount point as the root filesystem for the
|
||||
container processes.
|
||||
|
||||
## Virtualization
|
||||
|
||||
How Kata Containers maps container concepts to virtual machine technologies, and how this is realized in the multiple
|
||||
hypervisors and VMMs that Kata supports is described within the [virtualization documentation](./virtualization.md)
|
||||
|
||||
## Guest assets
|
||||
|
||||
The hypervisor will launch a virtual machine which includes a minimal guest kernel
|
||||
and a guest image.
|
||||
|
||||
### Guest kernel
|
||||
|
||||
The guest kernel is passed to the hypervisor and used to boot the virtual
|
||||
machine. The default kernel provided in Kata Containers is highly optimized for
|
||||
kernel boot time and minimal memory footprint, providing only those services
|
||||
required by a container workload. This is based on a very current upstream Linux
|
||||
kernel.
|
||||
|
||||
### Guest image
|
||||
|
||||
Kata Containers supports both an `initrd` and `rootfs` based minimal guest image.
|
||||
|
||||
#### Root filesystem image
|
||||
|
||||
The default packaged root filesystem image, sometimes referred to as the "mini O/S", is a
|
||||
highly optimized container bootstrap system based on [Clear Linux](https://clearlinux.org/). It provides an extremely minimal environment and
|
||||
has a highly optimized boot path.
|
||||
|
||||
The only services running in the context of the mini O/S are the init daemon
|
||||
(`systemd`) and the [Agent](#agent). The real workload the user wishes to run
|
||||
is created using libcontainer, creating a container in the same manner that is done
|
||||
by `runc`.
|
||||
|
||||
For example, when `ctr run -ti ubuntu date` is run:
|
||||
|
||||
- The hypervisor will boot the mini-OS image using the guest kernel.
|
||||
- `systemd`, running inside the mini-OS context, will launch the `kata-agent` in
|
||||
the same context.
|
||||
- The agent will create a new confined context to run the specified command in
|
||||
(`date` in this example).
|
||||
- The agent will then execute the command (`date` in this example) inside this
|
||||
new context, first setting the root filesystem to the expected Ubuntu\* root
|
||||
filesystem.
|
||||
|
||||
#### Initrd image
|
||||
|
||||
A compressed `cpio(1)` archive, created from a rootfs which is loaded into memory and used as part of the Linux startup process. During startup, the kernel unpacks it into a special instance of a `tmpfs` that becomes the initial root filesystem.
|
||||
|
||||
The only service running in the context of the initrd is the [Agent](#agent) as the init daemon. The real workload the user wishes to run is created using libcontainer, creating a container in the same manner that is done by `runc`.
|
||||
|
||||
## Agent
|
||||
|
||||
[`kata-agent`](../../src/agent) is a process running in the guest as a supervisor for managing containers and processes running within those containers.
|
||||
|
||||
For the 2.0 release, the `kata-agent` is rewritten in the [RUST programming language](https://www.rust-lang.org/) so that we can minimize its memory footprint while keeping the memory safety of the original GO version of [`kata-agent` used in Kata Container 1.x](https://github.com/kata-containers/agent). This memory footprint reduction is pretty impressive, from tens of megabytes down to less than 100 kilobytes, enabling Kata Containers in more use cases like functional computing and edge computing.
|
||||
|
||||
The `kata-agent` execution unit is the sandbox. A `kata-agent` sandbox is a container sandbox defined by a set of namespaces (NS, UTS, IPC and PID). `shimv2` can
|
||||
run several containers per VM to support container engines that require multiple
|
||||
containers running inside a pod.
|
||||
|
||||
`kata-agent` communicates with the other Kata components over `ttRPC`.
|
||||
|
||||
## Runtime
|
||||
|
||||
`containerd-shim-kata-v2` is a [containerd runtime shimv2](https://github.com/containerd/containerd/blob/v1.4.1/runtime/v2/README.md) implementation and is responsible for handling the `runtime v2 shim APIs`, which is similar to [the OCI runtime specification](https://github.com/opencontainers/runtime-spec) but simplifies the architecture by loading the runtime once and making RPC calls to handle the various container lifecycle commands. This refinement is an improvement on the OCI specification which requires the container manager call the runtime binary multiple times, at least once for each lifecycle command.
|
||||
|
||||
`containerd-shim-kata-v2` heavily utilizes the
|
||||
[virtcontainers package](../../src/runtime/virtcontainers/), which provides a generic, runtime-specification agnostic, hardware-virtualized containers library.
|
||||
|
||||
### Configuration
|
||||
|
||||
The runtime uses a TOML format configuration file called `configuration.toml`. By default this file is installed in the `/usr/share/defaults/kata-containers` directory and contains various settings such as the paths to the hypervisor, the guest kernel and the mini-OS image.
|
||||
|
||||
The actual configuration file paths can be determined by running:
|
||||
```
|
||||
$ kata-runtime --show-default-config-paths
|
||||
```
|
||||
Most users will not need to modify the configuration file.
|
||||
|
||||
The file is well commented and provides a few "knobs" that can be used to modify the behavior of the runtime and your chosen hypervisor.
|
||||
|
||||
The configuration file is also used to enable runtime [debug output](../Developer-Guide.md#enable-full-debug).
|
||||
|
||||
## Networking
|
||||
|
||||
Containers will typically live in their own, possibly shared, networking namespace.
|
||||
At some point in a container lifecycle, container engines will set up that namespace
|
||||
to add the container to a network which is isolated from the host network, but
|
||||
which is shared between containers
|
||||
|
||||
In order to do so, container engines will usually add one end of a virtual
|
||||
ethernet (`veth`) pair into the container networking namespace. The other end of
|
||||
the `veth` pair is added to the host networking namespace.
|
||||
|
||||
This is a very namespace-centric approach as many hypervisors/VMMs cannot handle `veth`
|
||||
interfaces. Typically, `TAP` interfaces are created for VM connectivity.
|
||||
|
||||
To overcome incompatibility between typical container engines expectations
|
||||
and virtual machines, Kata Containers networking transparently connects `veth`
|
||||
interfaces with `TAP` ones using Traffic Control:
|
||||
|
||||

|
||||
|
||||
With a TC filter in place, a redirection is created between the container network and the
|
||||
virtual machine. As an example, the CNI may create a device, `eth0`, in the container's network
|
||||
namespace, which is a VETH device. Kata Containers will create a tap device for the VM, `tap0_kata`,
|
||||
and setup a TC redirection filter to mirror traffic from `eth0`'s ingress to `tap0_kata`'s egress,
|
||||
and a second to mirror traffic from `tap0_kata`'s ingress to `eth0`'s egress.
|
||||
|
||||
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata. TC-filter
|
||||
is the default because it allows for simpler configuration, better CNI plugin compatibility, and performance
|
||||
on par with MACVTAP.
|
||||
|
||||
Kata Containers has deprecated support for bridge due to lacking performance relative to TC-filter and MACVTAP.
|
||||
|
||||
Kata Containers supports both
|
||||
[CNM](https://github.com/docker/libnetwork/blob/master/docs/design.md#the-container-network-model)
|
||||
and [CNI](https://github.com/containernetworking/cni) for networking management.
|
||||
|
||||
### Network Hotplug
|
||||
|
||||
Kata Containers has developed a set of network sub-commands and APIs to add, list and
|
||||
remove a guest network endpoint and to manipulate the guest route table.
|
||||
|
||||
The following diagram illustrates the Kata Containers network hotplug workflow.
|
||||
|
||||

|
||||
|
||||
## Storage
|
||||
Container workloads are shared with the virtualized environment through [virtio-fs](https://virtio-fs.gitlab.io/).
|
||||
|
||||
The [devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/master/snapshots/devmapper) is a special case. The `snapshotter` uses dedicated block devices rather than formatted filesystems, and operates at the block level rather than the file level. This knowledge is used to directly use the underlying block device instead of the overlay file system for the container root file system. The block device maps to the top read-write layer for the overlay. This approach gives much better I/O performance compared to using `virtio-fs` to share the container file system.
|
||||
|
||||
Kata Containers has the ability to hotplug and remove block devices, which makes it possible to use block devices for containers started after the VM has been launched.
|
||||
|
||||
Users can check to see if the container uses the devicemapper block device as its rootfs by calling `mount(8)` within the container. If the devicemapper block device
|
||||
is used, `/` will be mounted on `/dev/vda`. Users can disable direct mounting of the underlying block device through the runtime configuration.
|
||||
|
||||
## Kubernetes support
|
||||
|
||||
[Kubernetes\*](https://github.com/kubernetes/kubernetes/) is a popular open source
|
||||
container orchestration engine. In Kubernetes, a set of containers sharing resources
|
||||
such as networking, storage, mount, PID, etc. is called a
|
||||
[Pod](https://kubernetes.io/docs/user-guide/pods/).
|
||||
A node can have multiple pods, but at a minimum, a node within a Kubernetes cluster
|
||||
only needs to run a container runtime and a container agent (called a
|
||||
[Kubelet](https://kubernetes.io/docs/admin/kubelet/)).
|
||||
|
||||
A Kubernetes cluster runs a control plane where a scheduler (typically running on a
|
||||
dedicated master node) calls into a compute Kubelet. This Kubelet instance is
|
||||
responsible for managing the lifecycle of pods within the nodes and eventually relies
|
||||
on a container runtime to handle execution. The Kubelet architecture decouples
|
||||
lifecycle management from container execution through the dedicated
|
||||
`gRPC` based [Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/container-runtime-interface-v1.md).
|
||||
|
||||
In other words, a Kubelet is a CRI client and expects a CRI implementation to
|
||||
handle the server side of the interface.
|
||||
[CRI-O\*](https://github.com/kubernetes-incubator/cri-o) and [Containerd\*](https://github.com/containerd/containerd/) are CRI implementations that rely on [OCI](https://github.com/opencontainers/runtime-spec)
|
||||
compatible runtimes for managing container instances.
|
||||
|
||||
Kata Containers is an officially supported CRI-O and Containerd runtime. Refer to the following guides on how to set up Kata Containers with Kubernetes:
|
||||
|
||||
- [How to use Kata Containers and Containerd](../how-to/containerd-kata.md)
|
||||
- [Run Kata Containers with Kubernetes](../how-to/run-kata-with-k8s.md)
|
||||
|
||||
#### OCI annotations
|
||||
|
||||
In order for the Kata Containers runtime (or any virtual machine based OCI compatible
|
||||
runtime) to be able to understand if it needs to create a full virtual machine or if it
|
||||
has to create a new container inside an existing pod's virtual machine, CRI-O adds
|
||||
specific annotations to the OCI configuration file (`config.json`) which is passed to
|
||||
the OCI compatible runtime.
|
||||
|
||||
Before calling its runtime, CRI-O will always add a `io.kubernetes.cri-o.ContainerType`
|
||||
annotation to the `config.json` configuration file it produces from the Kubelet CRI
|
||||
request. The `io.kubernetes.cri-o.ContainerType` annotation can either be set to `sandbox`
|
||||
or `container`. Kata Containers will then use this annotation to decide if it needs to
|
||||
respectively create a virtual machine or a container inside a virtual machine associated
|
||||
with a Kubernetes pod:
|
||||
|
||||
```Go
|
||||
containerType, err := ociSpec.ContainerType()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
handleFactory(ctx, runtimeConfig)
|
||||
|
||||
disableOutput := noNeedForOutput(detach, ociSpec.Process.Terminal)
|
||||
|
||||
var process vc.Process
|
||||
switch containerType {
|
||||
case vc.PodSandbox:
|
||||
process, err = createSandbox(ctx, ociSpec, runtimeConfig, containerID, bundlePath, console, disableOutput, systemdCgroup)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case vc.PodContainer:
|
||||
process, err = createContainer(ctx, ociSpec, containerID, bundlePath, console, disableOutput)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### Mixing VM based and namespace based runtimes
|
||||
|
||||
> **Note:** Since Kubernetes 1.12, the [`Kubernetes RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
> has been supported and the user can specify runtime without the non-standardized annotations.
|
||||
|
||||
With `RuntimeClass`, users can define Kata Containers as a `RuntimeClass` and then explicitly specify that a pod being created as a Kata Containers pod. For details, please refer to [How to use Kata Containers and Containerd](../../docs/how-to/containerd-kata.md).
|
||||
|
||||
|
||||
# Appendices
|
||||
|
||||
## DAX
|
||||
|
||||
Kata Containers utilizes the Linux kernel DAX [(Direct Access filesystem)](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/filesystems/dax.rst?h=v5.14)
|
||||
feature to efficiently map some host-side files into the guest VM space.
|
||||
In particular, Kata Containers uses the QEMU NVDIMM feature to provide a
|
||||
memory-mapped virtual device that can be used to DAX map the virtual machine's
|
||||
root filesystem into the guest memory address space.
|
||||
|
||||
Mapping files using DAX provides a number of benefits over more traditional VM
|
||||
file and device mapping mechanisms:
|
||||
|
||||
- Mapping as a direct access devices allows the guest to directly access
|
||||
the host memory pages (such as via Execute In Place (XIP)), bypassing the guest
|
||||
page cache. This provides both time and space optimizations.
|
||||
- Mapping as a direct access device inside the VM allows pages from the
|
||||
host to be demand loaded using page faults, rather than having to make requests
|
||||
via a virtualized device (causing expensive VM exits/hypercalls), thus providing
|
||||
a speed optimization.
|
||||
- Utilizing `MAP_SHARED` shared memory on the host allows the host to efficiently
|
||||
share pages.
|
||||
|
||||
Kata Containers uses the following steps to set up the DAX mappings:
|
||||
1. QEMU is configured with an NVDIMM memory device, with a memory file
|
||||
backend to map in the host-side file into the virtual NVDIMM space.
|
||||
2. The guest kernel command line mounts this NVDIMM device with the DAX
|
||||
feature enabled, allowing direct page mapping and access, thus bypassing the
|
||||
guest page cache.
|
||||
|
||||

|
||||
|
||||
Information on the use of NVDIMM via QEMU is available in the [QEMU source code](http://git.qemu-project.org/?p=qemu.git;a=blob;f=docs/nvdimm.txt;hb=HEAD)
|
||||
@@ -1,477 +0,0 @@
|
||||
# Kata Containers Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
Kata Containers is an open source community working to build a secure
|
||||
container [runtime](#runtime) with lightweight virtual machines (VM's)
|
||||
that feel and perform like standard Linux containers, but provide
|
||||
stronger [workload](#workload) isolation using hardware
|
||||
[virtualization](#virtualization) technology as a second layer of
|
||||
defence.
|
||||
|
||||
Kata Containers runs on [multiple architectures](../../../src/runtime/README.md#platform-support)
|
||||
and supports [multiple hypervisors](../../hypervisors.md).
|
||||
|
||||
This document is a summary of the Kata Containers architecture.
|
||||
|
||||
## Background knowledge
|
||||
|
||||
This document assumes the reader understands a number of concepts
|
||||
related to containers and file systems. The
|
||||
[background](background.md) document explains these concepts.
|
||||
|
||||
## Example command
|
||||
|
||||
This document makes use of a particular [example
|
||||
command](example-command.md) throughout the text to illustrate certain
|
||||
concepts.
|
||||
|
||||
## Virtualization
|
||||
|
||||
For details on how Kata Containers maps container concepts to VM
|
||||
technologies, and how this is realized in the multiple hypervisors and
|
||||
VMMs that Kata supports see the
|
||||
[virtualization documentation](../virtualization.md).
|
||||
|
||||
## Compatibility
|
||||
|
||||
The [Kata Containers runtime](../../../src/runtime) is compatible with
|
||||
the [OCI](https://github.com/opencontainers)
|
||||
[runtime specification](https://github.com/opencontainers/runtime-spec)
|
||||
and therefore works seamlessly with the
|
||||
[Kubernetes Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-node/container-runtime-interface.md)
|
||||
through the [CRI-O](https://github.com/kubernetes-incubator/cri-o)
|
||||
and [containerd](https://github.com/containerd/containerd)
|
||||
implementations.
|
||||
|
||||
Kata Containers provides a ["shimv2"](#shim-v2-architecture) compatible runtime.
|
||||
|
||||
## Shim v2 architecture
|
||||
|
||||
The Kata Containers runtime is shim v2 ("shimv2") compatible. This
|
||||
section explains what this means.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> For a comparison with the Kata 1.x architecture, see
|
||||
> [the architectural history document](history.md).
|
||||
|
||||
The
|
||||
[containerd runtime shimv2 architecture](https://github.com/containerd/containerd/tree/main/runtime/v2)
|
||||
or _shim API_ architecture resolves the issues with the old
|
||||
architecture by defining a set of shimv2 APIs that a compatible
|
||||
runtime implementation must supply. Rather than calling the runtime
|
||||
binary multiple times for each new container, the shimv2 architecture
|
||||
runs a single instance of the runtime binary (for any number of
|
||||
containers). This improves performance and resolves the state handling
|
||||
issue.
|
||||
|
||||
The shimv2 API is similar to the
|
||||
[OCI runtime](https://github.com/opencontainers/runtime-spec)
|
||||
API in terms of the way the container lifecycle is split into
|
||||
different verbs. Rather than calling the runtime multiple times, the
|
||||
container manager creates a socket and passes it to the shimv2
|
||||
runtime. The socket is a bi-directional communication channel that
|
||||
uses a gRPC based protocol to allow the container manager to send API
|
||||
calls to the runtime, which returns the result to the container
|
||||
manager using the same channel.
|
||||
|
||||
The shimv2 architecture allows running several containers per VM to
|
||||
support container engines that require multiple containers running
|
||||
inside a pod.
|
||||
|
||||
With the new architecture [Kubernetes](kubernetes.md) can
|
||||
launch both Pod and OCI compatible containers with a single
|
||||
[runtime](#runtime) shim per Pod, rather than `2N+1` shims. No stand
|
||||
alone `kata-proxy` process is required, even if VSOCK is not
|
||||
available.
|
||||
|
||||
## Workload
|
||||
|
||||
The workload is the command the user requested to run in the
|
||||
container and is specified in the [OCI bundle](background.md#oci-bundle)'s
|
||||
configuration file.
|
||||
|
||||
In our [example](example-command.md), the workload is the `sh(1)` command.
|
||||
|
||||
### Workload root filesystem
|
||||
|
||||
For details of how the [runtime](#runtime) makes the
|
||||
[container image](background.md#container-image) chosen by the user available to
|
||||
the workload process, see the
|
||||
[Container creation](#container-creation) and [storage](#storage) sections.
|
||||
|
||||
Note that the workload is isolated from the [guest VM](#environments) environment by its
|
||||
surrounding [container environment](#environments). The guest VM
|
||||
environment where the container runs in is also isolated from the _outer_
|
||||
[host environment](#environments) where the container manager runs.
|
||||
|
||||
## System overview
|
||||
|
||||
### Environments
|
||||
|
||||
The following terminology is used to describe the different or
|
||||
environments (or contexts) various processes run in. It is necessary
|
||||
to study this table closely to make sense of what follows:
|
||||
|
||||
| Type | Name | Virtualized | Containerized | rootfs | Rootfs device type | Mount type | Description |
|
||||
|-|-|-|-|-|-|-|-|
|
||||
| Host | Host | no `[1]` | no | Host specific | Host specific | Host specific | The environment provided by a standard, physical non virtualized system. |
|
||||
| VM root | Guest VM | yes | no | rootfs inside the [guest image](guest-assets.md#guest-image) | Hypervisor specific `[2]` | `ext4` | The first (or top) level VM environment created on a host system. |
|
||||
| VM container root | Container | yes | yes | rootfs type requested by user ([`ubuntu` in the example](example-command.md)) | `kataShared` | [virtio FS](storage.md#virtio-fs) | The first (or top) level container environment created inside the VM. Based on the [OCI bundle](background.md#oci-bundle). |
|
||||
|
||||
**Key:**
|
||||
|
||||
- `[1]`: For simplicity, this document assumes the host environment
|
||||
runs on physical hardware.
|
||||
|
||||
- `[2]`: See the [DAX](#dax) section.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The word "root" is used to mean _top level_ here in a similar
|
||||
> manner to the term [rootfs](background.md#root-filesystem).
|
||||
>
|
||||
> - The term "first level" prefix used above is important since it implies
|
||||
> that it is possible to create multi level systems. However, they do
|
||||
> not form part of a standard Kata Containers environment so will not
|
||||
> be considered in this document.
|
||||
|
||||
The reasons for containerizing the [workload](#workload) inside the VM
|
||||
are:
|
||||
|
||||
- Isolates the workload entirely from the VM environment.
|
||||
- Provides better isolation between containers in a [pod](kubernetes.md).
|
||||
- Allows the workload to be managed and monitored through its cgroup
|
||||
confinement.
|
||||
|
||||
### Container creation
|
||||
|
||||
The steps below show at a high level how a Kata Containers container is
|
||||
created using the containerd container manager:
|
||||
|
||||
1. The user requests the creation of a container by running a command
|
||||
like the [example command](example-command.md).
|
||||
1. The container manager daemon runs a single instance of the Kata
|
||||
[runtime](#runtime).
|
||||
1. The Kata runtime loads its [configuration file](#configuration).
|
||||
1. The container manager calls a set of shimv2 API functions on the runtime.
|
||||
1. The Kata runtime launches the configured [hypervisor](#hypervisor).
|
||||
1. The hypervisor creates and starts (_boots_) a VM using the
|
||||
[guest assets](guest-assets.md#guest-assets):
|
||||
|
||||
- The hypervisor [DAX](#dax) shares the
|
||||
[guest image](guest-assets.md#guest-image)
|
||||
into the VM to become the VM [rootfs](background.md#root-filesystem) (mounted on a `/dev/pmem*` device),
|
||||
which is known as the [VM root environment](#environments).
|
||||
- The hypervisor mounts the [OCI bundle](background.md#oci-bundle), using [virtio FS](storage.md#virtio-fs),
|
||||
into a container specific directory inside the VM's rootfs.
|
||||
|
||||
This container specific directory will become the
|
||||
[container rootfs](#environments), known as the
|
||||
[container environment](#environments).
|
||||
|
||||
1. The [agent](#agent) is started as part of the VM boot.
|
||||
|
||||
1. The runtime calls the agent's `CreateSandbox` API to request the
|
||||
agent create a container:
|
||||
|
||||
1. The agent creates a [container environment](#environments)
|
||||
in the container specific directory that contains the [container rootfs](#environments).
|
||||
|
||||
The container environment hosts the [workload](#workload) in the
|
||||
[container rootfs](#environments) directory.
|
||||
|
||||
1. The agent spawns the workload inside the container environment.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The container environment created by the agent is equivalent to
|
||||
> a container environment created by the
|
||||
> [`runc`](https://github.com/opencontainers/runc) OCI runtime;
|
||||
> Linux cgroups and namespaces are created inside the VM by the
|
||||
> [guest kernel](guest-assets.md#guest-kernel) to isolate the
|
||||
> workload from the VM environment the container is created in.
|
||||
> See the [Environments](#environments) section for an
|
||||
> explanation of why this is done.
|
||||
>
|
||||
> - See the [guest image](guest-assets.md#guest-image) section for
|
||||
> details of exactly how the agent is started.
|
||||
|
||||
1. The container manager returns control of the container to the
|
||||
user running the `ctr` command.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> At this point, the container is running and:
|
||||
>
|
||||
> - The [workload](#workload) process ([`sh(1)` in the example](example-command.md))
|
||||
> is running in the [container environment](#environments).
|
||||
> - The user is now able to interact with the workload
|
||||
> (using the [`ctr` command in the example](example-command.md)).
|
||||
> - The [agent](#agent), running inside the VM is monitoring the
|
||||
> [workload](#workload) process.
|
||||
> - The [runtime](#runtime) is waiting for the agent's `WaitProcess` API
|
||||
> call to complete.
|
||||
|
||||
Further details of these steps are provided in the sections below.
|
||||
|
||||
### Container shutdown
|
||||
|
||||
There are two possible ways for the container environment to be
|
||||
terminated:
|
||||
|
||||
- When the [workload](#workload) exits.
|
||||
|
||||
This is the standard, or _graceful_ shutdown method.
|
||||
|
||||
- When the container manager forces the container to be deleted.
|
||||
|
||||
#### Workload exit
|
||||
|
||||
The [agent](#agent) will detect when the [workload](#workload) process
|
||||
exits, capture its exit status (see `wait(2)`) and return that value
|
||||
to the [runtime](#runtime) by specifying it as the response to the
|
||||
`WaitProcess` agent API call made by the [runtime](#runtime).
|
||||
|
||||
The runtime then passes the value back to the container manager by the
|
||||
`Wait` [shimv2 API](#shim-v2-architecture) call.
|
||||
|
||||
Once the workload has fully exited, the VM is no longer needed and the
|
||||
runtime cleans up the environment (which includes terminating the
|
||||
[hypervisor](#hypervisor) process).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> When [agent tracing is enabled](../../tracing.md#agent-shutdown-behaviour),
|
||||
> the shutdown behaviour is different.
|
||||
|
||||
#### Container manager requested shutdown
|
||||
|
||||
If the container manager requests the container be deleted, the
|
||||
[runtime](#runtime) will signal the agent by sending it a
|
||||
`DestroySandbox` [ttRPC API](../../../src/libs/protocols/protos/agent.proto) request.
|
||||
|
||||
## Guest assets
|
||||
|
||||
The guest assets comprise a guest image and a guest kernel that are
|
||||
used by the [hypervisor](#hypervisor).
|
||||
|
||||
See the [guest assets](guest-assets.md) document for further
|
||||
information.
|
||||
|
||||
## Hypervisor
|
||||
|
||||
The [hypervisor](../../hypervisors.md) specified in the
|
||||
[configuration file](#configuration) creates a VM to host the
|
||||
[agent](#agent) and the [workload](#workload) inside the
|
||||
[container environment](#environments).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The hypervisor process runs inside an environment slightly different
|
||||
> to the host environment:
|
||||
>
|
||||
> - It is run in a different cgroup environment to the host.
|
||||
> - It is given a separate network namespace from the host.
|
||||
> - If the [OCI configuration specifies a SELinux label](https://github.com/opencontainers/runtime-spec/blob/main/config.md#linux-process),
|
||||
> the hypervisor process will run with that label (*not* the workload running inside the hypervisor's VM).
|
||||
|
||||
## Agent
|
||||
|
||||
The Kata Containers agent ([`kata-agent`](../../../src/agent)), written
|
||||
in the [Rust programming language](https://www.rust-lang.org), is a
|
||||
long running process that runs inside the VM. It acts as the
|
||||
supervisor for managing the containers and the [workload](#workload)
|
||||
running within those containers. Only a single agent process is run
|
||||
for each VM created.
|
||||
|
||||
### Agent communications protocol
|
||||
|
||||
The agent communicates with the other Kata components (primarily the
|
||||
[runtime](#runtime)) using a
|
||||
[`ttRPC`](https://github.com/containerd/ttrpc-rust) based
|
||||
[protocol](../../../src/libs/protocols/protos).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> If you wish to learn more about this protocol, a practical way to do
|
||||
> so is to experiment with the
|
||||
> [agent control tool](#agent-control-tool) on a test system.
|
||||
> This tool is for test and development purposes only and can send
|
||||
> arbitrary ttRPC agent API commands to the [agent](#agent).
|
||||
|
||||
## Runtime
|
||||
|
||||
The Kata Containers runtime (the [`containerd-shim-kata-v2`](../../../src/runtime/cmd/containerd-shim-kata-v2
|
||||
) binary) is a [shimv2](#shim-v2-architecture) compatible runtime.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The Kata Containers runtime is sometimes referred to as the Kata
|
||||
> _shim_. Both terms are correct since the `containerd-shim-kata-v2`
|
||||
> is a container runtime, and that runtime implements the containerd
|
||||
> shim v2 API.
|
||||
|
||||
The runtime makes heavy use of the [`virtcontainers`
|
||||
package](../../../src/runtime/virtcontainers), which provides a generic,
|
||||
runtime-specification agnostic, hardware-virtualized containers
|
||||
library.
|
||||
|
||||
The runtime is responsible for starting the [hypervisor](#hypervisor)
|
||||
and it's VM, and communicating with the [agent](#agent) using a
|
||||
[ttRPC based protocol](#agent-communications-protocol) over a VSOCK
|
||||
socket that provides a communications link between the VM and the
|
||||
host.
|
||||
|
||||
This protocol allows the runtime to send container management commands
|
||||
to the agent. The protocol is also used to carry the standard I/O
|
||||
streams (`stdout`, `stderr`, `stdin`) between the containers and
|
||||
container managers (such as CRI-O or containerd).
|
||||
|
||||
## Utility program
|
||||
|
||||
The `kata-runtime` binary is a utility program that provides
|
||||
administrative commands to manipulate and query a Kata Containers
|
||||
installation.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> In Kata 1.x, this program also acted as the main
|
||||
> [runtime](#runtime), but this is no longer required due to the
|
||||
> improved shimv2 architecture.
|
||||
|
||||
### exec command
|
||||
|
||||
The `exec` command allows an administrator or developer to enter the
|
||||
[VM root environment](#environments) which is not accessible by the container
|
||||
[workload](#workload).
|
||||
|
||||
See [the developer guide](../../Developer-Guide.md#connect-to-debug-console) for further details.
|
||||
|
||||
### Configuration
|
||||
|
||||
See the [configuration file details](../../../src/runtime/README.md#configuration).
|
||||
|
||||
The configuration file is also used to enable runtime [debug output](../../Developer-Guide.md#enable-full-debug).
|
||||
|
||||
## Process overview
|
||||
|
||||
The table below shows an example of the main processes running in the
|
||||
different [environments](#environments) when a Kata Container is
|
||||
created with containerd using our [example command](example-command.md):
|
||||
|
||||
| Description | Host | VM root environment | VM container environment |
|
||||
|-|-|-|-|
|
||||
| Container manager | `containerd` | |
|
||||
| Kata Containers | [runtime](#runtime), [`virtiofsd`](storage.md#virtio-fs), [hypervisor](#hypervisor) | [agent](#agent) |
|
||||
| User [workload](#workload) | | | [`ubuntu sh`](example-command.md) |
|
||||
|
||||
## Networking
|
||||
|
||||
See the [networking document](networking.md).
|
||||
|
||||
## Storage
|
||||
|
||||
See the [storage document](storage.md).
|
||||
|
||||
## Kubernetes support
|
||||
|
||||
See the [Kubernetes document](kubernetes.md).
|
||||
|
||||
#### OCI annotations
|
||||
|
||||
In order for the Kata Containers [runtime](#runtime) (or any VM based OCI compatible
|
||||
runtime) to be able to understand if it needs to create a full VM or if it
|
||||
has to create a new container inside an existing pod's VM, CRI-O adds
|
||||
specific annotations to the OCI configuration file (`config.json`) which is passed to
|
||||
the OCI compatible runtime.
|
||||
|
||||
Before calling its runtime, CRI-O will always add a `io.kubernetes.cri-o.ContainerType`
|
||||
annotation to the `config.json` configuration file it produces from the Kubelet CRI
|
||||
request. The `io.kubernetes.cri-o.ContainerType` annotation can either be set to `sandbox`
|
||||
or `container`. Kata Containers will then use this annotation to decide if it needs to
|
||||
respectively create a virtual machine or a container inside a virtual machine associated
|
||||
with a Kubernetes pod:
|
||||
|
||||
| Annotation value | Kata VM created? | Kata container created? |
|
||||
|-|-|-|
|
||||
| `sandbox` | yes | yes (inside new VM) |
|
||||
| `container`| no | yes (in existing VM) |
|
||||
|
||||
#### Mixing VM based and namespace based runtimes
|
||||
|
||||
> **Note:** Since Kubernetes 1.12, the [`Kubernetes RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
> has been supported and the user can specify runtime without the non-standardized annotations.
|
||||
|
||||
With `RuntimeClass`, users can define Kata Containers as a
|
||||
`RuntimeClass` and then explicitly specify that a pod must be created
|
||||
as a Kata Containers pod. For details, please refer to [How to use
|
||||
Kata Containers and containerd](../../../docs/how-to/containerd-kata.md).
|
||||
|
||||
## Tracing
|
||||
|
||||
The [tracing document](../../tracing.md) provides details on the tracing
|
||||
architecture.
|
||||
|
||||
# Appendices
|
||||
|
||||
## DAX
|
||||
|
||||
Kata Containers utilizes the Linux kernel DAX
|
||||
[(Direct Access filesystem)](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/filesystems/dax.rst?h=v5.14)
|
||||
feature to efficiently map the [guest image](guest-assets.md#guest-image) in the
|
||||
[host environment](#environments) into the
|
||||
[guest VM environment](#environments) to become the VM's
|
||||
[rootfs](background.md#root-filesystem).
|
||||
|
||||
If the [configured](#configuration) [hypervisor](#hypervisor) is set
|
||||
to either QEMU or Cloud Hypervisor, DAX is used with the feature shown
|
||||
in the table below:
|
||||
|
||||
| Hypervisor | Feature used | rootfs device type |
|
||||
|-|-|-|
|
||||
| Cloud Hypervisor (CH) | `dax` `FsConfig` configuration option | PMEM (emulated Persistent Memory device) |
|
||||
| QEMU | NVDIMM memory device with a memory file backend | NVDIMM (emulated Non-Volatile Dual In-line Memory Module device) |
|
||||
|
||||
The features in the table above are equivalent in that they provide a memory-mapped
|
||||
virtual device which is used to DAX map the VM's
|
||||
[rootfs](background.md#root-filesystem) into the [VM guest](#environments) memory
|
||||
address space.
|
||||
|
||||
The VM is then booted, specifying the `root=` kernel parameter to make
|
||||
the [guest kernel](guest-assets.md#guest-kernel) use the appropriate emulated device
|
||||
as its rootfs.
|
||||
|
||||
### DAX advantages
|
||||
|
||||
Mapping files using [DAX](#dax) provides a number of benefits over
|
||||
more traditional VM file and device mapping mechanisms:
|
||||
|
||||
- Mapping as a direct access device allows the guest to directly
|
||||
access the host memory pages (such as via Execute In Place (XIP)),
|
||||
bypassing the [guest kernel](guest-assets.md#guest-kernel)'s page cache. This
|
||||
zero copy provides both time and space optimizations.
|
||||
|
||||
- Mapping as a direct access device inside the VM allows pages from the
|
||||
host to be demand loaded using page faults, rather than having to make requests
|
||||
via a virtualized device (causing expensive VM exits/hypercalls), thus providing
|
||||
a speed optimization.
|
||||
|
||||
- Utilizing `mmap(2)`'s `MAP_SHARED` shared memory option on the host
|
||||
allows the host to efficiently share pages.
|
||||
|
||||

|
||||
|
||||
For further details of the use of NVDIMM with QEMU, see the [QEMU
|
||||
project documentation](https://www.qemu.org).
|
||||
|
||||
## Agent control tool
|
||||
|
||||
The [agent control tool](../../../src/tools/agent-ctl) is a test and
|
||||
development tool that can be used to learn more about a Kata
|
||||
Containers system.
|
||||
|
||||
## Terminology
|
||||
|
||||
See the [project glossary](../../../Glossary.md).
|
||||
@@ -1,81 +0,0 @@
|
||||
# Kata Containers architecture background knowledge
|
||||
|
||||
The following sections explain some of the background concepts
|
||||
required to understand the [architecture document](README.md).
|
||||
|
||||
## Root filesystem
|
||||
|
||||
This document uses the term _rootfs_ to refer to a root filesystem
|
||||
which is mounted as the top-level directory ("`/`") and often referred
|
||||
to as _slash_.
|
||||
|
||||
It is important to understand this term since the overall system uses
|
||||
multiple different rootfs's (as explained in the
|
||||
[Environments](README.md#environments) section.
|
||||
|
||||
## Container image
|
||||
|
||||
In the [example command](example-command.md) the user has specified the
|
||||
type of container they wish to run via the container image name:
|
||||
`ubuntu`. This image name corresponds to a _container image_ that can
|
||||
be used to create a container with an Ubuntu Linux environment. Hence,
|
||||
in our [example](example-command.md), the `sh(1)` command will be run
|
||||
inside a container which has an Ubuntu rootfs.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The term _container image_ is confusing since the image in question
|
||||
> is **not** a container: it is simply a set of files (_an image_)
|
||||
> that can be used to _create_ a container. The term _container
|
||||
> template_ would be more accurate but the term _container image_ is
|
||||
> commonly used so this document uses the standard term.
|
||||
|
||||
For the purposes of this document, the most important part of the
|
||||
[example command line](example-command.md) is the container image the
|
||||
user has requested. Normally, the container manager will _pull_
|
||||
(download) a container image from a remote site and store a copy
|
||||
locally. This local container image is used by the container manager
|
||||
to create an [OCI bundle](#oci-bundle) which will form the environment
|
||||
the container will run in. After creating the OCI bundle, the
|
||||
container manager launches a [runtime](README.md#runtime) which will create the
|
||||
container using the provided OCI bundle.
|
||||
|
||||
## OCI bundle
|
||||
|
||||
To understand what follows, it is important to know at a high level
|
||||
how an OCI ([Open Containers Initiative](https://opencontainers.org)) compatible container is created.
|
||||
|
||||
An OCI compatible container is created by taking a
|
||||
[container image](#container-image) and converting the embedded rootfs
|
||||
into an
|
||||
[OCI rootfs bundle](https://github.com/opencontainers/runtime-spec/blob/main/bundle.md),
|
||||
or more simply, an _OCI bundle_.
|
||||
|
||||
An OCI bundle is a `tar(1)` archive normally created by a container
|
||||
manager which is passed to an OCI [runtime](README.md#runtime) which converts
|
||||
it into a full container rootfs. The bundle contains two assets:
|
||||
|
||||
- A container image [rootfs](#root-filesystem)
|
||||
|
||||
This is simply a directory of files that will be used to represent
|
||||
the rootfs for the container.
|
||||
|
||||
For the [example command](example-command.md), the directory will
|
||||
contain the files necessary to create a minimal Ubuntu root
|
||||
filesystem.
|
||||
|
||||
- An [OCI configuration file](https://github.com/opencontainers/runtime-spec/blob/main/config.md)
|
||||
|
||||
This is a JSON file called `config.json`.
|
||||
|
||||
The container manager will create this file so that:
|
||||
|
||||
- The `root.path` value is set to the full path of the specified
|
||||
container rootfs.
|
||||
|
||||
In [the example](example-command.md) this value will be `ubuntu`.
|
||||
|
||||
- The `process.args` array specifies the list of commands the user
|
||||
wishes to run. This is known as the [workload](README.md#workload).
|
||||
|
||||
In [the example](example-command.md) the workload is `sh(1)`.
|
||||
@@ -1,30 +0,0 @@
|
||||
# Example command
|
||||
|
||||
The following containerd command creates a container. It is referred
|
||||
to throughout the architecture document to help explain various points:
|
||||
|
||||
```bash
|
||||
$ sudo ctr run --runtime "io.containerd.kata.v2" --rm -t "quay.io/libpod/ubuntu:latest" foo sh
|
||||
```
|
||||
|
||||
This command requests that containerd:
|
||||
|
||||
- Create a container (`ctr run`).
|
||||
- Use the Kata [shimv2](README.md#shim-v2-architecture) runtime (`--runtime "io.containerd.kata.v2"`).
|
||||
- Delete the container when it [exits](README.md#workload-exit) (`--rm`).
|
||||
- Attach the container to the user's terminal (`-t`).
|
||||
- Use the Ubuntu Linux [container image](background.md#container-image)
|
||||
to create the container [rootfs](background.md#root-filesystem) that will become
|
||||
the [container environment](README.md#environments)
|
||||
(`quay.io/libpod/ubuntu:latest`).
|
||||
- Create the container with the name "`foo`".
|
||||
- Run the `sh(1)` command in the Ubuntu rootfs based container
|
||||
environment.
|
||||
|
||||
The command specified here is referred to as the [workload](README.md#workload).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> For the purposes of this document and to keep explanations
|
||||
> simpler, we assume the user is running this command in the
|
||||
> [host environment](README.md#environments).
|
||||
@@ -1,152 +0,0 @@
|
||||
# Guest assets
|
||||
|
||||
Kata Containers creates a VM in which to run one or more containers.
|
||||
It does this by launching a [hypervisor](README.md#hypervisor) to
|
||||
create the VM. The hypervisor needs two assets for this task: a Linux
|
||||
kernel and a small root filesystem image to boot the VM.
|
||||
|
||||
## Guest kernel
|
||||
|
||||
The [guest kernel](../../../tools/packaging/kernel)
|
||||
is passed to the hypervisor and used to boot the VM.
|
||||
The default kernel provided in Kata Containers is highly optimized for
|
||||
kernel boot time and minimal memory footprint, providing only those
|
||||
services required by a container workload. It is based on the latest
|
||||
Linux LTS (Long Term Support) [kernel](https://www.kernel.org).
|
||||
|
||||
## Guest image
|
||||
|
||||
The hypervisor uses an image file which provides a minimal root
|
||||
filesystem used by the guest kernel to boot the VM and host the Kata
|
||||
Container. Kata Containers supports both initrd and rootfs based
|
||||
minimal guest images. The [default packages](../../install/) provide both
|
||||
an image and an initrd, both of which are created using the
|
||||
[`osbuilder`](../../../tools/osbuilder) tool.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - Although initrd and rootfs based images are supported, not all
|
||||
> [hypervisors](README.md#hypervisor) support both types of image.
|
||||
>
|
||||
> - The guest image is *unrelated* to the image used in a container
|
||||
> workload.
|
||||
>
|
||||
> For example, if a user creates a container that runs a shell in a
|
||||
> BusyBox image, they will run that shell in a BusyBox environment.
|
||||
> However, the guest image running inside the VM that is used to
|
||||
> *host* that BusyBox image could be running Clear Linux, Ubuntu,
|
||||
> Fedora or any other distribution potentially.
|
||||
>
|
||||
> The `osbuilder` tool provides
|
||||
> [configurations for various common Linux distributions](../../../tools/osbuilder/rootfs-builder)
|
||||
> which can be built into either initrd or rootfs guest images.
|
||||
>
|
||||
> - If you are using a [packaged version of Kata
|
||||
> Containers](../../install), you can see image details by running the
|
||||
> [`kata-collect-data.sh`](../../../src/runtime/data/kata-collect-data.sh.in)
|
||||
> script as `root` and looking at the "Image details" section of the
|
||||
> output.
|
||||
|
||||
#### Root filesystem image
|
||||
|
||||
The default packaged rootfs image, sometimes referred to as the _mini
|
||||
O/S_, is a highly optimized container bootstrap system.
|
||||
|
||||
If this image type is [configured](README.md#configuration), when the
|
||||
user runs the [example command](example-command.md):
|
||||
|
||||
- The [runtime](README.md#runtime) will launch the configured [hypervisor](README.md#hypervisor).
|
||||
- The hypervisor will boot the mini-OS image using the [guest kernel](#guest-kernel).
|
||||
- The kernel will start the init daemon as PID 1 (`systemd`) inside the VM root environment.
|
||||
- `systemd`, running inside the mini-OS context, will launch the [agent](README.md#agent)
|
||||
in the root context of the VM.
|
||||
- The agent will create a new container environment, setting its root
|
||||
filesystem to that requested by the user (Ubuntu in [the example](example-command.md)).
|
||||
- The agent will then execute the command (`sh(1)` in [the example](example-command.md))
|
||||
inside the new container.
|
||||
|
||||
The table below summarises the default mini O/S showing the
|
||||
environments that are created, the services running in those
|
||||
environments (for all platforms) and the root filesystem used by
|
||||
each service:
|
||||
|
||||
| Process | Environment | systemd service? | rootfs | User accessible | Notes |
|
||||
|-|-|-|-|-|-|
|
||||
| systemd | VM root | n/a | [VM guest image](#guest-image)| [debug console][debug-console] | The init daemon, running as PID 1 |
|
||||
| [Agent](README.md#agent) | VM root | yes | [VM guest image](#guest-image)| [debug console][debug-console] | Runs as a systemd service |
|
||||
| `chronyd` | VM root | yes | [VM guest image](#guest-image)| [debug console][debug-console] | Used to synchronise the time with the host |
|
||||
| container workload (`sh(1)` in [the example](example-command.md)) | VM container | no | User specified (Ubuntu in [the example](example-command.md)) | [exec command](README.md#exec-command) | Managed by the agent |
|
||||
|
||||
See also the [process overview](README.md#process-overview).
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The "User accessible" column shows how an administrator can access
|
||||
> the environment.
|
||||
>
|
||||
> - The container workload is running inside a full container
|
||||
> environment which itself is running within a VM environment.
|
||||
>
|
||||
> - See the [configuration files for the `osbuilder` tool](../../../tools/osbuilder/rootfs-builder)
|
||||
> for details of the default distribution for platforms other than
|
||||
> Intel x86_64.
|
||||
|
||||
#### Initrd image
|
||||
|
||||
The initrd image is a compressed `cpio(1)` archive, created from a
|
||||
rootfs which is loaded into memory and used as part of the Linux
|
||||
startup process. During startup, the kernel unpacks it into a special
|
||||
instance of a `tmpfs` mount that becomes the initial root filesystem.
|
||||
|
||||
If this image type is [configured](README.md#configuration), when the user runs
|
||||
the [example command](example-command.md):
|
||||
|
||||
- The [runtime](README.md#runtime) will launch the configured [hypervisor](README.md#hypervisor).
|
||||
- The hypervisor will boot the mini-OS image using the [guest kernel](#guest-kernel).
|
||||
- The kernel will start the init daemon as PID 1 (the
|
||||
[agent](README.md#agent))
|
||||
inside the VM root environment.
|
||||
- The [agent](README.md#agent) will create a new container environment, setting its root
|
||||
filesystem to that requested by the user (`ubuntu` in
|
||||
[the example](example-command.md)).
|
||||
- The agent will then execute the command (`sh(1)` in [the example](example-command.md))
|
||||
inside the new container.
|
||||
|
||||
The table below summarises the default mini O/S showing the environments that are created,
|
||||
the processes running in those environments (for all platforms) and
|
||||
the root filesystem used by each service:
|
||||
|
||||
| Process | Environment | rootfs | User accessible | Notes |
|
||||
|-|-|-|-|-|
|
||||
| [Agent](README.md#agent) | VM root | [VM guest image](#guest-image) | [debug console][debug-console] | Runs as the init daemon (PID 1) |
|
||||
| container workload | VM container | User specified (Ubuntu in this example) | [exec command](README.md#exec-command) | Managed by the agent |
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The "User accessible" column shows how an administrator can access
|
||||
> the environment.
|
||||
>
|
||||
> - It is possible to use a standard init daemon such as systemd with
|
||||
> an initrd image if this is desirable.
|
||||
|
||||
See also the [process overview](README.md#process-overview).
|
||||
|
||||
#### Image summary
|
||||
|
||||
| Image type | Default distro | Init daemon | Reason | Notes |
|
||||
|-|-|-|-|-|
|
||||
| [image](background.md#root-filesystem-image) | [Clear Linux](https://clearlinux.org) (for x86_64 systems)| systemd | Minimal and highly optimized | systemd offers flexibility |
|
||||
| [initrd](#initrd-image) | [Alpine Linux](https://alpinelinux.org) | Kata [agent](README.md#agent) (as no systemd support) | Security hardened and tiny C library |
|
||||
|
||||
See also:
|
||||
|
||||
- The [osbuilder](../../../tools/osbuilder) tool
|
||||
|
||||
This is used to build all default image types.
|
||||
|
||||
- The [versions database](../../../versions.yaml)
|
||||
|
||||
The `default-image-name` and `default-initrd-name` options specify
|
||||
the default distributions for each image type.
|
||||
|
||||
[debug-console]: ../../Developer-Guide.md#connect-to-debug-console
|
||||
@@ -1,41 +0,0 @@
|
||||
# History
|
||||
|
||||
## Kata 1.x architecture
|
||||
|
||||
In the old [Kata 1.x architecture](https://github.com/kata-containers/documentation/blob/master/design/architecture.md),
|
||||
the Kata [runtime](README.md#runtime) was an executable called `kata-runtime`.
|
||||
The container manager called this executable multiple times when
|
||||
creating each container. Each time the runtime was called a different
|
||||
OCI command-line verb was provided. This architecture was simple, but
|
||||
not well suited to creating VM based containers due to the issue of
|
||||
handling state between calls. Additionally, the architecture suffered
|
||||
from performance issues related to continually having to spawn new
|
||||
instances of the runtime binary, and
|
||||
[Kata shim](https://github.com/kata-containers/shim) and
|
||||
[Kata proxy](https://github.com/kata-containers/proxy) processes for systems
|
||||
that did not provide VSOCK.
|
||||
|
||||
## Kata 2.x architecture
|
||||
|
||||
See the ["shimv2"](README.md#shim-v2-architecture) section of the
|
||||
architecture document.
|
||||
|
||||
## Architectural comparison
|
||||
|
||||
| Kata version | Kata Runtime process calls | Kata shim processes | Kata proxy processes (if no VSOCK) |
|
||||
|-|-|-|-|
|
||||
| 1.x | multiple per container | 1 per container connection | 1 |
|
||||
| 2.x | 1 per VM (hosting any number of containers) | 0 | 0 |
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - A single VM can host one or more containers.
|
||||
>
|
||||
> - The "Kata shim processes" column refers to the old
|
||||
> [Kata shim](https://github.com/kata-containers/shim) (`kata-shim` binary),
|
||||
> *not* the new shimv2 runtime instance (`containerd-shim-kata-v2` binary).
|
||||
|
||||
The diagram below shows how the original architecture was simplified
|
||||
with the advent of shimv2.
|
||||
|
||||

|
||||
@@ -1,35 +0,0 @@
|
||||
# Kubernetes support
|
||||
|
||||
[Kubernetes](https://github.com/kubernetes/kubernetes/), or K8s, is a popular open source
|
||||
container orchestration engine. In Kubernetes, a set of containers sharing resources
|
||||
such as networking, storage, mount, PID, etc. is called a
|
||||
[pod](https://kubernetes.io/docs/user-guide/pods/).
|
||||
|
||||
A node can have multiple pods, but at a minimum, a node within a Kubernetes cluster
|
||||
only needs to run a container runtime and a container agent (called a
|
||||
[Kubelet](https://kubernetes.io/docs/admin/kubelet/)).
|
||||
|
||||
Kata Containers represents a Kubelet pod as a VM.
|
||||
|
||||
A Kubernetes cluster runs a control plane where a scheduler (typically
|
||||
running on a dedicated master node) calls into a compute Kubelet. This
|
||||
Kubelet instance is responsible for managing the lifecycle of pods
|
||||
within the nodes and eventually relies on a container runtime to
|
||||
handle execution. The Kubelet architecture decouples lifecycle
|
||||
management from container execution through a dedicated gRPC based
|
||||
[Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/container-runtime-interface-v1.md).
|
||||
|
||||
In other words, a Kubelet is a CRI client and expects a CRI
|
||||
implementation to handle the server side of the interface.
|
||||
[CRI-O](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[containerd](https://github.com/containerd/containerd/) are CRI
|
||||
implementations that rely on
|
||||
[OCI](https://github.com/opencontainers/runtime-spec) compatible
|
||||
runtimes for managing container instances.
|
||||
|
||||
Kata Containers is an officially supported CRI-O and containerd
|
||||
runtime. Refer to the following guides on how to set up Kata
|
||||
Containers with Kubernetes:
|
||||
|
||||
- [How to use Kata Containers and containerd](../../how-to/containerd-kata.md)
|
||||
- [Run Kata Containers with Kubernetes](../../how-to/run-kata-with-k8s.md)
|
||||
@@ -1,49 +0,0 @@
|
||||
# Networking
|
||||
|
||||
Containers typically live in their own, possibly shared, networking namespace.
|
||||
At some point in a container lifecycle, container engines will set up that namespace
|
||||
to add the container to a network which is isolated from the host network.
|
||||
|
||||
In order to setup the network for a container, container engines call into a
|
||||
networking plugin. The network plugin will usually create a virtual
|
||||
ethernet (`veth`) pair adding one end of the `veth` pair into the container
|
||||
networking namespace, while the other end of the `veth` pair is added to the
|
||||
host networking namespace.
|
||||
|
||||
This is a very namespace-centric approach as many hypervisors or VM
|
||||
Managers (VMMs) such as `virt-manager` cannot handle `veth`
|
||||
interfaces. Typically, [`TAP`](https://www.kernel.org/doc/Documentation/networking/tuntap.txt)
|
||||
interfaces are created for VM connectivity.
|
||||
|
||||
To overcome incompatibility between typical container engines expectations
|
||||
and virtual machines, Kata Containers networking transparently connects `veth`
|
||||
interfaces with `TAP` ones using [Traffic Control](https://man7.org/linux/man-pages/man8/tc.8.html):
|
||||
|
||||

|
||||
|
||||
With a TC filter rules in place, a redirection is created between the container network
|
||||
and the virtual machine. As an example, the network plugin may place a device,
|
||||
`eth0`, in the container's network namespace, which is one end of a VETH device.
|
||||
Kata Containers will create a tap device for the VM, `tap0_kata`,
|
||||
and setup a TC redirection filter to redirect traffic from `eth0`'s ingress to `tap0_kata`'s egress,
|
||||
and a second TC filter to redirect traffic from `tap0_kata`'s ingress to `eth0`'s egress.
|
||||
|
||||
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata.
|
||||
With this method, Kata created a MACVTAP device to connect directly to the `eth0` device.
|
||||
TC-filter is the default because it allows for simpler configuration, better CNI plugin
|
||||
compatibility, and performance on par with MACVTAP.
|
||||
|
||||
Kata Containers has deprecated support for bridge due to lacking performance relative to TC-filter and MACVTAP.
|
||||
|
||||
Kata Containers supports both
|
||||
[CNM](https://github.com/docker/libnetwork/blob/master/docs/design.md#the-container-network-model)
|
||||
and [CNI](https://github.com/containernetworking/cni) for networking management.
|
||||
|
||||
## Network Hotplug
|
||||
|
||||
Kata Containers has developed a set of network sub-commands and APIs to add, list and
|
||||
remove a guest network endpoint and to manipulate the guest route table.
|
||||
|
||||
The following diagram illustrates the Kata Containers network hotplug workflow.
|
||||
|
||||

|
||||
@@ -1,44 +0,0 @@
|
||||
# Storage
|
||||
|
||||
## virtio SCSI
|
||||
|
||||
If a block-based graph driver is [configured](README.md#configuration),
|
||||
`virtio-scsi` is used to _share_ the workload image (such as
|
||||
`busybox:latest`) into the container's environment inside the VM.
|
||||
|
||||
## virtio FS
|
||||
|
||||
If a block-based graph driver is _not_ [configured](README.md#configuration), a
|
||||
[`virtio-fs`](https://virtio-fs.gitlab.io) (`VIRTIO`) overlay
|
||||
filesystem mount point is used to _share_ the workload image instead. The
|
||||
[agent](README.md#agent) uses this mount point as the root filesystem for the
|
||||
container processes.
|
||||
|
||||
For virtio-fs, the [runtime](README.md#runtime) starts one `virtiofsd` daemon
|
||||
(that runs in the host context) for each VM created.
|
||||
|
||||
## Devicemapper
|
||||
|
||||
The
|
||||
[devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/master/snapshots/devmapper)
|
||||
is a special case. The `snapshotter` uses dedicated block devices
|
||||
rather than formatted filesystems, and operates at the block level
|
||||
rather than the file level. This knowledge is used to directly use the
|
||||
underlying block device instead of the overlay file system for the
|
||||
container root file system. The block device maps to the top
|
||||
read-write layer for the overlay. This approach gives much better I/O
|
||||
performance compared to using `virtio-fs` to share the container file
|
||||
system.
|
||||
|
||||
#### Hot plug and unplug
|
||||
|
||||
Kata Containers has the ability to hot plug add and hot plug remove
|
||||
block devices. This makes it possible to use block devices for
|
||||
containers started after the VM has been launched.
|
||||
|
||||
Users can check to see if the container uses the `devicemapper` block
|
||||
device as its rootfs by calling `mount(8)` within the container. If
|
||||
the `devicemapper` block device is used, the root filesystem (`/`)
|
||||
will be mounted from `/dev/vda`. Users can disable direct mounting of
|
||||
the underlying block device through the runtime
|
||||
[configuration](README.md#configuration).
|
||||
@@ -1825,8 +1825,12 @@ components:
|
||||
desc: ""
|
||||
- value: grpc.StartContainerRequest
|
||||
desc: ""
|
||||
- value: grpc.StartTracingRequest
|
||||
desc: ""
|
||||
- value: grpc.StatsContainerRequest
|
||||
desc: ""
|
||||
- value: grpc.StopTracingRequest
|
||||
desc: ""
|
||||
- value: grpc.TtyWinResizeRequest
|
||||
desc: ""
|
||||
- value: grpc.UpdateContainerRequest
|
||||
|
||||
@@ -19,7 +19,7 @@ Cgroups are hierarchical, and this can be seen with the following pod example:
|
||||
- Container 2: `cgroupsPath=/kubepods/pod1/container2`
|
||||
|
||||
- Pod 2: `cgroupsPath=/kubepods/pod2`
|
||||
- Container 1: `cgroupsPath=/kubepods/pod2/container1`
|
||||
- Container 1: `cgroupsPath=/kubepods/pod2/container2`
|
||||
- Container 2: `cgroupsPath=/kubepods/pod2/container2`
|
||||
|
||||
Depending on the upper-level orchestration layers, the cgroup under which the pod is placed is
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
# Kata 2.0 Metrics Design
|
||||
|
||||
Kata implements CRI's API and supports [`ContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L101) and [`ListContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L103) interfaces to expose containers metrics. User can use these interfaces to get basic metrics about containers.
|
||||
Kata implement CRI's API and support [`ContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L101) and [`ListContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L103) interfaces to expose containers metrics. User can use these interface to get basic metrics about container.
|
||||
|
||||
Unlike `runc`, Kata is a VM-based runtime and has a different architecture.
|
||||
But unlike `runc`, Kata is a VM-based runtime and has a different architecture.
|
||||
|
||||
## Limitations of Kata 1.x and target of Kata 2.0
|
||||
## Limitations of Kata 1.x and the target of Kata 2.0
|
||||
|
||||
Kata 1.x has a number of limitations related to observability that may be obstacles to running Kata Containers at scale.
|
||||
|
||||
In Kata 2.0, the following components will be able to provide more details about the system:
|
||||
In Kata 2.0, the following components will be able to provide more details about the system.
|
||||
|
||||
- containerd shim v2 (effectively `kata-runtime`)
|
||||
- Hypervisor statistics
|
||||
- Agent process
|
||||
- Guest OS statistics
|
||||
|
||||
> **Note**: In Kata 1.x, the main user-facing component was the runtime (`kata-runtime`). From 1.5, Kata introduced the Kata containerd shim v2 (`containerd-shim-kata-v2`) which is essentially a modified runtime that is loaded by containerd to simplify and improve the way VM-based containers are created and managed.
|
||||
> **Note**: In Kata 1.x, the main user-facing component was the runtime (`kata-runtime`). From 1.5, Kata then introduced the Kata containerd shim v2 (`containerd-shim-kata-v2`) which is essentially a modified runtime that is loaded by containerd to simplify and improve the way VM-based containers are created and managed.
|
||||
>
|
||||
> For Kata 2.0, the main component is the Kata containerd shim v2, although the deprecated `kata-runtime` binary will be maintained for a period of time.
|
||||
>
|
||||
@@ -25,15 +25,14 @@ In Kata 2.0, the following components will be able to provide more details about
|
||||
|
||||
Kata 2.0 metrics strongly depend on [Prometheus](https://prometheus.io/), a graduated project from CNCF.
|
||||
|
||||
Kata Containers 2.0 introduces a new Kata component called `kata-monitor` which is used to monitor the Kata components on the host. It's shipped with the Kata runtime to provide an interface to:
|
||||
Kata Containers 2.0 introduces a new Kata component called `kata-monitor` which is used to monitor the other Kata components on the host. It's the monitor interface with Kata runtime, and we can do something like these:
|
||||
|
||||
- Get metrics
|
||||
- Get events
|
||||
|
||||
At present, `kata-monitor` supports retrieval of metrics only: this is what will be covered in this document.
|
||||
In this document we will cover metrics only. And until now it only supports metrics function.
|
||||
|
||||
|
||||
This is the architecture overview of metrics in Kata Containers 2.0:
|
||||
This is the architecture overview metrics in Kata Containers 2.0.
|
||||
|
||||

|
||||
|
||||
@@ -46,38 +45,38 @@ For a quick evaluation, you can check out [this how to](../how-to/how-to-set-pro
|
||||
|
||||
### Kata monitor
|
||||
|
||||
The `kata-monitor` management agent should be started on each node where the Kata containers runtime is installed. `kata-monitor` will:
|
||||
`kata-monitor` is a management agent on one node, where many Kata containers are running. `kata-monitor`'s work include:
|
||||
|
||||
> **Note**: a *node* running Kata containers will be either a single host system or a worker node belonging to a K8s cluster capable of running Kata pods.
|
||||
> **Note**: node is a single host system or a node in K8s clusters.
|
||||
|
||||
- Aggregate sandbox metrics running on the node, adding the `sandbox_id` label to them.
|
||||
- Expose a new Prometheus target, allowing all node metrics coming from the Kata shim to be collected by Prometheus indirectly. This simplifies the targets count in Prometheus and avoids exposing shim's metrics by `ip:port`.
|
||||
- Aggregate sandbox metrics running on this node, and add `sandbox_id` label
|
||||
- As a Prometheus target, all metrics from Kata shim on this node will be collected by Prometheus indirectly. This can easy the targets count in Prometheus, and also need not to expose shim's metrics by `ip:port`
|
||||
|
||||
Only one `kata-monitor` process runs in each node.
|
||||
Only one `kata-monitor` process are running on one node.
|
||||
|
||||
`kata-monitor` uses a different communication channel than the one used by the container engine (`containerd`/`CRI-O`) to communicate with the Kata shim. The Kata shim exposes a dedicated socket address reserved to `kata-monitor`.
|
||||
`kata-monitor` is using a different communication channel other than that `conatinerd` communicating with Kata shim, and Kata shim listen on a new socket address for communicating with `kata-monitor`.
|
||||
|
||||
The shim's metrics socket file is created under the virtcontainers sandboxes directory, i.e. `vc/sbs/${PODID}/shim-monitor.sock`.
|
||||
The way `kata-monitor` get shim's metrics socket file(`monitor_address`) like that `containerd` get shim address. The socket is an abstract socket and saved as file `abstract` with the same directory of `address` for `containerd`.
|
||||
|
||||
> **Note**: If there is no Prometheus server configured, i.e., there are no scrape operations, `kata-monitor` will not collect any metrics.
|
||||
> **Note**: If there is no Prometheus server is configured, i.e., there is no scrape operations, `kata-monitor` will do nothing initiative.
|
||||
|
||||
### Kata runtime
|
||||
|
||||
Kata runtime is responsible for:
|
||||
Runtime is responsible for:
|
||||
|
||||
- Gather metrics about shim process
|
||||
- Gather metrics about hypervisor process
|
||||
- Gather metrics about running sandbox
|
||||
- Get metrics from Kata agent (through `ttrpc`)
|
||||
- Get metrics from Kata agent(through `ttrpc`)
|
||||
|
||||
### Kata agent
|
||||
|
||||
Kata agent is responsible for:
|
||||
Agent is responsible for:
|
||||
|
||||
- Gather agent process metrics
|
||||
- Gather guest OS metrics
|
||||
|
||||
In Kata 2.0, the agent adds a new interface:
|
||||
And in Kata 2.0, agent will add a new interface:
|
||||
|
||||
```protobuf
|
||||
rpc GetMetrics(GetMetricsRequest) returns (Metrics);
|
||||
@@ -94,49 +93,33 @@ The `metrics` field is Prometheus encoded content. This can avoid defining a fix
|
||||
|
||||
### Performance and overhead
|
||||
|
||||
Metrics should not become a bottleneck for the system or downgrade the performance: they should run with minimal overhead.
|
||||
Metrics should not become the bottleneck of system, downgrade the performance, and run with minimal overhead.
|
||||
|
||||
Requirements:
|
||||
|
||||
* Metrics **MUST** be quick to collect
|
||||
* Metrics **MUST** be small
|
||||
* Metrics **MUST** be small.
|
||||
* Metrics **MUST** be generated only if there are subscribers to the Kata metrics service
|
||||
* Metrics **MUST** be stateless
|
||||
|
||||
In Kata 2.0, metrics are collected only when needed (pull mode), mainly from the `/proc` filesystem, and consumed by Prometheus. This means that if the Prometheus collector is not running (so no one cares about the metrics) the overhead will be zero.
|
||||
In Kata 2.0, metrics are collected mainly from `/proc` filesystem, and consumed by Prometheus, based on a pull mode, that is mean if there is no Prometheus collector is running, so there will be zero overhead if nobody cares the metrics.
|
||||
|
||||
The metrics service also doesn't hold any metrics in memory.
|
||||
|
||||
#### Metrics size ####
|
||||
Metrics service also doesn't hold any metrics in memory.
|
||||
|
||||
|\*|No Sandbox | 1 Sandbox | 2 Sandboxes |
|
||||
|---|---|---|---|
|
||||
|Metrics count| 39 | 106 | 173 |
|
||||
|Metrics size (bytes)| 9K | 144K | 283K |
|
||||
|Metrics size (`gzipped`, bytes)| 2K | 10K | 17K |
|
||||
|Metrics size(bytes)| 9K | 144K | 283K |
|
||||
|Metrics size(`gzipped`, bytes)| 2K | 10K | 17K |
|
||||
|
||||
*Metrics size*: response size of one Prometheus scrape request.
|
||||
*Metrics size*: Response size of one Prometheus scrape request.
|
||||
|
||||
It's easy to estimate the size of one metrics fetch request issued by Prometheus.
|
||||
The formula to calculate the expected size when no gzip compression is in place is:
|
||||
9 + (144 - 9) * `number of kata sandboxes`
|
||||
|
||||
Prometheus supports `gzip compression`. When enabled, the response size of each request will be smaller:
|
||||
2 + (10 - 2) * `number of kata sandboxes`
|
||||
|
||||
**Example**
|
||||
We have 10 sandboxes running on a node. The expected size of one metrics fetch request issued by Prometheus against the kata-monitor agent running on that node will be:
|
||||
9 + (144 - 9) * 10 = **1.35M**
|
||||
|
||||
If `gzip compression` is enabled:
|
||||
2 + (10 - 2) * 10 = **82K**
|
||||
|
||||
#### Metrics delay ####
|
||||
It's easy to estimated that if there are 10 sandboxes running in the host, the size of one metrics fetch request issued by Prometheus will be about to 9 + (144 - 9) * 10 = 1.35M (not `gzipped`) or 2 + (10 - 2) * 10 = 82K (`gzipped`). Of course Prometheus support `gzip` compression, that can reduce the response size of every request.
|
||||
|
||||
And here is some test data:
|
||||
|
||||
- End-to-end (from Prometheus server to `kata-monitor` and `kata-monitor` write response back): **20ms**(avg)
|
||||
- Agent (RPC all from shim to agent): **3ms**(avg)
|
||||
- End-to-end (from Prometheus server to `kata-monitor` and `kata-monitor` write response back): 20ms(avg)
|
||||
- Agent(RPC all from shim to agent): 3ms(avg)
|
||||
|
||||
Test infrastructure:
|
||||
|
||||
@@ -145,13 +128,13 @@ Test infrastructure:
|
||||
|
||||
**Scrape interval**
|
||||
|
||||
Prometheus default `scrape_interval` is 1 minute, but it is usually set to 15 seconds. A smaller `scrape_interval` causes more overhead, so users should set it depending on their monitoring needs.
|
||||
Prometheus default `scrape_interval` is 1 minute, and usually it is set to 15s. Small `scrape_interval` will cause more overhead, so user should set it on monitor demand.
|
||||
|
||||
## Metrics list
|
||||
|
||||
Here are listed all the metrics supported by Kata 2.0. Some metrics are dependent on the VM guest kernel, so the available ones may differ based on the environment.
|
||||
Here listed is all supported metrics by Kata 2.0. Some metrics is dependent on guest kernels in the VM, so there may be some different by your environment.
|
||||
|
||||
Metrics are categorized by the component from/for which the metrics are collected.
|
||||
Metrics is categorized by component where metrics are collected from and for.
|
||||
|
||||
* [Metric types](#metric-types)
|
||||
* [Kata agent metrics](#kata-agent-metrics)
|
||||
@@ -162,15 +145,15 @@ Metrics are categorized by the component from/for which the metrics are collecte
|
||||
* [Kata containerd shim v2 metrics](#kata-containerd-shim-v2-metrics)
|
||||
|
||||
> **Note**:
|
||||
> * Labels here do not include the `instance` and `job` labels added by Prometheus.
|
||||
> * Labels here are not include `instance` and `job` labels that added by Prometheus.
|
||||
> * Notes about metrics unit
|
||||
> * `Kibibytes`, abbreviated `KiB`. 1 `KiB` equals 1024 B.
|
||||
> * For some metrics (like network devices statistics from file `/proc/net/dev`), unit depends on label( for example `recv_bytes` and `recv_packets` have different units).
|
||||
> * Most of these metrics are collected from the `/proc` filesystem, so the unit of each metric matches the unit of the relevant `/proc` entry. See the `proc(5)` manual page for further details.
|
||||
> * For some metrics (like network devices statistics from file `/proc/net/dev`), unit is depend on label( for example `recv_bytes` and `recv_packets` are having different units).
|
||||
> * Most of these metrics is collected from `/proc` filesystem, so the unit of metrics are keeping the same unit as `/proc`. See the `proc(5)` manual page for further details.
|
||||
|
||||
### Metric types
|
||||
|
||||
Prometheus offers four core metric types.
|
||||
Prometheus offer four core metric types.
|
||||
|
||||
- Counter: A counter is a cumulative metric that represents a single monotonically increasing counter whose value can only increase.
|
||||
|
||||
@@ -305,7 +288,7 @@ Metrics about Kata containerd shim v2 process.
|
||||
|
||||
| Metric name | Type | Units | Labels | Introduced in Kata version |
|
||||
|---|---|---|---|---|
|
||||
| `kata_shim_agent_rpc_durations_histogram_milliseconds`: <br> RPC latency distributions. | `HISTOGRAM` | `milliseconds` | <ul><li>`action` (RPC actions of Kata agent)<ul><li>`grpc.CheckRequest`</li><li>`grpc.CloseStdinRequest`</li><li>`grpc.CopyFileRequest`</li><li>`grpc.CreateContainerRequest`</li><li>`grpc.CreateSandboxRequest`</li><li>`grpc.DestroySandboxRequest`</li><li>`grpc.ExecProcessRequest`</li><li>`grpc.GetMetricsRequest`</li><li>`grpc.GuestDetailsRequest`</li><li>`grpc.ListInterfacesRequest`</li><li>`grpc.ListProcessesRequest`</li><li>`grpc.ListRoutesRequest`</li><li>`grpc.MemHotplugByProbeRequest`</li><li>`grpc.OnlineCPUMemRequest`</li><li>`grpc.PauseContainerRequest`</li><li>`grpc.RemoveContainerRequest`</li><li>`grpc.ReseedRandomDevRequest`</li><li>`grpc.ResumeContainerRequest`</li><li>`grpc.SetGuestDateTimeRequest`</li><li>`grpc.SignalProcessRequest`</li><li>`grpc.StartContainerRequest`</li><li>`grpc.StatsContainerRequest`</li><li>`grpc.TtyWinResizeRequest`</li><li>`grpc.UpdateContainerRequest`</li><li>`grpc.UpdateInterfaceRequest`</li><li>`grpc.UpdateRoutesRequest`</li><li>`grpc.WaitProcessRequest`</li><li>`grpc.WriteStreamRequest`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_agent_rpc_durations_histogram_milliseconds`: <br> RPC latency distributions. | `HISTOGRAM` | `milliseconds` | <ul><li>`action` (RPC actions of Kata agent)<ul><li>`grpc.CheckRequest`</li><li>`grpc.CloseStdinRequest`</li><li>`grpc.CopyFileRequest`</li><li>`grpc.CreateContainerRequest`</li><li>`grpc.CreateSandboxRequest`</li><li>`grpc.DestroySandboxRequest`</li><li>`grpc.ExecProcessRequest`</li><li>`grpc.GetMetricsRequest`</li><li>`grpc.GuestDetailsRequest`</li><li>`grpc.ListInterfacesRequest`</li><li>`grpc.ListProcessesRequest`</li><li>`grpc.ListRoutesRequest`</li><li>`grpc.MemHotplugByProbeRequest`</li><li>`grpc.OnlineCPUMemRequest`</li><li>`grpc.PauseContainerRequest`</li><li>`grpc.RemoveContainerRequest`</li><li>`grpc.ReseedRandomDevRequest`</li><li>`grpc.ResumeContainerRequest`</li><li>`grpc.SetGuestDateTimeRequest`</li><li>`grpc.SignalProcessRequest`</li><li>`grpc.StartContainerRequest`</li><li>`grpc.StartTracingRequest`</li><li>`grpc.StatsContainerRequest`</li><li>`grpc.StopTracingRequest`</li><li>`grpc.TtyWinResizeRequest`</li><li>`grpc.UpdateContainerRequest`</li><li>`grpc.UpdateInterfaceRequest`</li><li>`grpc.UpdateRoutesRequest`</li><li>`grpc.WaitProcessRequest`</li><li>`grpc.WriteStreamRequest`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_fds`: <br> Kata containerd shim v2 open FDs. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_go_gc_duration_seconds`: <br> A summary of the pause duration of garbage collection cycles. | `SUMMARY` | `seconds` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_go_goroutines`: <br> Number of goroutines that currently exist. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
|
||||
@@ -209,5 +209,5 @@ network accessible to the collector.
|
||||
- The trace collection proposals are still being considered.
|
||||
|
||||
[kata-1x-tracing]: https://github.com/kata-containers/agent/blob/master/TRACING.md
|
||||
[trace-forwarder]: /src/tools/trace-forwarder
|
||||
[trace-forwarder]: /src/trace-forwarder
|
||||
[tracing-doc-pr]: https://github.com/kata-containers/kata-containers/pull/1937
|
||||
|
||||
@@ -157,32 +157,6 @@ docker run --cpus 4 -ti debian bash -c "nproc; cat /sys/fs/cgroup/cpu,cpuacct/cp
|
||||
400000 # cfs quota
|
||||
```
|
||||
|
||||
## Virtual CPU handling without hotplug
|
||||
|
||||
In some cases, the hardware and/or software architecture being utilized does not support
|
||||
hotplug. For example, Firecracker VMM does not support CPU or memory hotplug. Similarly,
|
||||
the current Linux Kernel for aarch64 does not support CPU or memory hotplug. To appropriately
|
||||
size the virtual machine for the workload within the container or pod, we provide a `static_sandbox_resource_mgmt`
|
||||
flag within the Kata Containers configuration. When this is set, the runtime will:
|
||||
- Size the VM based on the workload requirements as well as the `default_vcpus` option specified in the configuration.
|
||||
- Not resize the virtual machine after it has been launched.
|
||||
|
||||
VM size determination varies depending on the type of container being run, and may not always
|
||||
be available. If workload sizing information is not available, the virtual machine will be started with the
|
||||
`default_vcpus`.
|
||||
|
||||
In the case of a pod, the initial sandbox container (pause container) typically doesn't contain any resource
|
||||
information in its runtime `spec`. It is possible that the upper layer runtime
|
||||
(i.e. containerd or CRI-O) may pass sandbox sizing annotations within the pause container's
|
||||
`spec`. If these are provided, we will use this to appropriately size the VM. In particular,
|
||||
we'll calculate the number of CPUs required for the workload and augment this by `default_vcpus`
|
||||
configuration option, and use this for the virtual machine size.
|
||||
|
||||
In the case of a single container (i.e., not a pod), if the container specifies resource requirements,
|
||||
the container's `spec` will provide the sizing information directly. If these are set, we will
|
||||
calculate the number of CPUs required for the workload and augment this by `default_vcpus`
|
||||
configuration option, and use this for the virtual machine size.
|
||||
|
||||
|
||||
[1]: https://docs.docker.com/config/containers/resource_constraints/#cpu
|
||||
[2]: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource
|
||||
|
||||
@@ -40,8 +40,8 @@ Kata Containers with QEMU has complete compatibility with Kubernetes.
|
||||
|
||||
Depending on the host architecture, Kata Containers supports various machine types,
|
||||
for example `pc` and `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers
|
||||
machine type is `q35`. The machine type and its [`Machine accelerators`](#machine-accelerators) can
|
||||
be changed by editing the runtime [`configuration`](architecture/README.md#configuration) file.
|
||||
machine type is `pc`. The machine type and its [`Machine accelerators`](#machine-accelerators) can
|
||||
be changed by editing the runtime [`configuration`](./architecture.md/#configuration) file.
|
||||
|
||||
Devices and features used:
|
||||
- virtio VSOCK or virtio serial
|
||||
|
||||
@@ -36,4 +36,3 @@
|
||||
- [How to use hotplug memory on arm64 in Kata Containers](how-to-hotplug-memory-arm64.md)
|
||||
- [How to setup swap devices in guest kernel](how-to-setup-swap-devices-in-guest-kernel.md)
|
||||
- [How to run rootless vmm](how-to-run-rootless-vmm.md)
|
||||
- [How to run Docker with Kata Containers](how-to-run-docker-with-kata.md)
|
||||
|
||||
@@ -188,7 +188,7 @@ If you use Containerd older than v1.2.4 or a version of Kata older than v1.6.0
|
||||
shell script with the following:
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
KATA_CONF_FILE=/etc/kata-containers/firecracker.toml containerd-shim-kata-v2 $@
|
||||
```
|
||||
|
||||
|
||||
@@ -264,7 +264,7 @@ At the same time, we will add the `--log=/var/log/kata-runtime.log` argument to
|
||||
own file (rather than into the system journal).
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
/opt/kata/bin/kata-runtime --config "/opt/kata/share/defaults/kata-containers/configuration-qemu.toml" --log-format=json --log=/var/log/kata-runtime.log $@
|
||||
```
|
||||
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
# How to run Docker in Docker with Kata Containers
|
||||
|
||||
This document describes the why and how behind running Docker in a Kata Container.
|
||||
|
||||
> **Note:** While in other environments this might be described as "Docker in Docker", the new architecture of Kata 2.x means [Docker can no longer be used to create containers using a Kata Containers runtime](https://github.com/kata-containers/kata-containers/issues/722).
|
||||
|
||||
## Requirements
|
||||
|
||||
- A working Kata Containers installation
|
||||
|
||||
## Install and configure Kata Containers
|
||||
|
||||
Follow the [Kata Containers installation guide](../install/README.md) to Install Kata Containers on your Kubernetes cluster.
|
||||
|
||||
## Background
|
||||
|
||||
Docker in Docker ("DinD") is the colloquial name for the ability to run `docker` from inside a container.
|
||||
|
||||
You can learn more about about Docker-in-Docker at the following links:
|
||||
|
||||
- [The original announcement of DinD](https://www.docker.com/blog/docker-can-now-run-within-docker/)
|
||||
- [`docker` image Docker Hub page](https://hub.docker.com/_/docker/) (this page lists the `-dind` releases)
|
||||
|
||||
While normally DinD refers to running `docker` from inside a Docker container,
|
||||
Kata Containers 2.x allows only supported runtimes (such as [`containerd`](../install/container-manager/containerd/containerd-install.md)).
|
||||
|
||||
Running `docker` in a Kata Container implies creating Docker containers from inside a container managed by `containerd` (or another supported container manager), as illustrated below:
|
||||
|
||||
```
|
||||
container manager -> Kata Containers shim -> Docker Daemon -> Docker container
|
||||
(containerd) (containerd-shim-kata-v2) (dockerd) (busybox sh)
|
||||
```
|
||||
|
||||
[OverlayFS][OverlayFS] is the preferred storage driver for most container runtimes on Linux ([including Docker](https://docs.docker.com/storage/storagedriver/select-storage-driver)).
|
||||
|
||||
> **Note:** While in the past Kata Containers did not contain the [`overlay` kernel module (aka OverlayFS)][OverlayFS], the kernel modules have been included since the [Kata Containers v2.0.0 release][v2.0.0].
|
||||
|
||||
[OverlayFS]: https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html
|
||||
[v2.0.0]: https://github.com/kata-containers/kata-containers/releases/tag/2.0.0
|
||||
[kata-2.x-supported-runtimes]: https://github.com/kata-containers/kata-containers/blob/5737b36a3513f4da11a9dc7301b0c97ea22a51cf/docs/install/container-manager/containerd/containerd-install.md
|
||||
|
||||
## Why Docker in Kata Containers 2.x requires special measures
|
||||
|
||||
Running Docker containers Kata Containers requires care because `VOLUME`s specified in `Dockerfile`s run by Kata Containers are given the `kataShared` mount type by default, which applies to the root directory `/`:
|
||||
|
||||
```console
|
||||
/ # mount
|
||||
kataShared on / type virtiofs (rw,relatime,dax)
|
||||
```
|
||||
|
||||
`kataShared` mount types are powered by [`virtio-fs`][virtio-fs], a marked improvement over `virtio-9p`, thanks to [PR #1016](https://github.com/kata-containers/runtime/pull/1016). While `virtio-fs` is normally an excellent choice, in the case of DinD workloads `virtio-fs` causes an issue -- [it *cannot* be used as a "upper layer" of `overlayfs` without a custom patch](http://lists.katacontainers.io/pipermail/kata-dev/2020-January/001216.html).
|
||||
|
||||
As `/var/lib/docker` is a `VOLUME` specified by DinD (i.e. the `docker` images tagged `*-dind`/`*-dind-rootless`), `docker` fill fail to start (or even worse, silently pick a worse storage driver like `vfs`) when started in a Kata Container. Special measures must be taken when running DinD-powered workloads in Kata Containers.
|
||||
|
||||
## Workarounds/Solutions
|
||||
|
||||
Thanks to various community contributions (see [issue references below](#references)) the following options, with various trade-offs have been uncovered:
|
||||
|
||||
### Use a memory backed volume
|
||||
|
||||
For small workloads (small container images, without much generated filesystem load), a memory-backed volume is sufficient. Kubernetes supports a variant of [the `EmptyDir` volume][k8s-emptydir], which allows for memdisk-backed storage -- the [the `medium: Memory` ][k8s-memory-volume-type]. An example of a `Pod` using such a setup [was contributed](https://github.com/kata-containers/runtime/issues/1429#issuecomment-477385283), and is reproduced below:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: dind
|
||||
spec:
|
||||
runtimeClassName: kata
|
||||
containers:
|
||||
- name: dind
|
||||
securityContext:
|
||||
privileged: true
|
||||
image: docker:20.10-dind
|
||||
args: ["--storage-driver=overlay2"]
|
||||
resources:
|
||||
limits:
|
||||
memory: "3G"
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/
|
||||
name: dockersock
|
||||
- mountPath: /var/lib/docker
|
||||
name: docker
|
||||
volumes:
|
||||
- name: dockersock
|
||||
emptyDir: {}
|
||||
- name: docker
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
```
|
||||
|
||||
Inside the container you can view the mount:
|
||||
|
||||
```console
|
||||
/ # mount | grep lib\/docker
|
||||
tmpfs on /var/lib/docker type tmpfs (rw,relatime)
|
||||
```
|
||||
|
||||
As is mentioned in the comment encapsulating this code, using volatile memory for container storage backing is a risky and could be possibly wasteful on machines that do not have a lot of RAM.
|
||||
|
||||
### Use a loop mounted disk
|
||||
|
||||
Using a loop mounted disk that is provisioned shortly before starting of the container workload is another approach that yields good performance.
|
||||
|
||||
Contributors provided [an example in issue #1888](https://github.com/kata-containers/runtime/issues/1888#issuecomment-739057384), which is reproduced in part below:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
containers:
|
||||
- name: docker
|
||||
image: docker:20.10-dind
|
||||
command: ["sh", "-c"]
|
||||
args:
|
||||
- if [[ $(df -PT /var/lib/docker | awk 'NR==2 {print $2}') == virtiofs ]]; then
|
||||
apk add e2fsprogs &&
|
||||
truncate -s 20G /tmp/disk.img &&
|
||||
mkfs.ext4 /tmp/disk.img &&
|
||||
mount /tmp/disk.img /var/lib/docker; fi &&
|
||||
dockerd-entrypoint.sh;
|
||||
securityContext:
|
||||
privileged: true
|
||||
```
|
||||
|
||||
Note that loop mounted disks are often sparse, which means they *do not* take up the full amount of space that has been provisioned. This solution seems to produce the best performance and flexibility, at the expense of increased complexity and additional required setup.
|
||||
|
||||
### Build a custom kernel
|
||||
|
||||
It's possible to [modify the kernel](https://github.com/kata-containers/runtime/issues/1888#issuecomment-616872558) (in addition to applying the earlier mentioned mailing list patch) to support using `virtio-fs` as an upper. Note that if you modify your kernel and use `virtio-fs` you may require [additional changes](https://github.com/kata-containers/runtime/issues/1888#issuecomment-739057384) for decent performance and to address other issues.
|
||||
|
||||
> **NOTE:** A future kernel release may rectify the usability and performance issues of using `virtio-fs` as an OverlayFS upper layer.
|
||||
|
||||
## References
|
||||
|
||||
The solutions proposed in this document are an amalgamation of thoughtful contributions from the Kata Containers community.
|
||||
|
||||
Find links to issues & related discussion and the fruits therein below:
|
||||
|
||||
- [How to run Docker in Docker with Kata Containers (#2474)](https://github.com/kata-containers/kata-containers/issues/2474)
|
||||
- [Does Kata-container support AUFS/OverlayFS? (#2493)](https://github.com/kata-containers/runtime/issues/2493)
|
||||
- [Unable to start docker in docker with virtio-fs (#1888)](https://github.com/kata-containers/runtime/issues/1888)
|
||||
- [Not using native diff for overlay2 (#1429)](https://github.com/kata-containers/runtime/issues/1429)
|
||||
@@ -56,6 +56,7 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.hypervisor.enable_iommu` | `boolean` | enable `iommu` on Q35 (QEMU x86_64) |
|
||||
| `io.katacontainers.config.hypervisor.enable_iothreads` | `boolean`| enable IO to be processed in a separate thread. Supported currently for virtio-`scsi` driver |
|
||||
| `io.katacontainers.config.hypervisor.enable_mem_prealloc` | `boolean` | the memory space used for `nvdimm` device by the hypervisor |
|
||||
| `io.katacontainers.config.hypervisor.enable_swap` | `boolean` | enable swap of VM memory |
|
||||
| `io.katacontainers.config.hypervisor.enable_vhost_user_store` | `boolean` | enable vhost-user storage device (QEMU) |
|
||||
| `io.katacontainers.config.hypervisor.enable_virtio_mem` | `boolean` | enable virtio-mem (QEMU) |
|
||||
| `io.katacontainers.config.hypervisor.entropy_source` (R) | string| the path to a host source of entropy (`/dev/random`, `/dev/urandom` or real hardware RNG device) |
|
||||
|
||||
@@ -154,7 +154,7 @@ From Kubernetes v1.12, users can use [`RuntimeClass`](https://kubernetes.io/docs
|
||||
|
||||
```bash
|
||||
$ cat > runtime.yaml <<EOF
|
||||
apiVersion: node.k8s.io/v1
|
||||
apiVersion: node.k8s.io/v1beta1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: kata
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2019 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
@@ -22,7 +22,7 @@ An equivalent shim implementation for CRI-O is planned.
|
||||
### CRI-O
|
||||
For CRI-O installation instructions, refer to the [CRI-O Tutorial](https://github.com/cri-o/cri-o/blob/main/tutorial.md) page.
|
||||
|
||||
The following sections show how to set up the CRI-O snippet configuration file (default path: `/etc/crio/crio.conf`) for Kata.
|
||||
The following sections show how to set up the CRI-O configuration file (default path: `/etc/crio/crio.conf`) for Kata.
|
||||
|
||||
Unless otherwise stated, all the following settings are specific to the `crio.runtime` table:
|
||||
```toml
|
||||
@@ -40,16 +40,74 @@ A comprehensive documentation of the configuration file can be found [here](http
|
||||
#### Kubernetes Runtime Class (CRI-O v1.12+)
|
||||
The [Kubernetes Runtime Class](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
is the preferred way of specifying the container runtime configuration to run a Pod's containers.
|
||||
To use this feature, Kata must added as a runtime handler. This can be done by
|
||||
dropping a `50-kata` snippet file into `/etc/crio/crio.conf.d`, with the
|
||||
content shown below:
|
||||
To use this feature, Kata must added as a runtime handler with:
|
||||
|
||||
```toml
|
||||
[crio.runtime.runtimes.kata]
|
||||
runtime_path = "/usr/bin/containerd-shim-kata-v2"
|
||||
runtime_type = "vm"
|
||||
runtime_root = "/run/vc"
|
||||
privileged_without_host_devices = true
|
||||
[crio.runtime.runtimes.kata-runtime]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
```
|
||||
|
||||
You can also add multiple entries to specify alternatives hypervisors, e.g.:
|
||||
```toml
|
||||
[crio.runtime.runtimes.kata-qemu]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
|
||||
[crio.runtime.runtimes.kata-fc]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
```
|
||||
|
||||
#### Untrusted annotation (until CRI-O v1.12)
|
||||
The untrusted annotation is used to specify a runtime for __untrusted__ workloads, i.e.
|
||||
a runtime to be used when the workload cannot be trusted and a higher level of security
|
||||
is required. An additional flag can be used to let CRI-O know if a workload
|
||||
should be considered _trusted_ or _untrusted_ by default.
|
||||
For further details, see the documentation
|
||||
[here](../design/architecture.md#mixing-vm-based-and-namespace-based-runtimes).
|
||||
|
||||
```toml
|
||||
# runtime is the OCI compatible runtime used for trusted container workloads.
|
||||
# This is a mandatory setting as this runtime will be the default one
|
||||
# and will also be used for untrusted container workloads if
|
||||
# runtime_untrusted_workload is not set.
|
||||
runtime = "/usr/bin/runc"
|
||||
|
||||
# runtime_untrusted_workload is the OCI compatible runtime used for untrusted
|
||||
# container workloads. This is an optional setting, except if
|
||||
# default_container_trust is set to "untrusted".
|
||||
runtime_untrusted_workload = "/usr/bin/kata-runtime"
|
||||
|
||||
# default_workload_trust is the default level of trust crio puts in container
|
||||
# workloads. It can either be "trusted" or "untrusted", and the default
|
||||
# is "trusted".
|
||||
# Containers can be run through different container runtimes, depending on
|
||||
# the trust hints we receive from kubelet:
|
||||
# - If kubelet tags a container workload as untrusted, crio will try first to
|
||||
# run it through the untrusted container workload runtime. If it is not set,
|
||||
# crio will use the trusted runtime.
|
||||
# - If kubelet does not provide any information about the container workload trust
|
||||
# level, the selected runtime will depend on the default_container_trust setting.
|
||||
# If it is set to "untrusted", then all containers except for the host privileged
|
||||
# ones, will be run by the runtime_untrusted_workload runtime. Host privileged
|
||||
# containers are by definition trusted and will always use the trusted container
|
||||
# runtime. If default_container_trust is set to "trusted", crio will use the trusted
|
||||
# container runtime for all containers.
|
||||
default_workload_trust = "untrusted"
|
||||
```
|
||||
|
||||
#### Network namespace management
|
||||
To enable networking for the workloads run by Kata, CRI-O needs to be configured to
|
||||
manage network namespaces, by setting the following key to `true`.
|
||||
|
||||
In CRI-O v1.16:
|
||||
```toml
|
||||
manage_network_ns_lifecycle = true
|
||||
```
|
||||
In CRI-O v1.17+:
|
||||
```toml
|
||||
manage_ns_lifecycle = true
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -12,26 +12,16 @@ Containers.
|
||||
|
||||
Packaged installation methods uses your distribution's native package format (such as RPM or DEB).
|
||||
|
||||
> **Note:** We encourage installation methods that provides automatic updates, it ensures security updates and bug fixes are
|
||||
> easily applied.
|
||||
*Note:* We encourage installation methods that provides automatic updates, it ensures security updates and bug fixes are
|
||||
easily applied.
|
||||
|
||||
| Installation method | Description | Automatic updates | Use case |
|
||||
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|
|
||||
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. |
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. |
|
||||
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. |
|
||||
|
||||
### Kata Deploy Installation
|
||||
|
||||
Kata Deploy provides a Dockerfile, which contains all of the binaries and
|
||||
artifacts required to run Kata Containers, as well as reference DaemonSets,
|
||||
which can be utilized to install Kata Containers on a running Kubernetes
|
||||
cluster.
|
||||
|
||||
[Use Kata Deploy](/tools/packaging/kata-deploy/README.md) to install Kata Containers on a Kubernetes Cluster.
|
||||
| Installation method | Description | Automatic updates | Use case |
|
||||
|------------------------------------------------------|---------------------------------------------------------------------|-------------------|----------------------------------------------------------|
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. |
|
||||
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. |
|
||||
|
||||
### Official packages
|
||||
|
||||
@@ -58,9 +48,9 @@ Follow the [containerd installation guide](container-manager/containerd/containe
|
||||
|
||||
## Build from source installation
|
||||
|
||||
> **Note:** Power users who decide to build from sources should be aware of the
|
||||
> implications of using an unpackaged system which will not be automatically
|
||||
> updated as new [releases](../Stable-Branch-Strategy.md) are made available.
|
||||
*Note:* Power users who decide to build from sources should be aware of the
|
||||
implications of using an unpackaged system which will not be automatically
|
||||
updated as new [releases](../Stable-Branch-Strategy.md) are made available.
|
||||
|
||||
[Building from sources](../Developer-Guide.md#initial-setup) allows power users
|
||||
who are comfortable building software from source to use the latest component
|
||||
|
||||
@@ -209,5 +209,5 @@ to allow you to access the VM environment.
|
||||
[opentelemetry]: https://opentelemetry.io
|
||||
[osbuilder]: https://github.com/kata-containers/kata-containers/blob/main/tools/osbuilder
|
||||
[setup-debug-console]: https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#set-up-a-debug-console
|
||||
[trace-forwarder]: /src/tools/trace-forwarder
|
||||
[trace-forwarder]: /src/trace-forwarder
|
||||
[vsock]: https://wiki.qemu.org/Features/VirtioVsock
|
||||
|
||||
@@ -235,7 +235,7 @@ then [Kata-deploy](https://github.com/kata-containers/kata-containers/tree/main/
|
||||
is use to install Kata. This will make sure that the correct `agent` version
|
||||
is installed into the rootfs in the steps below.
|
||||
|
||||
The following instructions use Ubuntu as the root filesystem with systemd as
|
||||
The following instructions use Debian as the root filesystem with systemd as
|
||||
the init and will add in the `kmod` binary, which is not a standard binary in
|
||||
a Kata rootfs image. The `kmod` binary is necessary to load the Intel® QAT
|
||||
kernel modules when the virtual machine rootfs boots.
|
||||
@@ -257,7 +257,7 @@ $ cd $GOPATH
|
||||
$ export AGENT_VERSION=$(kata-runtime version | head -n 1 | grep -o "[0-9.]\+")
|
||||
$ cd ${OSBUILDER}/rootfs-builder
|
||||
$ sudo rm -rf ${ROOTFS_DIR}
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh ubuntu'
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh debian'
|
||||
```
|
||||
|
||||
### Compile Intel® QAT drivers for Kata Containers kernel and add to Kata Containers rootfs
|
||||
@@ -355,10 +355,10 @@ this small script so that it redirects to be able to use either QEMU or
|
||||
Cloud Hypervisor with Kata.
|
||||
|
||||
```bash
|
||||
$ echo '#!/usr/bin/env bash' | sudo tee /usr/local/bin/containerd-shim-kata-qemu-v2
|
||||
$ echo '#!/bin/bash' | sudo tee /usr/local/bin/containerd-shim-kata-qemu-v2
|
||||
$ echo 'KATA_CONF_FILE=/opt/kata/share/defaults/kata-containers/configuration-qemu.toml /opt/kata/bin/containerd-shim-kata-v2 $@' | sudo tee -a /usr/local/bin/containerd-shim-kata-qemu-v2
|
||||
$ sudo chmod +x /usr/local/bin/containerd-shim-kata-qemu-v2
|
||||
$ echo '#!/usr/bin/env bash' | sudo tee /usr/local/bin/containerd-shim-kata-clh-v2
|
||||
$ echo '#!/bin/bash' | sudo tee /usr/local/bin/containerd-shim-kata-clh-v2
|
||||
$ echo 'KATA_CONF_FILE=/opt/kata/share/defaults/kata-containers/configuration-clh.toml /opt/kata/bin/containerd-shim-kata-v2 $@' | sudo tee -a /usr/local/bin/containerd-shim-kata-clh-v2
|
||||
$ sudo chmod +x /usr/local/bin/containerd-shim-kata-clh-v2
|
||||
```
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Setup to run SPDK vhost-user devices with Kata Containers
|
||||
# Setup to run SPDK vhost-user devices with Kata Containers and Docker*
|
||||
|
||||
> **Note:** This guide only applies to QEMU, since the vhost-user storage
|
||||
> device is only available for QEMU now. The enablement work on other
|
||||
@@ -104,7 +104,7 @@ devices:
|
||||
|
||||
- `vhost-user-blk`
|
||||
- `vhost-user-scsi`
|
||||
- `vhost-user-nvme` (deprecated from SPDK 21.07 release)
|
||||
- `vhost-user-nvme`
|
||||
|
||||
For more information, visit [SPDK](https://spdk.io) and [SPDK vhost-user target](https://spdk.io/doc/vhost.html).
|
||||
|
||||
@@ -222,43 +222,26 @@ minor `0` should be created for it, in order to be recognized by Kata runtime:
|
||||
$ sudo mknod /var/run/kata-containers/vhost-user/block/devices/vhostblk0 b 241 0
|
||||
```
|
||||
|
||||
> **Note:** The enablement of vhost-user block device in Kata containers
|
||||
> is supported by Kata Containers `1.11.0-alpha1` or newer.
|
||||
> Make sure you have updated your Kata containers before evaluation.
|
||||
|
||||
## Launch a Kata container with SPDK vhost-user block device
|
||||
|
||||
To use `vhost-user-blk` device, use `ctr` to pass a host `vhost-user-blk`
|
||||
device to the container. In your `config.json`, you should use `devices`
|
||||
To use `vhost-user-blk` device, use Docker to pass a host `vhost-user-blk`
|
||||
device to the container. In docker, `--device=HOST-DIR:CONTAINER-DIR` is used
|
||||
to pass a host device to the container.
|
||||
|
||||
For example (only `vhost-user-blk` listed):
|
||||
|
||||
```json
|
||||
{
|
||||
"linux": {
|
||||
"devices": [
|
||||
{
|
||||
"path": "/dev/vda",
|
||||
"type": "b",
|
||||
"major": 241,
|
||||
"minor": 0,
|
||||
"fileMode": 420,
|
||||
"uid": 0,
|
||||
"gid": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
With `rootfs` provisioned under `bundle` directory, you can run your SPDK container:
|
||||
For example:
|
||||
|
||||
```bash
|
||||
$ sudo ctr run -d --runtime io.containerd.run.kata.v2 --config bundle/config.json spdk_container
|
||||
$ sudo docker run --runtime kata-runtime --device=/var/run/kata-containers/vhost-user/block/devices/vhostblk0:/dev/vda -it busybox sh
|
||||
```
|
||||
|
||||
Example of performing I/O operations on the `vhost-user-blk` device inside
|
||||
container:
|
||||
|
||||
```
|
||||
$ sudo ctr t exec --exec-id 1 -t spdk_container sh
|
||||
/ # ls -l /dev/vda
|
||||
brw-r--r-- 1 root root 254, 0 Jan 20 03:54 /dev/vda
|
||||
/ # dd if=/dev/vda of=/tmp/ddtest bs=4k count=20
|
||||
|
||||
@@ -20,8 +20,6 @@ const LOG_LEVELS: &[(&str, slog::Level)] = &[
|
||||
("critical", slog::Level::Critical),
|
||||
];
|
||||
|
||||
const DEFAULT_SUBSYSTEM: &str = "root";
|
||||
|
||||
// XXX: 'writer' param used to make testing possible.
|
||||
pub fn create_logger<W>(
|
||||
name: &str,
|
||||
@@ -52,7 +50,7 @@ where
|
||||
let logger = slog::Logger::root(
|
||||
async_drain.fuse(),
|
||||
o!("version" => env!("CARGO_PKG_VERSION"),
|
||||
"subsystem" => DEFAULT_SUBSYSTEM,
|
||||
"subsystem" => "root",
|
||||
"pid" => process::id().to_string(),
|
||||
"name" => name.to_string(),
|
||||
"source" => source.to_string()),
|
||||
@@ -218,8 +216,8 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::{json, Value};
|
||||
use slog::{crit, debug, error, info, warn, Logger};
|
||||
use serde_json::Value;
|
||||
use slog::info;
|
||||
use std::io::prelude::*;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
@@ -297,15 +295,15 @@ mod tests {
|
||||
let result_level = result.unwrap();
|
||||
let expected_level = d.result.unwrap();
|
||||
|
||||
assert!(result_level == expected_level, "{}", msg);
|
||||
assert!(result_level == expected_level, msg);
|
||||
continue;
|
||||
} else {
|
||||
assert!(result.is_err(), "{}", msg);
|
||||
assert!(result.is_err(), msg);
|
||||
}
|
||||
|
||||
let expected_error = d.result.as_ref().unwrap_err();
|
||||
let actual_error = result.unwrap_err();
|
||||
assert!(&actual_error == expected_error, "{}", msg);
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,13 +350,13 @@ mod tests {
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
if d.result.is_ok() {
|
||||
assert!(result == d.result, "{}", msg);
|
||||
assert!(result == d.result, msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
let expected_error = d.result.as_ref().unwrap_err();
|
||||
let actual_error = result.unwrap_err();
|
||||
assert!(&actual_error == expected_error, "{}", msg);
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -378,17 +376,14 @@ mod tests {
|
||||
let record_key = "record-key-1";
|
||||
let record_value = "record-key-2";
|
||||
|
||||
let (logger, guard) = create_logger(name, source, level, writer);
|
||||
let logger = create_logger(name, source, level, writer);
|
||||
|
||||
let msg = "foo, bar, baz";
|
||||
|
||||
// Call the logger (which calls the drain)
|
||||
// Note: This "mid level" log level should be available in debug or
|
||||
// release builds.
|
||||
info!(&logger, "{}", msg; "subsystem" => record_subsystem, record_key => record_value);
|
||||
info!(logger, "{}", msg; "subsystem" => record_subsystem, record_key => record_value);
|
||||
|
||||
// Force temp file to be flushed
|
||||
drop(guard);
|
||||
drop(logger);
|
||||
|
||||
let mut contents = String::new();
|
||||
@@ -435,168 +430,4 @@ mod tests {
|
||||
.expect("failed to find record key field");
|
||||
assert_eq!(field_record_value, record_value);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_logger_levels() {
|
||||
let name = "name";
|
||||
let source = "source";
|
||||
|
||||
let debug_msg = "a debug log level message";
|
||||
let info_msg = "an info log level message";
|
||||
let warn_msg = "a warn log level message";
|
||||
let error_msg = "an error log level message";
|
||||
let critical_msg = "a critical log level message";
|
||||
|
||||
// The slog crate will *remove* macro calls for log levels "above" the
|
||||
// configured log level.lock
|
||||
//
|
||||
// At the time of writing, the default slog log
|
||||
// level is "info", but this crate overrides that using the magic
|
||||
// "*max_level*" features in the "Cargo.toml" manifest.
|
||||
|
||||
// However, there are two log levels:
|
||||
//
|
||||
// - max_level_${level}
|
||||
//
|
||||
// This is the log level for normal "cargo build" (development/debug)
|
||||
// builds.
|
||||
//
|
||||
// - release_max_level_${level}
|
||||
//
|
||||
// This is the log level for "cargo install" and
|
||||
// "cargo build --release" (release) builds.
|
||||
//
|
||||
// This crate sets them to different values, which is sensible and
|
||||
// standard practice. However, that causes a problem: there is
|
||||
// currently no clean way for this test code to detect _which_
|
||||
// profile the test is being built for (development or release),
|
||||
// meaning we cannot know which macros are expected to produce output
|
||||
// and which aren't ;(
|
||||
//
|
||||
// The best we can do is test the following log levels which
|
||||
// are expected to work in all build profiles.
|
||||
|
||||
let debug_closure = |logger: &Logger, msg: String| debug!(logger, "{}", msg);
|
||||
let info_closure = |logger: &Logger, msg: String| info!(logger, "{}", msg);
|
||||
let warn_closure = |logger: &Logger, msg: String| warn!(logger, "{}", msg);
|
||||
let error_closure = |logger: &Logger, msg: String| error!(logger, "{}", msg);
|
||||
let critical_closure = |logger: &Logger, msg: String| crit!(logger, "{}", msg);
|
||||
|
||||
struct TestData<'a> {
|
||||
slog_level: slog::Level,
|
||||
slog_level_tag: &'a str,
|
||||
msg: String,
|
||||
closure: Box<dyn Fn(&Logger, String)>,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
slog_level: slog::Level::Debug,
|
||||
// Looks like a typo but tragically it isn't! ;(
|
||||
slog_level_tag: "DEBG",
|
||||
msg: debug_msg.into(),
|
||||
closure: Box::new(debug_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Info,
|
||||
slog_level_tag: "INFO",
|
||||
msg: info_msg.into(),
|
||||
closure: Box::new(info_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Warning,
|
||||
slog_level_tag: "WARN",
|
||||
msg: warn_msg.into(),
|
||||
closure: Box::new(warn_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Error,
|
||||
// Another language tragedy
|
||||
slog_level_tag: "ERRO",
|
||||
msg: error_msg.into(),
|
||||
closure: Box::new(error_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Critical,
|
||||
slog_level_tag: "CRIT",
|
||||
msg: critical_msg.into(),
|
||||
closure: Box::new(critical_closure),
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]", i);
|
||||
|
||||
// Create a writer for the logger drain to use
|
||||
let writer =
|
||||
NamedTempFile::new().expect(&format!("{:}: failed to create tempfile", msg));
|
||||
|
||||
// Used to check file contents before the temp file is unlinked
|
||||
let mut writer_ref = writer
|
||||
.reopen()
|
||||
.expect(&format!("{:?}: failed to clone tempfile", msg));
|
||||
|
||||
let (logger, logger_guard) = create_logger(name, source, d.slog_level, writer);
|
||||
|
||||
// Call the logger (which calls the drain)
|
||||
(d.closure)(&logger, d.msg.to_owned());
|
||||
|
||||
// Force temp file to be flushed
|
||||
drop(logger_guard);
|
||||
drop(logger);
|
||||
|
||||
let mut contents = String::new();
|
||||
writer_ref
|
||||
.read_to_string(&mut contents)
|
||||
.expect(&format!("{:?}: failed to read tempfile contents", msg));
|
||||
|
||||
// Convert file to JSON
|
||||
let fields: Value = serde_json::from_str(&contents)
|
||||
.expect(&format!("{:?}: failed to convert logfile to json", msg));
|
||||
|
||||
// Check the expected JSON fields
|
||||
|
||||
let field_ts = fields
|
||||
.get("ts")
|
||||
.expect(&format!("{:?}: failed to find timestamp field", msg));
|
||||
assert_ne!(field_ts, "", "{}", msg);
|
||||
|
||||
let field_version = fields
|
||||
.get("version")
|
||||
.expect(&format!("{:?}: failed to find version field", msg));
|
||||
assert_eq!(field_version, env!("CARGO_PKG_VERSION"), "{}", msg);
|
||||
|
||||
let field_pid = fields
|
||||
.get("pid")
|
||||
.expect(&format!("{:?}: failed to find pid field", msg));
|
||||
assert_ne!(field_pid, "", "{}", msg);
|
||||
|
||||
let field_level = fields
|
||||
.get("level")
|
||||
.expect(&format!("{:?}: failed to find level field", msg));
|
||||
assert_eq!(field_level, d.slog_level_tag, "{}", msg);
|
||||
|
||||
let field_msg = fields
|
||||
.get("msg")
|
||||
.expect(&format!("{:?}: failed to find msg field", msg));
|
||||
assert_eq!(field_msg, &json!(d.msg), "{}", msg);
|
||||
|
||||
let field_name = fields
|
||||
.get("name")
|
||||
.expect(&format!("{:?}: failed to find name field", msg));
|
||||
assert_eq!(field_name, name, "{}", msg);
|
||||
|
||||
let field_source = fields
|
||||
.get("source")
|
||||
.expect(&format!("{:?}: failed to find source field", msg));
|
||||
assert_eq!(field_source, source, "{}", msg);
|
||||
|
||||
let field_subsystem = fields
|
||||
.get("subsystem")
|
||||
.expect(&format!("{:?}: failed to find subsystem field", msg));
|
||||
|
||||
// No explicit subsystem, so should be the default
|
||||
assert_eq!(field_subsystem, &json!(DEFAULT_SUBSYSTEM), "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -76,7 +76,7 @@ then a new configuration file can be [created](#configure-kata-containers)
|
||||
and [configured][7].
|
||||
|
||||
[1]: https://docs.snapcraft.io/snaps/intro
|
||||
[2]: ../docs/design/architecture/README.md#root-filesystem-image
|
||||
[2]: ../docs/design/architecture.md#root-filesystem-image
|
||||
[3]: https://docs.snapcraft.io/reference/confinement#classic
|
||||
[4]: https://github.com/kata-containers/runtime#configuration
|
||||
[5]: https://docs.docker.com/engine/reference/commandline/dockerd
|
||||
|
||||
@@ -118,19 +118,18 @@ parts:
|
||||
export AGENT_INIT=yes
|
||||
export USE_DOCKER=1
|
||||
export DEBUG=1
|
||||
arch="$(uname -m)"
|
||||
initrd_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.initrd.architecture.${arch}.name)
|
||||
image_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.image.architecture.${arch}.name)
|
||||
case "$arch" in
|
||||
case "$(uname -m)" in
|
||||
aarch64)
|
||||
sudo -E PATH=$PATH make initrd DISTRO=alpine
|
||||
;;
|
||||
ppc64le|s390x)
|
||||
# Cannot use alpine on ppc64le/s390x because it would require a musl agent
|
||||
sudo -E PATH=$PATH make initrd DISTRO=ubuntu
|
||||
;;
|
||||
x86_64)
|
||||
# In some build systems it's impossible to build a rootfs image, try with the initrd image
|
||||
sudo -E PATH=$PATH make image DISTRO=${image_distro} || sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
sudo -E PATH=$PATH make image DISTRO=clearlinux || sudo -E PATH=$PATH make initrd DISTRO=alpine
|
||||
;;
|
||||
|
||||
aarch64|ppc64le|s390x)
|
||||
sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
;;
|
||||
|
||||
*) echo "unsupported architecture: $(uname -m)"; exit 1;;
|
||||
esac
|
||||
|
||||
|
||||
223
src/agent/Cargo.lock
generated
223
src/agent/Cargo.lock
generated
@@ -28,9 +28,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.51"
|
||||
version = "1.0.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b26702f315f53b6071259e15dd9d64528213b44d61de1ec926eca7715d62203"
|
||||
checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3"
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
@@ -60,17 +60,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.1"
|
||||
@@ -184,36 +173,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "3.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1121e32687f7f90b905d4775273305baa4f32cd418923e9b0fa726533221857"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"indexmap",
|
||||
"lazy_static",
|
||||
"os_str_bytes",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
"textwrap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "3.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cbcf660a32ad0eda4b11996d8761432f499034f6e685bc6072337db662c85f8"
|
||||
dependencies = [
|
||||
"heck 0.4.0",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.3.0"
|
||||
@@ -307,9 +266,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca"
|
||||
checksum = "28560757fe2bb34e79f907794bb6b22ae8b0e5c669b638a1132f2592b19035b4"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@@ -322,9 +281,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888"
|
||||
checksum = "ba3dda0b6588335f360afc675d0564c17a77a2bda81ca178a4b6081bd86c7f0b"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@@ -332,15 +291,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d"
|
||||
checksum = "d0c8ff0461b82559810cdccfde3215c3f373807f5e5232b71479bff7bb2583d7"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c"
|
||||
checksum = "29d6d2ff5bb10fb95c85b8ce46538a2e5f5e7fdc755623a7d4529ab8a4ed9d2a"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@@ -349,18 +308,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377"
|
||||
checksum = "b1f9d34af5a1aac6fb380f735fe510746c38067c5bf16c7fd250280503c971b2"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb"
|
||||
checksum = "6dbd947adfffb0efc70599b3ddcf7b5597bb5fa9e245eb99f62b3a5f7bb8bd3c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
@@ -368,23 +325,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11"
|
||||
checksum = "e3055baccb68d74ff6480350f8d6eb8fcfa3aa11bdc1a1ae3afdd0514617d508"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99"
|
||||
checksum = "6ee7c6485c30167ce4dfb83ac568a849fe53274c831081476ee13e0dce1aad72"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.17"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481"
|
||||
checksum = "d9b5cf40b47a271f77a8b1bec03ca09044d99d2372c0de244e66430761127164"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
@@ -394,8 +350,6 @@ dependencies = [
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"proc-macro-hack",
|
||||
"proc-macro-nested",
|
||||
"slab",
|
||||
]
|
||||
|
||||
@@ -425,12 +379,6 @@ dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
@@ -538,7 +486,6 @@ dependencies = [
|
||||
"async-trait",
|
||||
"capctl",
|
||||
"cgroups-rs",
|
||||
"clap",
|
||||
"futures",
|
||||
"ipnetwork",
|
||||
"lazy_static",
|
||||
@@ -547,7 +494,7 @@ dependencies = [
|
||||
"logging",
|
||||
"netlink-packet-utils",
|
||||
"netlink-sys",
|
||||
"nix 0.23.1",
|
||||
"nix 0.21.2",
|
||||
"oci",
|
||||
"opentelemetry",
|
||||
"procfs 0.12.0",
|
||||
@@ -789,6 +736,19 @@ dependencies = [
|
||||
"memoffset",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.21.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77d9f3521ea8e0641a153b3cddaf008dcbf26acd4ed739a2517295e0760d12c7"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cc",
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"memoffset",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.22.2"
|
||||
@@ -845,9 +805,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.13.0"
|
||||
version = "1.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||
checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
@@ -889,15 +849,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_str_bytes"
|
||||
version = "6.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
@@ -966,18 +917,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.0.8"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08"
|
||||
checksum = "1622113ce508488160cff04e6abc60960e676d330e1ca0f77c0b8df17c81438f"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.0.8"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389"
|
||||
checksum = "b95af56fee93df76d721d356ac1ca41fccf168bc448eb14049234df764ba3e76"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1008,47 +959,11 @@ version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed0cfbc8191465bed66e1718596ee0b0b35d5ee1f41c5df2189d0fe8bde535ba"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-nested"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.34"
|
||||
version = "1.0.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f84e92c0f7c9d58328b85a78557813e4bd845130db68d7184635344399423b1"
|
||||
checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029"
|
||||
dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
@@ -1116,7 +1031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603"
|
||||
dependencies = [
|
||||
"bytes 1.1.0",
|
||||
"heck 0.3.3",
|
||||
"heck",
|
||||
"itertools",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -1155,10 +1070,6 @@ name = "protobuf"
|
||||
version = "2.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e86d370532557ae7573551a1ec8235a0f8d6cb276c7c9e6aa490b511c447485"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen"
|
||||
@@ -1191,9 +1102,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.10"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05"
|
||||
checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -1320,7 +1231,7 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"libseccomp",
|
||||
"nix 0.23.1",
|
||||
"nix 0.21.2",
|
||||
"oci",
|
||||
"path-absolutize",
|
||||
"protobuf",
|
||||
@@ -1501,17 +1412,11 @@ version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ecab6c735a6bb4139c0caafd0cc3635748bbb3acf4550e8138122099251f309"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.82"
|
||||
version = "1.0.84"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59"
|
||||
checksum = "ecb2e6da8ee5eb9a61068762a32fa9619cc591ceb055b3687f4cd4051ec2e06b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1538,21 +1443,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80"
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.30"
|
||||
@@ -1815,12 +1705,6 @@ version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
|
||||
|
||||
[[package]]
|
||||
name = "void"
|
||||
version = "1.0.2"
|
||||
@@ -1845,7 +1729,7 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"libc",
|
||||
"nix 0.23.1",
|
||||
"nix 0.21.2",
|
||||
"opentelemetry",
|
||||
"serde",
|
||||
"slog",
|
||||
@@ -1941,15 +1825,6 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
|
||||
@@ -5,20 +5,20 @@ authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oci = { path = "../libs/oci" }
|
||||
oci = { path = "oci" }
|
||||
rustjail = { path = "rustjail" }
|
||||
protocols = { path = "../libs/protocols" }
|
||||
protocols = { path = "protocols" }
|
||||
lazy_static = "1.3.0"
|
||||
ttrpc = { version = "0.5.0", features = ["async", "protobuf-codec"], default-features = false }
|
||||
protobuf = "=2.14.0"
|
||||
libc = "0.2.58"
|
||||
nix = "0.23.0"
|
||||
nix = "0.21.0"
|
||||
capctl = "0.2.0"
|
||||
serde_json = "1.0.39"
|
||||
scan_fmt = "0.2.3"
|
||||
scopeguard = "1.0.0"
|
||||
thiserror = "1.0.26"
|
||||
regex = "1.5.4"
|
||||
regex = "1"
|
||||
serial_test = "0.5.1"
|
||||
|
||||
# Async helpers
|
||||
@@ -27,7 +27,7 @@ async-recursion = "0.3.2"
|
||||
futures = "0.3.17"
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1.14.0", features = ["full"] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio-vsock = "0.3.1"
|
||||
|
||||
netlink-sys = { version = "0.7.0", features = ["tokio_socket",]}
|
||||
@@ -37,7 +37,7 @@ ipnetwork = "0.17.0"
|
||||
|
||||
# Note: this crate sets the slog 'max_*' features which allows the log level
|
||||
# to be modified at runtime.
|
||||
logging = { path = "../libs/logging" }
|
||||
logging = { path = "../../pkg/logging" }
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.1.2"
|
||||
|
||||
@@ -60,13 +60,14 @@ vsock-exporter = { path = "vsock-exporter" }
|
||||
# Configuration
|
||||
serde = { version = "1.0.129", features = ["derive"] }
|
||||
toml = "0.5.8"
|
||||
clap = { version = "3.0.1", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1.0"
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"oci",
|
||||
"protocols",
|
||||
"rustjail",
|
||||
]
|
||||
|
||||
|
||||
@@ -101,10 +101,7 @@ endef
|
||||
##TARGET default: build code
|
||||
default: $(TARGET) show-header
|
||||
|
||||
$(TARGET): $(GENERATED_CODE) logging-crate-tests $(TARGET_PATH)
|
||||
|
||||
logging-crate-tests:
|
||||
make -C $(CWD)/../libs/logging
|
||||
$(TARGET): $(GENERATED_CODE) $(TARGET_PATH)
|
||||
|
||||
$(TARGET_PATH): $(SOURCES) | show-summary
|
||||
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
@@ -114,7 +111,7 @@ $(GENERATED_FILES): %: %.in
|
||||
|
||||
##TARGET optimize: optimized build
|
||||
optimize: $(SOURCES) | show-summary show-header
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny-warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
|
||||
##TARGET clippy: run clippy linter
|
||||
clippy: $(GENERATED_CODE)
|
||||
@@ -208,12 +205,11 @@ codecov-html: check_tarpaulin
|
||||
|
||||
.PHONY: \
|
||||
help \
|
||||
logging-crate-tests \
|
||||
optimize \
|
||||
show-header \
|
||||
show-summary \
|
||||
optimize \
|
||||
vendor
|
||||
|
||||
##TARGET generate-protocols: generate/update grpc agent protocols
|
||||
generate-protocols:
|
||||
../libs/protocols/hack/update-generated-proto.sh all
|
||||
protocols/hack/update-generated-proto.sh all
|
||||
|
||||
@@ -1,38 +1,48 @@
|
||||
# Kata Agent
|
||||
# Kata Agent in Rust
|
||||
|
||||
## Overview
|
||||
This is a rust version of the [`kata-agent`](https://github.com/kata-containers/agent).
|
||||
|
||||
The Kata agent is a long running process that runs inside the Virtual Machine
|
||||
(VM) (also known as the "pod" or "sandbox").
|
||||
In Denver PTG, [we discussed about re-writing agent in rust](https://etherpad.openstack.org/p/katacontainers-2019-ptg-denver-agenda):
|
||||
|
||||
The agent is packaged inside the Kata Containers
|
||||
[guest image](../../docs/design/architecture/README.md#guest-image)
|
||||
which is used to boot the VM. Once the runtime has launched the configured
|
||||
[hypervisor](../../docs/hypervisors.md) to create a new VM, the agent is
|
||||
started. From this point on, the agent is responsible for creating and
|
||||
managing the life cycle of the containers inside the VM.
|
||||
> In general, we all think about re-write agent in rust to reduce the footprint of agent. Moreover, Eric mentioned the possibility to stop using gRPC, which may have some impact on footprint. We may begin to do some POC to show how much we could save by re-writing agent in rust.
|
||||
|
||||
For further details, see the
|
||||
[architecture document](../../docs/design/architecture).
|
||||
After that, we drafted the initial code here, and any contributions are welcome.
|
||||
|
||||
## Audience
|
||||
## Features
|
||||
|
||||
If you simply wish to use Kata Containers, it is not necessary to understand
|
||||
the details of how the agent operates. Please see the
|
||||
[installation documentation](../../docs/install) for details of how deploy
|
||||
Kata Containers (which will include the Kata agent).
|
||||
| Feature | Status |
|
||||
| :--|:--:|
|
||||
| **OCI Behaviors** |
|
||||
| create/start containers | :white_check_mark: |
|
||||
| signal/wait process | :white_check_mark: |
|
||||
| exec/list process | :white_check_mark: |
|
||||
| I/O stream | :white_check_mark: |
|
||||
| Cgroups | :white_check_mark: |
|
||||
| Capabilities, `rlimit`, readonly path, masked path, users | :white_check_mark: |
|
||||
| Seccomp | :white_check_mark: |
|
||||
| container stats (`stats_container`) | :white_check_mark: |
|
||||
| Hooks | :white_check_mark: |
|
||||
| **Agent Features & APIs** |
|
||||
| run agent as `init` (mount fs, udev, setup `lo`) | :white_check_mark: |
|
||||
| block device as root device | :white_check_mark: |
|
||||
| Health API | :white_check_mark: |
|
||||
| network, interface/routes (`update_container`) | :white_check_mark: |
|
||||
| File transfer API (`copy_file`) | :white_check_mark: |
|
||||
| Device APIs (`reseed_random_device`, , `online_cpu_memory`, `mem_hotplug_probe`, `set_guet_data_time`) | :white_check_mark: |
|
||||
| VSOCK support | :white_check_mark: |
|
||||
| virtio-serial support | :heavy_multiplication_x: |
|
||||
| OCI Spec validator | :white_check_mark: |
|
||||
| **Infrastructures**|
|
||||
| Debug Console | :white_check_mark: |
|
||||
| Command line | :white_check_mark: |
|
||||
| Tracing | :heavy_multiplication_x: |
|
||||
|
||||
The remainder of this document is only useful for developers and testers.
|
||||
## Getting Started
|
||||
|
||||
## Build from Source
|
||||
### Build from Source
|
||||
The rust-agent needs to be built statically and linked with `musl`
|
||||
|
||||
Since the agent is written in the Rust language this section assumes the tool
|
||||
chain has been installed using standard Rust `rustup` tool.
|
||||
|
||||
### Build with musl
|
||||
|
||||
If you wish to build the agent with the `musl` C library, you need to run the
|
||||
following commands:
|
||||
> **Note:** skip this step for ppc64le, the build scripts explicitly use gnu for ppc64le.
|
||||
|
||||
```bash
|
||||
$ arch=$(uname -m)
|
||||
@@ -40,15 +50,12 @@ $ rustup target add "${arch}-unknown-linux-musl"
|
||||
$ sudo ln -s /usr/bin/g++ /bin/musl-g++
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> It is not currently possible to build using `musl` on ppc64le and s390x
|
||||
> since both platforms lack the `musl` target.
|
||||
|
||||
### Build the agent binary
|
||||
|
||||
The following steps download the Kata Containers source files and build the agent:
|
||||
ppc64le-only: Manually install `protoc`, e.g.
|
||||
```bash
|
||||
$ sudo dnf install protobuf-compiler
|
||||
```
|
||||
|
||||
Download the source files in the Kata containers repository and build the agent:
|
||||
```bash
|
||||
$ GOPATH="${GOPATH:-$HOME/go}"
|
||||
$ dir="$GOPATH/src/github.com/kata-containers"
|
||||
@@ -56,60 +63,17 @@ $ git -C ${dir} clone --depth 1 https://github.com/kata-containers/kata-containe
|
||||
$ make -C ${dir}/kata-containers/src/agent
|
||||
```
|
||||
|
||||
## Change the agent API
|
||||
|
||||
The Kata runtime communicates with the Kata agent using a ttRPC based API protocol.
|
||||
|
||||
This ttRPC API is defined by a set of [protocol buffers files](../libs/protocols/protos).
|
||||
The protocol files are used to generate the bindings for the following components:
|
||||
|
||||
| Component | Language | Generation method `[*]` | Tooling required |
|
||||
|-|-|-|-|
|
||||
| runtime | Golang | Run, `make generate-protocols` | `protoc` |
|
||||
| agent | Rust | Run, `make` | |
|
||||
|
||||
> **Key:**
|
||||
>
|
||||
> `[*]` - All commands must be run in the agent repository.
|
||||
|
||||
If you wish to change the API, these files must be regenerated. Although the
|
||||
rust code will be automatically generated by the
|
||||
[build script](../libs/protocols/build.rs),
|
||||
the Golang code generation requires the external `protoc` command to be
|
||||
available in `$PATH`.
|
||||
|
||||
To install the `protoc` command on a Fedora/CentOS/RHEL system:
|
||||
## Run Kata CI with rust-agent
|
||||
* Firstly, install Kata as noted by ["how to install Kata"](../../docs/install/README.md)
|
||||
* Secondly, build your own Kata initrd/image following the steps in ["how to build your own initrd/image"](../../docs/Developer-Guide.md#create-and-install-rootfs-and-initrd-image).
|
||||
notes: Please use your rust agent instead of the go agent when building your initrd/image.
|
||||
* Clone the Kata CI test cases from: https://github.com/kata-containers/tests.git, and then run the CRI test with:
|
||||
|
||||
```bash
|
||||
$ sudo dnf install -y protobuf-compiler
|
||||
$sudo -E PATH=$PATH -E GOPATH=$GOPATH integration/containerd/shimv2/shimv2-tests.sh
|
||||
```
|
||||
|
||||
## Custom guest image and kernel assets
|
||||
|
||||
If you wish to develop or test changes to the agent, you will need to create a
|
||||
custom guest image using the [osbuilder tool](../../tools/osbuilder). You
|
||||
may also wish to create a custom [guest kernel](../../tools/packaging/kernel).
|
||||
|
||||
Once created, [configure](../runtime/README.md#configuration) Kata Containers to use
|
||||
these custom assets to allow you to test your changes.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> To simplify development and testing, you may wish to run the agent
|
||||
> [stand alone](#run-the-agent-stand-alone) initially.
|
||||
|
||||
## Tracing
|
||||
|
||||
For details of tracing the operation of the agent, see the
|
||||
[tracing documentation](/docs/tracing.md).
|
||||
|
||||
## Run the agent stand alone
|
||||
|
||||
Although the agent is designed to run in a VM environment, for development and
|
||||
testing purposes it is possible to run it as a normal application.
|
||||
|
||||
When run in this way, the agent can be controlled using the low-level Kata
|
||||
agent control tool, rather than the Kata runtime.
|
||||
|
||||
For further details, see the
|
||||
[agent control tool documentation](../tools/agent-ctl/README.md#run-the-tool-and-the-agent-in-the-same-environment).
|
||||
## Mini Benchmark
|
||||
The memory of `RssAnon` consumed by the go-agent and rust-agent as below:
|
||||
go-agent: about 11M
|
||||
rust-agent: about 1.1M
|
||||
|
||||
@@ -4,16 +4,10 @@ version = "0.1.0"
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
with-serde = [ "serde", "serde_json" ]
|
||||
|
||||
[dependencies]
|
||||
ttrpc = { version = "0.5.0", features = ["async"] }
|
||||
async-trait = "0.1.42"
|
||||
protobuf = { version = "=2.14.0", features = ["with-serde"] }
|
||||
serde = { version = "1.0.130", features = ["derive"], optional = true }
|
||||
serde_json = { version = "1.0.68", optional = true }
|
||||
protobuf = "=2.14.0"
|
||||
|
||||
[build-dependencies]
|
||||
ttrpc-codegen = "0.2.0"
|
||||
44
src/agent/protocols/build.rs
Normal file
44
src/agent/protocols/build.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
// Copyright (c) 2020 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::fs;
|
||||
use ttrpc_codegen::{Codegen, Customize};
|
||||
|
||||
fn main() {
|
||||
let protos = vec![
|
||||
"protos/types.proto",
|
||||
"protos/agent.proto",
|
||||
"protos/health.proto",
|
||||
"protos/google/protobuf/empty.proto",
|
||||
"protos/oci.proto",
|
||||
];
|
||||
|
||||
Codegen::new()
|
||||
.out_dir("src")
|
||||
.inputs(&protos)
|
||||
.include("protos")
|
||||
.rust_protobuf()
|
||||
.customize(Customize {
|
||||
async_server: true,
|
||||
..Default::default()
|
||||
})
|
||||
.run()
|
||||
.expect("Gen codes failed.");
|
||||
|
||||
// There is a message named 'Box' in oci.proto
|
||||
// so there is a struct named 'Box', we should replace Box<Self> to ::std::boxed::Box<Self>
|
||||
// to avoid the conflict.
|
||||
replace_text_in_file(
|
||||
"src/oci.rs",
|
||||
"self: Box<Self>",
|
||||
"self: ::std::boxed::Box<Self>",
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn replace_text_in_file(file_name: &str, from: &str, to: &str) -> Result<(), std::io::Error> {
|
||||
let new_contents = fs::read_to_string(file_name)?.replace(from, to);
|
||||
fs::write(&file_name, new_contents.as_bytes())
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
|
||||
# //
|
||||
# // Copyright (c) 2020 Ant Group
|
||||
@@ -47,17 +47,17 @@ show_usage() {
|
||||
}
|
||||
|
||||
generate_go_sources() {
|
||||
local cmd="protoc -I$GOPATH/src:$GOPATH/src/github.com/kata-containers/kata-containers/src/libs/protocols/protos \
|
||||
local cmd="protoc -I$GOPATH/src:$GOPATH/src/github.com/kata-containers/kata-containers/src/agent/protocols/protos \
|
||||
--gogottrpc_out=plugins=ttrpc+fieldpath,\
|
||||
import_path=github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc,\
|
||||
\
|
||||
Mgithub.com/kata-containers/kata-containers/src/libs/protocols/protos/types.proto=github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols,\
|
||||
Mgithub.com/kata-containers/kata-containers/src/agent/protocols/protos/types.proto=github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols,\
|
||||
\
|
||||
Mgithub.com/kata-containers/kata-containers/src/libs/protocols/protos/oci.proto=github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc,\
|
||||
Mgithub.com/kata-containers/kata-containers/src/agent/protocols/protos/oci.proto=github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc,\
|
||||
\
|
||||
Mgogoproto/gogo.proto=github.com/gogo/protobuf/gogoproto,Mgoogle/protobuf/any.proto=github.com/gogo/protobuf/types,Mgoogle/protobuf/descriptor.proto=github.com/gogo/protobuf/protoc-gen-gogo/descriptor,Mgoogle/protobuf/duration.proto=github.com/gogo/protobuf/types,Mgoogle/protobuf/empty.proto=github.com/gogo/protobuf/types,Mgoogle/protobuf/field_mask.proto=github.com/gogo/protobuf/types,Mgoogle/protobuf/timestamp.proto=github.com/gogo/protobuf/types,Mgoogle/protobuf/wrappers.proto=github.com/gogo/protobuf/types,Mgoogle/rpc/status.proto=github.com/gogo/googleapis/google/rpc\
|
||||
:$GOPATH/src \
|
||||
$GOPATH/src/github.com/kata-containers/kata-containers/src/libs/protocols/protos/$1"
|
||||
$GOPATH/src/github.com/kata-containers/kata-containers/src/agent/protocols/protos/$1"
|
||||
|
||||
echo $cmd
|
||||
$cmd
|
||||
@@ -52,6 +52,8 @@ service AgentService {
|
||||
rpc AddARPNeighbors(AddARPNeighborsRequest) returns (google.protobuf.Empty);
|
||||
|
||||
// observability
|
||||
rpc StartTracing(StartTracingRequest) returns (google.protobuf.Empty);
|
||||
rpc StopTracing(StopTracingRequest) returns (google.protobuf.Empty);
|
||||
rpc GetMetrics(GetMetricsRequest) returns (Metrics);
|
||||
|
||||
// misc (TODO: some rpcs can be replaced by hyperstart-exec)
|
||||
@@ -490,6 +492,12 @@ message CopyFileRequest {
|
||||
bytes data = 8;
|
||||
}
|
||||
|
||||
message StartTracingRequest {
|
||||
}
|
||||
|
||||
message StopTracingRequest {
|
||||
}
|
||||
|
||||
message GetOOMEventRequest {}
|
||||
|
||||
message OOMEvent {
|
||||
@@ -8,10 +8,10 @@ edition = "2018"
|
||||
serde = "1.0.91"
|
||||
serde_json = "1.0.39"
|
||||
serde_derive = "1.0.91"
|
||||
oci = { path = "../../libs/oci" }
|
||||
protocols = { path ="../../libs/protocols" }
|
||||
oci = { path = "../oci" }
|
||||
protocols = { path ="../protocols" }
|
||||
caps = "0.5.0"
|
||||
nix = "0.23.0"
|
||||
nix = "0.21.0"
|
||||
scopeguard = "1.0.0"
|
||||
capctl = "0.2.0"
|
||||
lazy_static = "1.3.0"
|
||||
@@ -19,15 +19,15 @@ libc = "0.2.58"
|
||||
protobuf = "=2.14.0"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.1.2"
|
||||
scan_fmt = "0.2.6"
|
||||
regex = "1.5.4"
|
||||
scan_fmt = "0.2"
|
||||
regex = "1.1"
|
||||
path-absolutize = "1.2.0"
|
||||
anyhow = "1.0.32"
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.8" }
|
||||
rlimit = "0.5.3"
|
||||
|
||||
tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] }
|
||||
futures = "0.3.17"
|
||||
futures = "0.3"
|
||||
async-trait = "0.1.31"
|
||||
inotify = "0.9.2"
|
||||
libseccomp = { version = "0.1.3", optional = true }
|
||||
|
||||
@@ -22,6 +22,7 @@ use crate::cgroups::Manager as CgroupManager;
|
||||
use crate::container::DEFAULT_DEVICES;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use libc::{self, pid_t};
|
||||
use nix::errno::Errno;
|
||||
use oci::{
|
||||
LinuxBlockIo, LinuxCpu, LinuxDevice, LinuxDeviceCgroup, LinuxHugepageLimit, LinuxMemory,
|
||||
LinuxNetwork, LinuxPids, LinuxResources,
|
||||
@@ -174,7 +175,7 @@ impl CgroupManager for Manager {
|
||||
freezer_controller.freeze()?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -419,7 +419,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
ns.r#type.clone(),
|
||||
ns.path.clone()
|
||||
);
|
||||
log_child!(cfd_log, "error is : {:?}", e);
|
||||
log_child!(cfd_log, "error is : {:?}", e.as_errno());
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -496,7 +496,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
log_child!(cfd_log, "join namespace {:?}", s);
|
||||
sched::setns(fd, s).or_else(|e| {
|
||||
if s == CloneFlags::CLONE_NEWUSER {
|
||||
if e != Errno::EINVAL {
|
||||
if e.as_errno().unwrap() != Errno::EINVAL {
|
||||
let _ = write_sync(cwfd, SYNC_FAILED, format!("{:?}", e).as_str());
|
||||
return Err(e);
|
||||
}
|
||||
@@ -600,14 +600,6 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?;
|
||||
}
|
||||
|
||||
// Log unknown seccomp system calls in advance before the log file descriptor closes.
|
||||
#[cfg(feature = "seccomp")]
|
||||
if let Some(ref scmp) = linux.seccomp {
|
||||
if let Some(syscalls) = seccomp::get_unknown_syscalls(scmp) {
|
||||
log_child!(cfd_log, "unknown seccomp system calls: {:?}", syscalls);
|
||||
}
|
||||
}
|
||||
|
||||
// Without NoNewPrivileges, we need to set seccomp
|
||||
// before dropping capabilities because the calling thread
|
||||
// must have the CAP_SYS_ADMIN.
|
||||
@@ -644,10 +636,11 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
|
||||
// setup the envs
|
||||
for e in env.iter() {
|
||||
match valid_env(e) {
|
||||
Some((key, value)) => env::set_var(key, value),
|
||||
None => log_child!(cfd_log, "invalid env key-value: {:?}", e),
|
||||
let v: Vec<&str> = e.splitn(2, '=').collect();
|
||||
if v.len() != 2 {
|
||||
continue;
|
||||
}
|
||||
env::set_var(v[0], v[1]);
|
||||
}
|
||||
|
||||
// set the "HOME" env getting from "/etc/passwd", if
|
||||
@@ -671,8 +664,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
let _ = unistd::close(crfd);
|
||||
let _ = unistd::close(cwfd);
|
||||
|
||||
unistd::setsid().context("create a new session")?;
|
||||
if oci_process.terminal {
|
||||
unistd::setsid()?;
|
||||
unsafe {
|
||||
libc::ioctl(0, libc::TIOCSCTTY);
|
||||
}
|
||||
@@ -685,8 +678,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
Mode::from_bits_truncate(0),
|
||||
)?;
|
||||
unistd::close(fifofd)?;
|
||||
let buf: &mut [u8] = &mut [0];
|
||||
unistd::read(fd, buf)?;
|
||||
let mut buf: &mut [u8] = &mut [0];
|
||||
unistd::read(fd, &mut buf)?;
|
||||
}
|
||||
|
||||
// With NoNewPrivileges, we should set seccomp as close to
|
||||
@@ -1002,6 +995,8 @@ impl BaseContainer for LinuxContainer {
|
||||
|
||||
info!(logger, "entered namespaces!");
|
||||
|
||||
self.created = SystemTime::now();
|
||||
|
||||
if p.init {
|
||||
let spec = self.config.spec.as_mut().unwrap();
|
||||
update_namespaces(&self.logger, spec, p.pid)?;
|
||||
@@ -1116,8 +1111,10 @@ fn do_exec(args: &[String]) -> ! {
|
||||
.collect();
|
||||
|
||||
let _ = unistd::execvp(p.as_c_str(), &sa).map_err(|e| match e {
|
||||
nix::Error::UnknownErrno => std::process::exit(-2),
|
||||
_ => std::process::exit(e as i32),
|
||||
nix::Error::Sys(errno) => {
|
||||
std::process::exit(errno as i32);
|
||||
}
|
||||
_ => std::process::exit(-2),
|
||||
});
|
||||
|
||||
unreachable!()
|
||||
@@ -1163,7 +1160,7 @@ fn get_pid_namespace(logger: &Logger, linux: &Linux) -> Result<Option<RawFd>> {
|
||||
ns.r#type.clone(),
|
||||
ns.path.clone()
|
||||
);
|
||||
error!(logger, "error is : {:?}", e);
|
||||
error!(logger, "error is : {:?}", e.as_errno());
|
||||
|
||||
e
|
||||
})?;
|
||||
@@ -1396,13 +1393,13 @@ impl LinuxContainer {
|
||||
.context(format!("cannot change onwer of container {} root", id))?;
|
||||
|
||||
if config.spec.is_none() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
let spec = config.spec.as_ref().unwrap();
|
||||
|
||||
if spec.linux.is_none() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
let linux = spec.linux.as_ref().unwrap();
|
||||
@@ -1479,7 +1476,7 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
let binary = PathBuf::from(h.path.as_str());
|
||||
let path = binary.canonicalize()?;
|
||||
if !path.exists() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(anyhow!(nix::Error::from_errno(Errno::EINVAL)));
|
||||
}
|
||||
|
||||
let args = h.args.clone();
|
||||
@@ -1548,7 +1545,7 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
|
||||
if code != 0 {
|
||||
error!(logger, "hook {} exit status is {}", &path, code);
|
||||
return Err(anyhow!(nix::Error::UnknownErrno));
|
||||
return Err(anyhow!(nix::Error::from_errno(Errno::UnknownErrno)));
|
||||
}
|
||||
|
||||
debug!(logger, "hook {} exit status is 0", &path);
|
||||
@@ -1564,34 +1561,10 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
|
||||
match tokio::time::timeout(Duration::new(timeout, 0), join_handle).await {
|
||||
Ok(r) => r.unwrap(),
|
||||
Err(_) => Err(anyhow!(nix::Error::ETIMEDOUT)),
|
||||
Err(_) => Err(anyhow!(nix::Error::from_errno(Errno::ETIMEDOUT))),
|
||||
}
|
||||
}
|
||||
|
||||
// valid environment variables according to https://doc.rust-lang.org/std/env/fn.set_var.html#panics
|
||||
fn valid_env(e: &str) -> Option<(&str, &str)> {
|
||||
// wherther key or value will contain NULL char.
|
||||
if e.as_bytes().contains(&b'\0') {
|
||||
return None;
|
||||
}
|
||||
|
||||
let v: Vec<&str> = e.splitn(2, '=').collect();
|
||||
|
||||
// key can't hold an `equal` sign, but value can
|
||||
if v.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (key, value) = (v[0].trim(), v[1].trim());
|
||||
|
||||
// key can't be empty
|
||||
if key.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((key, value))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1670,7 +1643,7 @@ mod tests {
|
||||
)
|
||||
.await;
|
||||
|
||||
let expected_err = nix::Error::ETIMEDOUT;
|
||||
let expected_err = nix::Error::from_errno(Errno::ETIMEDOUT);
|
||||
assert_eq!(
|
||||
res.unwrap_err().downcast::<nix::Error>().unwrap(),
|
||||
expected_err
|
||||
@@ -2015,49 +1988,4 @@ mod tests {
|
||||
let ret = do_init_child(std::io::stdin().as_raw_fd());
|
||||
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_env() {
|
||||
let env = valid_env("a=b=c");
|
||||
assert_eq!(Some(("a", "b=c")), env);
|
||||
|
||||
let env = valid_env("a=b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
let env = valid_env("a =b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env(" a =b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env("a= b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env("a=b ");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
let env = valid_env("a=b c ");
|
||||
assert_eq!(Some(("a", "b c")), env);
|
||||
|
||||
let env = valid_env("=b");
|
||||
assert_eq!(None, env);
|
||||
|
||||
let env = valid_env("a=");
|
||||
assert_eq!(Some(("a", "")), env);
|
||||
|
||||
let env = valid_env("a==");
|
||||
assert_eq!(Some(("a", "=")), env);
|
||||
|
||||
let env = valid_env("a");
|
||||
assert_eq!(None, env);
|
||||
|
||||
let invalid_str = vec![97, b'\0', 98];
|
||||
let invalid_string = std::str::from_utf8(&invalid_str).unwrap();
|
||||
|
||||
let invalid_env = format!("{}=value", invalid_string);
|
||||
let env = valid_env(&invalid_env);
|
||||
assert_eq!(None, env);
|
||||
|
||||
let invalid_env = format!("key={}", invalid_string);
|
||||
let env = valid_env(&invalid_env);
|
||||
assert_eq!(None, env);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use libc::uid_t;
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{self, OFlag};
|
||||
#[cfg(not(test))]
|
||||
use nix::mount;
|
||||
@@ -34,9 +35,17 @@ use crate::log_child;
|
||||
// struct is populated from the content in the /proc/<pid>/mountinfo file.
|
||||
#[derive(std::fmt::Debug)]
|
||||
pub struct Info {
|
||||
id: i32,
|
||||
parent: i32,
|
||||
major: i32,
|
||||
minor: i32,
|
||||
root: String,
|
||||
mount_point: String,
|
||||
opts: String,
|
||||
optional: String,
|
||||
fstype: String,
|
||||
source: String,
|
||||
vfs_opts: String,
|
||||
}
|
||||
|
||||
const MOUNTINFOFORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}";
|
||||
@@ -103,7 +112,6 @@ lazy_static! {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
pub fn mount<
|
||||
P1: ?Sized + NixPath,
|
||||
P2: ?Sized + NixPath,
|
||||
@@ -116,42 +124,21 @@ pub fn mount<
|
||||
flags: MsFlags,
|
||||
data: Option<&P4>,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
mount::mount(source, target, fstype, flags, data)
|
||||
#[cfg(not(test))]
|
||||
return mount::mount(source, target, fstype, flags, data);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
pub fn mount<
|
||||
P1: ?Sized + NixPath,
|
||||
P2: ?Sized + NixPath,
|
||||
P3: ?Sized + NixPath,
|
||||
P4: ?Sized + NixPath,
|
||||
>(
|
||||
_source: Option<&P1>,
|
||||
_target: &P2,
|
||||
_fstype: Option<&P3>,
|
||||
_flags: MsFlags,
|
||||
_data: Option<&P4>,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
pub fn umount2<P: ?Sized + NixPath>(
|
||||
target: &P,
|
||||
flags: MntFlags,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
mount::umount2(target, flags)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
pub fn umount2<P: ?Sized + NixPath>(
|
||||
_target: &P,
|
||||
_flags: MntFlags,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return mount::umount2(target, flags);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn init_rootfs(
|
||||
@@ -463,20 +450,14 @@ fn mount_cgroups(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
fn pivot_root<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
new_root: &P1,
|
||||
put_old: &P2,
|
||||
) -> anyhow::Result<(), nix::Error> {
|
||||
unistd::pivot_root(new_root, put_old)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn pivot_root<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
_new_root: &P1,
|
||||
_put_old: &P2,
|
||||
) -> anyhow::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return unistd::pivot_root(new_root, put_old);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn pivot_rootfs<P: ?Sized + NixPath + std::fmt::Debug>(path: &P) -> Result<()> {
|
||||
@@ -554,20 +535,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
for (_index, line) in reader.lines().enumerate() {
|
||||
let line = line?;
|
||||
|
||||
//Example mountinfo format:
|
||||
// id
|
||||
// | / parent
|
||||
// | | / major:minor
|
||||
// | | | / root
|
||||
// | | | | / mount_point
|
||||
// | | | | | / opts
|
||||
// | | | | | | / optional
|
||||
// | | | | | | | / fstype
|
||||
// | | | | | | | | / source
|
||||
// | | | | | | | | | / vfs_opts
|
||||
// 22 96 0:21 / /sys rw,nosuid,nodev,noexec,relatime shared:2 - sysfs sysfs rw,seclabel
|
||||
|
||||
let (_id, _parent, _major, _minor, _root, mount_point, _opts, optional) = scan_fmt!(
|
||||
let (id, parent, major, minor, root, mount_point, opts, optional) = scan_fmt!(
|
||||
&line,
|
||||
MOUNTINFOFORMAT,
|
||||
i32,
|
||||
@@ -582,7 +550,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
|
||||
let fields: Vec<&str> = line.split(" - ").collect();
|
||||
if fields.len() == 2 {
|
||||
let (fstype, _source, _vfs_opts) =
|
||||
let (fstype, source, vfs_opts) =
|
||||
scan_fmt!(fields[1], "{} {} {}", String, String, String)?;
|
||||
|
||||
let mut optional_new = String::new();
|
||||
@@ -591,9 +559,17 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
}
|
||||
|
||||
let info = Info {
|
||||
id,
|
||||
parent,
|
||||
major,
|
||||
minor,
|
||||
root,
|
||||
mount_point,
|
||||
opts,
|
||||
optional: optional_new,
|
||||
fstype,
|
||||
source,
|
||||
vfs_opts,
|
||||
};
|
||||
|
||||
infos.push(info);
|
||||
@@ -606,15 +582,11 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
fn chroot<P: ?Sized + NixPath>(path: &P) -> Result<(), nix::Error> {
|
||||
unistd::chroot(path)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
fn chroot<P: ?Sized + NixPath>(_path: &P) -> Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return unistd::chroot(path);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn ms_move_root(rootfs: &str) -> Result<bool> {
|
||||
@@ -651,7 +623,7 @@ pub fn ms_move_root(rootfs: &str) -> Result<bool> {
|
||||
None::<&str>,
|
||||
)?;
|
||||
umount2(abs_mount_point, MntFlags::MNT_DETACH).or_else(|e| {
|
||||
if e.ne(&nix::Error::EINVAL) && e.ne(&nix::Error::EPERM) {
|
||||
if e.ne(&nix::Error::from(Errno::EINVAL)) && e.ne(&nix::Error::from(Errno::EPERM)) {
|
||||
return Err(anyhow!(e));
|
||||
}
|
||||
|
||||
@@ -728,7 +700,7 @@ fn secure_join(rootfs: &str, unsafe_path: &str) -> String {
|
||||
path.push(it);
|
||||
if let Ok(v) = path.read_link() {
|
||||
if v.is_absolute() {
|
||||
path = PathBuf::from(format!("{}{}", rootfs, v.to_str().unwrap()));
|
||||
path = PathBuf::from(format!("{}{}", rootfs, v.to_str().unwrap().to_string()));
|
||||
} else {
|
||||
path.pop();
|
||||
for it in v.iter() {
|
||||
@@ -794,8 +766,14 @@ fn mount_from(
|
||||
}
|
||||
};
|
||||
|
||||
let _ = stat::stat(dest.as_str())
|
||||
.map_err(|e| log_child!(cfd_log, "dest stat error. {}: {:?}", dest.as_str(), e));
|
||||
let _ = stat::stat(dest.as_str()).map_err(|e| {
|
||||
log_child!(
|
||||
cfd_log,
|
||||
"dest stat error. {}: {:?}",
|
||||
dest.as_str(),
|
||||
e.as_errno()
|
||||
)
|
||||
});
|
||||
|
||||
mount(
|
||||
Some(src.as_str()),
|
||||
@@ -805,7 +783,7 @@ fn mount_from(
|
||||
Some(d.as_str()),
|
||||
)
|
||||
.map_err(|e| {
|
||||
log_child!(cfd_log, "mount error: {:?}", e);
|
||||
log_child!(cfd_log, "mount error: {:?}", e.as_errno());
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -827,7 +805,7 @@ fn mount_from(
|
||||
None::<&str>,
|
||||
)
|
||||
.map_err(|e| {
|
||||
log_child!(cfd_log, "remout {}: {:?}", dest.as_str(), e);
|
||||
log_child!(cfd_log, "remout {}: {:?}", dest.as_str(), e.as_errno());
|
||||
e
|
||||
})?;
|
||||
}
|
||||
@@ -996,7 +974,7 @@ pub fn finish_rootfs(cfd_log: RawFd, spec: &Spec, process: &Process) -> Result<(
|
||||
|
||||
fn mask_path(path: &str) -> Result<()> {
|
||||
if !path.starts_with('/') || path.contains("..") {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
match mount(
|
||||
@@ -1006,30 +984,49 @@ fn mask_path(path: &str) -> Result<()> {
|
||||
MsFlags::MS_BIND,
|
||||
None::<&str>,
|
||||
) {
|
||||
Err(e) => match e {
|
||||
nix::Error::ENOENT | nix::Error::ENOTDIR => Ok(()),
|
||||
_ => Err(e.into()),
|
||||
},
|
||||
Ok(_) => Ok(()),
|
||||
Err(nix::Error::Sys(e)) => {
|
||||
if e != Errno::ENOENT && e != Errno::ENOTDIR {
|
||||
//info!("{}: {}", path, e.desc());
|
||||
return Err(nix::Error::Sys(e).into());
|
||||
}
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn readonly_path(path: &str) -> Result<()> {
|
||||
if !path.starts_with('/') || path.contains("..") {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
if let Err(e) = mount(
|
||||
match mount(
|
||||
Some(&path[1..]),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND | MsFlags::MS_REC,
|
||||
None::<&str>,
|
||||
) {
|
||||
match e {
|
||||
nix::Error::ENOENT => return Ok(()),
|
||||
_ => return Err(e.into()),
|
||||
};
|
||||
Err(nix::Error::Sys(e)) => {
|
||||
if e == Errno::ENOENT {
|
||||
return Ok(());
|
||||
} else {
|
||||
//info!("{}: {}", path, e.desc());
|
||||
return Err(nix::Error::Sys(e).into());
|
||||
}
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
mount(
|
||||
@@ -1385,7 +1382,7 @@ mod tests {
|
||||
|
||||
for (i, t) in tests.iter().enumerate() {
|
||||
// Create a string containing details of the test
|
||||
let msg = format!("test[{}]: {:?}", i, t.name);
|
||||
let msg = format!("test[{}]: {:?}", i, t);
|
||||
|
||||
// if is_symlink, then should be prepare the softlink environment
|
||||
if t.symlink_path != "" {
|
||||
|
||||
@@ -30,7 +30,7 @@ impl io::Read for &StreamFd {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match unistd::read(self.0, buf) {
|
||||
Ok(l) => Ok(l),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -39,7 +39,7 @@ impl io::Write for &StreamFd {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match unistd::write(self.0, buf) {
|
||||
Ok(l) => Ok(l),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl StreamFd {
|
||||
fn close(&mut self) -> io::Result<()> {
|
||||
match unistd::close(self.0) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,24 +39,6 @@ fn get_rule_conditions(args: &[LinuxSeccompArg]) -> Result<Vec<ScmpArgCompare>>
|
||||
Ok(conditions)
|
||||
}
|
||||
|
||||
pub fn get_unknown_syscalls(scmp: &LinuxSeccomp) -> Option<Vec<String>> {
|
||||
let mut unknown_syscalls: Vec<String> = Vec::new();
|
||||
|
||||
for syscall in &scmp.syscalls {
|
||||
for name in &syscall.names {
|
||||
if get_syscall_from_name(name, None).is_err() {
|
||||
unknown_syscalls.push(name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if unknown_syscalls.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(unknown_syscalls)
|
||||
}
|
||||
}
|
||||
|
||||
// init_seccomp creates a seccomp filter and loads it for the current process
|
||||
// including all the child processes.
|
||||
pub fn init_seccomp(scmp: &LinuxSeccomp) -> Result<()> {
|
||||
@@ -86,14 +68,7 @@ pub fn init_seccomp(scmp: &LinuxSeccomp) -> Result<()> {
|
||||
}
|
||||
|
||||
for name in &syscall.names {
|
||||
let syscall_num = match get_syscall_from_name(name, None) {
|
||||
Ok(num) => num,
|
||||
Err(_) => {
|
||||
// If we cannot resolve the given system call, we assume it is not supported
|
||||
// by the kernel. Hence, we skip it without generating an error.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let syscall_num = get_syscall_from_name(name, None)?;
|
||||
|
||||
if syscall.args.is_empty() {
|
||||
filter.add_rule(action, syscall_num, None)?;
|
||||
@@ -134,72 +109,6 @@ mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
const TEST_DATA: &str = r#"{
|
||||
"defaultAction": "SCMP_ACT_ALLOW",
|
||||
"architectures": [
|
||||
],
|
||||
"flags": [
|
||||
"SECCOMP_FILTER_FLAG_LOG"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"dup3",
|
||||
"invalid_syscall1",
|
||||
"invalid_syscall2"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 10,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 20,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 222,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 30,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"value": 40,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
#[test]
|
||||
fn test_get_filter_attr_from_flag() {
|
||||
skip_if_not_root!();
|
||||
@@ -212,19 +121,75 @@ mod tests {
|
||||
assert_eq!(get_filter_attr_from_flag("ERROR").is_err(), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_unknown_syscalls() {
|
||||
let scmp: oci::LinuxSeccomp = serde_json::from_str(TEST_DATA).unwrap();
|
||||
let syscalls = get_unknown_syscalls(&scmp).unwrap();
|
||||
|
||||
assert_eq!(syscalls, vec!["invalid_syscall1", "invalid_syscall2"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_init_seccomp() {
|
||||
skip_if_not_root!();
|
||||
|
||||
let mut scmp: oci::LinuxSeccomp = serde_json::from_str(TEST_DATA).unwrap();
|
||||
let data = r#"{
|
||||
"defaultAction": "SCMP_ACT_ALLOW",
|
||||
"architectures": [
|
||||
],
|
||||
"flags": [
|
||||
"SECCOMP_FILTER_FLAG_LOG"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"dup3"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 10,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 20,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 222,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 30,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"value": 40,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
let mut scmp: oci::LinuxSeccomp = serde_json::from_str(data).unwrap();
|
||||
let mut arch: Vec<oci::Arch>;
|
||||
|
||||
if cfg!(target_endian = "little") {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::unistd;
|
||||
use std::mem;
|
||||
use std::os::unix::io::RawFd;
|
||||
@@ -40,7 +41,7 @@ pub fn write_count(fd: RawFd, buf: &[u8], count: usize) -> Result<usize> {
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
if e != nix::Error::EINTR {
|
||||
if e != nix::Error::from_errno(Errno::EINTR) {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
@@ -64,7 +65,7 @@ fn read_count(fd: RawFd, count: usize) -> Result<Vec<u8>> {
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
if e != nix::Error::EINTR {
|
||||
if e != nix::Error::from_errno(Errno::EINTR) {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
|
||||
use crate::container::Config;
|
||||
use anyhow::{anyhow, Context, Error, Result};
|
||||
use nix::errno::Errno;
|
||||
use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec};
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Component, PathBuf};
|
||||
|
||||
fn einval() -> Error {
|
||||
anyhow!(nix::Error::EINVAL)
|
||||
anyhow!(nix::Error::from_errno(Errno::EINVAL))
|
||||
}
|
||||
|
||||
fn get_linux(oci: &Spec) -> Result<&Linux> {
|
||||
|
||||
@@ -29,7 +29,9 @@ allowed = [
|
||||
"SetGuestDateTimeRequest",
|
||||
"SignalProcessRequest",
|
||||
"StartContainerRequest",
|
||||
"StartTracingRequest",
|
||||
"StatsContainerRequest",
|
||||
"StopTracingRequest",
|
||||
"TtyWinResizeRequest",
|
||||
"UpdateContainerRequest",
|
||||
"UpdateInterfaceRequest",
|
||||
|
||||
@@ -194,17 +194,7 @@ impl FromStr for AgentConfig {
|
||||
|
||||
impl AgentConfig {
|
||||
#[instrument]
|
||||
pub fn from_cmdline(file: &str, args: Vec<String>) -> Result<AgentConfig> {
|
||||
// If config file specified in the args, generate our config from it
|
||||
let config_position = args.iter().position(|a| a == "--config" || a == "-c");
|
||||
if let Some(config_position) = config_position {
|
||||
if let Some(config_file) = args.get(config_position + 1) {
|
||||
return AgentConfig::from_config_file(config_file);
|
||||
} else {
|
||||
panic!("The config argument wasn't formed properly: {:?}", args);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_cmdline(file: &str) -> Result<AgentConfig> {
|
||||
let mut config: AgentConfig = Default::default();
|
||||
let cmdline = fs::read_to_string(file)?;
|
||||
let params: Vec<&str> = cmdline.split_ascii_whitespace().collect();
|
||||
@@ -906,8 +896,7 @@ mod tests {
|
||||
vars_to_unset.push(name);
|
||||
}
|
||||
|
||||
let config =
|
||||
AgentConfig::from_cmdline(filename, vec![]).expect("Failed to parse command line");
|
||||
let config = AgentConfig::from_cmdline(filename).expect("Failed to parse command line");
|
||||
|
||||
assert_eq!(d.debug_console, config.debug_console, "{}", msg);
|
||||
assert_eq!(d.dev_mode, config.dev_mode, "{}", msg);
|
||||
@@ -928,40 +917,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_cmdline_with_args_overwrites() {
|
||||
let expected = AgentConfig {
|
||||
dev_mode: true,
|
||||
server_addr: "unix://@/tmp/foo.socket".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let example_config_file_contents =
|
||||
"dev_mode = true\nserver_addr = 'unix://@/tmp/foo.socket'";
|
||||
let dir = tempdir().expect("failed to create tmpdir");
|
||||
let file_path = dir.path().join("config.toml");
|
||||
let filename = file_path.to_str().expect("failed to create filename");
|
||||
let mut file = File::create(filename).unwrap_or_else(|_| panic!("failed to create file"));
|
||||
file.write_all(example_config_file_contents.as_bytes())
|
||||
.unwrap_or_else(|_| panic!("failed to write file contents"));
|
||||
|
||||
let config =
|
||||
AgentConfig::from_cmdline("", vec!["--config".to_string(), filename.to_string()])
|
||||
.expect("Failed to parse command line");
|
||||
|
||||
assert_eq!(expected.debug_console, config.debug_console);
|
||||
assert_eq!(expected.dev_mode, config.dev_mode);
|
||||
assert_eq!(
|
||||
expected.unified_cgroup_hierarchy,
|
||||
config.unified_cgroup_hierarchy,
|
||||
);
|
||||
assert_eq!(expected.log_level, config.log_level);
|
||||
assert_eq!(expected.hotplug_timeout, config.hotplug_timeout);
|
||||
assert_eq!(expected.container_pipe_size, config.container_pipe_size);
|
||||
assert_eq!(expected.server_addr, config.server_addr);
|
||||
assert_eq!(expected.tracing, config.tracing);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_logrus_to_slog_level() {
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -149,8 +149,10 @@ fn run_in_child(slave_fd: libc::c_int, shell: String) -> Result<()> {
|
||||
|
||||
// run shell
|
||||
let _ = unistd::execvp(cmd.as_c_str(), &args).map_err(|e| match e {
|
||||
nix::Error::UnknownErrno => std::process::exit(-2),
|
||||
_ => std::process::exit(e as i32),
|
||||
nix::Error::Sys(errno) => {
|
||||
std::process::exit(errno as i32);
|
||||
}
|
||||
_ => std::process::exit(-2),
|
||||
});
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use libc::{c_uint, major, minor};
|
||||
use nix::sys::stat;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
@@ -11,7 +12,7 @@ use std::fmt;
|
||||
use std::fs;
|
||||
use std::os::unix::ffi::OsStrExt;
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
@@ -22,7 +23,7 @@ use crate::linux_abi::*;
|
||||
use crate::pci;
|
||||
use crate::sandbox::Sandbox;
|
||||
use crate::uevent::{wait_for_uevent, Uevent, UeventMatcher};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use anyhow::{anyhow, Result};
|
||||
use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
|
||||
use protocols::agent::Device;
|
||||
use tracing::instrument;
|
||||
@@ -52,6 +53,15 @@ pub const DRIVER_VFIO_GK_TYPE: &str = "vfio-gk";
|
||||
// container as a VFIO device node
|
||||
pub const DRIVER_VFIO_TYPE: &str = "vfio";
|
||||
|
||||
#[derive(Debug)]
|
||||
struct DevIndexEntry {
|
||||
idx: usize,
|
||||
residx: Vec<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct DevIndex(HashMap<String, DevIndexEntry>);
|
||||
|
||||
#[instrument]
|
||||
pub fn online_device(path: &str) -> Result<()> {
|
||||
fs::write(path, "1")?;
|
||||
@@ -157,22 +167,20 @@ pub fn pcipath_to_sysfs(root_bus_sysfs: &str, pcipath: &pci::Path) -> Result<Str
|
||||
let bridgebuspath = format!("{}{}/pci_bus", root_bus_sysfs, relpath);
|
||||
let mut files: Vec<_> = fs::read_dir(&bridgebuspath)?.collect();
|
||||
|
||||
match files.pop() {
|
||||
Some(busfile) if files.is_empty() => {
|
||||
bus = busfile?
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(
|
||||
"Expected exactly one PCI bus in {}, got {} instead",
|
||||
bridgebuspath,
|
||||
// Adjust to original value as we've already popped
|
||||
files.len() + 1
|
||||
));
|
||||
}
|
||||
};
|
||||
if files.len() != 1 {
|
||||
return Err(anyhow!(
|
||||
"Expected exactly one PCI bus in {}, got {} instead",
|
||||
bridgebuspath,
|
||||
files.len()
|
||||
));
|
||||
}
|
||||
|
||||
// unwrap is safe, because of the length test above
|
||||
let busfile = files.pop().unwrap()?;
|
||||
bus = busfile
|
||||
.file_name()
|
||||
.into_string()
|
||||
.map_err(|e| anyhow!("Bad filename under {}: {:?}", &bridgebuspath, e))?;
|
||||
}
|
||||
|
||||
Ok(relpath)
|
||||
@@ -220,9 +228,8 @@ impl VirtioBlkPciMatcher {
|
||||
fn new(relpath: &str) -> VirtioBlkPciMatcher {
|
||||
let root_bus = create_pci_root_bus_path();
|
||||
let re = format!(r"^{}{}/virtio[0-9]+/block/", root_bus, relpath);
|
||||
|
||||
VirtioBlkPciMatcher {
|
||||
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkPciMatcher regex"),
|
||||
rex: Regex::new(&re).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -260,7 +267,7 @@ impl VirtioBlkCCWMatcher {
|
||||
root_bus_path, device
|
||||
);
|
||||
VirtioBlkCCWMatcher {
|
||||
rex: Regex::new(&re).expect("BUG: failed to compile VirtioBlkCCWMatcher regex"),
|
||||
rex: Regex::new(&re).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -416,15 +423,12 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
|
||||
|
||||
for entry in fs::read_dir(SYSFS_SCSI_HOST_PATH)? {
|
||||
let host = entry?.file_name();
|
||||
|
||||
let host_str = host.to_str().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"failed to convert directory entry to unicode for file {:?}",
|
||||
host
|
||||
)
|
||||
})?;
|
||||
|
||||
let scan_path = PathBuf::from(&format!("{}/{}/{}", SYSFS_SCSI_HOST_PATH, host_str, "scan"));
|
||||
let scan_path = format!(
|
||||
"{}/{}/{}",
|
||||
SYSFS_SCSI_HOST_PATH,
|
||||
host.to_str().unwrap(),
|
||||
"scan"
|
||||
);
|
||||
|
||||
fs::write(scan_path, &scan_data)?;
|
||||
}
|
||||
@@ -432,201 +436,91 @@ fn scan_scsi_bus(scsi_addr: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct DevNumUpdate {
|
||||
// the major and minor numbers for the device within the guest
|
||||
guest_major: i64,
|
||||
guest_minor: i64,
|
||||
}
|
||||
|
||||
impl DevNumUpdate {
|
||||
fn from_vm_path<T: AsRef<Path>>(vm_path: T) -> Result<Self> {
|
||||
let vm_path = vm_path.as_ref();
|
||||
|
||||
if !vm_path.exists() {
|
||||
return Err(anyhow!("VM device path {:?} doesn't exist", vm_path));
|
||||
}
|
||||
|
||||
let devid = fs::metadata(vm_path)?.rdev();
|
||||
let guest_major = stat::major(devid) as i64;
|
||||
let guest_minor = stat::minor(devid) as i64;
|
||||
|
||||
Ok(DevNumUpdate {
|
||||
guest_major,
|
||||
guest_minor,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Represents the device-node and resource related updates to the OCI
|
||||
// spec needed for a particular device
|
||||
#[derive(Debug, Clone)]
|
||||
struct DevUpdate {
|
||||
num: DevNumUpdate,
|
||||
// an optional new path to update the device to in the "inner" container
|
||||
// specification
|
||||
final_path: Option<String>,
|
||||
}
|
||||
|
||||
impl DevUpdate {
|
||||
fn from_vm_path<T: AsRef<Path>>(vm_path: T, final_path: String) -> Result<Self> {
|
||||
Ok(DevUpdate {
|
||||
final_path: Some(final_path),
|
||||
..DevNumUpdate::from_vm_path(vm_path)?.into()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DevNumUpdate> for DevUpdate {
|
||||
fn from(num: DevNumUpdate) -> Self {
|
||||
DevUpdate {
|
||||
num,
|
||||
final_path: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Represents the updates to the OCI spec needed for a particular device
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct SpecUpdate {
|
||||
dev: Option<DevUpdate>,
|
||||
// optional corrections for PCI addresses
|
||||
pci: Vec<(pci::Address, pci::Address)>,
|
||||
}
|
||||
|
||||
impl<T: Into<DevUpdate>> From<T> for SpecUpdate {
|
||||
fn from(dev: T) -> Self {
|
||||
SpecUpdate {
|
||||
dev: Some(dev.into()),
|
||||
pci: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update_spec_devices updates the device list in the OCI spec to make
|
||||
// update_spec_device updates the device list in the OCI spec to make
|
||||
// it include details appropriate for the VM, instead of the host. It
|
||||
// is given a map of (container_path => update) where:
|
||||
// container_path: the path to the device in the original OCI spec
|
||||
// update: information on changes to make to the device
|
||||
// is given the host path to the device (to locate the device in the
|
||||
// original OCI spec) and the VM path which it uses to determine the
|
||||
// VM major/minor numbers, and the final path with which to present
|
||||
// the device in the (inner) container
|
||||
#[instrument]
|
||||
fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -> Result<()> {
|
||||
fn update_spec_device(
|
||||
spec: &mut Spec,
|
||||
devidx: &DevIndex,
|
||||
host_path: &str,
|
||||
vm_path: &str,
|
||||
final_path: &str,
|
||||
) -> Result<()> {
|
||||
let major_id: c_uint;
|
||||
let minor_id: c_uint;
|
||||
|
||||
// If no container_path is provided, we won't be able to match and
|
||||
// update the device in the OCI spec device list. This is an error.
|
||||
if host_path.is_empty() {
|
||||
return Err(anyhow!("Host path cannot empty for device"));
|
||||
}
|
||||
|
||||
let linux = spec
|
||||
.linux
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow!("Spec didn't contain linux field"))?;
|
||||
let mut res_updates = HashMap::<(&str, i64, i64), DevNumUpdate>::with_capacity(updates.len());
|
||||
.ok_or_else(|| anyhow!("Spec didn't container linux field"))?;
|
||||
|
||||
for specdev in &mut linux.devices {
|
||||
if let Some(update) = updates.remove(specdev.path.as_str()) {
|
||||
let host_major = specdev.major;
|
||||
let host_minor = specdev.minor;
|
||||
if !Path::new(vm_path).exists() {
|
||||
return Err(anyhow!("vm_path:{} doesn't exist", vm_path));
|
||||
}
|
||||
|
||||
let meta = fs::metadata(vm_path)?;
|
||||
let dev_id = meta.rdev();
|
||||
unsafe {
|
||||
major_id = major(dev_id);
|
||||
minor_id = minor(dev_id);
|
||||
}
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"update_spec_device(): vm_path={}, major: {}, minor: {}\n", vm_path, major_id, minor_id
|
||||
);
|
||||
|
||||
if let Some(idxdata) = devidx.0.get(host_path) {
|
||||
let dev = &mut linux.devices[idxdata.idx];
|
||||
let host_major = dev.major;
|
||||
let host_minor = dev.minor;
|
||||
|
||||
dev.major = major_id as i64;
|
||||
dev.minor = minor_id as i64;
|
||||
dev.path = final_path.to_string();
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"change the device from path: {} major: {} minor: {} to vm device path: {} major: {} minor: {}",
|
||||
host_path,
|
||||
host_major,
|
||||
host_minor,
|
||||
dev.path,
|
||||
dev.major,
|
||||
dev.minor,
|
||||
);
|
||||
|
||||
// Resources must be updated since they are used to identify
|
||||
// the device in the devices cgroup.
|
||||
for ridx in &idxdata.residx {
|
||||
// unwrap is safe, because residx would be empty if there
|
||||
// were no resources
|
||||
let res = &mut linux.resources.as_mut().unwrap().devices[*ridx];
|
||||
res.major = Some(major_id as i64);
|
||||
res.minor = Some(minor_id as i64);
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
"update_spec_devices() updating device";
|
||||
"container_path" => &specdev.path,
|
||||
"type" => &specdev.r#type,
|
||||
"host_major" => host_major,
|
||||
"host_minor" => host_minor,
|
||||
"guest_major" => update.num.guest_major,
|
||||
"guest_minor" => update.num.guest_minor,
|
||||
"final_path" => update.final_path.as_ref(),
|
||||
"set resources for device major: {} minor: {}\n", major_id, minor_id
|
||||
);
|
||||
|
||||
specdev.major = update.num.guest_major;
|
||||
specdev.minor = update.num.guest_minor;
|
||||
if let Some(final_path) = update.final_path {
|
||||
specdev.path = final_path;
|
||||
}
|
||||
|
||||
if res_updates
|
||||
.insert(
|
||||
(specdev.r#type.as_str(), host_major, host_minor),
|
||||
update.num,
|
||||
)
|
||||
.is_some()
|
||||
{
|
||||
return Err(anyhow!(
|
||||
"Conflicting resource updates for host_major={} host_minor={}",
|
||||
host_major,
|
||||
host_minor
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"Should have found a matching device {} in the spec",
|
||||
vm_path
|
||||
))
|
||||
}
|
||||
|
||||
// Make sure we applied all of our updates
|
||||
if !updates.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"Missing devices in OCI spec: {:?}",
|
||||
updates
|
||||
.keys()
|
||||
.map(|d| format!("{:?}", d))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(resources) = linux.resources.as_mut() {
|
||||
for r in &mut resources.devices {
|
||||
if let (Some(host_major), Some(host_minor)) = (r.major, r.minor) {
|
||||
if let Some(update) = res_updates.get(&(r.r#type.as_str(), host_major, host_minor))
|
||||
{
|
||||
info!(
|
||||
sl!(),
|
||||
"update_spec_devices() updating resource";
|
||||
"type" => &r.r#type,
|
||||
"host_major" => host_major,
|
||||
"host_minor" => host_minor,
|
||||
"guest_major" => update.guest_major,
|
||||
"guest_minor" => update.guest_minor,
|
||||
);
|
||||
|
||||
r.major = Some(update.guest_major);
|
||||
r.minor = Some(update.guest_minor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// update_spec_pci PCI addresses in the OCI spec to be guest addresses
|
||||
// instead of host addresses. It is given a map of (host address =>
|
||||
// guest address)
|
||||
#[instrument]
|
||||
fn update_spec_pci(spec: &mut Spec, updates: HashMap<pci::Address, pci::Address>) -> Result<()> {
|
||||
// Correct PCI addresses in the environment
|
||||
if let Some(process) = spec.process.as_mut() {
|
||||
for envvar in process.env.iter_mut() {
|
||||
let eqpos = envvar
|
||||
.find('=')
|
||||
.ok_or_else(|| anyhow!("Malformed OCI env entry {:?}", envvar))?;
|
||||
|
||||
let (name, eqval) = envvar.split_at(eqpos);
|
||||
let val = &eqval[1..];
|
||||
|
||||
if !name.starts_with("PCIDEVICE_") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut guest_addrs = Vec::<String>::new();
|
||||
|
||||
for host_addr in val.split(',') {
|
||||
let host_addr = pci::Address::from_str(host_addr)
|
||||
.with_context(|| format!("Can't parse {} environment variable", name))?;
|
||||
let guest_addr = updates
|
||||
.get(&host_addr)
|
||||
.ok_or_else(|| anyhow!("Unable to translate host PCI address {}", host_addr))?;
|
||||
guest_addrs.push(format!("{}", guest_addr));
|
||||
}
|
||||
|
||||
envvar.replace_range(eqpos + 1.., guest_addrs.join(",").as_str());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// device.Id should be the predicted device name (vda, vdb, ...)
|
||||
@@ -634,25 +528,43 @@ fn update_spec_pci(spec: &mut Spec, updates: HashMap<pci::Address, pci::Address>
|
||||
#[instrument]
|
||||
async fn virtiommio_blk_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
_sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<SpecUpdate> {
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
if device.vm_path.is_empty() {
|
||||
return Err(anyhow!("Invalid path for virtio mmio blk device"));
|
||||
}
|
||||
|
||||
Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into())
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&device.container_path,
|
||||
&device.vm_path,
|
||||
&device.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
// device.Id should be a PCI path string
|
||||
#[instrument]
|
||||
async fn virtio_blk_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<SpecUpdate> {
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
let mut dev = device.clone();
|
||||
let pcipath = pci::Path::from_str(&device.id)?;
|
||||
let vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;
|
||||
|
||||
Ok(DevNumUpdate::from_vm_path(vm_path)?.into())
|
||||
dev.vm_path = get_virtio_blk_pci_device_name(sandbox, &pcipath).await?;
|
||||
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
// device.id should be a CCW path string
|
||||
@@ -660,17 +572,30 @@ async fn virtio_blk_device_handler(
|
||||
#[instrument]
|
||||
async fn virtio_blk_ccw_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<SpecUpdate> {
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
let mut dev = device.clone();
|
||||
let ccw_device = ccw::Device::from_str(&device.id)?;
|
||||
let vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?;
|
||||
|
||||
Ok(DevNumUpdate::from_vm_path(vm_path)?.into())
|
||||
dev.vm_path = get_virtio_blk_ccw_device_name(sandbox, &ccw_device).await?;
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "s390x"))]
|
||||
#[instrument]
|
||||
async fn virtio_blk_ccw_device_handler(_: &Device, _: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
|
||||
async fn virtio_blk_ccw_device_handler(
|
||||
_: &Device,
|
||||
_: &mut Spec,
|
||||
_: &Arc<Mutex<Sandbox>>,
|
||||
_: &DevIndex,
|
||||
) -> Result<()> {
|
||||
Err(anyhow!("CCW is only supported on s390x"))
|
||||
}
|
||||
|
||||
@@ -678,23 +603,39 @@ async fn virtio_blk_ccw_device_handler(_: &Device, _: &Arc<Mutex<Sandbox>>) -> R
|
||||
#[instrument]
|
||||
async fn virtio_scsi_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<SpecUpdate> {
|
||||
let vm_path = get_scsi_device_name(sandbox, &device.id).await?;
|
||||
|
||||
Ok(DevNumUpdate::from_vm_path(vm_path)?.into())
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
let mut dev = device.clone();
|
||||
dev.vm_path = get_scsi_device_name(sandbox, &device.id).await?;
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&dev.container_path,
|
||||
&dev.vm_path,
|
||||
&dev.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
async fn virtio_nvdimm_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
_sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<SpecUpdate> {
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
if device.vm_path.is_empty() {
|
||||
return Err(anyhow!("Invalid path for nvdimm device"));
|
||||
}
|
||||
|
||||
Ok(DevNumUpdate::from_vm_path(&device.vm_path)?.into())
|
||||
update_spec_device(
|
||||
spec,
|
||||
devidx,
|
||||
&device.container_path,
|
||||
&device.vm_path,
|
||||
&device.container_path,
|
||||
)
|
||||
}
|
||||
|
||||
fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
|
||||
@@ -712,53 +653,80 @@ fn split_vfio_option(opt: &str) -> Option<(&str, &str)> {
|
||||
// Each option should have the form "DDDD:BB:DD.F=<pcipath>"
|
||||
// DDDD:BB:DD.F is the device's PCI address in the host
|
||||
// <pcipath> is a PCI path to the device in the guest (see pci.rs)
|
||||
async fn vfio_device_handler(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
|
||||
async fn vfio_device_handler(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
let vfio_in_guest = device.field_type != DRIVER_VFIO_GK_TYPE;
|
||||
let mut pci_fixups = Vec::<(pci::Address, pci::Address)>::new();
|
||||
let mut group = None;
|
||||
|
||||
for opt in device.options.iter() {
|
||||
let (host, pcipath) =
|
||||
let (_, pcipath) =
|
||||
split_vfio_option(opt).ok_or_else(|| anyhow!("Malformed VFIO option {:?}", opt))?;
|
||||
let host =
|
||||
pci::Address::from_str(host).context("Bad host PCI address in VFIO option {:?}")?;
|
||||
let pcipath = pci::Path::from_str(pcipath)?;
|
||||
|
||||
let guestdev = wait_for_pci_device(sandbox, &pcipath).await?;
|
||||
if vfio_in_guest {
|
||||
pci_driver_override(SYSFS_BUS_PCI_PATH, guestdev, "vfio-pci")?;
|
||||
|
||||
// Devices must have an IOMMU group to be usable via VFIO
|
||||
let devgroup = pci_iommu_group(SYSFS_BUS_PCI_PATH, guestdev)?
|
||||
.ok_or_else(|| anyhow!("{} has no IOMMU group", guestdev))?;
|
||||
|
||||
if let Some(g) = group {
|
||||
if g != devgroup {
|
||||
return Err(anyhow!("{} is not in guest IOMMU group {}", guestdev, g));
|
||||
}
|
||||
let devgroup = pci_iommu_group(SYSFS_BUS_PCI_PATH, guestdev)?;
|
||||
if devgroup.is_none() {
|
||||
// Devices must have an IOMMU group to be usable via VFIO
|
||||
return Err(anyhow!("{} has no IOMMU group", guestdev));
|
||||
}
|
||||
|
||||
group = Some(devgroup);
|
||||
if group.is_some() && group != devgroup {
|
||||
// If PCI devices associated with the same VFIO device
|
||||
// (and therefore group) in the host don't end up in
|
||||
// the same group in the guest, something has gone
|
||||
// horribly wrong
|
||||
return Err(anyhow!(
|
||||
"{} is not in guest IOMMU group {}",
|
||||
guestdev,
|
||||
group.unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
pci_fixups.push((host, guestdev));
|
||||
group = devgroup;
|
||||
}
|
||||
}
|
||||
|
||||
let dev_update = if vfio_in_guest {
|
||||
if vfio_in_guest {
|
||||
// If there are any devices at all, logic above ensures that group is not None
|
||||
let group = group.ok_or_else(|| anyhow!("failed to get VFIO group"))?;
|
||||
let group = group.unwrap();
|
||||
let vmpath = get_vfio_device_name(sandbox, group).await?;
|
||||
|
||||
let vm_path = get_vfio_device_name(sandbox, group).await?;
|
||||
update_spec_device(spec, devidx, &device.container_path, &vmpath, &vmpath)?;
|
||||
}
|
||||
|
||||
Some(DevUpdate::from_vm_path(&vm_path, vm_path.clone())?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Ok(SpecUpdate {
|
||||
dev: dev_update,
|
||||
pci: pci_fixups,
|
||||
})
|
||||
impl DevIndex {
|
||||
fn new(spec: &Spec) -> DevIndex {
|
||||
let mut map = HashMap::new();
|
||||
|
||||
if let Some(linux) = spec.linux.as_ref() {
|
||||
for (i, d) in linux.devices.iter().enumerate() {
|
||||
let mut residx = Vec::new();
|
||||
|
||||
if let Some(linuxres) = linux.resources.as_ref() {
|
||||
for (j, r) in linuxres.devices.iter().enumerate() {
|
||||
if r.r#type == d.r#type
|
||||
&& r.major == Some(d.major)
|
||||
&& r.minor == Some(d.minor)
|
||||
{
|
||||
residx.push(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
map.insert(d.path.clone(), DevIndexEntry { idx: i, residx });
|
||||
}
|
||||
}
|
||||
DevIndex(map)
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -767,40 +735,22 @@ pub async fn add_devices(
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
) -> Result<()> {
|
||||
let mut dev_updates = HashMap::<&str, DevUpdate>::with_capacity(devices.len());
|
||||
let mut pci_updates = HashMap::<pci::Address, pci::Address>::new();
|
||||
let devidx = DevIndex::new(spec);
|
||||
|
||||
for device in devices.iter() {
|
||||
let update = add_device(device, sandbox).await?;
|
||||
if let Some(dev_update) = update.dev {
|
||||
if dev_updates
|
||||
.insert(&device.container_path, dev_update)
|
||||
.is_some()
|
||||
{
|
||||
return Err(anyhow!(
|
||||
"Conflicting device updates for {}",
|
||||
&device.container_path
|
||||
));
|
||||
}
|
||||
|
||||
for (host, guest) in update.pci {
|
||||
if let Some(other_guest) = pci_updates.insert(host, guest) {
|
||||
return Err(anyhow!(
|
||||
"Conflicting guest address for host device {} ({} versus {})",
|
||||
host,
|
||||
guest,
|
||||
other_guest
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
add_device(device, spec, sandbox, &devidx).await?;
|
||||
}
|
||||
|
||||
update_spec_devices(spec, dev_updates)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
|
||||
async fn add_device(
|
||||
device: &Device,
|
||||
spec: &mut Spec,
|
||||
sandbox: &Arc<Mutex<Sandbox>>,
|
||||
devidx: &DevIndex,
|
||||
) -> Result<()> {
|
||||
// log before validation to help with debugging gRPC protocol version differences.
|
||||
info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
|
||||
device.id, device.field_type, device.vm_path, device.container_path, device.options);
|
||||
@@ -818,12 +768,14 @@ async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<Sp
|
||||
}
|
||||
|
||||
match device.field_type.as_str() {
|
||||
DRIVER_BLK_TYPE => virtio_blk_device_handler(device, sandbox).await,
|
||||
DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_device_handler(device, sandbox).await,
|
||||
DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, sandbox).await,
|
||||
DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, sandbox).await,
|
||||
DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, sandbox).await,
|
||||
DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => vfio_device_handler(device, sandbox).await,
|
||||
DRIVER_BLK_TYPE => virtio_blk_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_BLK_CCW_TYPE => virtio_blk_ccw_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_MMIO_BLK_TYPE => virtiommio_blk_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_NVDIMM_TYPE => virtio_nvdimm_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_SCSI_TYPE => virtio_scsi_device_handler(device, spec, sandbox, devidx).await,
|
||||
DRIVER_VFIO_GK_TYPE | DRIVER_VFIO_TYPE => {
|
||||
vfio_device_handler(device, spec, sandbox, devidx).await
|
||||
}
|
||||
_ => Err(anyhow!("Unknown device type {}", device.field_type)),
|
||||
}
|
||||
}
|
||||
@@ -843,8 +795,11 @@ pub fn update_device_cgroup(spec: &mut Spec) -> Result<()> {
|
||||
.as_mut()
|
||||
.ok_or_else(|| anyhow!("Spec didn't container linux field"))?;
|
||||
|
||||
let resources = linux.resources.get_or_insert(LinuxResources::default());
|
||||
if linux.resources.is_none() {
|
||||
linux.resources = Some(LinuxResources::default());
|
||||
}
|
||||
|
||||
let resources = linux.resources.as_mut().unwrap();
|
||||
resources.devices.push(LinuxDeviceCgroup {
|
||||
allow: false,
|
||||
major: Some(major),
|
||||
@@ -860,8 +815,7 @@ pub fn update_device_cgroup(spec: &mut Spec) -> Result<()> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::uevent::spawn_test_watcher;
|
||||
use oci::{Linux, Process};
|
||||
use std::iter::FromIterator;
|
||||
use oci::Linux;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
@@ -886,36 +840,28 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_devices() {
|
||||
fn test_update_spec_device() {
|
||||
let (major, minor) = (7, 2);
|
||||
let mut spec = Spec::default();
|
||||
|
||||
// vm_path empty
|
||||
let update = DevNumUpdate::from_vm_path("");
|
||||
assert!(update.is_err());
|
||||
// container_path empty
|
||||
let container_path = "";
|
||||
let vm_path = "";
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
// linux is empty
|
||||
let container_path = "/dev/null";
|
||||
let vm_path = "/dev/null";
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
spec.linux = Some(Linux::default());
|
||||
|
||||
// linux.devices doesn't contain the updated device
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
// linux.devices is empty
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
spec.linux.as_mut().unwrap().devices = vec![oci::LinuxDevice {
|
||||
@@ -925,14 +871,16 @@ mod tests {
|
||||
..oci::LinuxDevice::default()
|
||||
}];
|
||||
|
||||
// vm_path empty
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_err());
|
||||
|
||||
let vm_path = "/dev/null";
|
||||
|
||||
// guest and host path are not the same
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(
|
||||
res.is_err(),
|
||||
"container_path={:?} vm_path={:?} spec={:?}",
|
||||
@@ -944,13 +892,8 @@ mod tests {
|
||||
spec.linux.as_mut().unwrap().devices[0].path = container_path.to_string();
|
||||
|
||||
// spec.linux.resources is empty
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// update both devices and cgroup lists
|
||||
@@ -970,18 +913,13 @@ mod tests {
|
||||
..oci::LinuxResources::default()
|
||||
});
|
||||
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
let devidx = DevIndex::new(&spec);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_devices_guest_host_conflict() {
|
||||
fn test_update_spec_device_guest_host_conflict() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
let zero_rdev = fs::metadata("/dev/zero").unwrap().rdev();
|
||||
let full_rdev = fs::metadata("/dev/full").unwrap().rdev();
|
||||
@@ -1030,6 +968,7 @@ mod tests {
|
||||
}),
|
||||
..Spec::default()
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let container_path_a = "/dev/a";
|
||||
let vm_path_a = "/dev/zero";
|
||||
@@ -1055,17 +994,34 @@ mod tests {
|
||||
assert_eq!(Some(host_major_b), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor_b), specresources.devices[1].minor);
|
||||
|
||||
let updates = HashMap::from_iter(vec![
|
||||
(
|
||||
container_path_a,
|
||||
DevNumUpdate::from_vm_path(vm_path_a).unwrap().into(),
|
||||
),
|
||||
(
|
||||
container_path_b,
|
||||
DevNumUpdate::from_vm_path(vm_path_b).unwrap().into(),
|
||||
),
|
||||
]);
|
||||
let res = update_spec_devices(&mut spec, updates);
|
||||
let res = update_spec_device(
|
||||
&mut spec,
|
||||
&devidx,
|
||||
container_path_a,
|
||||
vm_path_a,
|
||||
container_path_a,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
assert_eq!(guest_major_a, specdevices[0].major);
|
||||
assert_eq!(guest_minor_a, specdevices[0].minor);
|
||||
assert_eq!(host_major_b, specdevices[1].major);
|
||||
assert_eq!(host_minor_b, specdevices[1].minor);
|
||||
|
||||
let specresources = spec.linux.as_ref().unwrap().resources.as_ref().unwrap();
|
||||
assert_eq!(Some(guest_major_a), specresources.devices[0].major);
|
||||
assert_eq!(Some(guest_minor_a), specresources.devices[0].minor);
|
||||
assert_eq!(Some(host_major_b), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor_b), specresources.devices[1].minor);
|
||||
|
||||
let res = update_spec_device(
|
||||
&mut spec,
|
||||
&devidx,
|
||||
container_path_b,
|
||||
vm_path_b,
|
||||
container_path_b,
|
||||
);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
@@ -1082,7 +1038,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_devices_char_block_conflict() {
|
||||
fn test_update_spec_device_char_block_conflict() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
|
||||
let guest_major = stat::major(null_rdev) as i64;
|
||||
@@ -1129,6 +1085,7 @@ mod tests {
|
||||
}),
|
||||
..Spec::default()
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let container_path = "/dev/char";
|
||||
let vm_path = "/dev/null";
|
||||
@@ -1139,13 +1096,7 @@ mod tests {
|
||||
assert_eq!(Some(host_major), specresources.devices[1].major);
|
||||
assert_eq!(Some(host_minor), specresources.devices[1].minor);
|
||||
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevNumUpdate::from_vm_path(vm_path).unwrap().into(),
|
||||
)]),
|
||||
);
|
||||
let res = update_spec_device(&mut spec, &devidx, container_path, vm_path, container_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
// Only the char device, not the block device should be updated
|
||||
@@ -1157,19 +1108,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_devices_final_path() {
|
||||
fn test_update_spec_device_final_path() {
|
||||
let null_rdev = fs::metadata("/dev/null").unwrap().rdev();
|
||||
let guest_major = stat::major(null_rdev) as i64;
|
||||
let guest_minor = stat::minor(null_rdev) as i64;
|
||||
|
||||
let container_path = "/dev/original";
|
||||
let host_path = "/dev/host";
|
||||
let host_major: i64 = 99;
|
||||
let host_minor: i64 = 99;
|
||||
|
||||
let mut spec = Spec {
|
||||
linux: Some(Linux {
|
||||
devices: vec![oci::LinuxDevice {
|
||||
path: container_path.to_string(),
|
||||
path: host_path.to_string(),
|
||||
r#type: "c".to_string(),
|
||||
major: host_major,
|
||||
minor: host_minor,
|
||||
@@ -1179,17 +1130,12 @@ mod tests {
|
||||
}),
|
||||
..Spec::default()
|
||||
};
|
||||
let devidx = DevIndex::new(&spec);
|
||||
|
||||
let vm_path = "/dev/null";
|
||||
let final_path = "/dev/new";
|
||||
let final_path = "/dev/final";
|
||||
|
||||
let res = update_spec_devices(
|
||||
&mut spec,
|
||||
HashMap::from_iter(vec![(
|
||||
container_path,
|
||||
DevUpdate::from_vm_path(vm_path, final_path.to_string()).unwrap(),
|
||||
)]),
|
||||
);
|
||||
let res = update_spec_device(&mut spec, &devidx, host_path, vm_path, final_path);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let specdevices = &spec.linux.as_ref().unwrap().devices;
|
||||
@@ -1198,48 +1144,6 @@ mod tests {
|
||||
assert_eq!(final_path, specdevices[0].path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_update_spec_pci() {
|
||||
let example_map = [
|
||||
// Each is a host,guest pair of pci addresses
|
||||
("0000:1a:01.0", "0000:01:01.0"),
|
||||
("0000:1b:02.0", "0000:01:02.0"),
|
||||
// This one has the same host address as guest address
|
||||
// above, to test that we're not double-translating
|
||||
("0000:01:01.0", "ffff:02:1f.7"),
|
||||
];
|
||||
|
||||
let mut spec = Spec {
|
||||
process: Some(Process {
|
||||
env: vec![
|
||||
"PCIDEVICE_x=0000:1a:01.0,0000:1b:02.0".to_string(),
|
||||
"PCIDEVICE_y=0000:01:01.0".to_string(),
|
||||
"NOTAPCIDEVICE_blah=abcd:ef:01.0".to_string(),
|
||||
],
|
||||
..Process::default()
|
||||
}),
|
||||
..Spec::default()
|
||||
};
|
||||
|
||||
let pci_fixups = example_map
|
||||
.iter()
|
||||
.map(|(h, g)| {
|
||||
(
|
||||
pci::Address::from_str(h).unwrap(),
|
||||
pci::Address::from_str(g).unwrap(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let res = update_spec_pci(&mut spec, pci_fixups);
|
||||
assert!(res.is_ok());
|
||||
|
||||
let env = &spec.process.as_ref().unwrap().env;
|
||||
assert_eq!(env[0], "PCIDEVICE_x=0000:01:01.0,0000:01:02.0");
|
||||
assert_eq!(env[1], "PCIDEVICE_y=ffff:02:1f.7");
|
||||
assert_eq!(env[2], "NOTAPCIDEVICE_blah=abcd:ef:01.0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pcipath_to_sysfs() {
|
||||
let testdir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
@@ -20,7 +20,6 @@ extern crate scopeguard;
|
||||
extern crate slog;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{AppSettings, Parser};
|
||||
use nix::fcntl::OFlag;
|
||||
use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
|
||||
use nix::unistd::{self, dup, Pid};
|
||||
@@ -81,32 +80,10 @@ const NAME: &str = "kata-agent";
|
||||
|
||||
lazy_static! {
|
||||
static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> = Arc::new(RwLock::new(
|
||||
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
|
||||
// clap::Parser::parse() greedily process all command line input including cargo test parameters,
|
||||
// so should only be used inside main.
|
||||
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
|
||||
AgentConfig::from_cmdline("/proc/cmdline").unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
// The default clap version info doesn't match our form, so we need to override it
|
||||
#[clap(global_setting(AppSettings::DisableVersionFlag))]
|
||||
struct AgentOpts {
|
||||
/// Print the version information
|
||||
#[clap(short, long)]
|
||||
version: bool,
|
||||
#[clap(subcommand)]
|
||||
subcmd: Option<SubCommand>,
|
||||
/// Specify a custom agent config file
|
||||
#[clap(short, long)]
|
||||
config: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
enum SubCommand {
|
||||
Init {},
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn announce(logger: &Logger, config: &AgentConfig) {
|
||||
info!(logger, "announce";
|
||||
@@ -136,10 +113,10 @@ async fn create_logger_task(rfd: RawFd, vsock_port: u32, shutdown: Receiver<bool
|
||||
)?;
|
||||
|
||||
let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, vsock_port);
|
||||
socket::bind(listenfd, &addr)?;
|
||||
socket::listen(listenfd, 1)?;
|
||||
socket::bind(listenfd, &addr).unwrap();
|
||||
socket::listen(listenfd, 1).unwrap();
|
||||
|
||||
writer = Box::new(util::get_vsock_stream(listenfd).await?);
|
||||
writer = Box::new(util::get_vsock_stream(listenfd).await.unwrap());
|
||||
} else {
|
||||
writer = Box::new(tokio::io::stdout());
|
||||
}
|
||||
@@ -278,9 +255,9 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
}
|
||||
|
||||
fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let args = AgentOpts::parse();
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
if args.version {
|
||||
if args.len() == 2 && args[1] == "--version" {
|
||||
println!(
|
||||
"{} version {} (api version: {}, commit version: {}, type: rust)",
|
||||
NAME,
|
||||
@@ -288,10 +265,11 @@ fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
version::API_VERSION,
|
||||
version::VERSION_COMMIT,
|
||||
);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if let Some(SubCommand::Init {}) = args.subcmd {
|
||||
if args.len() == 2 && args[1] == "init" {
|
||||
reset_sigpipe();
|
||||
rustjail::container::init_child();
|
||||
exit(0);
|
||||
@@ -348,7 +326,7 @@ async fn start_sandbox(
|
||||
sandbox.lock().await.sender = Some(tx);
|
||||
|
||||
// vsock:///dev/vsock, port
|
||||
let mut server = rpc::start(sandbox.clone(), config.server_addr.as_str())?;
|
||||
let mut server = rpc::start(sandbox.clone(), config.server_addr.as_str());
|
||||
server.start().await?;
|
||||
|
||||
rx.await?;
|
||||
|
||||
@@ -8,7 +8,6 @@ extern crate procfs;
|
||||
use prometheus::{Encoder, Gauge, GaugeVec, IntCounter, TextEncoder};
|
||||
|
||||
use anyhow::Result;
|
||||
use slog::warn;
|
||||
use tracing::instrument;
|
||||
|
||||
const NAMESPACE_KATA_AGENT: &str = "kata_agent";
|
||||
@@ -75,7 +74,7 @@ pub fn get_metrics(_: &protocols::agent::GetMetricsRequest) -> Result<String> {
|
||||
AGENT_SCRAPE_COUNT.inc();
|
||||
|
||||
// update agent process metrics
|
||||
update_agent_metrics()?;
|
||||
update_agent_metrics();
|
||||
|
||||
// update guest os metrics
|
||||
update_guest_metrics();
|
||||
@@ -85,26 +84,23 @@ pub fn get_metrics(_: &protocols::agent::GetMetricsRequest) -> Result<String> {
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let encoder = TextEncoder::new();
|
||||
encoder.encode(&metric_families, &mut buffer)?;
|
||||
encoder.encode(&metric_families, &mut buffer).unwrap();
|
||||
|
||||
Ok(String::from_utf8(buffer)?)
|
||||
Ok(String::from_utf8(buffer).unwrap())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn update_agent_metrics() -> Result<()> {
|
||||
fn update_agent_metrics() {
|
||||
let me = procfs::process::Process::myself();
|
||||
|
||||
let me = match me {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
// FIXME: return Ok for all errors?
|
||||
warn!(sl!(), "failed to create process instance: {:?}", e);
|
||||
if let Err(err) = me {
|
||||
error!(sl!(), "failed to create process instance: {:?}", err);
|
||||
return;
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let me = me.unwrap();
|
||||
|
||||
let tps = procfs::ticks_per_second()?;
|
||||
let tps = procfs::ticks_per_second().unwrap();
|
||||
|
||||
// process total time
|
||||
AGENT_TOTAL_TIME.set((me.stat.utime + me.stat.stime) as f64 / (tps as f64));
|
||||
@@ -113,7 +109,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
AGENT_TOTAL_VM.set(me.stat.vsize as f64);
|
||||
|
||||
// Total resident set
|
||||
let page_size = procfs::page_size()? as f64;
|
||||
let page_size = procfs::page_size().unwrap() as f64;
|
||||
AGENT_TOTAL_RSS.set(me.stat.rss as f64 * page_size);
|
||||
|
||||
// io
|
||||
@@ -136,11 +132,11 @@ fn update_agent_metrics() -> Result<()> {
|
||||
}
|
||||
|
||||
match me.status() {
|
||||
Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get process status: {:?}", err);
|
||||
}
|
||||
Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
|
||||
@@ -139,8 +139,8 @@ pub const STORAGE_HANDLER_LIST: &[&str] = &[
|
||||
|
||||
#[instrument]
|
||||
pub fn baremount(
|
||||
source: &Path,
|
||||
destination: &Path,
|
||||
source: &str,
|
||||
destination: &str,
|
||||
fs_type: &str,
|
||||
flags: MsFlags,
|
||||
options: &str,
|
||||
@@ -148,11 +148,11 @@ pub fn baremount(
|
||||
) -> Result<()> {
|
||||
let logger = logger.new(o!("subsystem" => "baremount"));
|
||||
|
||||
if source.as_os_str().is_empty() {
|
||||
if source.is_empty() {
|
||||
return Err(anyhow!("need mount source"));
|
||||
}
|
||||
|
||||
if destination.as_os_str().is_empty() {
|
||||
if destination.is_empty() {
|
||||
return Err(anyhow!("need mount destination"));
|
||||
}
|
||||
|
||||
@@ -448,18 +448,16 @@ fn mount_storage(logger: &Logger, storage: &Storage) -> Result<()> {
|
||||
let options_vec = options_vec.iter().map(String::as_str).collect();
|
||||
let (flags, options) = parse_mount_flags_and_options(options_vec);
|
||||
|
||||
let source = Path::new(&storage.source);
|
||||
|
||||
info!(logger, "mounting storage";
|
||||
"mount-source" => source.display(),
|
||||
"mount-destination" => mount_path.display(),
|
||||
"mount-source:" => storage.source.as_str(),
|
||||
"mount-destination" => storage.mount_point.as_str(),
|
||||
"mount-fstype" => storage.fstype.as_str(),
|
||||
"mount-options" => options.as_str(),
|
||||
);
|
||||
|
||||
baremount(
|
||||
source,
|
||||
mount_path,
|
||||
storage.source.as_str(),
|
||||
storage.mount_point.as_str(),
|
||||
storage.fstype.as_str(),
|
||||
flags,
|
||||
options.as_str(),
|
||||
@@ -587,10 +585,7 @@ fn mount_to_rootfs(logger: &Logger, m: &InitMount) -> Result<()> {
|
||||
|
||||
fs::create_dir_all(Path::new(m.dest)).context("could not create directory")?;
|
||||
|
||||
let source = Path::new(m.src);
|
||||
let dest = Path::new(m.dest);
|
||||
|
||||
baremount(source, dest, m.fstype, flags, &options, logger).or_else(|e| {
|
||||
baremount(m.src, m.dest, m.fstype, flags, &options, logger).or_else(|e| {
|
||||
if m.src != "dev" {
|
||||
return Err(e);
|
||||
}
|
||||
@@ -633,7 +628,8 @@ pub fn get_mount_fs_type_from_file(mount_file: &str, mount_point: &str) -> Resul
|
||||
let file = File::open(mount_file)?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
let re = Regex::new(format!("device .+ mounted on {} with fstype (.+)", mount_point).as_str())?;
|
||||
let re = Regex::new(format!("device .+ mounted on {} with fstype (.+)", mount_point).as_str())
|
||||
.unwrap();
|
||||
|
||||
// Read the file line by line using the lines() iterator from std::io::BufRead.
|
||||
for (_index, line) in reader.lines().enumerate() {
|
||||
@@ -711,21 +707,20 @@ pub fn get_cgroup_mounts(
|
||||
}
|
||||
}
|
||||
|
||||
let subsystem_name = fields[0];
|
||||
|
||||
if subsystem_name.is_empty() {
|
||||
if fields[0].is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if subsystem_name == "devices" {
|
||||
if fields[0] == "devices" {
|
||||
has_device_cgroup = true;
|
||||
}
|
||||
|
||||
if let Some((key, value)) = CGROUPS.get_key_value(subsystem_name) {
|
||||
if let Some(value) = CGROUPS.get(&fields[0]) {
|
||||
let key = CGROUPS.keys().find(|&&f| f == fields[0]).unwrap();
|
||||
cg_mounts.push(InitMount {
|
||||
fstype: "cgroup",
|
||||
src: "cgroup",
|
||||
dest: value,
|
||||
dest: *value,
|
||||
options: vec!["nosuid", "nodev", "noexec", "relatime", key],
|
||||
});
|
||||
}
|
||||
@@ -778,9 +773,10 @@ fn ensure_destination_file_exists(path: &Path) -> Result<()> {
|
||||
return Err(anyhow!("{:?} exists but is not a regular file", path));
|
||||
}
|
||||
|
||||
let dir = path
|
||||
.parent()
|
||||
.ok_or_else(|| anyhow!("failed to find parent path for {:?}", path))?;
|
||||
// The only way parent() can return None is if the path is /,
|
||||
// which always exists, so the test above will already have caught
|
||||
// it, thus the unwrap() is safe
|
||||
let dir = path.parent().unwrap();
|
||||
|
||||
fs::create_dir_all(dir).context(format!("create_dir_all {:?}", dir))?;
|
||||
|
||||
@@ -947,10 +943,14 @@ mod tests {
|
||||
std::fs::create_dir_all(d).expect("failed to created directory");
|
||||
}
|
||||
|
||||
let src = Path::new(&src_filename);
|
||||
let dest = Path::new(&dest_filename);
|
||||
|
||||
let result = baremount(src, dest, d.fs_type, d.flags, d.options, &logger);
|
||||
let result = baremount(
|
||||
&src_filename,
|
||||
&dest_filename,
|
||||
d.fs_type,
|
||||
d.flags,
|
||||
d.options,
|
||||
&logger,
|
||||
);
|
||||
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
|
||||
@@ -1027,11 +1027,15 @@ mod tests {
|
||||
.unwrap_or_else(|_| panic!("failed to create directory {}", d));
|
||||
}
|
||||
|
||||
let src = Path::new(mnt_src_filename);
|
||||
let dest = Path::new(mnt_dest_filename);
|
||||
|
||||
// Create an actual mount
|
||||
let result = baremount(src, dest, "bind", MsFlags::MS_BIND, "", &logger);
|
||||
let result = baremount(
|
||||
mnt_src_filename,
|
||||
mnt_dest_filename,
|
||||
"bind",
|
||||
MsFlags::MS_BIND,
|
||||
"",
|
||||
&logger,
|
||||
);
|
||||
assert!(result.is_ok(), "mount for test setup failed");
|
||||
|
||||
let tests = &[
|
||||
|
||||
@@ -23,7 +23,12 @@ pub const NSTYPEPID: &str = "pid";
|
||||
|
||||
#[instrument]
|
||||
pub fn get_current_thread_ns_path(ns_type: &str) -> String {
|
||||
format!("/proc/{}/task/{}/ns/{}", getpid(), gettid(), ns_type)
|
||||
format!(
|
||||
"/proc/{}/task/{}/ns/{}",
|
||||
getpid().to_string(),
|
||||
gettid().to_string(),
|
||||
ns_type
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -95,14 +100,11 @@ impl Namespace {
|
||||
self.path = new_ns_path.clone().into_os_string().into_string().unwrap();
|
||||
let hostname = self.hostname.clone();
|
||||
|
||||
let new_thread = std::thread::spawn(move || {
|
||||
let new_thread = tokio::spawn(async move {
|
||||
if let Err(err) = || -> Result<()> {
|
||||
let origin_ns_path = get_current_thread_ns_path(ns_type.get());
|
||||
|
||||
let source = Path::new(&origin_ns_path);
|
||||
let destination = new_ns_path.as_path();
|
||||
|
||||
File::open(&source)?;
|
||||
File::open(Path::new(&origin_ns_path))?;
|
||||
|
||||
// Create a new netns on the current thread.
|
||||
let cf = ns_type.get_flags();
|
||||
@@ -113,6 +115,8 @@ impl Namespace {
|
||||
nix::unistd::sethostname(hostname.unwrap())?;
|
||||
}
|
||||
// Bind mount the new namespace from the current thread onto the mount point to persist it.
|
||||
let source: &str = origin_ns_path.as_str();
|
||||
let destination: &str = new_ns_path.as_path().to_str().unwrap_or("none");
|
||||
|
||||
let mut flags = MsFlags::empty();
|
||||
|
||||
@@ -127,7 +131,7 @@ impl Namespace {
|
||||
|
||||
baremount(source, destination, "none", flags, "", &logger).map_err(|e| {
|
||||
anyhow!(
|
||||
"Failed to mount {:?} to {:?} with err:{:?}",
|
||||
"Failed to mount {} to {} with err:{:?}",
|
||||
source,
|
||||
destination,
|
||||
e
|
||||
@@ -143,7 +147,7 @@ impl Namespace {
|
||||
});
|
||||
|
||||
new_thread
|
||||
.join()
|
||||
.await
|
||||
.map_err(|e| anyhow!("Failed to join thread {:?}!", e))??;
|
||||
|
||||
Ok(self)
|
||||
@@ -246,126 +250,4 @@ mod tests {
|
||||
assert_eq!("pid", pid.get());
|
||||
assert_eq!(CloneFlags::CLONE_NEWPID, pid.get_flags());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new() {
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let ns_ipc = Namespace::new(&logger);
|
||||
assert_eq!(NamespaceType::Ipc, ns_ipc.ns_type);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_ipc() {
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let ns_ipc = Namespace::new(&logger).get_ipc();
|
||||
assert_eq!(NamespaceType::Ipc, ns_ipc.ns_type);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_uts_with_hostname() {
|
||||
let hostname = String::from("a.test.com");
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let ns_uts = Namespace::new(&logger).get_uts(hostname.as_str());
|
||||
assert_eq!(NamespaceType::Uts, ns_uts.ns_type);
|
||||
assert!(ns_uts.hostname.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_uts() {
|
||||
let hostname = String::from("");
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let ns_uts = Namespace::new(&logger).get_uts(hostname.as_str());
|
||||
assert_eq!(NamespaceType::Uts, ns_uts.ns_type);
|
||||
assert!(ns_uts.hostname.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_pid() {
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
|
||||
let ns_pid = Namespace::new(&logger).get_pid();
|
||||
assert_eq!(NamespaceType::Pid, ns_pid.ns_type);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_root_dir() {
|
||||
// Create dummy logger and temp folder.
|
||||
let logger = slog::Logger::root(slog::Discard, o!());
|
||||
let tmpdir = Builder::new().prefix("pid").tempdir().unwrap();
|
||||
|
||||
let ns_root = Namespace::new(&logger).set_root_dir(tmpdir.path().to_str().unwrap());
|
||||
assert_eq!(NamespaceType::Ipc, ns_root.ns_type);
|
||||
assert_eq!(ns_root.persistent_ns_dir, tmpdir.path().to_str().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_namespace_type_get() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
ns_type: NamespaceType,
|
||||
str: &'a str,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
ns_type: NamespaceType::Ipc,
|
||||
str: "ipc",
|
||||
},
|
||||
TestData {
|
||||
ns_type: NamespaceType::Uts,
|
||||
str: "uts",
|
||||
},
|
||||
TestData {
|
||||
ns_type: NamespaceType::Pid,
|
||||
str: "pid",
|
||||
},
|
||||
];
|
||||
|
||||
// Run the tests
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
// Create a string containing details of the test
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
assert_eq!(d.str, d.ns_type.get(), "{}", msg)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_namespace_type_get_flags() {
|
||||
#[derive(Debug)]
|
||||
struct TestData {
|
||||
ns_type: NamespaceType,
|
||||
ns_flag: CloneFlags,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
ns_type: NamespaceType::Ipc,
|
||||
ns_flag: CloneFlags::CLONE_NEWIPC,
|
||||
},
|
||||
TestData {
|
||||
ns_type: NamespaceType::Uts,
|
||||
ns_flag: CloneFlags::CLONE_NEWUTS,
|
||||
},
|
||||
TestData {
|
||||
ns_type: NamespaceType::Pid,
|
||||
ns_flag: CloneFlags::CLONE_NEWPID,
|
||||
},
|
||||
];
|
||||
|
||||
// Run the tests
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
// Create a string containing details of the test
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
assert_eq!(d.ns_flag, d.ns_type.get_flags(), "{}", msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -523,7 +523,7 @@ impl Handle {
|
||||
.as_ref()
|
||||
.map(|to| to.address.as_str()) // Extract address field
|
||||
.and_then(|addr| if addr.is_empty() { None } else { Some(addr) }) // Make sure it's not empty
|
||||
.ok_or(anyhow!(nix::Error::EINVAL))?;
|
||||
.ok_or(nix::Error::Sys(nix::errno::Errno::EINVAL))?;
|
||||
|
||||
let ip = IpAddr::from_str(ip_address)
|
||||
.map_err(|e| anyhow!("Failed to parse IP {}: {:?}", ip_address, e))?;
|
||||
@@ -612,7 +612,12 @@ fn parse_mac_address(addr: &str) -> Result<[u8; 6]> {
|
||||
|
||||
// Parse single Mac address block
|
||||
let mut parse_next = || -> Result<u8> {
|
||||
let v = u8::from_str_radix(split.next().ok_or(anyhow!(nix::Error::EINVAL))?, 16)?;
|
||||
let v = u8::from_str_radix(
|
||||
split
|
||||
.next()
|
||||
.ok_or(nix::Error::Sys(nix::errno::Errno::EINVAL))?,
|
||||
16,
|
||||
)?;
|
||||
Ok(v)
|
||||
};
|
||||
|
||||
|
||||
@@ -5,22 +5,30 @@
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use nix::mount::{self, MsFlags};
|
||||
use protocols::types::{Interface, Route};
|
||||
use slog::Logger;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
|
||||
const KATA_GUEST_SANDBOX_DNS_FILE: &str = "/run/kata-containers/sandbox/resolv.conf";
|
||||
const GUEST_DNS_FILE: &str = "/etc/resolv.conf";
|
||||
|
||||
// Network describes a sandbox network, includings its dns
|
||||
// Network fully describes a sandbox network with its interfaces, routes and dns
|
||||
// related information.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Network {
|
||||
ifaces: HashMap<String, Interface>,
|
||||
routes: Vec<Route>,
|
||||
dns: Vec<String>,
|
||||
}
|
||||
|
||||
impl Network {
|
||||
pub fn new() -> Network {
|
||||
Network { dns: Vec::new() }
|
||||
Network {
|
||||
ifaces: HashMap::new(),
|
||||
routes: Vec::new(),
|
||||
dns: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_dns(&mut self, dns: String) {
|
||||
|
||||
@@ -20,7 +20,7 @@ const FUNCTION_MAX: u8 = (1 << FUNCTION_BITS) - 1;
|
||||
|
||||
// Represents a PCI function's slot (a.k.a. device) and function
|
||||
// numbers, giving its location on a single logical bus
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct SlotFn(u8);
|
||||
|
||||
impl SlotFn {
|
||||
@@ -94,7 +94,7 @@ impl fmt::Display for SlotFn {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct Address {
|
||||
domain: u16,
|
||||
bus: u8,
|
||||
|
||||
@@ -14,7 +14,7 @@ use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use ttrpc::{
|
||||
self,
|
||||
error::get_rpc_status,
|
||||
error::get_rpc_status as ttrpc_error,
|
||||
r#async::{Server as TtrpcServer, TtrpcContext},
|
||||
};
|
||||
|
||||
@@ -86,13 +86,6 @@ macro_rules! sl {
|
||||
};
|
||||
}
|
||||
|
||||
// Convenience macro to wrap an error and response to ttrpc client
|
||||
macro_rules! ttrpc_error {
|
||||
($code:path, $err:expr $(,)?) => {
|
||||
get_rpc_status($code, format!("{:?}", $err))
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! is_allowed {
|
||||
($req:ident) => {
|
||||
if !AGENT_CONFIG
|
||||
@@ -100,7 +93,7 @@ macro_rules! is_allowed {
|
||||
.await
|
||||
.is_allowed_endpoint($req.descriptor().name())
|
||||
{
|
||||
return Err(ttrpc_error!(
|
||||
return Err(ttrpc_error(
|
||||
ttrpc::Code::UNIMPLEMENTED,
|
||||
format!("{} is blocked", $req.descriptor().name()),
|
||||
));
|
||||
@@ -118,18 +111,11 @@ pub struct AgentService {
|
||||
// ^[a-zA-Z0-9][a-zA-Z0-9_.-]+$
|
||||
//
|
||||
fn verify_cid(id: &str) -> Result<()> {
|
||||
let mut chars = id.chars();
|
||||
|
||||
let valid = match chars.next() {
|
||||
Some(first)
|
||||
if first.is_alphanumeric()
|
||||
&& id.len() > 1
|
||||
&& chars.all(|c| c.is_alphanumeric() || ['.', '-', '_'].contains(&c)) =>
|
||||
{
|
||||
true
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
let valid = id.len() > 1
|
||||
&& id.chars().next().unwrap().is_alphanumeric()
|
||||
&& id
|
||||
.chars()
|
||||
.all(|c| (c.is_alphanumeric() || ['.', '-', '_'].contains(&c)));
|
||||
|
||||
match valid {
|
||||
true => Ok(()),
|
||||
@@ -157,7 +143,7 @@ impl AgentService {
|
||||
Some(spec) => rustjail::grpc_to_oci(spec),
|
||||
None => {
|
||||
error!(sl!(), "no oci spec in the create container request!");
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(anyhow!(nix::Error::from_errno(nix::errno::Errno::EINVAL)));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -200,7 +186,7 @@ impl AgentService {
|
||||
update_device_cgroup(&mut oci)?;
|
||||
|
||||
// Append guest hooks
|
||||
append_guest_hooks(&s, &mut oci)?;
|
||||
append_guest_hooks(&s, &mut oci);
|
||||
|
||||
// write spec to bundle path, hooks might
|
||||
// read ocispec
|
||||
@@ -222,14 +208,21 @@ impl AgentService {
|
||||
LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl!())?;
|
||||
|
||||
let pipe_size = AGENT_CONFIG.read().await.container_pipe_size;
|
||||
|
||||
let p = if let Some(p) = oci.process {
|
||||
Process::new(&sl!(), &p, cid.as_str(), true, pipe_size)?
|
||||
let p = if oci.process.is_some() {
|
||||
Process::new(
|
||||
&sl!(),
|
||||
oci.process.as_ref().unwrap(),
|
||||
cid.as_str(),
|
||||
true,
|
||||
pipe_size,
|
||||
)?
|
||||
} else {
|
||||
info!(sl!(), "no process configurations!");
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(anyhow!(nix::Error::from_errno(nix::errno::Errno::EINVAL)));
|
||||
};
|
||||
|
||||
ctr.start(p).await?;
|
||||
|
||||
s.update_shared_pidns(&ctr)?;
|
||||
s.add_container(ctr);
|
||||
info!(sl!(), "created container!");
|
||||
@@ -251,17 +244,11 @@ impl AgentService {
|
||||
|
||||
ctr.exec()?;
|
||||
|
||||
if sid == cid {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// start oom event loop
|
||||
if let Some(ref ctr) = ctr.cgroup_manager {
|
||||
let cg_path = ctr.get_cg_path("memory");
|
||||
|
||||
if let Some(cg_path) = cg_path {
|
||||
let rx = notifier::notify_oom(cid.as_str(), cg_path.to_string()).await?;
|
||||
|
||||
if sid != cid && ctr.cgroup_manager.is_some() {
|
||||
let cg_path = ctr.cgroup_manager.as_ref().unwrap().get_cg_path("memory");
|
||||
if cg_path.is_some() {
|
||||
let rx = notifier::notify_oom(cid.as_str(), cg_path.unwrap()).await?;
|
||||
s.run_oom_event_monitor(rx, cid.clone()).await;
|
||||
}
|
||||
}
|
||||
@@ -334,11 +321,13 @@ impl AgentService {
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return Err(anyhow!(nix::Error::ETIME));
|
||||
return Err(anyhow!(nix::Error::from_errno(nix::errno::Errno::ETIME)));
|
||||
}
|
||||
|
||||
if handle.await.is_err() {
|
||||
return Err(anyhow!(nix::Error::UnknownErrno));
|
||||
return Err(anyhow!(nix::Error::from_errno(
|
||||
nix::errno::Errno::UnknownErrno
|
||||
)));
|
||||
}
|
||||
|
||||
let s = self.sandbox.clone();
|
||||
@@ -359,13 +348,14 @@ impl AgentService {
|
||||
let s = self.sandbox.clone();
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let process = req
|
||||
.process
|
||||
.into_option()
|
||||
.ok_or_else(|| anyhow!(nix::Error::EINVAL))?;
|
||||
let process = if req.process.is_some() {
|
||||
req.process.as_ref().unwrap()
|
||||
} else {
|
||||
return Err(anyhow!(nix::Error::from_errno(nix::errno::Errno::EINVAL)));
|
||||
};
|
||||
|
||||
let pipe_size = AGENT_CONFIG.read().await.container_pipe_size;
|
||||
let ocip = rustjail::process_grpc_to_oci(&process);
|
||||
let ocip = rustjail::process_grpc_to_oci(process);
|
||||
let p = Process::new(&sl!(), &ocip, exec_id.as_str(), false, pipe_size)?;
|
||||
|
||||
let ctr = sandbox
|
||||
@@ -383,6 +373,7 @@ impl AgentService {
|
||||
let eid = req.exec_id.clone();
|
||||
let s = self.sandbox.clone();
|
||||
let mut sandbox = s.lock().await;
|
||||
let mut init = false;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
@@ -391,14 +382,13 @@ impl AgentService {
|
||||
"exec-id" => eid.clone(),
|
||||
);
|
||||
|
||||
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
|
||||
if eid.is_empty() {
|
||||
init = true;
|
||||
}
|
||||
|
||||
let mut signal = Signal::try_from(req.signal as i32).map_err(|e| {
|
||||
anyhow!(e).context(format!(
|
||||
"failed to convert {:?} to signal (container-id: {}, exec-id: {})",
|
||||
req.signal, cid, eid
|
||||
))
|
||||
})?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), init)?;
|
||||
|
||||
let mut signal = Signal::try_from(req.signal as i32).unwrap();
|
||||
|
||||
// For container initProcess, if it hasn't installed handler for "SIGTERM" signal,
|
||||
// it will ignore the "SIGTERM" signal sent to it, thus send it "SIGKILL" signal
|
||||
@@ -434,7 +424,7 @@ impl AgentService {
|
||||
|
||||
let exit_rx = {
|
||||
let mut sandbox = s.lock().await;
|
||||
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), false)?;
|
||||
|
||||
p.exit_watchers.push(exit_send);
|
||||
pid = p.pid;
|
||||
@@ -457,11 +447,7 @@ impl AgentService {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
// Lost race, pick up exit code from channel
|
||||
resp.status = exit_recv
|
||||
.recv()
|
||||
.await
|
||||
.ok_or_else(|| anyhow!("Failed to receive exit code"))?;
|
||||
|
||||
resp.status = exit_recv.recv().await.unwrap();
|
||||
return Ok(resp);
|
||||
}
|
||||
};
|
||||
@@ -492,7 +478,7 @@ impl AgentService {
|
||||
let writer = {
|
||||
let s = self.sandbox.clone();
|
||||
let mut sandbox = s.lock().await;
|
||||
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), false)?;
|
||||
|
||||
// use ptmx io
|
||||
if p.term_master.is_some() {
|
||||
@@ -503,7 +489,7 @@ impl AgentService {
|
||||
}
|
||||
};
|
||||
|
||||
let writer = writer.ok_or_else(|| anyhow!("cannot get writer"))?;
|
||||
let writer = writer.unwrap();
|
||||
writer.lock().await.write_all(req.data.as_slice()).await?;
|
||||
|
||||
let mut resp = WriteStreamResponse::new();
|
||||
@@ -525,7 +511,7 @@ impl AgentService {
|
||||
let s = self.sandbox.clone();
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), false)?;
|
||||
|
||||
if p.term_master.is_some() {
|
||||
term_exit_notifier = p.term_exit_notifier.clone();
|
||||
@@ -542,10 +528,10 @@ impl AgentService {
|
||||
};
|
||||
|
||||
if reader.is_none() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(anyhow!(nix::Error::from_errno(nix::errno::Errno::EINVAL)));
|
||||
}
|
||||
|
||||
let reader = reader.ok_or_else(|| anyhow!("cannot get stream reader"))?;
|
||||
let reader = reader.unwrap();
|
||||
|
||||
tokio::select! {
|
||||
_ = term_exit_notifier.notified() => {
|
||||
@@ -572,7 +558,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
trace_rpc_call!(ctx, "create_container", req);
|
||||
is_allowed!(req);
|
||||
match self.do_create_container(req).await {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(_) => Ok(Empty::new()),
|
||||
}
|
||||
}
|
||||
@@ -585,7 +571,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
trace_rpc_call!(ctx, "start_container", req);
|
||||
is_allowed!(req);
|
||||
match self.do_start_container(req).await {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(_) => Ok(Empty::new()),
|
||||
}
|
||||
}
|
||||
@@ -599,7 +585,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
match self.do_remove_container(req).await {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(_) => Ok(Empty::new()),
|
||||
}
|
||||
}
|
||||
@@ -612,7 +598,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
trace_rpc_call!(ctx, "exec_process", req);
|
||||
is_allowed!(req);
|
||||
match self.do_exec_process(req).await {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(_) => Ok(Empty::new()),
|
||||
}
|
||||
}
|
||||
@@ -625,7 +611,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
trace_rpc_call!(ctx, "signal_process", req);
|
||||
is_allowed!(req);
|
||||
match self.do_signal_process(req).await {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(_) => Ok(Empty::new()),
|
||||
}
|
||||
}
|
||||
@@ -639,7 +625,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
self.do_wait_process(req)
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))
|
||||
}
|
||||
|
||||
async fn update_container(
|
||||
@@ -656,7 +642,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let ctr = sandbox.get_container(&cid).ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"invalid container id".to_string(),
|
||||
)
|
||||
@@ -664,11 +650,11 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
|
||||
let resp = Empty::new();
|
||||
|
||||
if let Some(res) = res.as_ref() {
|
||||
let oci_res = rustjail::resources_grpc_to_oci(res);
|
||||
if res.is_some() {
|
||||
let oci_res = rustjail::resources_grpc_to_oci(&res.unwrap());
|
||||
match ctr.set(oci_res) {
|
||||
Err(e) => {
|
||||
return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
|
||||
return Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()));
|
||||
}
|
||||
|
||||
Ok(_) => return Ok(resp),
|
||||
@@ -690,14 +676,14 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let ctr = sandbox.get_container(&cid).ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"invalid container id".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
ctr.stats()
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))
|
||||
}
|
||||
|
||||
async fn pause_container(
|
||||
@@ -712,14 +698,14 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let ctr = sandbox.get_container(cid).ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"invalid container id".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
ctr.pause()
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -736,14 +722,14 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let ctr = sandbox.get_container(cid).ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"invalid container id".to_string(),
|
||||
)
|
||||
})?;
|
||||
|
||||
ctr.resume()
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -756,7 +742,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
self.do_write_stream(req)
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))
|
||||
}
|
||||
|
||||
async fn read_stdout(
|
||||
@@ -767,7 +753,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
self.do_read_stream(req, true)
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))
|
||||
}
|
||||
|
||||
async fn read_stderr(
|
||||
@@ -778,7 +764,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
self.do_read_stream(req, false)
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))
|
||||
}
|
||||
|
||||
async fn close_stdin(
|
||||
@@ -794,14 +780,12 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let s = Arc::clone(&self.sandbox);
|
||||
let mut sandbox = s.lock().await;
|
||||
|
||||
let p = sandbox
|
||||
.find_container_process(cid.as_str(), eid.as_str())
|
||||
.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
format!("invalid argument: {:?}", e),
|
||||
)
|
||||
})?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), false).map_err(|e| {
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
format!("invalid argument: {:?}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
p.close_stdin();
|
||||
|
||||
@@ -820,31 +804,30 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let eid = req.exec_id.clone();
|
||||
let s = Arc::clone(&self.sandbox);
|
||||
let mut sandbox = s.lock().await;
|
||||
let p = sandbox
|
||||
.find_container_process(cid.as_str(), eid.as_str())
|
||||
.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc::Code::UNAVAILABLE,
|
||||
format!("invalid argument: {:?}", e),
|
||||
)
|
||||
})?;
|
||||
let p = find_process(&mut sandbox, cid.as_str(), eid.as_str(), false).map_err(|e| {
|
||||
ttrpc_error(
|
||||
ttrpc::Code::UNAVAILABLE,
|
||||
format!("invalid argument: {:?}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(fd) = p.term_master {
|
||||
unsafe {
|
||||
let win = winsize {
|
||||
ws_row: req.row as c_ushort,
|
||||
ws_col: req.column as c_ushort,
|
||||
ws_xpixel: 0,
|
||||
ws_ypixel: 0,
|
||||
};
|
||||
if p.term_master.is_none() {
|
||||
return Err(ttrpc_error(ttrpc::Code::UNAVAILABLE, "no tty".to_string()));
|
||||
}
|
||||
|
||||
let err = libc::ioctl(fd, TIOCSWINSZ, &win);
|
||||
Errno::result(err).map(drop).map_err(|e| {
|
||||
ttrpc_error!(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e))
|
||||
})?;
|
||||
}
|
||||
} else {
|
||||
return Err(ttrpc_error!(ttrpc::Code::UNAVAILABLE, "no tty".to_string()));
|
||||
let fd = p.term_master.unwrap();
|
||||
unsafe {
|
||||
let win = winsize {
|
||||
ws_row: req.row as c_ushort,
|
||||
ws_col: req.column as c_ushort,
|
||||
ws_xpixel: 0,
|
||||
ws_ypixel: 0,
|
||||
};
|
||||
|
||||
let err = libc::ioctl(fd, TIOCSWINSZ, &win);
|
||||
Errno::result(err)
|
||||
.map(drop)
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e)))?;
|
||||
}
|
||||
|
||||
Ok(Empty::new())
|
||||
@@ -859,7 +842,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
let interface = req.interface.into_option().ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"empty update interface request".to_string(),
|
||||
)
|
||||
@@ -872,7 +855,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.update_interface(&interface)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ttrpc_error!(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e))
|
||||
ttrpc_error(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e))
|
||||
})?;
|
||||
|
||||
Ok(interface)
|
||||
@@ -891,7 +874,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.into_option()
|
||||
.map(|r| r.Routes.into_vec())
|
||||
.ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"empty update routes request".to_string(),
|
||||
)
|
||||
@@ -900,14 +883,14 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = self.sandbox.lock().await;
|
||||
|
||||
sandbox.rtnl.update_routes(new_routes).await.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INTERNAL,
|
||||
format!("Failed to update routes: {:?}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let list = sandbox.rtnl.list_routes().await.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INTERNAL,
|
||||
format!("Failed to list routes after update: {:?}", e),
|
||||
)
|
||||
@@ -935,7 +918,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.list_interfaces()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INTERNAL,
|
||||
format!("Failed to list interfaces: {:?}", e),
|
||||
)
|
||||
@@ -962,7 +945,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.rtnl
|
||||
.list_routes()
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?;
|
||||
|
||||
Ok(protocols::agent::Routes {
|
||||
Routes: RepeatedField::from_vec(list),
|
||||
@@ -970,6 +953,25 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
})
|
||||
}
|
||||
|
||||
async fn start_tracing(
|
||||
&self,
|
||||
_ctx: &TtrpcContext,
|
||||
req: protocols::agent::StartTracingRequest,
|
||||
) -> ttrpc::Result<Empty> {
|
||||
info!(sl!(), "start_tracing {:?}", req);
|
||||
is_allowed!(req);
|
||||
Ok(Empty::new())
|
||||
}
|
||||
|
||||
async fn stop_tracing(
|
||||
&self,
|
||||
_ctx: &TtrpcContext,
|
||||
req: protocols::agent::StopTracingRequest,
|
||||
) -> ttrpc::Result<Empty> {
|
||||
is_allowed!(req);
|
||||
Ok(Empty::new())
|
||||
}
|
||||
|
||||
async fn create_sandbox(
|
||||
&self,
|
||||
ctx: &TtrpcContext,
|
||||
@@ -1002,12 +1004,13 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
}
|
||||
|
||||
for m in req.kernel_modules.iter() {
|
||||
load_kernel_module(m).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
load_kernel_module(m)
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
}
|
||||
|
||||
s.setup_shared_namespaces()
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
}
|
||||
|
||||
match add_storages(sl!(), req.storages.to_vec(), self.sandbox.clone(), None).await {
|
||||
@@ -1016,7 +1019,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut s = sandbox.lock().await;
|
||||
s.mounts = m
|
||||
}
|
||||
Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
};
|
||||
|
||||
match setup_guest_dns(sl!(), req.dns.to_vec()) {
|
||||
@@ -1029,7 +1032,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.iter()
|
||||
.map(|dns| s.network.set_dns(dns.to_string()));
|
||||
}
|
||||
Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
};
|
||||
|
||||
Ok(Empty::new())
|
||||
@@ -1047,25 +1050,12 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
let mut sandbox = s.lock().await;
|
||||
// destroy all containers, clean up, notify agent to exit
|
||||
// etc.
|
||||
sandbox
|
||||
.destroy()
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
sandbox.destroy().await.unwrap();
|
||||
// Close get_oom_event connection,
|
||||
// otherwise it will block the shutdown of ttrpc.
|
||||
sandbox.event_tx.take();
|
||||
|
||||
sandbox
|
||||
.sender
|
||||
.take()
|
||||
.ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc::Code::INTERNAL,
|
||||
"failed to get sandbox sender channel".to_string(),
|
||||
)
|
||||
})?
|
||||
.send(1)
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
sandbox.sender.take().unwrap().send(1).unwrap();
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1083,7 +1073,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.into_option()
|
||||
.map(|n| n.ARPNeighbors.into_vec())
|
||||
.ok_or_else(|| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INVALID_ARGUMENT,
|
||||
"empty add arp neighbours request".to_string(),
|
||||
)
|
||||
@@ -1096,7 +1086,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
.add_arp_neighbors(neighs)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ttrpc_error!(
|
||||
ttrpc_error(
|
||||
ttrpc::Code::INTERNAL,
|
||||
format!("Failed to add ARP neighbours: {:?}", e),
|
||||
)
|
||||
@@ -1117,7 +1107,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
|
||||
sandbox
|
||||
.online_cpu_memory(&req)
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1131,7 +1121,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
random::reseed_rng(req.data.as_slice())
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1154,7 +1144,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
}
|
||||
Err(e) => {
|
||||
info!(sl!(), "fail to get memory info!");
|
||||
return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
|
||||
return Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1174,7 +1164,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
do_mem_hotplug_by_probe(&req.memHotplugProbeAddr)
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1188,7 +1178,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
do_set_guest_date_time(req.Sec, req.Usec)
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1201,7 +1191,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
trace_rpc_call!(ctx, "copy_file", req);
|
||||
is_allowed!(req);
|
||||
|
||||
do_copy_file(&req).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
do_copy_file(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1215,7 +1205,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
is_allowed!(req);
|
||||
|
||||
match get_metrics(&req) {
|
||||
Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
|
||||
Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e.to_string())),
|
||||
Ok(s) => {
|
||||
let mut metrics = Metrics::new();
|
||||
metrics.set_metrics(s);
|
||||
@@ -1246,7 +1236,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
return Ok(resp);
|
||||
}
|
||||
|
||||
Err(ttrpc_error!(ttrpc::Code::INTERNAL, ""))
|
||||
Err(ttrpc_error(ttrpc::Code::INTERNAL, ""))
|
||||
}
|
||||
|
||||
async fn add_swap(
|
||||
@@ -1259,7 +1249,7 @@ impl protocols::agent_ttrpc::AgentService for AgentService {
|
||||
|
||||
do_add_swap(&self.sandbox, &req)
|
||||
.await
|
||||
.map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
|
||||
.map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e.to_string()))?;
|
||||
|
||||
Ok(Empty::new())
|
||||
}
|
||||
@@ -1324,9 +1314,16 @@ fn get_memory_info(block_size: bool, hotplug: bool) -> Result<(u64, bool)> {
|
||||
match stat::stat(SYSFS_MEMORY_HOTPLUG_PROBE_PATH) {
|
||||
Ok(_) => plug = true,
|
||||
Err(e) => {
|
||||
info!(sl!(), "hotplug memory error: {:?}", e);
|
||||
info!(
|
||||
sl!(),
|
||||
"hotplug memory error: {}",
|
||||
e.as_errno().unwrap().desc()
|
||||
);
|
||||
match e {
|
||||
nix::Error::ENOENT => plug = false,
|
||||
nix::Error::Sys(errno) => match errno {
|
||||
Errno::ENOENT => plug = false,
|
||||
_ => return Err(anyhow!(e)),
|
||||
},
|
||||
_ => return Err(anyhow!(e)),
|
||||
}
|
||||
}
|
||||
@@ -1377,7 +1374,27 @@ async fn read_stream(reader: Arc<Mutex<ReadHalf<PipeStream>>>, l: usize) -> Resu
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
pub fn start(s: Arc<Mutex<Sandbox>>, server_address: &str) -> Result<TtrpcServer> {
|
||||
fn find_process<'a>(
|
||||
sandbox: &'a mut Sandbox,
|
||||
cid: &'a str,
|
||||
eid: &'a str,
|
||||
init: bool,
|
||||
) -> Result<&'a mut Process> {
|
||||
let ctr = sandbox
|
||||
.get_container(cid)
|
||||
.ok_or_else(|| anyhow!("Invalid container id"))?;
|
||||
|
||||
if init || eid.is_empty() {
|
||||
return ctr
|
||||
.processes
|
||||
.get_mut(&ctr.init_process_pid)
|
||||
.ok_or_else(|| anyhow!("cannot find init process!"));
|
||||
}
|
||||
|
||||
ctr.get_process(eid).map_err(|_| anyhow!("Invalid exec id"))
|
||||
}
|
||||
|
||||
pub fn start(s: Arc<Mutex<Sandbox>>, server_address: &str) -> TtrpcServer {
|
||||
let agent_service = Box::new(AgentService { sandbox: s })
|
||||
as Box<dyn protocols::agent_ttrpc::AgentService + Send + Sync>;
|
||||
|
||||
@@ -1392,13 +1409,14 @@ pub fn start(s: Arc<Mutex<Sandbox>>, server_address: &str) -> Result<TtrpcServer
|
||||
let hservice = protocols::health_ttrpc::create_health(health_worker);
|
||||
|
||||
let server = TtrpcServer::new()
|
||||
.bind(server_address)?
|
||||
.bind(server_address)
|
||||
.unwrap()
|
||||
.register_service(aservice)
|
||||
.register_service(hservice);
|
||||
|
||||
info!(sl!(), "ttRPC server started"; "address" => server_address);
|
||||
|
||||
Ok(server)
|
||||
server
|
||||
}
|
||||
|
||||
// This function updates the container namespaces configuration based on the
|
||||
@@ -1443,28 +1461,24 @@ fn update_container_namespaces(
|
||||
// the create_sandbox request or create_container request.
|
||||
// Else set this to empty string so that a new pid namespace is
|
||||
// created for the container.
|
||||
if sandbox_pidns {
|
||||
if let Some(ref pidns) = &sandbox.sandbox_pidns {
|
||||
pid_ns.path = String::from(pidns.path.as_str());
|
||||
} else {
|
||||
return Err(anyhow!("failed to get sandbox pidns"));
|
||||
}
|
||||
if sandbox_pidns && sandbox.sandbox_pidns.is_some() {
|
||||
pid_ns.path = String::from(sandbox.sandbox_pidns.as_ref().unwrap().path.as_str());
|
||||
}
|
||||
|
||||
linux.namespaces.push(pid_ns);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn append_guest_hooks(s: &Sandbox, oci: &mut Spec) -> Result<()> {
|
||||
if let Some(ref guest_hooks) = s.hooks {
|
||||
let mut hooks = oci.hooks.take().unwrap_or_default();
|
||||
hooks.prestart.append(&mut guest_hooks.prestart.clone());
|
||||
hooks.poststart.append(&mut guest_hooks.poststart.clone());
|
||||
hooks.poststop.append(&mut guest_hooks.poststop.clone());
|
||||
oci.hooks = Some(hooks);
|
||||
fn append_guest_hooks(s: &Sandbox, oci: &mut Spec) {
|
||||
if s.hooks.is_none() {
|
||||
return;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let guest_hooks = s.hooks.as_ref().unwrap();
|
||||
let mut hooks = oci.hooks.take().unwrap_or_default();
|
||||
hooks.prestart.append(&mut guest_hooks.prestart.clone());
|
||||
hooks.poststart.append(&mut guest_hooks.poststart.clone());
|
||||
hooks.poststop.append(&mut guest_hooks.poststop.clone());
|
||||
oci.hooks = Some(hooks);
|
||||
}
|
||||
|
||||
// Check is the container process installed the
|
||||
@@ -1543,7 +1557,7 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
|
||||
let path = PathBuf::from(req.path.as_str());
|
||||
|
||||
if !path.starts_with(CONTAINER_BASE) {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
let parent = path.parent();
|
||||
@@ -1554,7 +1568,7 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
|
||||
PathBuf::from("/")
|
||||
};
|
||||
|
||||
fs::create_dir_all(&dir).or_else(|e| {
|
||||
fs::create_dir_all(dir.to_str().unwrap()).or_else(|e| {
|
||||
if e.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
return Err(e);
|
||||
}
|
||||
@@ -1562,7 +1576,10 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(req.dir_mode))?;
|
||||
std::fs::set_permissions(
|
||||
dir.to_str().unwrap(),
|
||||
std::fs::Permissions::from_mode(req.dir_mode),
|
||||
)?;
|
||||
|
||||
let mut tmpfile = path.clone();
|
||||
tmpfile.set_extension("tmp");
|
||||
@@ -1571,10 +1588,10 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(false)
|
||||
.open(&tmpfile)?;
|
||||
.open(tmpfile.to_str().unwrap())?;
|
||||
|
||||
file.write_all_at(req.data.as_slice(), req.offset as u64)?;
|
||||
let st = stat::stat(&tmpfile)?;
|
||||
let st = stat::stat(tmpfile.to_str().unwrap())?;
|
||||
|
||||
if st.st_size != req.file_size {
|
||||
return Ok(());
|
||||
@@ -1583,7 +1600,7 @@ fn do_copy_file(req: &CopyFileRequest) -> Result<()> {
|
||||
file.set_permissions(std::fs::Permissions::from_mode(req.file_mode))?;
|
||||
|
||||
unistd::chown(
|
||||
&tmpfile,
|
||||
tmpfile.to_str().unwrap(),
|
||||
Some(Uid::from_raw(req.uid as u32)),
|
||||
Some(Gid::from_raw(req.gid as u32)),
|
||||
)?;
|
||||
@@ -1620,13 +1637,10 @@ async fn do_add_swap(sandbox: &Arc<Mutex<Sandbox>>, req: &AddSwapRequest) -> Res
|
||||
// - container rootfs bind mounted at /<CONTAINER_BASE>/<cid>/rootfs
|
||||
// - modify container spec root to point to /<CONTAINER_BASE>/<cid>/rootfs
|
||||
fn setup_bundle(cid: &str, spec: &mut Spec) -> Result<PathBuf> {
|
||||
let spec_root = if let Some(sr) = &spec.root {
|
||||
sr
|
||||
} else {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
};
|
||||
|
||||
let spec_root_path = Path::new(&spec_root.path);
|
||||
if spec.root.is_none() {
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
let spec_root = spec.root.as_ref().unwrap();
|
||||
|
||||
let bundle_path = Path::new(CONTAINER_BASE).join(cid);
|
||||
let config_path = bundle_path.join("config.json");
|
||||
@@ -1634,36 +1648,22 @@ fn setup_bundle(cid: &str, spec: &mut Spec) -> Result<PathBuf> {
|
||||
|
||||
fs::create_dir_all(&rootfs_path)?;
|
||||
baremount(
|
||||
spec_root_path,
|
||||
&rootfs_path,
|
||||
&spec_root.path,
|
||||
rootfs_path.to_str().unwrap(),
|
||||
"bind",
|
||||
MsFlags::MS_BIND,
|
||||
"",
|
||||
&sl!(),
|
||||
)?;
|
||||
|
||||
let rootfs_path_name = rootfs_path
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("failed to convert rootfs to unicode"))?
|
||||
.to_string();
|
||||
|
||||
spec.root = Some(Root {
|
||||
path: rootfs_path_name,
|
||||
path: rootfs_path.to_str().unwrap().to_owned(),
|
||||
readonly: spec_root.readonly,
|
||||
});
|
||||
|
||||
let _ = spec.save(
|
||||
config_path
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("cannot convert path to unicode"))?,
|
||||
);
|
||||
let _ = spec.save(config_path.to_str().unwrap());
|
||||
|
||||
let olddir = unistd::getcwd().context("cannot getcwd")?;
|
||||
unistd::chdir(
|
||||
bundle_path
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("cannot convert bundle path to unicode"))?,
|
||||
)?;
|
||||
unistd::chdir(bundle_path.to_str().unwrap())?;
|
||||
|
||||
Ok(olddir)
|
||||
}
|
||||
@@ -1696,8 +1696,8 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> {
|
||||
|
||||
match status.code() {
|
||||
Some(code) => {
|
||||
let std_out = String::from_utf8_lossy(&output.stdout);
|
||||
let std_err = String::from_utf8_lossy(&output.stderr);
|
||||
let std_out: String = String::from_utf8(output.stdout).unwrap();
|
||||
let std_err: String = String::from_utf8(output.stderr).unwrap();
|
||||
let msg = format!(
|
||||
"load_kernel_module return code: {} stdout:{} stderr:{}",
|
||||
code, std_out, std_err
|
||||
@@ -1761,7 +1761,7 @@ mod tests {
|
||||
let mut oci = Spec {
|
||||
..Default::default()
|
||||
};
|
||||
append_guest_hooks(&s, &mut oci).unwrap();
|
||||
append_guest_hooks(&s, &mut oci);
|
||||
assert_eq!(s.hooks, oci.hooks);
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user