mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-03-02 02:32:06 +00:00
Compare commits
114 Commits
2.4.0-alph
...
2.2.3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3a1804cd73 | ||
|
|
b7493fd5d5 | ||
|
|
63ecbcf14b | ||
|
|
4f73e58d73 | ||
|
|
45f65a73c8 | ||
|
|
06d3049349 | ||
|
|
0366f6e817 | ||
|
|
7cb650abcf | ||
|
|
e97cd23bd6 | ||
|
|
6b6d81cced | ||
|
|
a479eca7de | ||
|
|
ee3bf4a411 | ||
|
|
4443a982e6 | ||
|
|
b794a39401 | ||
|
|
39d95f486b | ||
|
|
aa40324c52 | ||
|
|
9053137592 | ||
|
|
c4e8e86acf | ||
|
|
eea2c0195f | ||
|
|
1e798b96fd | ||
|
|
53c4492fb3 | ||
|
|
893623dfbc | ||
|
|
503ce9c154 | ||
|
|
9932e76f27 | ||
|
|
3a035c1f43 | ||
|
|
4102a18aa1 | ||
|
|
0034f40b67 | ||
|
|
1f6b0f651e | ||
|
|
112e0f6381 | ||
|
|
18820e31d9 | ||
|
|
8fafced9ff | ||
|
|
9668095abd | ||
|
|
be51808a13 | ||
|
|
3e145ea94c | ||
|
|
3951834565 | ||
|
|
79e0754a7b | ||
|
|
afe6005785 | ||
|
|
b8fc1af363 | ||
|
|
97167ccddd | ||
|
|
af0fbb9460 | ||
|
|
bc48a58806 | ||
|
|
d581cdab4e | ||
|
|
52fdfc4fed | ||
|
|
8d98e01414 | ||
|
|
688cc8e2bd | ||
|
|
ebc23df752 | ||
|
|
b0aca51eac | ||
|
|
28873c4d75 | ||
|
|
3525a2ed03 | ||
|
|
30d07d4407 | ||
|
|
623b108227 | ||
|
|
fc1822f094 | ||
|
|
ba6ad1c804 | ||
|
|
22d3df9141 | ||
|
|
e58fabfc20 | ||
|
|
feb06dad8a | ||
|
|
d9b41fc583 | ||
|
|
7852b9f8e1 | ||
|
|
83f219577d | ||
|
|
97421afe17 | ||
|
|
2b6327ac37 | ||
|
|
5256e0852c | ||
|
|
02b46268f4 | ||
|
|
1b3058dd24 | ||
|
|
98e2e93552 | ||
|
|
8f25c7da11 | ||
|
|
84da2f8ddc | ||
|
|
de0e3915b7 | ||
|
|
5c76f1c65a | ||
|
|
522a53010c | ||
|
|
852fc53351 | ||
|
|
e0a27b5e90 | ||
|
|
ba6fc32804 | ||
|
|
d5f5da4323 | ||
|
|
017cd3c53c | ||
|
|
484af1a559 | ||
|
|
a572a6ebf8 | ||
|
|
2ca867da7b | ||
|
|
f4da502c4f | ||
|
|
16164241df | ||
|
|
25c7e1181a | ||
|
|
4c5bf0576b | ||
|
|
b3e620dbcf | ||
|
|
98c2ca13c1 | ||
|
|
a97c9063db | ||
|
|
0481c5070c | ||
|
|
64504061c8 | ||
|
|
56920bc943 | ||
|
|
a1874ccd62 | ||
|
|
c2c650500b | ||
|
|
7ee43f9468 | ||
|
|
eedf139076 | ||
|
|
54a6890c3c | ||
|
|
1792a9fe11 | ||
|
|
9bf95279be | ||
|
|
807cc8a3a5 | ||
|
|
5987f3b5e1 | ||
|
|
caafd0f952 | ||
|
|
800126b272 | ||
|
|
b1372b353f | ||
|
|
dca35c1730 | ||
|
|
0bdfdad236 | ||
|
|
60155756f3 | ||
|
|
669888c339 | ||
|
|
cde008f441 | ||
|
|
7c866073f9 | ||
|
|
ca9e6538e6 | ||
|
|
938b01aedc | ||
|
|
abd708e814 | ||
|
|
61babd45ed | ||
|
|
59c51f6201 | ||
|
|
c1f260cc40 | ||
|
|
4cd6909f18 | ||
|
|
efa2d54e85 |
20
.github/workflows/kata-deploy-push.yaml
vendored
20
.github/workflows/kata-deploy-push.yaml
vendored
@@ -1,6 +1,6 @@
|
||||
name: kata deploy build
|
||||
name: kata-deploy-build
|
||||
|
||||
on: [push, pull_request]
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
@@ -9,7 +9,6 @@ jobs:
|
||||
matrix:
|
||||
asset:
|
||||
- kernel
|
||||
- kernel-experimental
|
||||
- shim-v2
|
||||
- qemu
|
||||
- cloud-hypervisor
|
||||
@@ -25,7 +24,7 @@ jobs:
|
||||
|
||||
- name: Build ${{ matrix.asset }}
|
||||
run: |
|
||||
make "${KATA_ASSET}-tarball"
|
||||
./tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh --build="${KATA_ASSET}"
|
||||
build_dir=$(readlink -f build)
|
||||
# store-artifact does not work with symlink
|
||||
sudo cp -r --preserve=all "${build_dir}" "kata-build"
|
||||
@@ -48,21 +47,12 @@ jobs:
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: build
|
||||
path: kata-artifacts
|
||||
- name: merge-artifacts
|
||||
run: |
|
||||
make merge-builds
|
||||
./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts
|
||||
- name: store-artifacts
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: kata-static-tarball
|
||||
path: kata-static.tar.xz
|
||||
|
||||
make-kata-tarball:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: make kata-tarball
|
||||
run: |
|
||||
make kata-tarball
|
||||
sudo make install-tarball
|
||||
|
||||
155
.github/workflows/kata-deploy-test.yaml
vendored
155
.github/workflows/kata-deploy-test.yaml
vendored
@@ -5,121 +5,60 @@ on:
|
||||
name: test-kata-deploy
|
||||
|
||||
jobs:
|
||||
check-comment-and-membership:
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
github.event.issue.pull_request
|
||||
&& github.event_name == 'issue_comment'
|
||||
&& github.event.action == 'created'
|
||||
&& startsWith(github.event.comment.body, '/test_kata_deploy')
|
||||
steps:
|
||||
- name: Check membership
|
||||
uses: kata-containers/is-organization-member@1.0.1
|
||||
id: is_organization_member
|
||||
with:
|
||||
organization: kata-containers
|
||||
username: ${{ github.event.comment.user.login }}
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Fail if not member
|
||||
run: |
|
||||
result=${{ steps.is_organization_member.outputs.result }}
|
||||
if [ $result == false ]; then
|
||||
user=${{ github.event.comment.user.login }}
|
||||
echo Either ${user} is not part of the kata-containers organization
|
||||
echo or ${user} has its Organization Visibility set to Private at
|
||||
echo https://github.com/orgs/kata-containers/people?query=${user}
|
||||
echo
|
||||
echo Ensure you change your Organization Visibility to Public and
|
||||
echo trigger the test again.
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build-asset:
|
||||
runs-on: ubuntu-latest
|
||||
needs: check-comment-and-membership
|
||||
strategy:
|
||||
matrix:
|
||||
asset:
|
||||
- cloud-hypervisor
|
||||
- firecracker
|
||||
- kernel
|
||||
- qemu
|
||||
- rootfs-image
|
||||
- rootfs-initrd
|
||||
- shim-v2
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install docker
|
||||
run: |
|
||||
curl -fsSL https://test.docker.com -o test-docker.sh
|
||||
sh test-docker.sh
|
||||
|
||||
- name: Build ${{ matrix.asset }}
|
||||
run: |
|
||||
make "${KATA_ASSET}-tarball"
|
||||
build_dir=$(readlink -f build)
|
||||
# store-artifact does not work with symlink
|
||||
sudo cp -r "${build_dir}" "kata-build"
|
||||
env:
|
||||
KATA_ASSET: ${{ matrix.asset }}
|
||||
TAR_OUTPUT: ${{ matrix.asset }}.tar.gz
|
||||
|
||||
- name: store-artifact ${{ matrix.asset }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-build/kata-static-${{ matrix.asset }}.tar.xz
|
||||
if-no-files-found: error
|
||||
|
||||
create-kata-tarball:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build-asset
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-artifacts
|
||||
- name: merge-artifacts
|
||||
run: |
|
||||
./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts
|
||||
- name: store-artifacts
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: kata-static-tarball
|
||||
path: kata-static.tar.xz
|
||||
|
||||
kata-deploy:
|
||||
needs: create-kata-tarball
|
||||
check_comments:
|
||||
if: ${{ github.event.issue.pull_request }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v2
|
||||
- name: Check for Command
|
||||
id: command
|
||||
uses: kata-containers/slash-command-action@v1
|
||||
with:
|
||||
name: kata-static-tarball
|
||||
- name: build-and-push-kata-deploy-ci
|
||||
id: build-and-push-kata-deploy-ci
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
command: "test_kata_deploy"
|
||||
reaction: "true"
|
||||
reaction-type: "eyes"
|
||||
allow-edits: "false"
|
||||
permission-level: admin
|
||||
- name: verify command arg is kata-deploy
|
||||
run: |
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
pushd $GITHUB_WORKSPACE
|
||||
git checkout $tag
|
||||
pkg_sha=$(git rev-parse HEAD)
|
||||
popd
|
||||
mv kata-static.tar.xz $GITHUB_WORKSPACE/tools/packaging/kata-deploy/kata-static.tar.xz
|
||||
docker build --build-arg KATA_ARTIFACTS=kata-static.tar.xz -t quay.io/kata-containers/kata-deploy-ci:$pkg_sha $GITHUB_WORKSPACE/tools/packaging/kata-deploy
|
||||
docker login -u ${{ secrets.QUAY_DEPLOYER_USERNAME }} -p ${{ secrets.QUAY_DEPLOYER_PASSWORD }} quay.io
|
||||
docker push quay.io/kata-containers/kata-deploy-ci:$pkg_sha
|
||||
mkdir -p packaging/kata-deploy
|
||||
ln -s $GITHUB_WORKSPACE/tools/packaging/kata-deploy/action packaging/kata-deploy/action
|
||||
echo "::set-output name=PKG_SHA::${pkg_sha}"
|
||||
echo "The command was '${{ steps.command.outputs.command-name }}' with arguments '${{ steps.command.outputs.command-arguments }}'"
|
||||
|
||||
create-and-test-container:
|
||||
needs: check_comments
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: get-PR-ref
|
||||
id: get-PR-ref
|
||||
run: |
|
||||
ref=$(cat $GITHUB_EVENT_PATH | jq -r '.issue.pull_request.url' | sed 's#^.*\/pulls#refs\/pull#' | sed 's#$#\/merge#')
|
||||
echo "reference for PR: " ${ref}
|
||||
echo "##[set-output name=pr-ref;]${ref}"
|
||||
|
||||
- name: check out
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
ref: ${{ steps.get-PR-ref.outputs.pr-ref }}
|
||||
|
||||
- name: build-container-image
|
||||
id: build-container-image
|
||||
run: |
|
||||
PR_SHA=$(git log --format=format:%H -n1)
|
||||
VERSION="2.0.0"
|
||||
ARTIFACT_URL="https://github.com/kata-containers/kata-containers/releases/download/${VERSION}/kata-static-${VERSION}-x86_64.tar.xz"
|
||||
wget "${ARTIFACT_URL}" -O tools/packaging/kata-deploy/kata-static.tar.xz
|
||||
docker build --build-arg KATA_ARTIFACTS=kata-static.tar.xz -t katadocker/kata-deploy-ci:${PR_SHA} -t quay.io/kata-containers/kata-deploy-ci:${PR_SHA} ./tools/packaging/kata-deploy
|
||||
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
|
||||
docker push katadocker/kata-deploy-ci:$PR_SHA
|
||||
docker login -u ${{ secrets.QUAY_DEPLOYER_USERNAME }} -p ${{ secrets.QUAY_DEPLOYER_PASSWORD }} quay.io
|
||||
docker push quay.io/kata-containers/kata-deploy-ci:$PR_SHA
|
||||
echo "##[set-output name=pr-sha;]${PR_SHA}"
|
||||
|
||||
- name: test-kata-deploy-ci-in-aks
|
||||
uses: ./packaging/kata-deploy/action
|
||||
uses: ./tools/packaging/kata-deploy/action
|
||||
with:
|
||||
packaging-sha: ${{steps.build-and-push-kata-deploy-ci.outputs.PKG_SHA}}
|
||||
packaging-sha: ${{ steps.build-container-image.outputs.pr-sha }}
|
||||
env:
|
||||
PKG_SHA: ${{steps.build-and-push-kata-deploy-ci.outputs.PKG_SHA}}
|
||||
PKG_SHA: ${{ steps.build-container-image.outputs.pr-sha }}
|
||||
AZ_APPID: ${{ secrets.AZ_APPID }}
|
||||
AZ_PASSWORD: ${{ secrets.AZ_PASSWORD }}
|
||||
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
|
||||
|
||||
295
.github/workflows/main.yaml
vendored
Normal file
295
.github/workflows/main.yaml
vendored
Normal file
@@ -0,0 +1,295 @@
|
||||
name: Publish release tarball
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '1.*'
|
||||
|
||||
jobs:
|
||||
get-artifact-list:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: get the list
|
||||
run: |
|
||||
pushd $GITHUB_WORKSPACE
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
git checkout $tag
|
||||
popd
|
||||
$GITHUB_WORKSPACE/tools/packaging/artifact-list.sh > artifact-list.txt
|
||||
- name: save-artifact-list
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
path: artifact-list.txt
|
||||
|
||||
build-kernel:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_kernel"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- run: |
|
||||
sudo apt-get update && sudo apt install -y flex bison libelf-dev bc iptables
|
||||
- name: build-kernel
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-kernel.tar.gz
|
||||
|
||||
build-experimental-kernel:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_experimental_kernel"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- run: |
|
||||
sudo apt-get update && sudo apt install -y flex bison libelf-dev bc iptables
|
||||
- name: build-experimental-kernel
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-experimental-kernel.tar.gz
|
||||
|
||||
build-qemu:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_qemu"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- name: build-qemu
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-qemu.tar.gz
|
||||
|
||||
# Job for building the image
|
||||
build-image:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_image"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- name: build-image
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-image.tar.gz
|
||||
|
||||
# Job for building firecracker hypervisor
|
||||
build-firecracker:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_firecracker"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- name: build-firecracker
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-firecracker.tar.gz
|
||||
|
||||
# Job for building cloud-hypervisor
|
||||
build-clh:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_clh"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- name: build-clh
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-clh.tar.gz
|
||||
|
||||
# Job for building kata components
|
||||
build-kata-components:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: get-artifact-list
|
||||
env:
|
||||
buildstr: "install_kata_components"
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifact-list
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: artifact-list
|
||||
- name: build-kata-components
|
||||
run: |
|
||||
if grep -q $buildstr ./artifact-list/artifact-list.txt; then
|
||||
$GITHUB_WORKSPACE/.github/workflows/generate-artifact-tarball.sh $buildstr
|
||||
echo "artifact-built=true" >> $GITHUB_ENV
|
||||
else
|
||||
echo "artifact-built=false" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: store-artifacts
|
||||
if: ${{ env.artifact-built }} == 'true'
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
path: kata-static-kata-components.tar.gz
|
||||
|
||||
gather-artifacts:
|
||||
runs-on: ubuntu-16.04
|
||||
needs: [build-experimental-kernel, build-kernel, build-qemu, build-image, build-firecracker, build-kata-components, build-clh]
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: kata-artifacts
|
||||
- name: colate-artifacts
|
||||
run: |
|
||||
$GITHUB_WORKSPACE/.github/workflows/gather-artifacts.sh
|
||||
- name: store-artifacts
|
||||
uses: actions/upload-artifact@master
|
||||
with:
|
||||
name: release-candidate
|
||||
path: kata-static.tar.xz
|
||||
|
||||
kata-deploy:
|
||||
needs: gather-artifacts
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: release-candidate
|
||||
- name: build-and-push-kata-deploy-ci
|
||||
id: build-and-push-kata-deploy-ci
|
||||
run: |
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
git clone https://github.com/kata-containers/packaging
|
||||
pushd packaging
|
||||
git checkout $tag
|
||||
pkg_sha=$(git rev-parse HEAD)
|
||||
popd
|
||||
mv release-candidate/kata-static.tar.xz ./packaging/kata-deploy/kata-static.tar.xz
|
||||
docker build --build-arg KATA_ARTIFACTS=kata-static.tar.xz -t katadocker/kata-deploy-ci:$pkg_sha -t quay.io/kata-containers/kata-deploy-ci:$pkg_sha ./packaging/kata-deploy
|
||||
docker login -u ${{ secrets.DOCKER_USERNAME }} -p ${{ secrets.DOCKER_PASSWORD }}
|
||||
docker push katadocker/kata-deploy-ci:$pkg_sha
|
||||
docker login -u ${{ secrets.QUAY_DEPLOYER_USERNAME }} -p ${{ secrets.QUAY_DEPLOYER_PASSWORD }} quay.io
|
||||
docker push quay.io/kata-containers/kata-deploy-ci:$pkg_sha
|
||||
echo "::set-output name=PKG_SHA::${pkg_sha}"
|
||||
- name: test-kata-deploy-ci-in-aks
|
||||
uses: ./packaging/kata-deploy/action
|
||||
with:
|
||||
packaging-sha: ${{steps.build-and-push-kata-deploy-ci.outputs.PKG_SHA}}
|
||||
env:
|
||||
PKG_SHA: ${{steps.build-and-push-kata-deploy-ci.outputs.PKG_SHA}}
|
||||
AZ_APPID: ${{ secrets.AZ_APPID }}
|
||||
AZ_PASSWORD: ${{ secrets.AZ_PASSWORD }}
|
||||
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
|
||||
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
|
||||
- name: push-tarball
|
||||
run: |
|
||||
# tag the container image we created and push to DockerHub
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
docker tag katadocker/kata-deploy-ci:${{steps.build-and-push-kata-deploy-ci.outputs.PKG_SHA}} katadocker/kata-deploy:${tag}
|
||||
docker push katadocker/kata-deploy:${tag}
|
||||
|
||||
upload-static-tarball:
|
||||
needs: kata-deploy
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: download-artifacts
|
||||
uses: actions/download-artifact@master
|
||||
with:
|
||||
name: release-candidate
|
||||
- name: install hub
|
||||
run: |
|
||||
HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//')
|
||||
wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \
|
||||
tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub
|
||||
- name: push static tarball to github
|
||||
run: |
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
tarball="kata-static-$tag-x86_64.tar.xz"
|
||||
repo="https://github.com/kata-containers/runtime.git"
|
||||
mv release-candidate/kata-static.tar.xz "release-candidate/${tarball}"
|
||||
git clone "${repo}"
|
||||
cd runtime
|
||||
echo "uploading asset '${tarball}' to '${repo}' tag: ${tag}"
|
||||
GITHUB_TOKEN=${{ secrets.GIT_UPLOAD_TOKEN }} hub release edit -m "" -a "../release-candidate/${tarball}" "${tag}"
|
||||
28
.github/workflows/release.yaml
vendored
28
.github/workflows/release.yaml
vendored
@@ -149,31 +149,3 @@ jobs:
|
||||
tar -cvzf "${tarball}" src/agent/.cargo/config src/agent/vendor
|
||||
GITHUB_TOKEN=${{ secrets.GIT_UPLOAD_TOKEN }} hub release edit -m "" -a "${tarball}" "${tag}"
|
||||
popd
|
||||
|
||||
upload-libseccomp-tarball:
|
||||
needs: upload-cargo-vendored-tarball
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: download-and-upload-tarball
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GIT_UPLOAD_TOKEN }}
|
||||
GOPATH: ${HOME}/go
|
||||
run: |
|
||||
pushd $GITHUB_WORKSPACE
|
||||
./ci/install_yq.sh
|
||||
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
|
||||
versions_yaml="versions.yaml"
|
||||
version=$(${GOPATH}/bin/yq read ${versions_yaml} "externals.libseccomp.version")
|
||||
repo_url=$(${GOPATH}/bin/yq read ${versions_yaml} "externals.libseccomp.url")
|
||||
download_url="${repo_url}/releases/download/v${version}"
|
||||
tarball="libseccomp-${version}.tar.gz"
|
||||
asc="${tarball}.asc"
|
||||
curl -sSLO "${download_url}/${tarball}"
|
||||
curl -sSLO "${download_url}/${asc}"
|
||||
# "-m" option should be empty to re-use the existing release title
|
||||
# without opening a text editor.
|
||||
# For the details, check https://hub.github.com/hub-release.1.html.
|
||||
hub release edit -m "" -a "${tarball}" "${tag}"
|
||||
hub release edit -m "" -a "${asc}" "${tag}"
|
||||
popd
|
||||
|
||||
@@ -12,7 +12,8 @@ on:
|
||||
- reopened
|
||||
- labeled
|
||||
- unlabeled
|
||||
branches:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
@@ -31,6 +32,8 @@ jobs:
|
||||
|
||||
- name: Checkout code to allow hub to communicate with the project
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
token: ${{ secrets.KATA_GITHUB_ACTIONS_TOKEN }}
|
||||
|
||||
- name: Install porting checker script
|
||||
run: |
|
||||
|
||||
16
.github/workflows/static-checks.yaml
vendored
16
.github/workflows/static-checks.yaml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
go-version: [1.16.x, 1.17.x]
|
||||
go-version: [1.15.x, 1.16.x]
|
||||
os: [ubuntu-20.04]
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
@@ -60,21 +60,13 @@ jobs:
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && ./ci/setup.sh
|
||||
env:
|
||||
GOPATH: ${{ runner.workspace }}/kata-containers
|
||||
- name: Installing rust
|
||||
- name: Building rust
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && ./ci/install_rust.sh
|
||||
PATH=$PATH:"$HOME/.cargo/bin"
|
||||
rustup target add x86_64-unknown-linux-musl
|
||||
rustup component add rustfmt clippy
|
||||
- name: Setup seccomp
|
||||
run: |
|
||||
libseccomp_install_dir=$(mktemp -d -t libseccomp.XXXXXXXXXX)
|
||||
gperf_install_dir=$(mktemp -d -t gperf.XXXXXXXXXX)
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && ./ci/install_libseccomp.sh "${libseccomp_install_dir}" "${gperf_install_dir}"
|
||||
echo "Set environment variables for the libseccomp crate to link the libseccomp library statically"
|
||||
echo "LIBSECCOMP_LINK_TYPE=static" >> $GITHUB_ENV
|
||||
echo "LIBSECCOMP_LIB_PATH=${libseccomp_install_dir}/lib" >> $GITHUB_ENV
|
||||
# Check whether the vendored code is up-to-date & working as the first thing
|
||||
- name: Check vendored code
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
@@ -92,7 +84,3 @@ jobs:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && make test
|
||||
- name: Run Unit Tests As Root User
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
cd ${GOPATH}/src/github.com/${{ github.repository }} && sudo -E PATH="$PATH" make test
|
||||
|
||||
26
Makefile
26
Makefile
@@ -8,24 +8,18 @@ COMPONENTS =
|
||||
|
||||
COMPONENTS += agent
|
||||
COMPONENTS += runtime
|
||||
COMPONENTS += trace-forwarder
|
||||
|
||||
# List of available tools
|
||||
TOOLS =
|
||||
|
||||
TOOLS += agent-ctl
|
||||
TOOLS += trace-forwarder
|
||||
|
||||
STANDARD_TARGETS = build check clean install test vendor
|
||||
|
||||
default: all
|
||||
|
||||
all: logging-crate-tests build
|
||||
|
||||
logging-crate-tests:
|
||||
make -C src/libs/logging
|
||||
|
||||
include utils.mk
|
||||
include ./tools/packaging/kata-deploy/local-build/Makefile
|
||||
|
||||
all: build
|
||||
|
||||
# Create the rules
|
||||
$(eval $(call create_all_rules,$(COMPONENTS),$(TOOLS),$(STANDARD_TARGETS)))
|
||||
@@ -39,10 +33,10 @@ generate-protocols:
|
||||
static-checks: build
|
||||
bash ci/static-checks.sh
|
||||
|
||||
.PHONY: \
|
||||
all \
|
||||
binary-tarball \
|
||||
default \
|
||||
install-binary-tarball \
|
||||
logging-crate-tests \
|
||||
static-checks
|
||||
binary-tarball:
|
||||
make -f ./tools/packaging/kata-deploy/local-build/Makefile
|
||||
|
||||
install-binary-tarball:
|
||||
make -f ./tools/packaging/kata-deploy/local-build/Makefile install
|
||||
|
||||
.PHONY: all default static-checks binary-tarball install-binary-tarball
|
||||
|
||||
@@ -70,8 +70,8 @@ The table below lists the remaining parts of the project:
|
||||
| [packaging](tools/packaging) | infrastructure | Scripts and metadata for producing packaged binaries<br/>(components, hypervisors, kernel and rootfs). |
|
||||
| [kernel](https://www.kernel.org) | kernel | Linux kernel used by the hypervisor to boot the guest image. Patches are stored [here](tools/packaging/kernel). |
|
||||
| [osbuilder](tools/osbuilder) | infrastructure | Tool to create "mini O/S" rootfs and initrd images and kernel for the hypervisor. |
|
||||
| [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
|
||||
| [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. |
|
||||
| [`agent-ctl`](tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
|
||||
| [`trace-forwarder`](src/trace-forwarder) | utility | Agent tracing helper. |
|
||||
| [`ci`](https://github.com/kata-containers/ci) | CI | Continuous Integration configuration files and scripts. |
|
||||
| [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. |
|
||||
|
||||
|
||||
30
ci/go-no-os-exit.sh
Executable file
30
ci/go-no-os-exit.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2018 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Check there are no os.Exit() calls creeping into the code
|
||||
# We don't use that exit path in the Kata codebase.
|
||||
|
||||
# Allow the path to check to be over-ridden.
|
||||
# Default to the current directory.
|
||||
go_packages=${1:-.}
|
||||
|
||||
echo "Checking for no os.Exit() calls for package [${go_packages}]"
|
||||
|
||||
candidates=`go list -f '{{.Dir}}/*.go' $go_packages`
|
||||
for f in $candidates; do
|
||||
filename=`basename $f`
|
||||
# skip all go test files
|
||||
[[ $filename == *_test.go ]] && continue
|
||||
# skip exit.go where, the only file we should call os.Exit() from.
|
||||
[[ $filename == "exit.go" ]] && continue
|
||||
files="$f $files"
|
||||
done
|
||||
|
||||
[ -z "$files" ] && echo "No files to check, skipping" && exit 0
|
||||
|
||||
if egrep -n '\<os\.Exit\>' $files; then
|
||||
echo "Direct calls to os.Exit() are forbidden, please use exit() so atexit() works"
|
||||
exit 1
|
||||
fi
|
||||
@@ -1,4 +1,3 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2020 Intel Corporation
|
||||
#
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright 2021 Sony Group Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
set -o errexit
|
||||
|
||||
cidir=$(dirname "$0")
|
||||
source "${cidir}/lib.sh"
|
||||
|
||||
clone_tests_repo
|
||||
|
||||
source "${tests_repo_dir}/.ci/lib.sh"
|
||||
|
||||
# The following variables if set on the environment will change the behavior
|
||||
# of gperf and libseccomp configure scripts, that may lead this script to
|
||||
# fail. So let's ensure they are unset here.
|
||||
unset PREFIX DESTDIR
|
||||
|
||||
arch=$(uname -m)
|
||||
workdir="$(mktemp -d --tmpdir build-libseccomp.XXXXX)"
|
||||
|
||||
# Variables for libseccomp
|
||||
# Currently, specify the libseccomp version directly without using `versions.yaml`
|
||||
# because the current Snap workflow is incomplete.
|
||||
# After solving the issue, replace this code by using the `versions.yaml`.
|
||||
# libseccomp_version=$(get_version "externals.libseccomp.version")
|
||||
# libseccomp_url=$(get_version "externals.libseccomp.url")
|
||||
libseccomp_version="2.5.1"
|
||||
libseccomp_url="https://github.com/seccomp/libseccomp"
|
||||
libseccomp_tarball="libseccomp-${libseccomp_version}.tar.gz"
|
||||
libseccomp_tarball_url="${libseccomp_url}/releases/download/v${libseccomp_version}/${libseccomp_tarball}"
|
||||
cflags="-O2"
|
||||
|
||||
# Variables for gperf
|
||||
# Currently, specify the gperf version directly without using `versions.yaml`
|
||||
# because the current Snap workflow is incomplete.
|
||||
# After solving the issue, replace this code by using the `versions.yaml`.
|
||||
# gperf_version=$(get_version "externals.gperf.version")
|
||||
# gperf_url=$(get_version "externals.gperf.url")
|
||||
gperf_version="3.1"
|
||||
# XXX: gnu.org currently unavailable - see https://github.com/kata-containers/kata-containers/issues/3314
|
||||
gperf_url="https://www.mirrorservice.org/sites/ftp.gnu.org/gnu/gperf"
|
||||
gperf_tarball="gperf-${gperf_version}.tar.gz"
|
||||
gperf_tarball_url="${gperf_url}/${gperf_tarball}"
|
||||
|
||||
# We need to build the libseccomp library from sources to create a static library for the musl libc.
|
||||
# However, ppc64le and s390x have no musl targets in Rust. Hence, we do not set cflags for the musl libc.
|
||||
if ([ "${arch}" != "ppc64le" ] && [ "${arch}" != "s390x" ]); then
|
||||
# Set FORTIFY_SOURCE=1 because the musl-libc does not have some functions about FORTIFY_SOURCE=2
|
||||
cflags="-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=1 -O2"
|
||||
fi
|
||||
|
||||
die() {
|
||||
msg="$*"
|
||||
echo "[Error] ${msg}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
finish() {
|
||||
rm -rf "${workdir}"
|
||||
}
|
||||
|
||||
trap finish EXIT
|
||||
|
||||
build_and_install_gperf() {
|
||||
echo "Build and install gperf version ${gperf_version}"
|
||||
mkdir -p "${gperf_install_dir}"
|
||||
curl -sLO "${gperf_tarball_url}"
|
||||
tar -xf "${gperf_tarball}"
|
||||
pushd "gperf-${gperf_version}"
|
||||
./configure --prefix="${gperf_install_dir}"
|
||||
make
|
||||
make install
|
||||
export PATH=$PATH:"${gperf_install_dir}"/bin
|
||||
popd
|
||||
echo "Gperf installed successfully"
|
||||
}
|
||||
|
||||
build_and_install_libseccomp() {
|
||||
echo "Build and install libseccomp version ${libseccomp_version}"
|
||||
mkdir -p "${libseccomp_install_dir}"
|
||||
curl -sLO "${libseccomp_tarball_url}"
|
||||
tar -xf "${libseccomp_tarball}"
|
||||
pushd "libseccomp-${libseccomp_version}"
|
||||
./configure --prefix="${libseccomp_install_dir}" CFLAGS="${cflags}" --enable-static
|
||||
make
|
||||
make install
|
||||
popd
|
||||
echo "Libseccomp installed successfully"
|
||||
}
|
||||
|
||||
main() {
|
||||
local libseccomp_install_dir="${1:-}"
|
||||
local gperf_install_dir="${2:-}"
|
||||
|
||||
if [ -z "${libseccomp_install_dir}" ] || [ -z "${gperf_install_dir}" ]; then
|
||||
die "Usage: ${0} <libseccomp-install-dir> <gperf-install-dir>"
|
||||
fi
|
||||
|
||||
pushd "$workdir"
|
||||
# gperf is required for building the libseccomp.
|
||||
build_and_install_gperf
|
||||
build_and_install_libseccomp
|
||||
popd
|
||||
}
|
||||
|
||||
main "$@"
|
||||
@@ -12,5 +12,5 @@ source "${cidir}/lib.sh"
|
||||
clone_tests_repo
|
||||
|
||||
pushd ${tests_repo_dir}
|
||||
.ci/install_rust.sh ${1:-}
|
||||
.ci/install_rust.sh
|
||||
popd
|
||||
|
||||
@@ -4,11 +4,6 @@
|
||||
#
|
||||
# This is the build root image for Kata Containers on OpenShift CI.
|
||||
#
|
||||
FROM registry.centos.org/centos:8
|
||||
FROM centos:8
|
||||
|
||||
RUN yum -y update && \
|
||||
yum -y install \
|
||||
git \
|
||||
sudo \
|
||||
wget && \
|
||||
yum clean all
|
||||
RUN yum -y update && yum -y install git sudo wget
|
||||
|
||||
@@ -86,16 +86,6 @@ One of the `initrd` and `image` options in Kata runtime config file **MUST** be
|
||||
The main difference between the options is that the size of `initrd`(10MB+) is significantly smaller than
|
||||
rootfs `image`(100MB+).
|
||||
|
||||
## Enable seccomp
|
||||
|
||||
Enable seccomp as follows:
|
||||
|
||||
```
|
||||
$ sudo sed -i '/^disable_guest_seccomp/ s/true/false/' /etc/kata-containers/configuration.toml
|
||||
```
|
||||
|
||||
This will pass container seccomp profiles to the kata agent.
|
||||
|
||||
## Enable full debug
|
||||
|
||||
Enable full debug as follows:
|
||||
@@ -226,18 +216,6 @@ $ go get -d -u github.com/kata-containers/kata-containers
|
||||
$ cd $GOPATH/src/github.com/kata-containers/kata-containers/src/agent && make
|
||||
```
|
||||
|
||||
The agent is built with seccomp capability by default.
|
||||
If you want to build the agent without the seccomp capability, you need to run `make` with `SECCOMP=no` as follows.
|
||||
|
||||
```
|
||||
$ make -C $GOPATH/src/github.com/kata-containers/kata-containers/src/agent SECCOMP=no
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> - If you enable seccomp in the main configuration file but build the agent without seccomp capability,
|
||||
> the runtime exits conservatively with an error message.
|
||||
|
||||
## Get the osbuilder
|
||||
|
||||
```
|
||||
@@ -256,21 +234,9 @@ the following example.
|
||||
$ export ROOTFS_DIR=${GOPATH}/src/github.com/kata-containers/kata-containers/tools/osbuilder/rootfs-builder/rootfs
|
||||
$ sudo rm -rf ${ROOTFS_DIR}
|
||||
$ cd $GOPATH/src/github.com/kata-containers/kata-containers/tools/osbuilder/rootfs-builder
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true ./rootfs.sh ${distro}'
|
||||
```
|
||||
|
||||
You MUST choose a distribution (e.g., `ubuntu`) for `${distro}`.
|
||||
You can get a supported distributions list in the Kata Containers by running the following.
|
||||
|
||||
```
|
||||
$ ./rootfs.sh -l
|
||||
```
|
||||
|
||||
If you want to build the agent without seccomp capability, you need to run the `rootfs.sh` script with `SECCOMP=no` as follows.
|
||||
|
||||
```
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh ${distro}'
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh ${distro}'
|
||||
```
|
||||
You MUST choose one of `alpine`, `centos`, `clearlinux`, `debian`, `euleros`, `fedora`, `suse`, and `ubuntu` for `${distro}`. By default `seccomp` packages are not included in the rootfs image. Set `SECCOMP` to `yes` to include them.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
@@ -306,7 +272,6 @@ $ script -fec 'sudo -E USE_DOCKER=true ./image_builder.sh ${ROOTFS_DIR}'
|
||||
> - If you do *not* wish to build under Docker, remove the `USE_DOCKER`
|
||||
> variable in the previous command and ensure the `qemu-img` command is
|
||||
> available on your system.
|
||||
> - If `qemu-img` is not installed, you will likely see errors such as `ERROR: File /dev/loop19p1 is not a block device` and `losetup: /tmp/tmp.bHz11oY851: Warning: file is smaller than 512 bytes; the loop device may be useless or invisible for system tools`. These can be mitigated by installing the `qemu-img` command (available in the `qemu-img` package on Fedora or the `qemu-utils` package on Debian).
|
||||
|
||||
|
||||
### Install the rootfs image
|
||||
@@ -325,23 +290,12 @@ $ (cd /usr/share/kata-containers && sudo ln -sf "$image" kata-containers.img)
|
||||
$ export ROOTFS_DIR="${GOPATH}/src/github.com/kata-containers/kata-containers/tools/osbuilder/rootfs-builder/rootfs"
|
||||
$ sudo rm -rf ${ROOTFS_DIR}
|
||||
$ cd $GOPATH/src/github.com/kata-containers/kata-containers/tools/osbuilder/rootfs-builder
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH AGENT_INIT=yes USE_DOCKER=true ./rootfs.sh ${distro}'
|
||||
```
|
||||
`AGENT_INIT` controls if the guest image uses the Kata agent as the guest `init` process. When you create an initrd image,
|
||||
always set `AGENT_INIT` to `yes`.
|
||||
|
||||
You MUST choose a distribution (e.g., `ubuntu`) for `${distro}`.
|
||||
You can get a supported distributions list in the Kata Containers by running the following.
|
||||
|
||||
```
|
||||
$ ./rootfs.sh -l
|
||||
```
|
||||
|
||||
If you want to build the agent without seccomp capability, you need to run the `rootfs.sh` script with `SECCOMP=no` as follows.
|
||||
|
||||
```
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH AGENT_INIT=yes USE_DOCKER=true SECCOMP=no ./rootfs.sh ${distro}'
|
||||
```
|
||||
`AGENT_INIT` controls if the guest image uses the Kata agent as the guest `init` process. When you create an initrd image,
|
||||
always set `AGENT_INIT` to `yes`. By default `seccomp` packages are not included in the initrd image. Set `SECCOMP` to `yes` to include them.
|
||||
|
||||
You MUST choose one of `alpine`, `centos`, `clearlinux`, `euleros`, and `fedora` for `${distro}`.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
|
||||
@@ -86,6 +86,21 @@ All other configurations are supported and are working properly.
|
||||
|
||||
## Networking
|
||||
|
||||
### Docker swarm and compose support
|
||||
|
||||
The newest version of Docker supported is specified by the
|
||||
`externals.docker.version` variable in the
|
||||
[versions database](https://github.com/kata-containers/runtime/blob/master/versions.yaml).
|
||||
|
||||
Basic Docker swarm support works. However, if you want to use custom networks
|
||||
with Docker's swarm, an older version of Docker is required. This is specified
|
||||
by the `externals.docker.meta.swarm-version` variable in the
|
||||
[versions database](https://github.com/kata-containers/runtime/blob/master/versions.yaml).
|
||||
|
||||
See issue https://github.com/kata-containers/runtime/issues/175 for more information.
|
||||
|
||||
Docker compose normally uses custom networks, so also has the same limitations.
|
||||
|
||||
## Resource management
|
||||
|
||||
Due to the way VMs differ in their CPU and memory allocation, and sharing
|
||||
|
||||
@@ -11,10 +11,6 @@ For details of the other Kata Containers repositories, see the
|
||||
|
||||
* [Installation guides](./install/README.md): Install and run Kata Containers with Docker or Kubernetes
|
||||
|
||||
## Tracing
|
||||
|
||||
See the [tracing documentation](tracing.md).
|
||||
|
||||
## More User Guides
|
||||
|
||||
* [Upgrading](Upgrading.md): how to upgrade from [Clear Containers](https://github.com/clearcontainers) and [runV](https://github.com/hyperhq/runv) to [Kata Containers](https://github.com/kata-containers) and how to upgrade an existing Kata Containers system to the latest version.
|
||||
@@ -41,10 +37,9 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
|
||||
### Design and Implementations
|
||||
|
||||
* [Kata Containers Architecture](design/architecture): Architectural overview of Kata Containers
|
||||
* [Kata Containers Architecture](design/architecture.md): Architectural overview of Kata Containers
|
||||
* [Kata Containers E2E Flow](design/end-to-end-flow.md): The entire end-to-end flow of Kata Containers
|
||||
* [Kata Containers design](./design/README.md): More Kata Containers design documents
|
||||
* [Kata Containers threat model](./threat-model/threat-model.md): Kata Containers threat model
|
||||
|
||||
### How to Contribute
|
||||
|
||||
@@ -52,18 +47,6 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
* [How to contribute to Kata Containers](https://github.com/kata-containers/community/blob/master/CONTRIBUTING.md)
|
||||
* [Code of Conduct](../CODE_OF_CONDUCT.md)
|
||||
|
||||
## Help Writing a Code PR
|
||||
|
||||
* [Code PR advice](code-pr-advice.md).
|
||||
|
||||
## Help Writing Unit Tests
|
||||
|
||||
* [Unit Test Advice](Unit-Test-Advice.md)
|
||||
|
||||
## Help Improving the Documents
|
||||
|
||||
* [Documentation Requirements](Documentation-Requirements.md)
|
||||
|
||||
### Code Licensing
|
||||
|
||||
* [Licensing](Licensing-strategy.md): About the licensing strategy of Kata Containers.
|
||||
@@ -73,6 +56,10 @@ Documents that help to understand and contribute to Kata Containers.
|
||||
* [Release strategy](Stable-Branch-Strategy.md)
|
||||
* [Release Process](Release-Process.md)
|
||||
|
||||
## Help Improving the Documents
|
||||
|
||||
* [Documentation Requirements](Documentation-Requirements.md)
|
||||
|
||||
## Website Changes
|
||||
|
||||
If you have a suggestion for how we can improve the
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
|
||||
### Check Git-hub Actions
|
||||
|
||||
We make use of [GitHub actions](https://github.com/features/actions) in this [file](https://github.com/kata-containers/kata-containers/blob/main/.github/workflows/release.yaml) in the `kata-containers/kata-containers` repository to build and upload release artifacts. This action is auto triggered with the above step when a new tag is pushed to the `kata-containers/kata-containers` repository.
|
||||
We make use of [GitHub actions](https://github.com/features/actions) in this [file](https://github.com/kata-containers/kata-containers/blob/main/.github/workflows/main.yaml) in the `kata-containers/kata-containers` repository to build and upload release artifacts. This action is auto triggered with the above step when a new tag is pushed to the `kata-containers/kata-containers` repository.
|
||||
|
||||
Check the [actions status page](https://github.com/kata-containers/kata-containers/actions) to verify all steps in the actions workflow have completed successfully. On success, a static tarball containing Kata release artifacts will be uploaded to the [Release page](https://github.com/kata-containers/kata-containers/releases).
|
||||
|
||||
|
||||
@@ -120,7 +120,7 @@ stable and main. While this is not in place currently, it should be considered i
|
||||
|
||||
### Patch releases
|
||||
|
||||
Releases are made every four weeks, which include a GitHub release as
|
||||
Releases are made every three weeks, which include a GitHub release as
|
||||
well as binary packages. These patch releases are made for both stable branches, and a "release candidate"
|
||||
for the next `MAJOR` or `MINOR` is created from main. If there are no changes across all the repositories, no
|
||||
release is created and an announcement is made on the developer mailing list to highlight this.
|
||||
@@ -136,7 +136,8 @@ The process followed for making a release can be found at [Release Process](Rele
|
||||
|
||||
### Frequency
|
||||
Minor releases are less frequent in order to provide a more stable baseline for users. They are currently
|
||||
running on a sixteen weeks cadence. The release schedule can be seen on the
|
||||
running on a twelve week cadence. As the Kata Containers code base has reached a certain level of
|
||||
maturity, we have increased the cadence from six weeks to twelve weeks. The release schedule can be seen on the
|
||||
[release rotation wiki page](https://github.com/kata-containers/community/wiki/Release-Team-Rota).
|
||||
|
||||
### Compatibility
|
||||
|
||||
@@ -1,379 +0,0 @@
|
||||
# Unit Test Advice
|
||||
|
||||
## Overview
|
||||
|
||||
This document offers advice on writing a Unit Test (UT) in
|
||||
[Golang](https://golang.org) and [Rust](https://www.rust-lang.org).
|
||||
|
||||
## General advice
|
||||
|
||||
### Unit test strategies
|
||||
|
||||
#### Positive and negative tests
|
||||
|
||||
Always add positive tests (where success is expected) *and* negative
|
||||
tests (where failure is expected).
|
||||
|
||||
#### Boundary condition tests
|
||||
|
||||
Try to add unit tests that exercise boundary conditions such as:
|
||||
|
||||
- Missing values (`null` or `None`).
|
||||
- Empty strings and huge strings.
|
||||
- Empty (or uninitialised) complex data structures
|
||||
(such as lists, vectors and hash tables).
|
||||
- Common numeric values (such as `-1`, `0`, `1` and the minimum and
|
||||
maximum values).
|
||||
|
||||
#### Test unusual values
|
||||
|
||||
Also always consider "unusual" input values such as:
|
||||
|
||||
- String values containing spaces, Unicode characters, special
|
||||
characters, escaped characters or null bytes.
|
||||
|
||||
> **Note:** Consider these unusual values in prefix, infix and
|
||||
> suffix position.
|
||||
|
||||
- String values that cannot be converted into numeric values or which
|
||||
contain invalid structured data (such as invalid JSON).
|
||||
|
||||
#### Other types of tests
|
||||
|
||||
If the code requires other forms of testing (such as stress testing,
|
||||
fuzz testing and integration testing), raise a GitHub issue and
|
||||
reference it on the issue you are using for the main work. This
|
||||
ensures the test team are aware that a new test is required.
|
||||
|
||||
### Test environment
|
||||
|
||||
#### Create unique files and directories
|
||||
|
||||
Ensure your tests do not write to a fixed file or directory. This can
|
||||
cause problems when running multiple tests simultaneously and also
|
||||
when running tests after a previous test run failure.
|
||||
|
||||
#### Assume parallel testing
|
||||
|
||||
Always assume your tests will be run *in parallel*. If this is
|
||||
problematic for a test, force it to run in isolation using the
|
||||
`serial_test` crate for Rust code for example.
|
||||
|
||||
### Running
|
||||
|
||||
Ensure you run the unit tests and they all pass before raising a PR.
|
||||
Ideally do this on different distributions on different architectures
|
||||
to maximise coverage (and so minimise surprises when your code runs in
|
||||
the CI).
|
||||
|
||||
## Assertions
|
||||
|
||||
### Golang assertions
|
||||
|
||||
Use the `testify` assertions package to create a new assertion object as this
|
||||
keeps the test code free from distracting `if` tests:
|
||||
|
||||
```go
|
||||
func TestSomething(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
err := doSomething()
|
||||
assert.NoError(err)
|
||||
}
|
||||
```
|
||||
|
||||
### Rust assertions
|
||||
|
||||
Use the standard set of `assert!()` macros.
|
||||
|
||||
## Table driven tests
|
||||
|
||||
Try to write tests using a table-based approach. This allows you to distill
|
||||
the logic into a compact table (rather than spreading the tests across
|
||||
multiple test functions). It also makes it easy to cover all the
|
||||
interesting boundary conditions:
|
||||
|
||||
### Golang table driven tests
|
||||
|
||||
Assume the following function:
|
||||
|
||||
```go
|
||||
// The function under test.
|
||||
//
|
||||
// Accepts a string and an integer and returns the
|
||||
// result of sticking them together separated by a dash as a string.
|
||||
func joinParamsWithDash(str string, num int) (string, error) {
|
||||
if str == "" {
|
||||
return "", errors.New("string cannot be blank")
|
||||
}
|
||||
|
||||
if num <= 0 {
|
||||
return "", errors.New("number must be positive")
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s-%d", str, num), nil
|
||||
}
|
||||
```
|
||||
|
||||
A table driven approach to testing it:
|
||||
|
||||
```go
|
||||
import (
|
||||
"testing"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestJoinParamsWithDash(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Type used to hold function parameters and expected results.
|
||||
type testData struct {
|
||||
param1 string
|
||||
param2 int
|
||||
expectedResult string
|
||||
expectError bool
|
||||
}
|
||||
|
||||
// List of tests to run including the expected results
|
||||
data := []testData{
|
||||
// Failure scenarios
|
||||
{"", -1, "", true},
|
||||
{"", 0, "", true},
|
||||
{"", 1, "", true},
|
||||
{"foo", 0, "", true},
|
||||
{"foo", -1, "", true},
|
||||
|
||||
// Success scenarios
|
||||
{"foo", 1, "foo-1", false},
|
||||
{"bar", 42, "bar-42", false},
|
||||
}
|
||||
|
||||
// Run the tests
|
||||
for i, d := range data {
|
||||
// Create a test-specific string that is added to each assert
|
||||
// call. It will be displayed if any assert test fails.
|
||||
msg := fmt.Sprintf("test[%d]: %+v", i, d)
|
||||
|
||||
// Call the function under test
|
||||
result, err := joinParamsWithDash(d.param1, d.param2)
|
||||
|
||||
// update the message for more information on failure
|
||||
msg = fmt.Sprintf("%s, result: %q, err: %v", msg, result, err)
|
||||
|
||||
if d.expectError {
|
||||
assert.Error(err, msg)
|
||||
|
||||
// If an error is expected, there is no point
|
||||
// performing additional checks.
|
||||
continue
|
||||
}
|
||||
|
||||
assert.NoError(err, msg)
|
||||
assert.Equal(d.expectedResult, result, msg)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Rust table driven tests
|
||||
|
||||
Assume the following function:
|
||||
|
||||
```rust
|
||||
// Convenience type to allow Result return types to only specify the type
|
||||
// for the true case; failures are specified as static strings.
|
||||
// XXX: This is an example. In real code use the "anyhow" and
|
||||
// XXX: "thiserror" crates.
|
||||
pub type Result<T> = std::result::Result<T, &'static str>;
|
||||
|
||||
// The function under test.
|
||||
//
|
||||
// Accepts a string and an integer and returns the
|
||||
// result of sticking them together separated by a dash as a string.
|
||||
fn join_params_with_dash(str: &str, num: i32) -> Result<String> {
|
||||
if str.is_empty() {
|
||||
return Err("string cannot be blank");
|
||||
}
|
||||
|
||||
if num <= 0 {
|
||||
return Err("number must be positive");
|
||||
}
|
||||
|
||||
let result = format!("{}-{}", str, num);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
A table driven approach to testing it:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_join_params_with_dash() {
|
||||
// This is a type used to record all details of the inputs
|
||||
// and outputs of the function under test.
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
str: &'a str,
|
||||
num: i32,
|
||||
result: Result<String>,
|
||||
}
|
||||
|
||||
// The tests can now be specified as a set of inputs and outputs
|
||||
let tests = &[
|
||||
// Failure scenarios
|
||||
TestData {
|
||||
str: "",
|
||||
num: 0,
|
||||
result: Err("string cannot be blank"),
|
||||
},
|
||||
TestData {
|
||||
str: "foo",
|
||||
num: -1,
|
||||
result: Err("number must be positive"),
|
||||
},
|
||||
|
||||
// Success scenarios
|
||||
TestData {
|
||||
str: "foo",
|
||||
num: 42,
|
||||
result: Ok("foo-42".to_string()),
|
||||
},
|
||||
TestData {
|
||||
str: "-",
|
||||
num: 1,
|
||||
result: Ok("--1".to_string()),
|
||||
},
|
||||
];
|
||||
|
||||
// Run the tests
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
// Create a string containing details of the test
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
// Call the function under test
|
||||
let result = join_params_with_dash(d.str, d.num);
|
||||
|
||||
// Update the test details string with the results of the call
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
// Perform the checks
|
||||
if d.result.is_ok() {
|
||||
assert!(result == d.result, msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Temporary files
|
||||
|
||||
Always delete temporary files on success.
|
||||
|
||||
### Golang temporary files
|
||||
|
||||
```go
|
||||
func TestSomething(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
// Create a temporary directory
|
||||
tmpdir, err := os.MkdirTemp("", "")
|
||||
assert.NoError(err)
|
||||
|
||||
// Delete it at the end of the test
|
||||
defer os.RemoveAll(tmpdir)
|
||||
|
||||
// Add test logic that will use the tmpdir here...
|
||||
}
|
||||
```
|
||||
|
||||
### Rust temporary files
|
||||
|
||||
Use the `tempfile` crate which allows files and directories to be deleted
|
||||
automatically:
|
||||
|
||||
```rust
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_something() {
|
||||
|
||||
// Create a temporary directory (which will be deleted automatically
|
||||
let dir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
let filename = dir.path().join("file.txt");
|
||||
|
||||
// create filename ...
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Test user
|
||||
|
||||
[Unit tests are run *twice*](https://github.com/kata-containers/tests/blob/main/.ci/go-test.sh):
|
||||
|
||||
- as the current user
|
||||
- as the `root` user (if different to the current user)
|
||||
|
||||
When writing a test consider which user should run it; even if the code the
|
||||
test is exercising runs as `root`, it may be necessary to *only* run the test
|
||||
as a non-`root` for the test to be meaningful. Add appropriate skip
|
||||
guards around code that requires `root` and non-`root` so that the test
|
||||
will run if the correct type of user is detected and skipped if not.
|
||||
|
||||
### Run Golang tests as a different user
|
||||
|
||||
The main repository has the most comprehensive set of skip abilities. See:
|
||||
|
||||
- https://github.com/kata-containers/kata-containers/tree/main/src/runtime/pkg/katatestutils
|
||||
|
||||
### Run Rust tests as a different user
|
||||
|
||||
One method is to use the `nix` crate along with some custom macros:
|
||||
|
||||
```
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[allow(unused_macros)]
|
||||
macro_rules! skip_if_root {
|
||||
() => {
|
||||
if nix::unistd::Uid::effective().is_root() {
|
||||
println!("INFO: skipping {} which needs non-root", module_path!());
|
||||
return;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[allow(unused_macros)]
|
||||
macro_rules! skip_if_not_root {
|
||||
() => {
|
||||
if !nix::unistd::Uid::effective().is_root() {
|
||||
println!("INFO: skipping {} which needs root", module_path!());
|
||||
return;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_that_must_be_run_as_root() {
|
||||
// Not running as the superuser, so skip.
|
||||
skip_if_not_root!();
|
||||
|
||||
// Run test *iff* the user running the test is root
|
||||
|
||||
// ...
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -102,7 +102,7 @@ first
|
||||
[install the latest release](#determine-latest-version).
|
||||
|
||||
See the
|
||||
[manual installation documentation](install/README.md#manual-installation)
|
||||
[manual installation installation documentation](install/README.md#manual-installation)
|
||||
for details on how to automatically install and configuration a static release
|
||||
with containerd.
|
||||
|
||||
@@ -114,7 +114,7 @@ with containerd.
|
||||
> kernel or image.
|
||||
|
||||
If you are using custom
|
||||
[guest assets](design/architecture/README.md#guest-assets),
|
||||
[guest assets](design/architecture.md#guest-assets),
|
||||
you must upgrade them to work with Kata Containers 2.x since Kata
|
||||
Containers 1.x assets will **not** work.
|
||||
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
# Code PR Advice
|
||||
|
||||
Before raising a PR containing code changes, we suggest you consider
|
||||
the following to ensure a smooth and fast process.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> - All the advice in this document is optional. However, if the
|
||||
> advice provided is not followed, there is no guarantee your PR
|
||||
> will be merged.
|
||||
>
|
||||
> - All the check tools will be run automatically on your PR by the CI.
|
||||
> However, if you run them locally first, there is a much better
|
||||
> chance of a successful initial CI run.
|
||||
|
||||
## Assumptions
|
||||
|
||||
This document assumes you have already read (and in the case of the
|
||||
code of conduct agreed to):
|
||||
|
||||
- The [Kata Containers code of conduct](https://github.com/kata-containers/community/blob/main/CODE_OF_CONDUCT.md).
|
||||
- The [Kata Containers contributing guide](https://github.com/kata-containers/community/blob/main/CONTRIBUTING.md).
|
||||
|
||||
## Code
|
||||
|
||||
### Architectures
|
||||
|
||||
Do not write architecture-specific code if it is possible to write the
|
||||
code generically.
|
||||
|
||||
### General advice
|
||||
|
||||
- Do not write code to impress: instead write code that is easy to read and understand.
|
||||
|
||||
- Always consider which user will run the code. Try to minimise
|
||||
the privileges the code requires.
|
||||
|
||||
### Comments
|
||||
|
||||
Always add comments if the intent of the code is not obvious. However,
|
||||
try to avoid comments if the code could be made clearer (for example
|
||||
by using more meaningful variable names).
|
||||
|
||||
### Constants
|
||||
|
||||
Don't embed magic numbers and strings in functions, particularly if
|
||||
they are used repeatedly.
|
||||
|
||||
Create constants at the top of the file instead.
|
||||
|
||||
### Copyright and license
|
||||
|
||||
Ensure all new files contain a copyright statement and an SPDX license
|
||||
identifier in the comments at the top of the file.
|
||||
|
||||
### FIXME and TODO
|
||||
|
||||
If the code contains areas that are not fully implemented, make this
|
||||
clear a comment which provides a link to a GitHub issue that provides
|
||||
further information.
|
||||
|
||||
Do not just rely on comments in this case though: if possible, return
|
||||
a "`BUG: feature X not implemented see {bug-url}`" type error.
|
||||
|
||||
### Functions
|
||||
|
||||
- Keep functions relatively short (less than 100 lines is a good "rule of thumb").
|
||||
|
||||
- Document functions if the parameters, return value or general intent
|
||||
of the function is not obvious.
|
||||
|
||||
- Always return errors where possible.
|
||||
|
||||
Do not discard error return values from the functions this function
|
||||
calls.
|
||||
|
||||
### Logging
|
||||
|
||||
- Don't use multiple log calls when a single log call could be used.
|
||||
|
||||
- Use structured logging where possible to allow
|
||||
[standard tooling](https://github.com/kata-containers/tests/tree/main/cmd/log-parser)
|
||||
be able to extract the log fields.
|
||||
|
||||
### Names
|
||||
|
||||
Give functions, macros and variables clear and meaningful names.
|
||||
|
||||
### Structures
|
||||
|
||||
#### Golang structures
|
||||
|
||||
Unlike Rust, Go does not enforce that all structure members be set.
|
||||
This has lead to numerous bugs in the past where code like the
|
||||
following is used:
|
||||
|
||||
```go
|
||||
type Foo struct {
|
||||
Key string
|
||||
Value string
|
||||
}
|
||||
|
||||
// BUG: Key not set, but nobody noticed! ;(
|
||||
let foo1 = Foo {
|
||||
Value: "foo",
|
||||
}
|
||||
```
|
||||
|
||||
A much safer approach is to create a constructor function to enforce
|
||||
integrity:
|
||||
|
||||
```go
|
||||
type Foo struct {
|
||||
Key string
|
||||
Value string
|
||||
}
|
||||
|
||||
func NewFoo(key, value string) (*Foo, error) {
|
||||
if key == "" {
|
||||
return nil, errors.New("Foo needs a key")
|
||||
}
|
||||
|
||||
if value == "" {
|
||||
return nil, errors.New("Foo needs a value")
|
||||
}
|
||||
|
||||
return &Foo{
|
||||
Key: key,
|
||||
Value: value,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func testFoo() error {
|
||||
// BUG: Key not set, but nobody noticed! ;(
|
||||
badFoo := Foo{Value: "value"}
|
||||
|
||||
// Ok - the constructor performs needed validation
|
||||
goodFoo, err := NewFoo("name", "value")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The above is just an example. The *safest* approach would be to move
|
||||
> `NewFoo()` into a separate package and make `Foo` and it's elements
|
||||
> private. The compiler would then enforce the use of the constructor
|
||||
> to guarantee correctly defined objects.
|
||||
|
||||
|
||||
### Tracing
|
||||
|
||||
Consider if the code needs to create a new
|
||||
[trace span](https://github.com/kata-containers/kata-containers/blob/main/docs/tracing.md).
|
||||
|
||||
Ensure any new trace spans added to the code are completed.
|
||||
|
||||
## Tests
|
||||
|
||||
### Unit tests
|
||||
|
||||
Where possible, code changes should be accompanied by unit tests.
|
||||
|
||||
Consider using the standard
|
||||
[table-based approach](Unit-Test-Advice.md)
|
||||
as it encourages you to make functions small and simple, and also
|
||||
allows you to think about what types of value to test.
|
||||
|
||||
### Other categories of test
|
||||
|
||||
Raised a GitHub issue in the
|
||||
[`tests`](https://github.com/kata-containers/tests) repository that
|
||||
explains what sort of test is required along with as much detail as
|
||||
possible. Ensure the original issue is referenced on the `tests` issue.
|
||||
|
||||
### Unsafe code
|
||||
|
||||
#### Rust language specifics
|
||||
|
||||
Minimise the use of `unsafe` blocks in Rust code and since it is
|
||||
potentially dangerous always write [unit tests][#unit-tests]
|
||||
for this code where possible.
|
||||
|
||||
`expect()` and `unwrap()` will cause the code to panic on error.
|
||||
Prefer to return a `Result` on error rather than using these calls to
|
||||
allow the caller to deal with the error condition.
|
||||
|
||||
The table below lists the small number of cases where use of
|
||||
`expect()` and `unwrap()` are permitted:
|
||||
|
||||
| Area | Rationale for permitting |
|
||||
|-|-|
|
||||
| In test code (the `tests` module) | Panics will cause the test to fail, which is desirable. |
|
||||
| `lazy_static!()` | This magic macro cannot "return" a value as it runs before `main()`. |
|
||||
| `defer!()` | Similar to golang's `defer()` but doesn't allow the use of `?`. |
|
||||
| `tokio::spawn(async move {})` | Cannot currently return a `Result` from an `async move` closure. |
|
||||
| If an explicit test is performed before the `unwrap()` / `expect()` | *"Just about acceptable"*, but not ideal `[*]` |
|
||||
| `Mutex.lock()` | Almost unrecoverable if failed in the lock acquisition |
|
||||
|
||||
|
||||
`[*]` - There can lead to bad *future* code: consider what would
|
||||
happen if the explicit test gets dropped in the future. This is easier
|
||||
to happen if the test and the extraction of the value are two separate
|
||||
operations. In summary, this strategy can introduce an insidious
|
||||
maintenance issue.
|
||||
|
||||
## Documentation
|
||||
|
||||
### General requirements
|
||||
|
||||
- All new features should be accompanied by documentation explaining:
|
||||
|
||||
- What the new feature does
|
||||
|
||||
- Why it is useful
|
||||
|
||||
- How to use the feature
|
||||
|
||||
- Any known issues or limitations
|
||||
|
||||
Links should be provided to GitHub issues tracking the issues
|
||||
|
||||
- The [documentation requirements document](Documentation-Requirements.md)
|
||||
explains how the project formats documentation.
|
||||
|
||||
### Markdown syntax
|
||||
|
||||
Run the
|
||||
[markdown checker](https://github.com/kata-containers/tests/tree/main/cmd/check-markdown)
|
||||
on your documentation changes.
|
||||
|
||||
### Spell check
|
||||
|
||||
Run the
|
||||
[spell checker](https://github.com/kata-containers/tests/tree/main/cmd/check-spelling)
|
||||
on your documentation changes.
|
||||
|
||||
## Finally
|
||||
|
||||
You may wish to read the documentation that the
|
||||
[Kata Review Team](https://github.com/kata-containers/community/blob/main/Rota-Process.md) use to help review PRs:
|
||||
|
||||
- [PR review guide](https://github.com/kata-containers/community/blob/main/PR-Review-Guide.md).
|
||||
- [documentation review process](https://github.com/kata-containers/community/blob/main/Documentation-Review-Process.md).
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
Kata Containers design documents:
|
||||
|
||||
- [Kata Containers architecture](architecture)
|
||||
- [Kata Containers architecture](architecture.md)
|
||||
- [API Design of Kata Containers](kata-api-design.md)
|
||||
- [Design requirements for Kata Containers](kata-design-requirements.md)
|
||||
- [VSocks](VSocks.md)
|
||||
|
||||
@@ -1 +1 @@
|
||||
<mxfile host="app.diagrams.net" modified="2021-11-05T13:07:32.992Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" etag="j5e7J3AOXxeQrt-Zz2uw" version="15.6.8" type="device"><diagram id="XNV8G0dePIPkhS_Khqr4" name="Page-1">7Vxdd9o4EP01nLP7QI5s+fORUNhNT7rNbnqaZl/2CCywG2OxQhDIr18Z29iyZD6CDZRuHho8toQ9986dGVlNC3Yny98omvqfiIfDlg68ZQt+aOm6AUyb/4otq8TiGnpiGNPAS0wgNzwGbzgxapl1Hnh4ltoSEyMkZMFUNA5JFOEhE2yIUvIqXjYioScYpmiMJcPjEIWy9SnwmJ9YdQjd/MTvOBj76VdDCNI7n6Ds6tQw85FHXgsm2GvBLiWEJZ8myy4OY++JjulXnN3cGcUR22fAn3fPzx+jj7e9HrIXA330feIZ7czPCxTO00dO75atMh+MKZlP08swZXip8jwaZJcD+ca0zeNyomAywYyu+CXpRG3NNM1kUMoSXU+PX3OfG9nEfsHdG56gFOfxZvbcE/xD6gy1Yz7/88nuPiLwcNcZDLvEvZ10Vm1zt1+4WyIPx5NoLXj76gcMP07RMD77yqOB23w2CdPTIxKxlN78nuHtmCIv4A7qkpDQ9XzQxsjC8blREIYFu4ewMxpy+4xR8oILZ6yhgwcjfkZ2+Xa0yzDK2JzP81ZzntcNtedHI8+1LNnzo9FIHyo971kDy7Qa8Xx61gJCSGhyRHCtkXHRLLMhXGwJF659/KFrCwtHBkAbIA3rKgAAsHqdfjqDCBn/aRIYTRORUWiVa8rAwKZwcSRcuCQzFESYcrNWLz6K4HFtD9i2QrZM7HiGCjtHH8Ak3ETs+n0A+v0msdNhCTv7RkZPM1TwNSV37lb4asw6VwifDc4MnqJ88ldTTBfBjHulDB1/SibiI/o2IhEuAZGaUBiMI3445B7lvIC3sc8CXqZ20hOTwPPir1ESIqcMUORDn9DgLaZcmF7QnHKWUhqQ4TNUpUZj6GkSeuM5nvGUBl4wjeJW5odAsDnAzBJihiLgrIYgUz6DrJYSRqdvVwzblol82qJZM3Y75sfrV9wKGC+pXdEa7BTP16/s7/lLbVc0uY+8hn7lYGCkdgVKyJy0XdHkPvJn6VcOxu4C2xVte7t5zf3K0fCdv12Ry6efpl05XDgvrFvR5V7zmruVw/E6Z7OiDjcoQYK9MX5MDwllPhmTCIW93FpyXn7NPSHTFMXvmLFV6lI0Z0TEGC8D9q3w+TmeiueN5OjDMp15fbCSMdJijDg0dPUtuzI+KMwSH+bTrI+yeWRss3xO5nSYsfb+y53jDp6+P1r+y+fXj+HLU97AMETHmG1xalo+xI7cygqKQ8SCBRZuQwXxemiHUrQqXDAlQcRmhZkfYoPQBNqOmJstu0gYxQjD1ksjnBLFkrvICbd5nCNUA0qqwRidDpXMvEcDLiMCm/ZXAopnwVvaV8dcSF3IJzdvW+YHBctktmwNo727+erWHdxormWIMpEcHUYXGV0osmFz09kUZDSaYSZJymEIqyNHLqirEb5A7Xmv1hTZZB+nPa6sPVtFaqf45HyDBrRFvtnHEa5WQqklQy7ir5g6aiE6hjpqEbuQtOUCUf52p63yCFOzS6w7Lm1t9WtB1LqFNrMXjfmnlm6FcYE74CZrHH/6ZdOLeq04F/T5v92/7tqff5UoXeeyj+U5tqXsPWHHNGA2w37LPtlLib0L37auOa4IKpQXpDXHkktfq4bSV4lf1tb+HBpyXOmr75t+4L7pZ28ROQpjXY7RBxrP7eP5LH5wTBdYXla4rsATyz4LqALPaCbw1Mn7nHGXx9pzMdR2xF0eas/ZfCfJ3VDRcqrFrPa4e2fytk1bSbfq5F0eYTpmrclbyUH5XaTP2FRJzMvsOPUKJTi44QQ3Um6uqd/MXm9l/aZuiFM01x7ICwqV6J5MdqCY8QGwTqI98aQPmAbcpTFTj1wC21+P4J56tGGhCQEUK8QjaVhNs8NVzbHFezNAaSP7TlUrjTha1dDX0f1d+292B77+8dRGX7/MfHCmzPpOplaycPcah9tIslM1lpq4jWZTPGWTJBGTjjuGYVXftKXpLY0wHbh9hA5ca9uIZtpkKKfaF8RQe0KigCle6V3sXofDa2/NNfWbCgIVqm/dVLxgba76lrcvXGjb+64yetvK1u4VMKtuYTkOKjl0frQXI5VR8446FZqmXlNlCsqvQkqyXktpqnxnvMdWvOZ3hzqGmAgMR7EmYG928hR1yW5s05XCMcnaqRcsBAdZ/87j/5C4pmR7tuZkh1+gGdPlmpjZ+WzFNV9wbc/8YNJep5/FZnp+t+tvSC6uLx8ZDeejrfQ6aDtqBdTNrbzKujYjw5f6XK/a8esAwIt4hes7JgAGOKXrs/2o2o1e2qUtb51T1QZ1bL5SPoL8mvYCxEn5pkDNWKcpxir3rv+vToeGiF1BnMtSJ3n16ArUaX/XV6qTKW9Wq0md+GH+RwaSSiv/Ww2w9x8=</diagram></mxfile>
|
||||
<mxfile host="Chrome" modified="2020-07-02T06:44:28.736Z" agent="5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36" etag="r7FpfnbGNK7jbg54Gu9x" version="13.3.5" type="device"><diagram id="XNV8G0dePIPkhS_Khqr4" name="Page-1">7VvZcuI4FP0aHqFky+sjkNCTqfR0qtLV6fTLlMDy0hiLscWWrx8Zy3iRQkhjQxbygnVlhK1zz9HRkg4cztZfYjT3vxIHhx0VOOsOvOqoKrRthX2kkU0WMTWYBbw4cLIQKAL3wRPOgkoeXQQOTngsC1FCQhrMq8EJiSI8oZUYimOyqt7mktCpBObIw0LgfoJCMfoQONTPoiqEdlHxFw48n/80hIA/+Qzld/NA4iOHrEoheN2Bw5gQml3N1kMcpr1X7ZjRM7W7J4txRA/5wrd/v5rDewTubvrjyZDYg1l/01W0rJklChf8lfnT0k3eBzFZRA5OW1E6cLDyA4rv52iS1q4Y6izm01nIq10SUQ4jwxAOvBg5AXvCIQlJvG0PmhgZOK1zgzAsxR2ELXfC4gmNyRSXaoyJhccuqxHfmXfDEscUr0sh3gdfMJlhGm/YLby2q+i6nn2J56QOzKy8KhDWctT8Eri7rEQ8q7xd60W/swve9a+BQW8PBlWTw+C6jm0YIgyu66oTKQyOMTZ0oykYNLsGQ/7OJRgYnUQYFENvCYYWUXgvZFDss5PBuHBBVc7OBQUKvY4dNjbyIompTzwSofC6iA4KXNKULu65JWTO0fiNKd1wONCCkipWeB3Qn6Xrx7Spns5LV2ve8raw4YUyy7R9iCRkEU/4u3i3328se/zw+97wp99Wf4fTh2I0pCj2MN3TOZwjaYfsBTjGIaLBsmomGodKhQJjKI3nEymAt2jMPFql01EYeBG7nrAOwyy/B2nqBswE9XnFLHCcDF+cBE9ovG0v7fo5CSK6fR190NGvDgJjb7YJpNlZO/6rFfMkJRPoKbahVUUtKx2MBm/8Ln27Ustqmonldrt2tQ3iugnLmzqeu4c8COK9mVkRRSNkXTpwgmUFZeO/RWopt0h0ky0UfXaDos3XWzzyenblpZ+sfykKIhw73cQPZt0poqi73DXPHnf7C9nNzY2Hmqi2yhgpWJWpLQDGdX/EW6jqM/trSoUts6bCUBwLFdPMk6Csw0YDg6EcePMV3H6D4szwiDc/y4XSt9Ji8bVtSSbq5nGibouivpdjL6o6TxjQA5ZuVjMGHKc0jQqJfKwQzdQHzpwj7YAkc+Sj17nswN7HLklGofHNCbglCrjhWKahyQQc9nUN5i20JeBMnMHLAi6bzLQm35r+megGjqKbeqhQw0OF+jR8U0W+3cVp2z5eJOmL45gl8ccmnm1XiGdIltQUS2uHePJx7py8K7j2WKbaC7wrqPaYt3eSYQ5KZr1yMTsb76QQi1Min9K5FPe3OelVnyHaq+e8oMfYVWXgkVPefIKrKL3aAlN71lRcxXgWz5PxWP2YRIYHEnmXXxBa1fSyj8uvRrMJ/XBvb7q/6A348c9DF/34nvjgzANA61lzgizJMX4jNguKer9dqpqRKKCkXX917pUpW1drS48yh6Xqp1yZ0kS9Tshk2hwOsl0xCwATynDo6wBooG0cLFibYFoiCjIQYGs2VxH6+43OL/9omNu32vLyqozxpuyqKurXe9uleZYyf+BYoa6rx3mInJWGUuFkVwOn86yKgOllW6Zp0TVr2zKaRHRPvS2jiWT+8IOfqcBeDQodqucd/8TdMeSl7/iRzaCm1bYpVREEWwZCW0dFLAGEnXilCtcsGJLTO7bpANOUEEbHliNdFbXUMczO+1SBGo0AGI2aAgqqdaC0XKTK0qWdkjB79oZYWJw0f1qsDMkgc1Kk8vN15fWwzRzHyyCRzHbZi9IqGNWOjEiEa73OQ4f7Shn61blE/aidT+LgKc2vsPPCssWrzizWBVB2ZlF2ZLE1qEQb6C1wkprUKY6j9Ez8u4CrmeEJVNGBsk1Y46TwiCdKP59L0HOhOpdLkJxkutgE2dCjw/PbBGW/p7v4hB1Y5tl9gmjpLj5B6hNka+Yn9QmqaOkuPmGHjtaeT2DF4h/tsrW/4v8V4fX/</diagram></mxfile>
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 90 KiB After Width: | Height: | Size: 93 KiB |
290
docs/design/architecture.md
Normal file
290
docs/design/architecture.md
Normal file
@@ -0,0 +1,290 @@
|
||||
# Kata Containers Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
This is an architectural overview of Kata Containers, based on the 2.0 release.
|
||||
|
||||
The primary deliverable of the Kata Containers project is a CRI friendly shim. There is also a CRI friendly library API behind them.
|
||||
|
||||
The [Kata Containers runtime](../../src/runtime)
|
||||
is compatible with the [OCI](https://github.com/opencontainers) [runtime specification](https://github.com/opencontainers/runtime-spec)
|
||||
and therefore works seamlessly with the [Kubernetes\* Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-node/container-runtime-interface.md)
|
||||
through the [CRI-O\*](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[Containerd\*](https://github.com/containerd/containerd) implementation.
|
||||
|
||||
Kata Containers creates a QEMU\*/KVM virtual machine for pod that `kubelet` (Kubernetes) creates respectively.
|
||||
|
||||
The [`containerd-shim-kata-v2` (shown as `shimv2` from this point onwards)](../../src/runtime/containerd-shim-v2)
|
||||
is the Kata Containers entrypoint, which
|
||||
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/master/runtime/v2) for Kata.
|
||||
|
||||
Before `shimv2` (as done in [Kata Containers 1.x releases](https://github.com/kata-containers/runtime/releases)), we need to create a `containerd-shim` and a [`kata-shim`](https://github.com/kata-containers/shim) for each container and the Pod sandbox itself, plus an optional [`kata-proxy`](https://github.com/kata-containers/proxy) when VSOCK is not available. With `shimv2`, Kubernetes can launch Pod and OCI compatible containers with one shim (the `shimv2`) per Pod instead of `2N+1` shims, and no standalone `kata-proxy` process even if no VSOCK is available.
|
||||
|
||||

|
||||
|
||||
The container process is then spawned by
|
||||
[`kata-agent`](../../src/agent), an agent process running
|
||||
as a daemon inside the virtual machine. `kata-agent` runs a [`ttRPC`](https://github.com/containerd/ttrpc-rust) server in
|
||||
the guest using a VIRTIO serial or VSOCK interface which QEMU exposes as a socket
|
||||
file on the host. `shimv2` uses a `ttRPC` protocol to communicate with
|
||||
the agent. This protocol allows the runtime to send container management
|
||||
commands to the agent. The protocol is also used to carry the I/O streams (stdout,
|
||||
stderr, stdin) between the containers and the manage engines (e.g. CRI-O or containerd).
|
||||
|
||||
For any given container, both the init process and all potentially executed
|
||||
commands within that container, together with their related I/O streams, need
|
||||
to go through the VSOCK interface exported by QEMU.
|
||||
|
||||
The container workload, that is, the actual OCI bundle rootfs, is exported from the
|
||||
host to the virtual machine. In the case where a block-based graph driver is
|
||||
configured, `virtio-scsi` will be used. In all other cases a `virtio-fs` VIRTIO mount point
|
||||
will be used. `kata-agent` uses this mount point as the root filesystem for the
|
||||
container processes.
|
||||
|
||||
## Virtualization
|
||||
|
||||
How Kata Containers maps container concepts to virtual machine technologies, and how this is realized in the multiple
|
||||
hypervisors and VMMs that Kata supports is described within the [virtualization documentation](./virtualization.md)
|
||||
|
||||
## Guest assets
|
||||
|
||||
The hypervisor will launch a virtual machine which includes a minimal guest kernel
|
||||
and a guest image.
|
||||
|
||||
### Guest kernel
|
||||
|
||||
The guest kernel is passed to the hypervisor and used to boot the virtual
|
||||
machine. The default kernel provided in Kata Containers is highly optimized for
|
||||
kernel boot time and minimal memory footprint, providing only those services
|
||||
required by a container workload. This is based on a very current upstream Linux
|
||||
kernel.
|
||||
|
||||
### Guest image
|
||||
|
||||
Kata Containers supports both an `initrd` and `rootfs` based minimal guest image.
|
||||
|
||||
#### Root filesystem image
|
||||
|
||||
The default packaged root filesystem image, sometimes referred to as the "mini O/S", is a
|
||||
highly optimized container bootstrap system based on [Clear Linux](https://clearlinux.org/). It provides an extremely minimal environment and
|
||||
has a highly optimized boot path.
|
||||
|
||||
The only services running in the context of the mini O/S are the init daemon
|
||||
(`systemd`) and the [Agent](#agent). The real workload the user wishes to run
|
||||
is created using libcontainer, creating a container in the same manner that is done
|
||||
by `runc`.
|
||||
|
||||
For example, when `ctr run -ti ubuntu date` is run:
|
||||
|
||||
- The hypervisor will boot the mini-OS image using the guest kernel.
|
||||
- `systemd`, running inside the mini-OS context, will launch the `kata-agent` in
|
||||
the same context.
|
||||
- The agent will create a new confined context to run the specified command in
|
||||
(`date` in this example).
|
||||
- The agent will then execute the command (`date` in this example) inside this
|
||||
new context, first setting the root filesystem to the expected Ubuntu\* root
|
||||
filesystem.
|
||||
|
||||
#### Initrd image
|
||||
|
||||
A compressed `cpio(1)` archive, created from a rootfs which is loaded into memory and used as part of the Linux startup process. During startup, the kernel unpacks it into a special instance of a `tmpfs` that becomes the initial root filesystem.
|
||||
|
||||
The only service running in the context of the initrd is the [Agent](#agent) as the init daemon. The real workload the user wishes to run is created using libcontainer, creating a container in the same manner that is done by `runc`.
|
||||
|
||||
## Agent
|
||||
|
||||
[`kata-agent`](../../src/agent) is a process running in the guest as a supervisor for managing containers and processes running within those containers.
|
||||
|
||||
For the 2.0 release, the `kata-agent` is rewritten in the [RUST programming language](https://www.rust-lang.org/) so that we can minimize its memory footprint while keeping the memory safety of the original GO version of [`kata-agent` used in Kata Container 1.x](https://github.com/kata-containers/agent). This memory footprint reduction is pretty impressive, from tens of megabytes down to less than 100 kilobytes, enabling Kata Containers in more use cases like functional computing and edge computing.
|
||||
|
||||
The `kata-agent` execution unit is the sandbox. A `kata-agent` sandbox is a container sandbox defined by a set of namespaces (NS, UTS, IPC and PID). `shimv2` can
|
||||
run several containers per VM to support container engines that require multiple
|
||||
containers running inside a pod.
|
||||
|
||||
`kata-agent` communicates with the other Kata components over `ttRPC`.
|
||||
|
||||
## Runtime
|
||||
|
||||
`containerd-shim-kata-v2` is a [containerd runtime shimv2](https://github.com/containerd/containerd/blob/v1.4.1/runtime/v2/README.md) implementation and is responsible for handling the `runtime v2 shim APIs`, which is similar to [the OCI runtime specification](https://github.com/opencontainers/runtime-spec) but simplifies the architecture by loading the runtime once and making RPC calls to handle the various container lifecycle commands. This refinement is an improvement on the OCI specification which requires the container manager call the runtime binary multiple times, at least once for each lifecycle command.
|
||||
|
||||
`containerd-shim-kata-v2` heavily utilizes the
|
||||
[virtcontainers package](../../src/runtime/virtcontainers/), which provides a generic, runtime-specification agnostic, hardware-virtualized containers library.
|
||||
|
||||
### Configuration
|
||||
|
||||
The runtime uses a TOML format configuration file called `configuration.toml`. By default this file is installed in the `/usr/share/defaults/kata-containers` directory and contains various settings such as the paths to the hypervisor, the guest kernel and the mini-OS image.
|
||||
|
||||
The actual configuration file paths can be determined by running:
|
||||
```
|
||||
$ kata-runtime --show-default-config-paths
|
||||
```
|
||||
Most users will not need to modify the configuration file.
|
||||
|
||||
The file is well commented and provides a few "knobs" that can be used to modify the behavior of the runtime and your chosen hypervisor.
|
||||
|
||||
The configuration file is also used to enable runtime [debug output](../Developer-Guide.md#enable-full-debug).
|
||||
|
||||
## Networking
|
||||
|
||||
Containers will typically live in their own, possibly shared, networking namespace.
|
||||
At some point in a container lifecycle, container engines will set up that namespace
|
||||
to add the container to a network which is isolated from the host network, but
|
||||
which is shared between containers
|
||||
|
||||
In order to do so, container engines will usually add one end of a virtual
|
||||
ethernet (`veth`) pair into the container networking namespace. The other end of
|
||||
the `veth` pair is added to the host networking namespace.
|
||||
|
||||
This is a very namespace-centric approach as many hypervisors/VMMs cannot handle `veth`
|
||||
interfaces. Typically, `TAP` interfaces are created for VM connectivity.
|
||||
|
||||
To overcome incompatibility between typical container engines expectations
|
||||
and virtual machines, Kata Containers networking transparently connects `veth`
|
||||
interfaces with `TAP` ones using Traffic Control:
|
||||
|
||||

|
||||
|
||||
With a TC filter in place, a redirection is created between the container network and the
|
||||
virtual machine. As an example, the CNI may create a device, `eth0`, in the container's network
|
||||
namespace, which is a VETH device. Kata Containers will create a tap device for the VM, `tap0_kata`,
|
||||
and setup a TC redirection filter to mirror traffic from `eth0`'s ingress to `tap0_kata`'s egress,
|
||||
and a second to mirror traffic from `tap0_kata`'s ingress to `eth0`'s egress.
|
||||
|
||||
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata. TC-filter
|
||||
is the default because it allows for simpler configuration, better CNI plugin compatibility, and performance
|
||||
on par with MACVTAP.
|
||||
|
||||
Kata Containers has deprecated support for bridge due to lacking performance relative to TC-filter and MACVTAP.
|
||||
|
||||
Kata Containers supports both
|
||||
[CNM](https://github.com/docker/libnetwork/blob/master/docs/design.md#the-container-network-model)
|
||||
and [CNI](https://github.com/containernetworking/cni) for networking management.
|
||||
|
||||
### Network Hotplug
|
||||
|
||||
Kata Containers has developed a set of network sub-commands and APIs to add, list and
|
||||
remove a guest network endpoint and to manipulate the guest route table.
|
||||
|
||||
The following diagram illustrates the Kata Containers network hotplug workflow.
|
||||
|
||||

|
||||
|
||||
## Storage
|
||||
Container workloads are shared with the virtualized environment through [virtio-fs](https://virtio-fs.gitlab.io/).
|
||||
|
||||
The [devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/master/snapshots/devmapper) is a special case. The `snapshotter` uses dedicated block devices rather than formatted filesystems, and operates at the block level rather than the file level. This knowledge is used to directly use the underlying block device instead of the overlay file system for the container root file system. The block device maps to the top read-write layer for the overlay. This approach gives much better I/O performance compared to using `virtio-fs` to share the container file system.
|
||||
|
||||
Kata Containers has the ability to hotplug and remove block devices, which makes it possible to use block devices for containers started after the VM has been launched.
|
||||
|
||||
Users can check to see if the container uses the devicemapper block device as its rootfs by calling `mount(8)` within the container. If the devicemapper block device
|
||||
is used, `/` will be mounted on `/dev/vda`. Users can disable direct mounting of the underlying block device through the runtime configuration.
|
||||
|
||||
## Kubernetes support
|
||||
|
||||
[Kubernetes\*](https://github.com/kubernetes/kubernetes/) is a popular open source
|
||||
container orchestration engine. In Kubernetes, a set of containers sharing resources
|
||||
such as networking, storage, mount, PID, etc. is called a
|
||||
[Pod](https://kubernetes.io/docs/user-guide/pods/).
|
||||
A node can have multiple pods, but at a minimum, a node within a Kubernetes cluster
|
||||
only needs to run a container runtime and a container agent (called a
|
||||
[Kubelet](https://kubernetes.io/docs/admin/kubelet/)).
|
||||
|
||||
A Kubernetes cluster runs a control plane where a scheduler (typically running on a
|
||||
dedicated master node) calls into a compute Kubelet. This Kubelet instance is
|
||||
responsible for managing the lifecycle of pods within the nodes and eventually relies
|
||||
on a container runtime to handle execution. The Kubelet architecture decouples
|
||||
lifecycle management from container execution through the dedicated
|
||||
`gRPC` based [Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/container-runtime-interface-v1.md).
|
||||
|
||||
In other words, a Kubelet is a CRI client and expects a CRI implementation to
|
||||
handle the server side of the interface.
|
||||
[CRI-O\*](https://github.com/kubernetes-incubator/cri-o) and [Containerd\*](https://github.com/containerd/containerd/) are CRI implementations that rely on [OCI](https://github.com/opencontainers/runtime-spec)
|
||||
compatible runtimes for managing container instances.
|
||||
|
||||
Kata Containers is an officially supported CRI-O and Containerd runtime. Refer to the following guides on how to set up Kata Containers with Kubernetes:
|
||||
|
||||
- [How to use Kata Containers and Containerd](../how-to/containerd-kata.md)
|
||||
- [Run Kata Containers with Kubernetes](../how-to/run-kata-with-k8s.md)
|
||||
|
||||
#### OCI annotations
|
||||
|
||||
In order for the Kata Containers runtime (or any virtual machine based OCI compatible
|
||||
runtime) to be able to understand if it needs to create a full virtual machine or if it
|
||||
has to create a new container inside an existing pod's virtual machine, CRI-O adds
|
||||
specific annotations to the OCI configuration file (`config.json`) which is passed to
|
||||
the OCI compatible runtime.
|
||||
|
||||
Before calling its runtime, CRI-O will always add a `io.kubernetes.cri-o.ContainerType`
|
||||
annotation to the `config.json` configuration file it produces from the Kubelet CRI
|
||||
request. The `io.kubernetes.cri-o.ContainerType` annotation can either be set to `sandbox`
|
||||
or `container`. Kata Containers will then use this annotation to decide if it needs to
|
||||
respectively create a virtual machine or a container inside a virtual machine associated
|
||||
with a Kubernetes pod:
|
||||
|
||||
```Go
|
||||
containerType, err := ociSpec.ContainerType()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
handleFactory(ctx, runtimeConfig)
|
||||
|
||||
disableOutput := noNeedForOutput(detach, ociSpec.Process.Terminal)
|
||||
|
||||
var process vc.Process
|
||||
switch containerType {
|
||||
case vc.PodSandbox:
|
||||
process, err = createSandbox(ctx, ociSpec, runtimeConfig, containerID, bundlePath, console, disableOutput, systemdCgroup)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
case vc.PodContainer:
|
||||
process, err = createContainer(ctx, ociSpec, containerID, bundlePath, console, disableOutput)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### Mixing VM based and namespace based runtimes
|
||||
|
||||
> **Note:** Since Kubernetes 1.12, the [`Kubernetes RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
> has been supported and the user can specify runtime without the non-standardized annotations.
|
||||
|
||||
With `RuntimeClass`, users can define Kata Containers as a `RuntimeClass` and then explicitly specify that a pod being created as a Kata Containers pod. For details, please refer to [How to use Kata Containers and Containerd](../../docs/how-to/containerd-kata.md).
|
||||
|
||||
|
||||
# Appendices
|
||||
|
||||
## DAX
|
||||
|
||||
Kata Containers utilizes the Linux kernel DAX [(Direct Access filesystem)](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/filesystems/dax.txt)
|
||||
feature to efficiently map some host-side files into the guest VM space.
|
||||
In particular, Kata Containers uses the QEMU NVDIMM feature to provide a
|
||||
memory-mapped virtual device that can be used to DAX map the virtual machine's
|
||||
root filesystem into the guest memory address space.
|
||||
|
||||
Mapping files using DAX provides a number of benefits over more traditional VM
|
||||
file and device mapping mechanisms:
|
||||
|
||||
- Mapping as a direct access devices allows the guest to directly access
|
||||
the host memory pages (such as via Execute In Place (XIP)), bypassing the guest
|
||||
page cache. This provides both time and space optimizations.
|
||||
- Mapping as a direct access device inside the VM allows pages from the
|
||||
host to be demand loaded using page faults, rather than having to make requests
|
||||
via a virtualized device (causing expensive VM exits/hypercalls), thus providing
|
||||
a speed optimization.
|
||||
- Utilizing `MAP_SHARED` shared memory on the host allows the host to efficiently
|
||||
share pages.
|
||||
|
||||
Kata Containers uses the following steps to set up the DAX mappings:
|
||||
1. QEMU is configured with an NVDIMM memory device, with a memory file
|
||||
backend to map in the host-side file into the virtual NVDIMM space.
|
||||
2. The guest kernel command line mounts this NVDIMM device with the DAX
|
||||
feature enabled, allowing direct page mapping and access, thus bypassing the
|
||||
guest page cache.
|
||||
|
||||

|
||||
|
||||
Information on the use of NVDIMM via QEMU is available in the [QEMU source code](http://git.qemu-project.org/?p=qemu.git;a=blob;f=docs/nvdimm.txt;hb=HEAD)
|
||||
@@ -1,477 +0,0 @@
|
||||
# Kata Containers Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
Kata Containers is an open source community working to build a secure
|
||||
container [runtime](#runtime) with lightweight virtual machines (VM's)
|
||||
that feel and perform like standard Linux containers, but provide
|
||||
stronger [workload](#workload) isolation using hardware
|
||||
[virtualization](#virtualization) technology as a second layer of
|
||||
defence.
|
||||
|
||||
Kata Containers runs on [multiple architectures](../../../src/runtime/README.md#platform-support)
|
||||
and supports [multiple hypervisors](../../hypervisors.md).
|
||||
|
||||
This document is a summary of the Kata Containers architecture.
|
||||
|
||||
## Background knowledge
|
||||
|
||||
This document assumes the reader understands a number of concepts
|
||||
related to containers and file systems. The
|
||||
[background](background.md) document explains these concepts.
|
||||
|
||||
## Example command
|
||||
|
||||
This document makes use of a particular [example
|
||||
command](example-command.md) throughout the text to illustrate certain
|
||||
concepts.
|
||||
|
||||
## Virtualization
|
||||
|
||||
For details on how Kata Containers maps container concepts to VM
|
||||
technologies, and how this is realized in the multiple hypervisors and
|
||||
VMMs that Kata supports see the
|
||||
[virtualization documentation](../virtualization.md).
|
||||
|
||||
## Compatibility
|
||||
|
||||
The [Kata Containers runtime](../../../src/runtime) is compatible with
|
||||
the [OCI](https://github.com/opencontainers)
|
||||
[runtime specification](https://github.com/opencontainers/runtime-spec)
|
||||
and therefore works seamlessly with the
|
||||
[Kubernetes Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-node/container-runtime-interface.md)
|
||||
through the [CRI-O](https://github.com/kubernetes-incubator/cri-o)
|
||||
and [containerd](https://github.com/containerd/containerd)
|
||||
implementations.
|
||||
|
||||
Kata Containers provides a ["shimv2"](#shim-v2-architecture) compatible runtime.
|
||||
|
||||
## Shim v2 architecture
|
||||
|
||||
The Kata Containers runtime is shim v2 ("shimv2") compatible. This
|
||||
section explains what this means.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> For a comparison with the Kata 1.x architecture, see
|
||||
> [the architectural history document](history.md).
|
||||
|
||||
The
|
||||
[containerd runtime shimv2 architecture](https://github.com/containerd/containerd/tree/main/runtime/v2)
|
||||
or _shim API_ architecture resolves the issues with the old
|
||||
architecture by defining a set of shimv2 APIs that a compatible
|
||||
runtime implementation must supply. Rather than calling the runtime
|
||||
binary multiple times for each new container, the shimv2 architecture
|
||||
runs a single instance of the runtime binary (for any number of
|
||||
containers). This improves performance and resolves the state handling
|
||||
issue.
|
||||
|
||||
The shimv2 API is similar to the
|
||||
[OCI runtime](https://github.com/opencontainers/runtime-spec)
|
||||
API in terms of the way the container lifecycle is split into
|
||||
different verbs. Rather than calling the runtime multiple times, the
|
||||
container manager creates a socket and passes it to the shimv2
|
||||
runtime. The socket is a bi-directional communication channel that
|
||||
uses a gRPC based protocol to allow the container manager to send API
|
||||
calls to the runtime, which returns the result to the container
|
||||
manager using the same channel.
|
||||
|
||||
The shimv2 architecture allows running several containers per VM to
|
||||
support container engines that require multiple containers running
|
||||
inside a pod.
|
||||
|
||||
With the new architecture [Kubernetes](kubernetes.md) can
|
||||
launch both Pod and OCI compatible containers with a single
|
||||
[runtime](#runtime) shim per Pod, rather than `2N+1` shims. No stand
|
||||
alone `kata-proxy` process is required, even if VSOCK is not
|
||||
available.
|
||||
|
||||
## Workload
|
||||
|
||||
The workload is the command the user requested to run in the
|
||||
container and is specified in the [OCI bundle](background.md#oci-bundle)'s
|
||||
configuration file.
|
||||
|
||||
In our [example](example-command.md), the workload is the `sh(1)` command.
|
||||
|
||||
### Workload root filesystem
|
||||
|
||||
For details of how the [runtime](#runtime) makes the
|
||||
[container image](background.md#container-image) chosen by the user available to
|
||||
the workload process, see the
|
||||
[Container creation](#container-creation) and [storage](#storage) sections.
|
||||
|
||||
Note that the workload is isolated from the [guest VM](#environments) environment by its
|
||||
surrounding [container environment](#environments). The guest VM
|
||||
environment where the container runs in is also isolated from the _outer_
|
||||
[host environment](#environments) where the container manager runs.
|
||||
|
||||
## System overview
|
||||
|
||||
### Environments
|
||||
|
||||
The following terminology is used to describe the different or
|
||||
environments (or contexts) various processes run in. It is necessary
|
||||
to study this table closely to make sense of what follows:
|
||||
|
||||
| Type | Name | Virtualized | Containerized | rootfs | Rootfs device type | Mount type | Description |
|
||||
|-|-|-|-|-|-|-|-|
|
||||
| Host | Host | no `[1]` | no | Host specific | Host specific | Host specific | The environment provided by a standard, physical non virtualized system. |
|
||||
| VM root | Guest VM | yes | no | rootfs inside the [guest image](guest-assets.md#guest-image) | Hypervisor specific `[2]` | `ext4` | The first (or top) level VM environment created on a host system. |
|
||||
| VM container root | Container | yes | yes | rootfs type requested by user ([`ubuntu` in the example](example-command.md)) | `kataShared` | [virtio FS](storage.md#virtio-fs) | The first (or top) level container environment created inside the VM. Based on the [OCI bundle](background.md#oci-bundle). |
|
||||
|
||||
**Key:**
|
||||
|
||||
- `[1]`: For simplicity, this document assumes the host environment
|
||||
runs on physical hardware.
|
||||
|
||||
- `[2]`: See the [DAX](#dax) section.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The word "root" is used to mean _top level_ here in a similar
|
||||
> manner to the term [rootfs](background.md#root-filesystem).
|
||||
>
|
||||
> - The term "first level" prefix used above is important since it implies
|
||||
> that it is possible to create multi level systems. However, they do
|
||||
> not form part of a standard Kata Containers environment so will not
|
||||
> be considered in this document.
|
||||
|
||||
The reasons for containerizing the [workload](#workload) inside the VM
|
||||
are:
|
||||
|
||||
- Isolates the workload entirely from the VM environment.
|
||||
- Provides better isolation between containers in a [pod](kubernetes.md).
|
||||
- Allows the workload to be managed and monitored through its cgroup
|
||||
confinement.
|
||||
|
||||
### Container creation
|
||||
|
||||
The steps below show at a high level how a Kata Containers container is
|
||||
created using the containerd container manager:
|
||||
|
||||
1. The user requests the creation of a container by running a command
|
||||
like the [example command](example-command.md).
|
||||
1. The container manager daemon runs a single instance of the Kata
|
||||
[runtime](#runtime).
|
||||
1. The Kata runtime loads its [configuration file](#configuration).
|
||||
1. The container manager calls a set of shimv2 API functions on the runtime.
|
||||
1. The Kata runtime launches the configured [hypervisor](#hypervisor).
|
||||
1. The hypervisor creates and starts (_boots_) a VM using the
|
||||
[guest assets](guest-assets.md#guest-assets):
|
||||
|
||||
- The hypervisor [DAX](#dax) shares the
|
||||
[guest image](guest-assets.md#guest-image)
|
||||
into the VM to become the VM [rootfs](background.md#root-filesystem) (mounted on a `/dev/pmem*` device),
|
||||
which is known as the [VM root environment](#environments).
|
||||
- The hypervisor mounts the [OCI bundle](background.md#oci-bundle), using [virtio FS](storage.md#virtio-fs),
|
||||
into a container specific directory inside the VM's rootfs.
|
||||
|
||||
This container specific directory will become the
|
||||
[container rootfs](#environments), known as the
|
||||
[container environment](#environments).
|
||||
|
||||
1. The [agent](#agent) is started as part of the VM boot.
|
||||
|
||||
1. The runtime calls the agent's `CreateSandbox` API to request the
|
||||
agent create a container:
|
||||
|
||||
1. The agent creates a [container environment](#environments)
|
||||
in the container specific directory that contains the [container rootfs](#environments).
|
||||
|
||||
The container environment hosts the [workload](#workload) in the
|
||||
[container rootfs](#environments) directory.
|
||||
|
||||
1. The agent spawns the workload inside the container environment.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The container environment created by the agent is equivalent to
|
||||
> a container environment created by the
|
||||
> [`runc`](https://github.com/opencontainers/runc) OCI runtime;
|
||||
> Linux cgroups and namespaces are created inside the VM by the
|
||||
> [guest kernel](guest-assets.md#guest-kernel) to isolate the
|
||||
> workload from the VM environment the container is created in.
|
||||
> See the [Environments](#environments) section for an
|
||||
> explanation of why this is done.
|
||||
>
|
||||
> - See the [guest image](guest-assets.md#guest-image) section for
|
||||
> details of exactly how the agent is started.
|
||||
|
||||
1. The container manager returns control of the container to the
|
||||
user running the `ctr` command.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> At this point, the container is running and:
|
||||
>
|
||||
> - The [workload](#workload) process ([`sh(1)` in the example](example-command.md))
|
||||
> is running in the [container environment](#environments).
|
||||
> - The user is now able to interact with the workload
|
||||
> (using the [`ctr` command in the example](example-command.md)).
|
||||
> - The [agent](#agent), running inside the VM is monitoring the
|
||||
> [workload](#workload) process.
|
||||
> - The [runtime](#runtime) is waiting for the agent's `WaitProcess` API
|
||||
> call to complete.
|
||||
|
||||
Further details of these steps are provided in the sections below.
|
||||
|
||||
### Container shutdown
|
||||
|
||||
There are two possible ways for the container environment to be
|
||||
terminated:
|
||||
|
||||
- When the [workload](#workload) exits.
|
||||
|
||||
This is the standard, or _graceful_ shutdown method.
|
||||
|
||||
- When the container manager forces the container to be deleted.
|
||||
|
||||
#### Workload exit
|
||||
|
||||
The [agent](#agent) will detect when the [workload](#workload) process
|
||||
exits, capture its exit status (see `wait(2)`) and return that value
|
||||
to the [runtime](#runtime) by specifying it as the response to the
|
||||
`WaitProcess` agent API call made by the [runtime](#runtime).
|
||||
|
||||
The runtime then passes the value back to the container manager by the
|
||||
`Wait` [shimv2 API](#shim-v2-architecture) call.
|
||||
|
||||
Once the workload has fully exited, the VM is no longer needed and the
|
||||
runtime cleans up the environment (which includes terminating the
|
||||
[hypervisor](#hypervisor) process).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> When [agent tracing is enabled](../../tracing.md#agent-shutdown-behaviour),
|
||||
> the shutdown behaviour is different.
|
||||
|
||||
#### Container manager requested shutdown
|
||||
|
||||
If the container manager requests the container be deleted, the
|
||||
[runtime](#runtime) will signal the agent by sending it a
|
||||
`DestroySandbox` [ttRPC API](../../../src/agent/protocols/protos/agent.proto) request.
|
||||
|
||||
## Guest assets
|
||||
|
||||
The guest assets comprise a guest image and a guest kernel that are
|
||||
used by the [hypervisor](#hypervisor).
|
||||
|
||||
See the [guest assets](guest-assets.md) document for further
|
||||
information.
|
||||
|
||||
## Hypervisor
|
||||
|
||||
The [hypervisor](../../hypervisors.md) specified in the
|
||||
[configuration file](#configuration) creates a VM to host the
|
||||
[agent](#agent) and the [workload](#workload) inside the
|
||||
[container environment](#environments).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The hypervisor process runs inside an environment slightly different
|
||||
> to the host environment:
|
||||
>
|
||||
> - It is run in a different cgroup environment to the host.
|
||||
> - It is given a separate network namespace from the host.
|
||||
> - If the [OCI configuration specifies a SELinux label](https://github.com/opencontainers/runtime-spec/blob/main/config.md#linux-process),
|
||||
> the hypervisor process will run with that label (*not* the workload running inside the hypervisor's VM).
|
||||
|
||||
## Agent
|
||||
|
||||
The Kata Containers agent ([`kata-agent`](../../../src/agent)), written
|
||||
in the [Rust programming language](https://www.rust-lang.org), is a
|
||||
long running process that runs inside the VM. It acts as the
|
||||
supervisor for managing the containers and the [workload](#workload)
|
||||
running within those containers. Only a single agent process is run
|
||||
for each VM created.
|
||||
|
||||
### Agent communications protocol
|
||||
|
||||
The agent communicates with the other Kata components (primarily the
|
||||
[runtime](#runtime)) using a
|
||||
[`ttRPC`](https://github.com/containerd/ttrpc-rust) based
|
||||
[protocol](../../../src/agent/protocols/protos).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> If you wish to learn more about this protocol, a practical way to do
|
||||
> so is to experiment with the
|
||||
> [agent control tool](#agent-control-tool) on a test system.
|
||||
> This tool is for test and development purposes only and can send
|
||||
> arbitrary ttRPC agent API commands to the [agent](#agent).
|
||||
|
||||
## Runtime
|
||||
|
||||
The Kata Containers runtime (the [`containerd-shim-kata-v2`](../../../src/runtime/cmd/containerd-shim-kata-v2
|
||||
) binary) is a [shimv2](#shim-v2-architecture) compatible runtime.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The Kata Containers runtime is sometimes referred to as the Kata
|
||||
> _shim_. Both terms are correct since the `containerd-shim-kata-v2`
|
||||
> is a container runtime, and that runtime implements the containerd
|
||||
> shim v2 API.
|
||||
|
||||
The runtime makes heavy use of the [`virtcontainers`
|
||||
package](../../../src/runtime/virtcontainers), which provides a generic,
|
||||
runtime-specification agnostic, hardware-virtualized containers
|
||||
library.
|
||||
|
||||
The runtime is responsible for starting the [hypervisor](#hypervisor)
|
||||
and it's VM, and communicating with the [agent](#agent) using a
|
||||
[ttRPC based protocol](#agent-communications-protocol) over a VSOCK
|
||||
socket that provides a communications link between the VM and the
|
||||
host.
|
||||
|
||||
This protocol allows the runtime to send container management commands
|
||||
to the agent. The protocol is also used to carry the standard I/O
|
||||
streams (`stdout`, `stderr`, `stdin`) between the containers and
|
||||
container managers (such as CRI-O or containerd).
|
||||
|
||||
## Utility program
|
||||
|
||||
The `kata-runtime` binary is a utility program that provides
|
||||
administrative commands to manipulate and query a Kata Containers
|
||||
installation.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> In Kata 1.x, this program also acted as the main
|
||||
> [runtime](#runtime), but this is no longer required due to the
|
||||
> improved shimv2 architecture.
|
||||
|
||||
### exec command
|
||||
|
||||
The `exec` command allows an administrator or developer to enter the
|
||||
[VM root environment](#environments) which is not accessible by the container
|
||||
[workload](#workload).
|
||||
|
||||
See [the developer guide](../../Developer-Guide.md#connect-to-debug-console) for further details.
|
||||
|
||||
### Configuration
|
||||
|
||||
See the [configuration file details](../../../src/runtime/README.md#configuration).
|
||||
|
||||
The configuration file is also used to enable runtime [debug output](../../Developer-Guide.md#enable-full-debug).
|
||||
|
||||
## Process overview
|
||||
|
||||
The table below shows an example of the main processes running in the
|
||||
different [environments](#environments) when a Kata Container is
|
||||
created with containerd using our [example command](example-command.md):
|
||||
|
||||
| Description | Host | VM root environment | VM container environment |
|
||||
|-|-|-|-|
|
||||
| Container manager | `containerd` | |
|
||||
| Kata Containers | [runtime](#runtime), [`virtiofsd`](storage.md#virtio-fs), [hypervisor](#hypervisor) | [agent](#agent) |
|
||||
| User [workload](#workload) | | | [`ubuntu sh`](example-command.md) |
|
||||
|
||||
## Networking
|
||||
|
||||
See the [networking document](networking.md).
|
||||
|
||||
## Storage
|
||||
|
||||
See the [storage document](storage.md).
|
||||
|
||||
## Kubernetes support
|
||||
|
||||
See the [Kubernetes document](kubernetes.md).
|
||||
|
||||
#### OCI annotations
|
||||
|
||||
In order for the Kata Containers [runtime](#runtime) (or any VM based OCI compatible
|
||||
runtime) to be able to understand if it needs to create a full VM or if it
|
||||
has to create a new container inside an existing pod's VM, CRI-O adds
|
||||
specific annotations to the OCI configuration file (`config.json`) which is passed to
|
||||
the OCI compatible runtime.
|
||||
|
||||
Before calling its runtime, CRI-O will always add a `io.kubernetes.cri-o.ContainerType`
|
||||
annotation to the `config.json` configuration file it produces from the Kubelet CRI
|
||||
request. The `io.kubernetes.cri-o.ContainerType` annotation can either be set to `sandbox`
|
||||
or `container`. Kata Containers will then use this annotation to decide if it needs to
|
||||
respectively create a virtual machine or a container inside a virtual machine associated
|
||||
with a Kubernetes pod:
|
||||
|
||||
| Annotation value | Kata VM created? | Kata container created? |
|
||||
|-|-|-|
|
||||
| `sandbox` | yes | yes (inside new VM) |
|
||||
| `container`| no | yes (in existing VM) |
|
||||
|
||||
#### Mixing VM based and namespace based runtimes
|
||||
|
||||
> **Note:** Since Kubernetes 1.12, the [`Kubernetes RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
> has been supported and the user can specify runtime without the non-standardized annotations.
|
||||
|
||||
With `RuntimeClass`, users can define Kata Containers as a
|
||||
`RuntimeClass` and then explicitly specify that a pod must be created
|
||||
as a Kata Containers pod. For details, please refer to [How to use
|
||||
Kata Containers and containerd](../../../docs/how-to/containerd-kata.md).
|
||||
|
||||
## Tracing
|
||||
|
||||
The [tracing document](../../tracing.md) provides details on the tracing
|
||||
architecture.
|
||||
|
||||
# Appendices
|
||||
|
||||
## DAX
|
||||
|
||||
Kata Containers utilizes the Linux kernel DAX
|
||||
[(Direct Access filesystem)](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/filesystems/dax.rst?h=v5.14)
|
||||
feature to efficiently map the [guest image](guest-assets.md#guest-image) in the
|
||||
[host environment](#environments) into the
|
||||
[guest VM environment](#environments) to become the VM's
|
||||
[rootfs](background.md#root-filesystem).
|
||||
|
||||
If the [configured](#configuration) [hypervisor](#hypervisor) is set
|
||||
to either QEMU or Cloud Hypervisor, DAX is used with the feature shown
|
||||
in the table below:
|
||||
|
||||
| Hypervisor | Feature used | rootfs device type |
|
||||
|-|-|-|
|
||||
| Cloud Hypervisor (CH) | `dax` `FsConfig` configuration option | PMEM (emulated Persistent Memory device) |
|
||||
| QEMU | NVDIMM memory device with a memory file backend | NVDIMM (emulated Non-Volatile Dual In-line Memory Module device) |
|
||||
|
||||
The features in the table above are equivalent in that they provide a memory-mapped
|
||||
virtual device which is used to DAX map the VM's
|
||||
[rootfs](background.md#root-filesystem) into the [VM guest](#environments) memory
|
||||
address space.
|
||||
|
||||
The VM is then booted, specifying the `root=` kernel parameter to make
|
||||
the [guest kernel](guest-assets.md#guest-kernel) use the appropriate emulated device
|
||||
as its rootfs.
|
||||
|
||||
### DAX advantages
|
||||
|
||||
Mapping files using [DAX](#dax) provides a number of benefits over
|
||||
more traditional VM file and device mapping mechanisms:
|
||||
|
||||
- Mapping as a direct access device allows the guest to directly
|
||||
access the host memory pages (such as via Execute In Place (XIP)),
|
||||
bypassing the [guest kernel](guest-assets.md#guest-kernel)'s page cache. This
|
||||
zero copy provides both time and space optimizations.
|
||||
|
||||
- Mapping as a direct access device inside the VM allows pages from the
|
||||
host to be demand loaded using page faults, rather than having to make requests
|
||||
via a virtualized device (causing expensive VM exits/hypercalls), thus providing
|
||||
a speed optimization.
|
||||
|
||||
- Utilizing `mmap(2)`'s `MAP_SHARED` shared memory option on the host
|
||||
allows the host to efficiently share pages.
|
||||
|
||||

|
||||
|
||||
For further details of the use of NVDIMM with QEMU, see the [QEMU
|
||||
project documentation](https://www.qemu.org).
|
||||
|
||||
## Agent control tool
|
||||
|
||||
The [agent control tool](../../../src/tools/agent-ctl) is a test and
|
||||
development tool that can be used to learn more about a Kata
|
||||
Containers system.
|
||||
|
||||
## Terminology
|
||||
|
||||
See the [project glossary](../../../Glossary.md).
|
||||
@@ -1,81 +0,0 @@
|
||||
# Kata Containers architecture background knowledge
|
||||
|
||||
The following sections explain some of the background concepts
|
||||
required to understand the [architecture document](README.md).
|
||||
|
||||
## Root filesystem
|
||||
|
||||
This document uses the term _rootfs_ to refer to a root filesystem
|
||||
which is mounted as the top-level directory ("`/`") and often referred
|
||||
to as _slash_.
|
||||
|
||||
It is important to understand this term since the overall system uses
|
||||
multiple different rootfs's (as explained in the
|
||||
[Environments](README.md#environments) section.
|
||||
|
||||
## Container image
|
||||
|
||||
In the [example command](example-command.md) the user has specified the
|
||||
type of container they wish to run via the container image name:
|
||||
`ubuntu`. This image name corresponds to a _container image_ that can
|
||||
be used to create a container with an Ubuntu Linux environment. Hence,
|
||||
in our [example](example-command.md), the `sh(1)` command will be run
|
||||
inside a container which has an Ubuntu rootfs.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> The term _container image_ is confusing since the image in question
|
||||
> is **not** a container: it is simply a set of files (_an image_)
|
||||
> that can be used to _create_ a container. The term _container
|
||||
> template_ would be more accurate but the term _container image_ is
|
||||
> commonly used so this document uses the standard term.
|
||||
|
||||
For the purposes of this document, the most important part of the
|
||||
[example command line](example-command.md) is the container image the
|
||||
user has requested. Normally, the container manager will _pull_
|
||||
(download) a container image from a remote site and store a copy
|
||||
locally. This local container image is used by the container manager
|
||||
to create an [OCI bundle](#oci-bundle) which will form the environment
|
||||
the container will run in. After creating the OCI bundle, the
|
||||
container manager launches a [runtime](README.md#runtime) which will create the
|
||||
container using the provided OCI bundle.
|
||||
|
||||
## OCI bundle
|
||||
|
||||
To understand what follows, it is important to know at a high level
|
||||
how an OCI ([Open Containers Initiative](https://opencontainers.org)) compatible container is created.
|
||||
|
||||
An OCI compatible container is created by taking a
|
||||
[container image](#container-image) and converting the embedded rootfs
|
||||
into an
|
||||
[OCI rootfs bundle](https://github.com/opencontainers/runtime-spec/blob/main/bundle.md),
|
||||
or more simply, an _OCI bundle_.
|
||||
|
||||
An OCI bundle is a `tar(1)` archive normally created by a container
|
||||
manager which is passed to an OCI [runtime](README.md#runtime) which converts
|
||||
it into a full container rootfs. The bundle contains two assets:
|
||||
|
||||
- A container image [rootfs](#root-filesystem)
|
||||
|
||||
This is simply a directory of files that will be used to represent
|
||||
the rootfs for the container.
|
||||
|
||||
For the [example command](example-command.md), the directory will
|
||||
contain the files necessary to create a minimal Ubuntu root
|
||||
filesystem.
|
||||
|
||||
- An [OCI configuration file](https://github.com/opencontainers/runtime-spec/blob/main/config.md)
|
||||
|
||||
This is a JSON file called `config.json`.
|
||||
|
||||
The container manager will create this file so that:
|
||||
|
||||
- The `root.path` value is set to the full path of the specified
|
||||
container rootfs.
|
||||
|
||||
In [the example](example-command.md) this value will be `ubuntu`.
|
||||
|
||||
- The `process.args` array specifies the list of commands the user
|
||||
wishes to run. This is known as the [workload](README.md#workload).
|
||||
|
||||
In [the example](example-command.md) the workload is `sh(1)`.
|
||||
@@ -1,30 +0,0 @@
|
||||
# Example command
|
||||
|
||||
The following containerd command creates a container. It is referred
|
||||
to throughout the architecture document to help explain various points:
|
||||
|
||||
```bash
|
||||
$ sudo ctr run --runtime "io.containerd.kata.v2" --rm -t "quay.io/libpod/ubuntu:latest" foo sh
|
||||
```
|
||||
|
||||
This command requests that containerd:
|
||||
|
||||
- Create a container (`ctr run`).
|
||||
- Use the Kata [shimv2](README.md#shim-v2-architecture) runtime (`--runtime "io.containerd.kata.v2"`).
|
||||
- Delete the container when it [exits](README.md#workload-exit) (`--rm`).
|
||||
- Attach the container to the user's terminal (`-t`).
|
||||
- Use the Ubuntu Linux [container image](background.md#container-image)
|
||||
to create the container [rootfs](background.md#root-filesystem) that will become
|
||||
the [container environment](README.md#environments)
|
||||
(`quay.io/libpod/ubuntu:latest`).
|
||||
- Create the container with the name "`foo`".
|
||||
- Run the `sh(1)` command in the Ubuntu rootfs based container
|
||||
environment.
|
||||
|
||||
The command specified here is referred to as the [workload](README.md#workload).
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> For the purposes of this document and to keep explanations
|
||||
> simpler, we assume the user is running this command in the
|
||||
> [host environment](README.md#environments).
|
||||
@@ -1,152 +0,0 @@
|
||||
# Guest assets
|
||||
|
||||
Kata Containers creates a VM in which to run one or more containers.
|
||||
It does this by launching a [hypervisor](README.md#hypervisor) to
|
||||
create the VM. The hypervisor needs two assets for this task: a Linux
|
||||
kernel and a small root filesystem image to boot the VM.
|
||||
|
||||
## Guest kernel
|
||||
|
||||
The [guest kernel](../../../tools/packaging/kernel)
|
||||
is passed to the hypervisor and used to boot the VM.
|
||||
The default kernel provided in Kata Containers is highly optimized for
|
||||
kernel boot time and minimal memory footprint, providing only those
|
||||
services required by a container workload. It is based on the latest
|
||||
Linux LTS (Long Term Support) [kernel](https://www.kernel.org).
|
||||
|
||||
## Guest image
|
||||
|
||||
The hypervisor uses an image file which provides a minimal root
|
||||
filesystem used by the guest kernel to boot the VM and host the Kata
|
||||
Container. Kata Containers supports both initrd and rootfs based
|
||||
minimal guest images. The [default packages](../../install/) provide both
|
||||
an image and an initrd, both of which are created using the
|
||||
[`osbuilder`](../../../tools/osbuilder) tool.
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - Although initrd and rootfs based images are supported, not all
|
||||
> [hypervisors](README.md#hypervisor) support both types of image.
|
||||
>
|
||||
> - The guest image is *unrelated* to the image used in a container
|
||||
> workload.
|
||||
>
|
||||
> For example, if a user creates a container that runs a shell in a
|
||||
> BusyBox image, they will run that shell in a BusyBox environment.
|
||||
> However, the guest image running inside the VM that is used to
|
||||
> *host* that BusyBox image could be running Clear Linux, Ubuntu,
|
||||
> Fedora or any other distribution potentially.
|
||||
>
|
||||
> The `osbuilder` tool provides
|
||||
> [configurations for various common Linux distributions](../../../tools/osbuilder/rootfs-builder)
|
||||
> which can be built into either initrd or rootfs guest images.
|
||||
>
|
||||
> - If you are using a [packaged version of Kata
|
||||
> Containers](../../install), you can see image details by running the
|
||||
> [`kata-collect-data.sh`](../../../src/runtime/data/kata-collect-data.sh.in)
|
||||
> script as `root` and looking at the "Image details" section of the
|
||||
> output.
|
||||
|
||||
#### Root filesystem image
|
||||
|
||||
The default packaged rootfs image, sometimes referred to as the _mini
|
||||
O/S_, is a highly optimized container bootstrap system.
|
||||
|
||||
If this image type is [configured](README.md#configuration), when the
|
||||
user runs the [example command](example-command.md):
|
||||
|
||||
- The [runtime](README.md#runtime) will launch the configured [hypervisor](README.md#hypervisor).
|
||||
- The hypervisor will boot the mini-OS image using the [guest kernel](#guest-kernel).
|
||||
- The kernel will start the init daemon as PID 1 (`systemd`) inside the VM root environment.
|
||||
- `systemd`, running inside the mini-OS context, will launch the [agent](README.md#agent)
|
||||
in the root context of the VM.
|
||||
- The agent will create a new container environment, setting its root
|
||||
filesystem to that requested by the user (Ubuntu in [the example](example-command.md)).
|
||||
- The agent will then execute the command (`sh(1)` in [the example](example-command.md))
|
||||
inside the new container.
|
||||
|
||||
The table below summarises the default mini O/S showing the
|
||||
environments that are created, the services running in those
|
||||
environments (for all platforms) and the root filesystem used by
|
||||
each service:
|
||||
|
||||
| Process | Environment | systemd service? | rootfs | User accessible | Notes |
|
||||
|-|-|-|-|-|-|
|
||||
| systemd | VM root | n/a | [VM guest image](#guest-image)| [debug console][debug-console] | The init daemon, running as PID 1 |
|
||||
| [Agent](README.md#agent) | VM root | yes | [VM guest image](#guest-image)| [debug console][debug-console] | Runs as a systemd service |
|
||||
| `chronyd` | VM root | yes | [VM guest image](#guest-image)| [debug console][debug-console] | Used to synchronise the time with the host |
|
||||
| container workload (`sh(1)` in [the example](example-command.md)) | VM container | no | User specified (Ubuntu in [the example](example-command.md)) | [exec command](README.md#exec-command) | Managed by the agent |
|
||||
|
||||
See also the [process overview](README.md#process-overview).
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The "User accessible" column shows how an administrator can access
|
||||
> the environment.
|
||||
>
|
||||
> - The container workload is running inside a full container
|
||||
> environment which itself is running within a VM environment.
|
||||
>
|
||||
> - See the [configuration files for the `osbuilder` tool](../../../tools/osbuilder/rootfs-builder)
|
||||
> for details of the default distribution for platforms other than
|
||||
> Intel x86_64.
|
||||
|
||||
#### Initrd image
|
||||
|
||||
The initrd image is a compressed `cpio(1)` archive, created from a
|
||||
rootfs which is loaded into memory and used as part of the Linux
|
||||
startup process. During startup, the kernel unpacks it into a special
|
||||
instance of a `tmpfs` mount that becomes the initial root filesystem.
|
||||
|
||||
If this image type is [configured](README.md#configuration), when the user runs
|
||||
the [example command](example-command.md):
|
||||
|
||||
- The [runtime](README.md#runtime) will launch the configured [hypervisor](README.md#hypervisor).
|
||||
- The hypervisor will boot the mini-OS image using the [guest kernel](#guest-kernel).
|
||||
- The kernel will start the init daemon as PID 1 (the
|
||||
[agent](README.md#agent))
|
||||
inside the VM root environment.
|
||||
- The [agent](README.md#agent) will create a new container environment, setting its root
|
||||
filesystem to that requested by the user (`ubuntu` in
|
||||
[the example](example-command.md)).
|
||||
- The agent will then execute the command (`sh(1)` in [the example](example-command.md))
|
||||
inside the new container.
|
||||
|
||||
The table below summarises the default mini O/S showing the environments that are created,
|
||||
the processes running in those environments (for all platforms) and
|
||||
the root filesystem used by each service:
|
||||
|
||||
| Process | Environment | rootfs | User accessible | Notes |
|
||||
|-|-|-|-|-|
|
||||
| [Agent](README.md#agent) | VM root | [VM guest image](#guest-image) | [debug console][debug-console] | Runs as the init daemon (PID 1) |
|
||||
| container workload | VM container | User specified (Ubuntu in this example) | [exec command](README.md#exec-command) | Managed by the agent |
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - The "User accessible" column shows how an administrator can access
|
||||
> the environment.
|
||||
>
|
||||
> - It is possible to use a standard init daemon such as systemd with
|
||||
> an initrd image if this is desirable.
|
||||
|
||||
See also the [process overview](README.md#process-overview).
|
||||
|
||||
#### Image summary
|
||||
|
||||
| Image type | Default distro | Init daemon | Reason | Notes |
|
||||
|-|-|-|-|-|
|
||||
| [image](background.md#root-filesystem-image) | [Clear Linux](https://clearlinux.org) (for x86_64 systems)| systemd | Minimal and highly optimized | systemd offers flexibility |
|
||||
| [initrd](#initrd-image) | [Alpine Linux](https://alpinelinux.org) | Kata [agent](README.md#agent) (as no systemd support) | Security hardened and tiny C library |
|
||||
|
||||
See also:
|
||||
|
||||
- The [osbuilder](../../../tools/osbuilder) tool
|
||||
|
||||
This is used to build all default image types.
|
||||
|
||||
- The [versions database](../../../versions.yaml)
|
||||
|
||||
The `default-image-name` and `default-initrd-name` options specify
|
||||
the default distributions for each image type.
|
||||
|
||||
[debug-console]: ../../Developer-Guide.md#connect-to-debug-console
|
||||
@@ -1,41 +0,0 @@
|
||||
# History
|
||||
|
||||
## Kata 1.x architecture
|
||||
|
||||
In the old [Kata 1.x architecture](https://github.com/kata-containers/documentation/blob/master/design/architecture.md),
|
||||
the Kata [runtime](README.md#runtime) was an executable called `kata-runtime`.
|
||||
The container manager called this executable multiple times when
|
||||
creating each container. Each time the runtime was called a different
|
||||
OCI command-line verb was provided. This architecture was simple, but
|
||||
not well suited to creating VM based containers due to the issue of
|
||||
handling state between calls. Additionally, the architecture suffered
|
||||
from performance issues related to continually having to spawn new
|
||||
instances of the runtime binary, and
|
||||
[Kata shim](https://github.com/kata-containers/shim) and
|
||||
[Kata proxy](https://github.com/kata-containers/proxy) processes for systems
|
||||
that did not provide VSOCK.
|
||||
|
||||
## Kata 2.x architecture
|
||||
|
||||
See the ["shimv2"](README.md#shim-v2-architecture) section of the
|
||||
architecture document.
|
||||
|
||||
## Architectural comparison
|
||||
|
||||
| Kata version | Kata Runtime process calls | Kata shim processes | Kata proxy processes (if no VSOCK) |
|
||||
|-|-|-|-|
|
||||
| 1.x | multiple per container | 1 per container connection | 1 |
|
||||
| 2.x | 1 per VM (hosting any number of containers) | 0 | 0 |
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - A single VM can host one or more containers.
|
||||
>
|
||||
> - The "Kata shim processes" column refers to the old
|
||||
> [Kata shim](https://github.com/kata-containers/shim) (`kata-shim` binary),
|
||||
> *not* the new shimv2 runtime instance (`containerd-shim-kata-v2` binary).
|
||||
|
||||
The diagram below shows how the original architecture was simplified
|
||||
with the advent of shimv2.
|
||||
|
||||

|
||||
@@ -1,35 +0,0 @@
|
||||
# Kubernetes support
|
||||
|
||||
[Kubernetes](https://github.com/kubernetes/kubernetes/), or K8s, is a popular open source
|
||||
container orchestration engine. In Kubernetes, a set of containers sharing resources
|
||||
such as networking, storage, mount, PID, etc. is called a
|
||||
[pod](https://kubernetes.io/docs/user-guide/pods/).
|
||||
|
||||
A node can have multiple pods, but at a minimum, a node within a Kubernetes cluster
|
||||
only needs to run a container runtime and a container agent (called a
|
||||
[Kubelet](https://kubernetes.io/docs/admin/kubelet/)).
|
||||
|
||||
Kata Containers represents a Kubelet pod as a VM.
|
||||
|
||||
A Kubernetes cluster runs a control plane where a scheduler (typically
|
||||
running on a dedicated master node) calls into a compute Kubelet. This
|
||||
Kubelet instance is responsible for managing the lifecycle of pods
|
||||
within the nodes and eventually relies on a container runtime to
|
||||
handle execution. The Kubelet architecture decouples lifecycle
|
||||
management from container execution through a dedicated gRPC based
|
||||
[Container Runtime Interface (CRI)](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node/container-runtime-interface-v1.md).
|
||||
|
||||
In other words, a Kubelet is a CRI client and expects a CRI
|
||||
implementation to handle the server side of the interface.
|
||||
[CRI-O](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[containerd](https://github.com/containerd/containerd/) are CRI
|
||||
implementations that rely on
|
||||
[OCI](https://github.com/opencontainers/runtime-spec) compatible
|
||||
runtimes for managing container instances.
|
||||
|
||||
Kata Containers is an officially supported CRI-O and containerd
|
||||
runtime. Refer to the following guides on how to set up Kata
|
||||
Containers with Kubernetes:
|
||||
|
||||
- [How to use Kata Containers and containerd](../../how-to/containerd-kata.md)
|
||||
- [Run Kata Containers with Kubernetes](../../how-to/run-kata-with-k8s.md)
|
||||
@@ -1,48 +0,0 @@
|
||||
# Networking
|
||||
|
||||
See the [networking document](networking.md).
|
||||
|
||||
Containers will typically live in their own, possibly shared, networking namespace.
|
||||
At some point in a container lifecycle, container engines will set up that namespace
|
||||
to add the container to a network which is isolated from the host network, but
|
||||
which is shared between containers
|
||||
|
||||
In order to do so, container engines will usually add one end of a virtual
|
||||
ethernet (`veth`) pair into the container networking namespace. The other end of
|
||||
the `veth` pair is added to the host networking namespace.
|
||||
|
||||
This is a very namespace-centric approach as many hypervisors or VM
|
||||
Managers (VMMs) such as `virt-manager` cannot handle `veth`
|
||||
interfaces. Typically, `TAP` interfaces are created for VM
|
||||
connectivity.
|
||||
|
||||
To overcome incompatibility between typical container engines expectations
|
||||
and virtual machines, Kata Containers networking transparently connects `veth`
|
||||
interfaces with `TAP` ones using Traffic Control:
|
||||
|
||||

|
||||
|
||||
With a TC filter in place, a redirection is created between the container network and the
|
||||
virtual machine. As an example, the CNI may create a device, `eth0`, in the container's network
|
||||
namespace, which is a VETH device. Kata Containers will create a tap device for the VM, `tap0_kata`,
|
||||
and setup a TC redirection filter to mirror traffic from `eth0`'s ingress to `tap0_kata`'s egress,
|
||||
and a second to mirror traffic from `tap0_kata`'s ingress to `eth0`'s egress.
|
||||
|
||||
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata. TC-filter
|
||||
is the default because it allows for simpler configuration, better CNI plugin compatibility, and performance
|
||||
on par with MACVTAP.
|
||||
|
||||
Kata Containers has deprecated support for bridge due to lacking performance relative to TC-filter and MACVTAP.
|
||||
|
||||
Kata Containers supports both
|
||||
[CNM](https://github.com/docker/libnetwork/blob/master/docs/design.md#the-container-network-model)
|
||||
and [CNI](https://github.com/containernetworking/cni) for networking management.
|
||||
|
||||
## Network Hotplug
|
||||
|
||||
Kata Containers has developed a set of network sub-commands and APIs to add, list and
|
||||
remove a guest network endpoint and to manipulate the guest route table.
|
||||
|
||||
The following diagram illustrates the Kata Containers network hotplug workflow.
|
||||
|
||||

|
||||
@@ -1,44 +0,0 @@
|
||||
# Storage
|
||||
|
||||
## virtio SCSI
|
||||
|
||||
If a block-based graph driver is [configured](README.md#configuration),
|
||||
`virtio-scsi` is used to _share_ the workload image (such as
|
||||
`busybox:latest`) into the container's environment inside the VM.
|
||||
|
||||
## virtio FS
|
||||
|
||||
If a block-based graph driver is _not_ [configured](README.md#configuration), a
|
||||
[`virtio-fs`](https://virtio-fs.gitlab.io) (`VIRTIO`) overlay
|
||||
filesystem mount point is used to _share_ the workload image instead. The
|
||||
[agent](README.md#agent) uses this mount point as the root filesystem for the
|
||||
container processes.
|
||||
|
||||
For virtio-fs, the [runtime](README.md#runtime) starts one `virtiofsd` daemon
|
||||
(that runs in the host context) for each VM created.
|
||||
|
||||
## Devicemapper
|
||||
|
||||
The
|
||||
[devicemapper `snapshotter`](https://github.com/containerd/containerd/tree/master/snapshots/devmapper)
|
||||
is a special case. The `snapshotter` uses dedicated block devices
|
||||
rather than formatted filesystems, and operates at the block level
|
||||
rather than the file level. This knowledge is used to directly use the
|
||||
underlying block device instead of the overlay file system for the
|
||||
container root file system. The block device maps to the top
|
||||
read-write layer for the overlay. This approach gives much better I/O
|
||||
performance compared to using `virtio-fs` to share the container file
|
||||
system.
|
||||
|
||||
#### Hot plug and unplug
|
||||
|
||||
Kata Containers has the ability to hot plug add and hot plug remove
|
||||
block devices. This makes it possible to use block devices for
|
||||
containers started after the VM has been launched.
|
||||
|
||||
Users can check to see if the container uses the `devicemapper` block
|
||||
device as its rootfs by calling `mount(8)` within the container. If
|
||||
the `devicemapper` block device is used, the root filesystem (`/`)
|
||||
will be mounted from `/dev/vda`. Users can disable direct mounting of
|
||||
the underlying block device through the runtime
|
||||
[configuration](README.md#configuration).
|
||||
@@ -1825,8 +1825,12 @@ components:
|
||||
desc: ""
|
||||
- value: grpc.StartContainerRequest
|
||||
desc: ""
|
||||
- value: grpc.StartTracingRequest
|
||||
desc: ""
|
||||
- value: grpc.StatsContainerRequest
|
||||
desc: ""
|
||||
- value: grpc.StopTracingRequest
|
||||
desc: ""
|
||||
- value: grpc.TtyWinResizeRequest
|
||||
desc: ""
|
||||
- value: grpc.UpdateContainerRequest
|
||||
|
||||
@@ -12,244 +12,187 @@ The OCI [runtime specification][linux-config] provides guidance on where the con
|
||||
> [`cgroupsPath`][cgroupspath]: (string, OPTIONAL) path to the cgroups. It can be used to either control the cgroups
|
||||
> hierarchy for containers or to run a new process in an existing container
|
||||
|
||||
Cgroups are hierarchical, and this can be seen with the following pod example:
|
||||
cgroups are hierarchical, and this can be seen with the following pod example:
|
||||
|
||||
- Pod 1: `cgroupsPath=/kubepods/pod1`
|
||||
- Container 1: `cgroupsPath=/kubepods/pod1/container1`
|
||||
- Container 2: `cgroupsPath=/kubepods/pod1/container2`
|
||||
- Container 1:
|
||||
`cgroupsPath=/kubepods/pod1/container1`
|
||||
- Container 2:
|
||||
`cgroupsPath=/kubepods/pod1/container2`
|
||||
|
||||
- Pod 2: `cgroupsPath=/kubepods/pod2`
|
||||
- Container 1: `cgroupsPath=/kubepods/pod2/container2`
|
||||
- Container 2: `cgroupsPath=/kubepods/pod2/container2`
|
||||
- Container 1:
|
||||
`cgroupsPath=/kubepods/pod2/container2`
|
||||
- Container 2:
|
||||
`cgroupsPath=/kubepods/pod2/container2`
|
||||
|
||||
Depending on the upper-level orchestration layers, the cgroup under which the pod is placed is
|
||||
managed by the orchestrator or not. In the case of Kubernetes, the pod cgroup is created by Kubelet,
|
||||
while the container cgroups are to be handled by the runtime.
|
||||
Kubelet will size the pod cgroup based on the container resource requirements, to which it may add
|
||||
a configured set of [pod resource overheads](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-overhead/).
|
||||
Depending on the upper-level orchestrator, the cgroup under which the pod is placed is
|
||||
managed by the orchestrator. In the case of Kubernetes, the pod-cgroup is created by Kubelet,
|
||||
while the container cgroups are to be handled by the runtime. Kubelet will size the pod-cgroup
|
||||
based on the container resource requirements.
|
||||
|
||||
Kata Containers introduces a non-negligible resource overhead for running a sandbox (pod). Typically, the Kata shim,
|
||||
through its underlying VMM invocation, will create many additional threads compared to process based container runtimes:
|
||||
the para-virtualized I/O back-ends, the VMM instance or even the Kata shim process, all of those host processes consume
|
||||
memory and CPU time not directly tied to the container workload, and introduces a sandbox resource overhead.
|
||||
In order for a Kata workload to run without significant performance degradation, its sandbox overhead must be
|
||||
provisioned accordingly. Two scenarios are possible:
|
||||
Kata Containers introduces a non-negligible overhead for running a sandbox (pod). Based on this, two scenarios are possible:
|
||||
1) The upper-layer orchestrator takes the overhead of running a sandbox into account when sizing the pod-cgroup, or
|
||||
2) Kata Containers do not fully constrain the VMM and associated processes, instead placing a subset of them outside of the pod-cgroup.
|
||||
|
||||
1) The upper-layer orchestrator takes the overhead of running a sandbox into account when sizing the pod cgroup.
|
||||
For example, Kubernetes [`PodOverhead`](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-overhead/)
|
||||
feature lets the orchestrator add a configured sandbox overhead to the sum of all its containers resources. In
|
||||
that case, the pod sandbox is properly sized and all Kata created processes will run under the pod cgroup
|
||||
defined constraints and limits.
|
||||
2) The upper-layer orchestrator does **not** take the sandbox overhead into account and the pod cgroup is not
|
||||
sized to properly run all Kata created processes. With that scenario, attaching all the Kata processes to the sandbox
|
||||
cgroup may lead to non-negligible workload performance degradations. As a consequence, Kata Containers will move
|
||||
all processes but the vCPU threads into a dedicated overhead cgroup under `/kata_overhead`. The Kata runtime will
|
||||
not apply any constraints or limits to that cgroup, it is up to the infrastructure owner to optionally set it up.
|
||||
Kata Containers provides two options for how cgroups are handled on the host. Selection of these options is done through
|
||||
the `SandboxCgroupOnly` flag within the Kata Containers [configuration](../../src/runtime/README.md#configuration)
|
||||
file.
|
||||
|
||||
Those 2 scenarios are not dynamically detected by the Kata Containers runtime implementation, and thus the
|
||||
infrastructure owner must configure the runtime according to how the upper-layer orchestrator creates and sizes the
|
||||
pod cgroup. That configuration selection is done through the `sandbox_cgroup_only` flag within the Kata Containers
|
||||
[configuration](../../src/runtime/README.md#configuration) file.
|
||||
## `SandboxCgroupOnly` enabled
|
||||
|
||||
## `sandbox_cgroup_only = true`
|
||||
With `SandboxCgroupOnly` enabled, it is expected that the parent cgroup is sized to take the overhead of running
|
||||
a sandbox into account. This is ideal, as all the applicable Kata Containers components can be placed within the
|
||||
given cgroup-path.
|
||||
|
||||
Setting `sandbox_cgroup_only` to `true` from the Kata Containers configuration file means that the pod cgroup is
|
||||
properly sized and takes the pod overhead into account. This is ideal, as all the applicable Kata Containers processes
|
||||
can simply be placed within the given cgroup path.
|
||||
|
||||
In the context of Kubernetes, Kubelet can size the pod cgroup to take the overhead of running a Kata-based sandbox
|
||||
into account. This has been supported since the 1.16 Kubernetes release, through the
|
||||
[`PodOverhead`](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-overhead/) feature.
|
||||
In the context of Kubernetes, Kubelet will size the pod-cgroup to take the overhead of running a Kata-based sandbox
|
||||
into account. This will be feasible in the 1.16 Kubernetes release through the `PodOverhead` feature.
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ │
|
||||
│ ┌──────────────────────────────────┐ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────────────────────┐ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ ┌─────────────────────┐ │ │ │
|
||||
│ │ │ │ vCPU threads │ │ │ │
|
||||
│ │ │ │ I/O threads │ │ │ │
|
||||
│ │ │ │ VMM │ │ │ │
|
||||
│ │ │ │ Kata Shim │ │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ │ /kata_<sandbox_id> │ │ │ │
|
||||
│ │ │ └─────────────────────┘ │ │ │
|
||||
│ │ │Pod 1 │ │ │
|
||||
│ │ └─────────────────────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ │ ┌─────────────────────────────┐ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ ┌─────────────────────┐ │ │ │
|
||||
│ │ │ │ vCPU threads │ │ │ │
|
||||
│ │ │ │ I/O threads │ │ │ │
|
||||
│ │ │ │ VMM │ │ │ │
|
||||
│ │ │ │ Kata Shim │ │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ │ /kata_<sandbox_id> │ │ │ │
|
||||
│ │ │ └─────────────────────┘ │ │ │
|
||||
│ │ │Pod 2 │ │ │
|
||||
│ │ └─────────────────────────────┘ │ │
|
||||
│ │ │ │
|
||||
│ │/kubepods │ │
|
||||
│ └──────────────────────────────────┘ │
|
||||
│ │
|
||||
│ Node │
|
||||
└─────────────────────────────────────────┘
|
||||
+----------------------------------------------------------+
|
||||
| +---------------------------------------------------+ |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | | kata-shimv2, VMM and threads: | | | |
|
||||
| | | | (VMM, IO-threads, vCPU threads, etc)| | | |
|
||||
| | | | | | | |
|
||||
| | | | kata_<sandbox-id> | | | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | | | |
|
||||
| | |Pod 1 | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | | kata-shimv2, VMM and threads: | | | |
|
||||
| | | | (VMM, IO-threads, vCPU threads, etc)| | | |
|
||||
| | | | | | | |
|
||||
| | | | kata_<sandbox-id> | | | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | |Pod 2 | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| |kubepods | |
|
||||
| +---------------------------------------------------+ |
|
||||
| |
|
||||
|Node |
|
||||
+----------------------------------------------------------+
|
||||
```
|
||||
|
||||
### Implementation details
|
||||
### What does Kata do in this configuration?
|
||||
1. Given a `PodSandbox` container creation, let:
|
||||
|
||||
When `sandbox_cgroup_only` is enabled, the Kata shim will create a per pod
|
||||
sub-cgroup under the pod's dedicated cgroup. For example, in the Kubernetes context,
|
||||
it will create a `/kata_<PodSandboxID>` under the `/kubepods` cgroup hierarchy.
|
||||
On a typical cgroup v1 hierarchy mounted under `/sys/fs/cgroup/`, the memory cgroup
|
||||
subsystem for a pod with sandbox ID `12345678` would live under
|
||||
`/sys/fs/cgroup/memory/kubepods/kata_12345678`.
|
||||
```
|
||||
podCgroup=Parent(container.CgroupsPath)
|
||||
KataSandboxCgroup=<podCgroup>/kata_<PodSandboxID>
|
||||
```
|
||||
|
||||
In most cases, the `/kata_<PodSandboxID>` created cgroup is unrestricted and inherits and shares all
|
||||
constraints and limits from the parent cgroup (`/kubepods` in the Kubernetes case). The exception is
|
||||
for the `cpuset` and `devices` cgroup subsystems, which are managed by the Kata shim.
|
||||
2. Create the cgroup, `KataSandboxCgroup`
|
||||
|
||||
After creating the `/kata_<PodSandboxID>` cgroup, the Kata Containers shim will move itself to it, **before** starting
|
||||
the virtual machine. As a consequence all processes subsequently created by the Kata Containers shim (the VMM itself, and
|
||||
all vCPU and I/O related threads) will be created in the `/kata_<PodSandboxID>` cgroup.
|
||||
3. Join the `KataSandboxCgroup`
|
||||
|
||||
### Why create a kata-cgroup under the parent cgroup?
|
||||
Any process created by the runtime will be created in `KataSandboxCgroup`.
|
||||
The runtime will limit the cgroup in the host only if the sandbox doesn't have a
|
||||
container type annotation, but the caller is free to set the proper limits for the `podCgroup`.
|
||||
|
||||
And why not directly adding the per sandbox shim directly to the pod cgroup (e.g.
|
||||
`/kubepods` in the Kubernetes context)?
|
||||
In the example above the pod cgroups are `/kubepods/pod1` and `/kubepods/pod2`.
|
||||
Kata creates the unrestricted sandbox cgroup under the pod cgroup.
|
||||
|
||||
The Kata Containers shim implementation creates a per-sandbox cgroup
|
||||
(`/kata_<PodSandboxID>`) to support the `Docker` use case. Although `Docker` does not
|
||||
have a notion of pods, Kata Containers still creates a sandbox to support the pod-less,
|
||||
single container use case that `Docker` implements. Since `Docker` does create any
|
||||
cgroup hierarchy to place a container into, it would be very complex for Kata to map
|
||||
a particular container to its sandbox without placing it under a `/kata_<containerID>>`
|
||||
sub-cgroup first.
|
||||
### Why create a Kata-cgroup under the parent cgroup?
|
||||
|
||||
### Advantages
|
||||
`Docker` does not have a notion of pods, and will not create a cgroup directory
|
||||
to place a particular container in (i.e., all containers would be in a path like
|
||||
`/docker/container-id`. To simplify the implementation and continue to support `Docker`,
|
||||
Kata Containers creates the sandbox-cgroup, in the case of Kubernetes, or a container cgroup, in the case
|
||||
of docker.
|
||||
|
||||
Keeping all Kata Containers processes under a properly sized pod cgroup is ideal
|
||||
and makes for a simpler Kata Containers implementation. It also helps with gathering
|
||||
accurate statistics and preventing Kata workloads from being noisy neighbors.
|
||||
### Improvements
|
||||
|
||||
#### Pod resources statistics
|
||||
- Get statistics about pod resources
|
||||
|
||||
If the Kata caller wants to know the resource usage on the host it can get
|
||||
statistics from the pod cgroup. All cgroups stats in the hierarchy will include
|
||||
the Kata overhead. This gives the possibility of gathering usage-statics at the
|
||||
pod level and the container level.
|
||||
|
||||
#### Better host resource isolation
|
||||
- Better host resource isolation
|
||||
|
||||
Because the Kata runtime will place all the Kata processes in the pod cgroup,
|
||||
the resource limits that the caller applies to the pod cgroup will affect all
|
||||
processes that belong to the Kata sandbox in the host. This will improve the
|
||||
isolation in the host preventing Kata to become a noisy neighbor.
|
||||
|
||||
## `sandbox_cgroup_only = false` (Default setting)
|
||||
|
||||
If the cgroup provided to Kata is not sized appropriately, Kata components will
|
||||
consume resources that the actual container workloads expect to see and use.
|
||||
This can cause instability and performance degradations.
|
||||
|
||||
To avoid that situation, Kata Containers creates an unconstrained overhead
|
||||
cgroup and moves all non workload related processes (Anything but the virtual CPU
|
||||
threads) to it. The name of this overhead cgroup is `/kata_overhead` and a per
|
||||
sandbox sub cgroup will be created under it for each sandbox Kata Containers creates.
|
||||
|
||||
Kata Containers does not add any constraints or limitations on the overhead cgroup. It is up to the infrastructure
|
||||
owner to either:
|
||||
|
||||
- Provision nodes with a pre-sized `/kata_overhead` cgroup. Kata Containers will
|
||||
load that existing cgroup and move all non workload related processes to it.
|
||||
- Let Kata Containers create the `/kata_overhead` cgroup, leave it
|
||||
unconstrained or resize it a-posteriori.
|
||||
## `SandboxCgroupOnly` disabled (default, legacy)
|
||||
|
||||
If the cgroup provided to Kata is not sized appropriately, instability will be
|
||||
introduced when fully constraining Kata components, and the user-workload will
|
||||
see a subset of resources that were requested. Based on this, the default
|
||||
handling for Kata Containers is to not fully constrain the VMM and Kata
|
||||
components on the host.
|
||||
|
||||
```
|
||||
┌────────────────────────────────────────────────────────────────────┐
|
||||
│ │
|
||||
│ ┌─────────────────────────────┐ ┌───────────────────────────┐ │
|
||||
│ │ │ │ │ │
|
||||
│ │ ┌─────────────────────────┼────┼─────────────────────────┐ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ ┌─────────────────────┐ │ │ ┌─────────────────────┐ │ │ │
|
||||
│ │ │ │ vCPU threads │ │ │ │ VMM │ │ │ │
|
||||
│ │ │ │ │ │ │ │ I/O threads │ │ │ │
|
||||
│ │ │ │ │ │ │ │ Kata Shim │ │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ │ /kata_<sandbox_id> │ │ │ │ /<sandbox_id> │ │ │ │
|
||||
│ │ │ └─────────────────────┘ │ │ └─────────────────────┘ │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ Pod 1 │ │ │ │ │
|
||||
│ │ └─────────────────────────┼────┼─────────────────────────┘ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ ┌─────────────────────────┼────┼─────────────────────────┐ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ ┌─────────────────────┐ │ │ ┌─────────────────────┐ │ │ │
|
||||
│ │ │ │ vCPU threads │ │ │ │ VMM │ │ │ │
|
||||
│ │ │ │ │ │ │ │ I/O threads │ │ │ │
|
||||
│ │ │ │ │ │ │ │ Kata Shim │ │ │ │
|
||||
│ │ │ │ │ │ │ │ │ │ │ │
|
||||
│ │ │ │ /kata_<sandbox_id> │ │ │ │ /<sandbox_id> │ │ │ │
|
||||
│ │ │ └─────────────────────┘ │ │ └─────────────────────┘ │ │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ │ Pod 2 │ │ │ │ │
|
||||
│ │ └─────────────────────────┼────┼─────────────────────────┘ │ │
|
||||
│ │ │ │ │ │
|
||||
│ │ /kubepods │ │ /kata_overhead │ │
|
||||
│ └─────────────────────────────┘ └───────────────────────────┘ │
|
||||
│ │
|
||||
│ │
|
||||
│ Node │
|
||||
└────────────────────────────────────────────────────────────────────┘
|
||||
+----------------------------------------------------------+
|
||||
| +---------------------------------------------------+ |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | |Container 1 |-|Container 2 | | | |
|
||||
| | | | |-| | | | |
|
||||
| | | | Shim+container1 |-| Shim+container2 | | | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | | | |
|
||||
| | |Pod 1 | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | |Container 1 |-|Container 2 | | | |
|
||||
| | | | |-| | | | |
|
||||
| | | | Shim+container1 |-| Shim+container2 | | | |
|
||||
| | | +--------------------------------------+ | | |
|
||||
| | | | | |
|
||||
| | |Pod 2 | | |
|
||||
| | +---------------------------------------------+ | |
|
||||
| |kubepods | |
|
||||
| +---------------------------------------------------+ |
|
||||
| +---------------------------------------------------+ |
|
||||
| | Hypervisor | |
|
||||
| |Kata | |
|
||||
| +---------------------------------------------------+ |
|
||||
| |
|
||||
|Node |
|
||||
+----------------------------------------------------------+
|
||||
|
||||
```
|
||||
|
||||
### Implementation Details
|
||||
### What does this method do?
|
||||
|
||||
When `sandbox_cgroup_only` is disabled, the Kata Containers shim will create a per pod
|
||||
sub-cgroup under the pods dedicated cgroup, and another one under the overhead cgroup.
|
||||
For example, in the Kubernetes context, it will create a `/kata_<PodSandboxID>` under
|
||||
the `/kubepods` cgroup hierarchy, and a `/<PodSandboxID>` under the `/kata_overhead` one.
|
||||
1. Given a container creation let `containerCgroupHost=container.CgroupsPath`
|
||||
1. Rename `containerCgroupHost` path to add `kata_`
|
||||
1. Let `PodCgroupPath=PodSanboxContainerCgroup` where `PodSanboxContainerCgroup` is the cgroup of a container of type `PodSandbox`
|
||||
1. Limit the `PodCgroupPath` with the sum of all the container limits in the Sandbox
|
||||
1. Move only vCPU threads of hypervisor to `PodCgroupPath`
|
||||
1. Per each container, move its `kata-shim` to its own `containerCgroupHost`
|
||||
1. Move hypervisor and applicable threads to memory cgroup `/kata`
|
||||
|
||||
On a typical cgroup v1 hierarchy mounted under `/sys/fs/cgroup/`, for a pod which sandbox
|
||||
ID is `12345678`, create with `sandbox_cgroup_only` disabled, the 2 memory subsystems
|
||||
for the sandbox cgroup and the overhead cgroup would respectively live under
|
||||
`/sys/fs/cgroup/memory/kubepods/kata_12345678` and `/sys/fs/cgroup/memory/kata_overhead/12345678`.
|
||||
_Note_: the Kata Containers runtime will not add all the hypervisor threads to
|
||||
the cgroup path requested, only vCPUs. These threads are run unconstrained.
|
||||
|
||||
Unlike when `sandbox_cgroup_only` is enabled, the Kata Containers shim will move itself
|
||||
to the overhead cgroup first, and then move the vCPU threads to the sandbox cgroup as
|
||||
they're created. All Kata processes and threads will run under the overhead cgroup except for
|
||||
the vCPU threads.
|
||||
This mitigates the risk of the VMM and other threads receiving an out of memory scenario (`OOM`).
|
||||
|
||||
With `sandbox_cgroup_only` disabled, Kata Containers assumes the pod cgroup is only sized
|
||||
to accommodate for the actual container workloads processes. For Kata, this maps
|
||||
to the VMM created virtual CPU threads and so they are the only ones running under the pod
|
||||
cgroup. This mitigates the risk of the VMM, the Kata shim and the I/O threads going through
|
||||
a catastrophic out of memory scenario (`OOM`).
|
||||
|
||||
#### Pros and Cons
|
||||
#### Impact
|
||||
|
||||
Running all non vCPU threads under an unconstrained overhead cgroup could lead to workloads
|
||||
potentially consuming a large amount of host resources.
|
||||
If resources are reserved at a system level to account for the overheads of
|
||||
running sandbox containers, this configuration can be utilized with adequate
|
||||
stability. In this scenario, non-negligible amounts of CPU and memory will be
|
||||
utilized unaccounted for on the host.
|
||||
|
||||
On the other hand, running all non vCPU threads under a dedicated overhead cgroup can provide
|
||||
accurate metrics on the actual Kata Container pod overhead, allowing for tuning the overhead
|
||||
cgroup size and constraints accordingly.
|
||||
|
||||
[linux-config]: https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md
|
||||
[cgroupspath]: https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#cgroups-path
|
||||
[linux-config]: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md
|
||||
[cgroupspath]: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#cgroups-path
|
||||
|
||||
# Supported cgroups
|
||||
|
||||
Kata Containers currently only supports cgroups `v1`.
|
||||
|
||||
In the following sections each cgroup is described briefly.
|
||||
Kata Containers supports cgroups `v1` and `v2`. In the following sections each cgroup is
|
||||
described briefly and what changes are needed in Kata Containers to support it.
|
||||
|
||||
## Cgroups V1
|
||||
|
||||
@@ -301,7 +244,7 @@ diagram:
|
||||
A process can join a cgroup by writing its process id (`pid`) to `cgroup.procs` file,
|
||||
or join a cgroup partially by writing the task (thread) id (`tid`) to the `tasks` file.
|
||||
|
||||
Kata Containers only supports `v1`.
|
||||
Kata Containers supports `v1` by default and no change in the configuration file is needed.
|
||||
To know more about `cgroups v1`, see [cgroupsv1(7)][2].
|
||||
|
||||
## Cgroups V2
|
||||
@@ -354,13 +297,22 @@ Same as `cgroups v1`, a process can join the cgroup by writing its process id (`
|
||||
`cgroup.procs` file, or join a cgroup partially by writing the task (thread) id (`tid`) to
|
||||
`cgroup.threads` file.
|
||||
|
||||
Kata Containers does not support cgroups `v2` on the host.
|
||||
For backwards compatibility Kata Containers defaults to supporting cgroups v1 by default.
|
||||
To change this to `v2`, set `sandbox_cgroup_only=true` in the `configuration.toml` file.
|
||||
To know more about `cgroups v2`, see [cgroupsv2(7)][3].
|
||||
|
||||
### Distro Support
|
||||
|
||||
Many Linux distributions do not yet support `cgroups v2`, as it is quite a recent addition.
|
||||
For more information about the status of this feature see [issue #2494][4].
|
||||
|
||||
# Summary
|
||||
|
||||
| cgroup option | default? | status | pros | cons | cgroups
|
||||
|-|-|-|-|-|-|
|
||||
| `SandboxCgroupOnly=false` | yes | legacy | Easiest to make Kata work | Unaccounted for memory and resource utilization | v1
|
||||
| `SandboxCgroupOnly=true` | no | recommended | Complete tracking of Kata memory and CPU utilization. In Kubernetes, the Kubelet can fully constrain Kata via the pod cgroup | Requires upper layer orchestrator which sizes sandbox cgroup appropriately | v1, v2
|
||||
|
||||
|
||||
[1]: http://man7.org/linux/man-pages/man5/tmpfs.5.html
|
||||
[2]: http://man7.org/linux/man-pages/man7/cgroups.7.html#CGROUPS_VERSION_1
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
# Kata 2.0 Metrics Design
|
||||
|
||||
Kata implements CRI's API and supports [`ContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L101) and [`ListContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L103) interfaces to expose containers metrics. User can use these interfaces to get basic metrics about containers.
|
||||
Kata implement CRI's API and support [`ContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L101) and [`ListContainerStats`](https://github.com/kubernetes/kubernetes/blob/release-1.18/staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2/api.proto#L103) interfaces to expose containers metrics. User can use these interface to get basic metrics about container.
|
||||
|
||||
Unlike `runc`, Kata is a VM-based runtime and has a different architecture.
|
||||
But unlike `runc`, Kata is a VM-based runtime and has a different architecture.
|
||||
|
||||
## Limitations of Kata 1.x and target of Kata 2.0
|
||||
## Limitations of Kata 1.x and the target of Kata 2.0
|
||||
|
||||
Kata 1.x has a number of limitations related to observability that may be obstacles to running Kata Containers at scale.
|
||||
|
||||
In Kata 2.0, the following components will be able to provide more details about the system:
|
||||
In Kata 2.0, the following components will be able to provide more details about the system.
|
||||
|
||||
- containerd shim v2 (effectively `kata-runtime`)
|
||||
- Hypervisor statistics
|
||||
- Agent process
|
||||
- Guest OS statistics
|
||||
|
||||
> **Note**: In Kata 1.x, the main user-facing component was the runtime (`kata-runtime`). From 1.5, Kata introduced the Kata containerd shim v2 (`containerd-shim-kata-v2`) which is essentially a modified runtime that is loaded by containerd to simplify and improve the way VM-based containers are created and managed.
|
||||
> **Note**: In Kata 1.x, the main user-facing component was the runtime (`kata-runtime`). From 1.5, Kata then introduced the Kata containerd shim v2 (`containerd-shim-kata-v2`) which is essentially a modified runtime that is loaded by containerd to simplify and improve the way VM-based containers are created and managed.
|
||||
>
|
||||
> For Kata 2.0, the main component is the Kata containerd shim v2, although the deprecated `kata-runtime` binary will be maintained for a period of time.
|
||||
>
|
||||
@@ -25,15 +25,14 @@ In Kata 2.0, the following components will be able to provide more details about
|
||||
|
||||
Kata 2.0 metrics strongly depend on [Prometheus](https://prometheus.io/), a graduated project from CNCF.
|
||||
|
||||
Kata Containers 2.0 introduces a new Kata component called `kata-monitor` which is used to monitor the Kata components on the host. It's shipped with the Kata runtime to provide an interface to:
|
||||
Kata Containers 2.0 introduces a new Kata component called `kata-monitor` which is used to monitor the other Kata components on the host. It's the monitor interface with Kata runtime, and we can do something like these:
|
||||
|
||||
- Get metrics
|
||||
- Get events
|
||||
|
||||
At present, `kata-monitor` supports retrieval of metrics only: this is what will be covered in this document.
|
||||
In this document we will cover metrics only. And until now it only supports metrics function.
|
||||
|
||||
|
||||
This is the architecture overview of metrics in Kata Containers 2.0:
|
||||
This is the architecture overview metrics in Kata Containers 2.0.
|
||||
|
||||

|
||||
|
||||
@@ -46,38 +45,38 @@ For a quick evaluation, you can check out [this how to](../how-to/how-to-set-pro
|
||||
|
||||
### Kata monitor
|
||||
|
||||
The `kata-monitor` management agent should be started on each node where the Kata containers runtime is installed. `kata-monitor` will:
|
||||
`kata-monitor` is a management agent on one node, where many Kata containers are running. `kata-monitor`'s work include:
|
||||
|
||||
> **Note**: a *node* running Kata containers will be either a single host system or a worker node belonging to a K8s cluster capable of running Kata pods.
|
||||
> **Note**: node is a single host system or a node in K8s clusters.
|
||||
|
||||
- Aggregate sandbox metrics running on the node, adding the `sandbox_id` label to them.
|
||||
- Expose a new Prometheus target, allowing all node metrics coming from the Kata shim to be collected by Prometheus indirectly. This simplifies the targets count in Prometheus and avoids exposing shim's metrics by `ip:port`.
|
||||
- Aggregate sandbox metrics running on this node, and add `sandbox_id` label
|
||||
- As a Prometheus target, all metrics from Kata shim on this node will be collected by Prometheus indirectly. This can easy the targets count in Prometheus, and also need not to expose shim's metrics by `ip:port`
|
||||
|
||||
Only one `kata-monitor` process runs in each node.
|
||||
Only one `kata-monitor` process are running on one node.
|
||||
|
||||
`kata-monitor` uses a different communication channel than the one used by the container engine (`containerd`/`CRI-O`) to communicate with the Kata shim. The Kata shim exposes a dedicated socket address reserved to `kata-monitor`.
|
||||
`kata-monitor` is using a different communication channel other than that `conatinerd` communicating with Kata shim, and Kata shim listen on a new socket address for communicating with `kata-monitor`.
|
||||
|
||||
The shim's metrics socket file is created under the virtcontainers sandboxes directory, i.e. `vc/sbs/${PODID}/shim-monitor.sock`.
|
||||
The way `kata-monitor` get shim's metrics socket file(`monitor_address`) like that `containerd` get shim address. The socket is an abstract socket and saved as file `abstract` with the same directory of `address` for `containerd`.
|
||||
|
||||
> **Note**: If there is no Prometheus server configured, i.e., there are no scrape operations, `kata-monitor` will not collect any metrics.
|
||||
> **Note**: If there is no Prometheus server is configured, i.e., there is no scrape operations, `kata-monitor` will do nothing initiative.
|
||||
|
||||
### Kata runtime
|
||||
|
||||
Kata runtime is responsible for:
|
||||
Runtime is responsible for:
|
||||
|
||||
- Gather metrics about shim process
|
||||
- Gather metrics about hypervisor process
|
||||
- Gather metrics about running sandbox
|
||||
- Get metrics from Kata agent (through `ttrpc`)
|
||||
- Get metrics from Kata agent(through `ttrpc`)
|
||||
|
||||
### Kata agent
|
||||
|
||||
Kata agent is responsible for:
|
||||
Agent is responsible for:
|
||||
|
||||
- Gather agent process metrics
|
||||
- Gather guest OS metrics
|
||||
|
||||
In Kata 2.0, the agent adds a new interface:
|
||||
And in Kata 2.0, agent will add a new interface:
|
||||
|
||||
```protobuf
|
||||
rpc GetMetrics(GetMetricsRequest) returns (Metrics);
|
||||
@@ -94,49 +93,33 @@ The `metrics` field is Prometheus encoded content. This can avoid defining a fix
|
||||
|
||||
### Performance and overhead
|
||||
|
||||
Metrics should not become a bottleneck for the system or downgrade the performance: they should run with minimal overhead.
|
||||
Metrics should not become the bottleneck of system, downgrade the performance, and run with minimal overhead.
|
||||
|
||||
Requirements:
|
||||
|
||||
* Metrics **MUST** be quick to collect
|
||||
* Metrics **MUST** be small
|
||||
* Metrics **MUST** be small.
|
||||
* Metrics **MUST** be generated only if there are subscribers to the Kata metrics service
|
||||
* Metrics **MUST** be stateless
|
||||
|
||||
In Kata 2.0, metrics are collected only when needed (pull mode), mainly from the `/proc` filesystem, and consumed by Prometheus. This means that if the Prometheus collector is not running (so no one cares about the metrics) the overhead will be zero.
|
||||
In Kata 2.0, metrics are collected mainly from `/proc` filesystem, and consumed by Prometheus, based on a pull mode, that is mean if there is no Prometheus collector is running, so there will be zero overhead if nobody cares the metrics.
|
||||
|
||||
The metrics service also doesn't hold any metrics in memory.
|
||||
|
||||
#### Metrics size ####
|
||||
Metrics service also doesn't hold any metrics in memory.
|
||||
|
||||
|\*|No Sandbox | 1 Sandbox | 2 Sandboxes |
|
||||
|---|---|---|---|
|
||||
|Metrics count| 39 | 106 | 173 |
|
||||
|Metrics size (bytes)| 9K | 144K | 283K |
|
||||
|Metrics size (`gzipped`, bytes)| 2K | 10K | 17K |
|
||||
|Metrics size(bytes)| 9K | 144K | 283K |
|
||||
|Metrics size(`gzipped`, bytes)| 2K | 10K | 17K |
|
||||
|
||||
*Metrics size*: response size of one Prometheus scrape request.
|
||||
*Metrics size*: Response size of one Prometheus scrape request.
|
||||
|
||||
It's easy to estimate the size of one metrics fetch request issued by Prometheus.
|
||||
The formula to calculate the expected size when no gzip compression is in place is:
|
||||
9 + (144 - 9) * `number of kata sandboxes`
|
||||
|
||||
Prometheus supports `gzip compression`. When enabled, the response size of each request will be smaller:
|
||||
2 + (10 - 2) * `number of kata sandboxes`
|
||||
|
||||
**Example**
|
||||
We have 10 sandboxes running on a node. The expected size of one metrics fetch request issued by Prometheus against the kata-monitor agent running on that node will be:
|
||||
9 + (144 - 9) * 10 = **1.35M**
|
||||
|
||||
If `gzip compression` is enabled:
|
||||
2 + (10 - 2) * 10 = **82K**
|
||||
|
||||
#### Metrics delay ####
|
||||
It's easy to estimated that if there are 10 sandboxes running in the host, the size of one metrics fetch request issued by Prometheus will be about to 9 + (144 - 9) * 10 = 1.35M (not `gzipped`) or 2 + (10 - 2) * 10 = 82K (`gzipped`). Of course Prometheus support `gzip` compression, that can reduce the response size of every request.
|
||||
|
||||
And here is some test data:
|
||||
|
||||
- End-to-end (from Prometheus server to `kata-monitor` and `kata-monitor` write response back): **20ms**(avg)
|
||||
- Agent (RPC all from shim to agent): **3ms**(avg)
|
||||
- End-to-end (from Prometheus server to `kata-monitor` and `kata-monitor` write response back): 20ms(avg)
|
||||
- Agent(RPC all from shim to agent): 3ms(avg)
|
||||
|
||||
Test infrastructure:
|
||||
|
||||
@@ -145,13 +128,13 @@ Test infrastructure:
|
||||
|
||||
**Scrape interval**
|
||||
|
||||
Prometheus default `scrape_interval` is 1 minute, but it is usually set to 15 seconds. A smaller `scrape_interval` causes more overhead, so users should set it depending on their monitoring needs.
|
||||
Prometheus default `scrape_interval` is 1 minute, and usually it is set to 15s. Small `scrape_interval` will cause more overhead, so user should set it on monitor demand.
|
||||
|
||||
## Metrics list
|
||||
|
||||
Here are listed all the metrics supported by Kata 2.0. Some metrics are dependent on the VM guest kernel, so the available ones may differ based on the environment.
|
||||
Here listed is all supported metrics by Kata 2.0. Some metrics is dependent on guest kernels in the VM, so there may be some different by your environment.
|
||||
|
||||
Metrics are categorized by the component from/for which the metrics are collected.
|
||||
Metrics is categorized by component where metrics are collected from and for.
|
||||
|
||||
* [Metric types](#metric-types)
|
||||
* [Kata agent metrics](#kata-agent-metrics)
|
||||
@@ -162,15 +145,15 @@ Metrics are categorized by the component from/for which the metrics are collecte
|
||||
* [Kata containerd shim v2 metrics](#kata-containerd-shim-v2-metrics)
|
||||
|
||||
> **Note**:
|
||||
> * Labels here do not include the `instance` and `job` labels added by Prometheus.
|
||||
> * Labels here are not include `instance` and `job` labels that added by Prometheus.
|
||||
> * Notes about metrics unit
|
||||
> * `Kibibytes`, abbreviated `KiB`. 1 `KiB` equals 1024 B.
|
||||
> * For some metrics (like network devices statistics from file `/proc/net/dev`), unit depends on label( for example `recv_bytes` and `recv_packets` have different units).
|
||||
> * Most of these metrics are collected from the `/proc` filesystem, so the unit of each metric matches the unit of the relevant `/proc` entry. See the `proc(5)` manual page for further details.
|
||||
> * For some metrics (like network devices statistics from file `/proc/net/dev`), unit is depend on label( for example `recv_bytes` and `recv_packets` are having different units).
|
||||
> * Most of these metrics is collected from `/proc` filesystem, so the unit of metrics are keeping the same unit as `/proc`. See the `proc(5)` manual page for further details.
|
||||
|
||||
### Metric types
|
||||
|
||||
Prometheus offers four core metric types.
|
||||
Prometheus offer four core metric types.
|
||||
|
||||
- Counter: A counter is a cumulative metric that represents a single monotonically increasing counter whose value can only increase.
|
||||
|
||||
@@ -224,7 +207,7 @@ Metrics for Firecracker vmm.
|
||||
| `kata_firecracker_uart`: <br> Metrics specific to the UART device. | `GAUGE` | | <ul><li>`item`<ul><li>`error_count`</li><li>`flush_count`</li><li>`missed_read_count`</li><li>`missed_write_count`</li><li>`read_count`</li><li>`write_count`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_firecracker_vcpu`: <br> Metrics specific to VCPUs' mode of functioning. | `GAUGE` | | <ul><li>`item`<ul><li>`exit_io_in`</li><li>`exit_io_out`</li><li>`exit_mmio_read`</li><li>`exit_mmio_write`</li><li>`failures`</li><li>`filter_cpuid`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_firecracker_vmm`: <br> Metrics specific to the machine manager as a whole. | `GAUGE` | | <ul><li>`item`<ul><li>`device_events`</li><li>`panic_count`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_firecracker_vsock`: <br> VSOCK-related metrics. | `GAUGE` | | <ul><li>`item`<ul><li>`activate_fails`</li><li>`cfg_fails`</li><li>`conn_event_fails`</li><li>`conns_added`</li><li>`conns_killed`</li><li>`conns_removed`</li><li>`ev_queue_event_fails`</li><li>`killq_resync`</li><li>`muxer_event_fails`</li><li>`rx_bytes_count`</li><li>`rx_packets_count`</li><li>`rx_queue_event_count`</li><li>`rx_queue_event_fails`</li><li>`rx_read_fails`</li><li>`tx_bytes_count`</li><li>`tx_flush_fails`</li><li>`tx_packets_count`</li><li>`tx_queue_event_count`</li><li>`tx_queue_event_fails`</li><li>`tx_write_fails`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_firecracker_vsock`: <br> Vsock-related metrics. | `GAUGE` | | <ul><li>`item`<ul><li>`activate_fails`</li><li>`cfg_fails`</li><li>`conn_event_fails`</li><li>`conns_added`</li><li>`conns_killed`</li><li>`conns_removed`</li><li>`ev_queue_event_fails`</li><li>`killq_resync`</li><li>`muxer_event_fails`</li><li>`rx_bytes_count`</li><li>`rx_packets_count`</li><li>`rx_queue_event_count`</li><li>`rx_queue_event_fails`</li><li>`rx_read_fails`</li><li>`tx_bytes_count`</li><li>`tx_flush_fails`</li><li>`tx_packets_count`</li><li>`tx_queue_event_count`</li><li>`tx_queue_event_fails`</li><li>`tx_write_fails`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
|
||||
### Kata guest OS metrics
|
||||
|
||||
@@ -305,7 +288,7 @@ Metrics about Kata containerd shim v2 process.
|
||||
|
||||
| Metric name | Type | Units | Labels | Introduced in Kata version |
|
||||
|---|---|---|---|---|
|
||||
| `kata_shim_agent_rpc_durations_histogram_milliseconds`: <br> RPC latency distributions. | `HISTOGRAM` | `milliseconds` | <ul><li>`action` (RPC actions of Kata agent)<ul><li>`grpc.CheckRequest`</li><li>`grpc.CloseStdinRequest`</li><li>`grpc.CopyFileRequest`</li><li>`grpc.CreateContainerRequest`</li><li>`grpc.CreateSandboxRequest`</li><li>`grpc.DestroySandboxRequest`</li><li>`grpc.ExecProcessRequest`</li><li>`grpc.GetMetricsRequest`</li><li>`grpc.GuestDetailsRequest`</li><li>`grpc.ListInterfacesRequest`</li><li>`grpc.ListProcessesRequest`</li><li>`grpc.ListRoutesRequest`</li><li>`grpc.MemHotplugByProbeRequest`</li><li>`grpc.OnlineCPUMemRequest`</li><li>`grpc.PauseContainerRequest`</li><li>`grpc.RemoveContainerRequest`</li><li>`grpc.ReseedRandomDevRequest`</li><li>`grpc.ResumeContainerRequest`</li><li>`grpc.SetGuestDateTimeRequest`</li><li>`grpc.SignalProcessRequest`</li><li>`grpc.StartContainerRequest`</li><li>`grpc.StatsContainerRequest`</li><li>`grpc.TtyWinResizeRequest`</li><li>`grpc.UpdateContainerRequest`</li><li>`grpc.UpdateInterfaceRequest`</li><li>`grpc.UpdateRoutesRequest`</li><li>`grpc.WaitProcessRequest`</li><li>`grpc.WriteStreamRequest`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_agent_rpc_durations_histogram_milliseconds`: <br> RPC latency distributions. | `HISTOGRAM` | `milliseconds` | <ul><li>`action` (RPC actions of Kata agent)<ul><li>`grpc.CheckRequest`</li><li>`grpc.CloseStdinRequest`</li><li>`grpc.CopyFileRequest`</li><li>`grpc.CreateContainerRequest`</li><li>`grpc.CreateSandboxRequest`</li><li>`grpc.DestroySandboxRequest`</li><li>`grpc.ExecProcessRequest`</li><li>`grpc.GetMetricsRequest`</li><li>`grpc.GuestDetailsRequest`</li><li>`grpc.ListInterfacesRequest`</li><li>`grpc.ListProcessesRequest`</li><li>`grpc.ListRoutesRequest`</li><li>`grpc.MemHotplugByProbeRequest`</li><li>`grpc.OnlineCPUMemRequest`</li><li>`grpc.PauseContainerRequest`</li><li>`grpc.RemoveContainerRequest`</li><li>`grpc.ReseedRandomDevRequest`</li><li>`grpc.ResumeContainerRequest`</li><li>`grpc.SetGuestDateTimeRequest`</li><li>`grpc.SignalProcessRequest`</li><li>`grpc.StartContainerRequest`</li><li>`grpc.StartTracingRequest`</li><li>`grpc.StatsContainerRequest`</li><li>`grpc.StopTracingRequest`</li><li>`grpc.TtyWinResizeRequest`</li><li>`grpc.UpdateContainerRequest`</li><li>`grpc.UpdateInterfaceRequest`</li><li>`grpc.UpdateRoutesRequest`</li><li>`grpc.WaitProcessRequest`</li><li>`grpc.WriteStreamRequest`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_fds`: <br> Kata containerd shim v2 open FDs. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_go_gc_duration_seconds`: <br> A summary of the pause duration of garbage collection cycles. | `SUMMARY` | `seconds` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_go_goroutines`: <br> Number of goroutines that currently exist. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
|
||||
@@ -30,7 +30,7 @@ The Kata Containers runtime **MUST** implement the following command line option
|
||||
The Kata Containers project **MUST** provide two interfaces for CRI shims to manage hardware
|
||||
virtualization based Kubernetes pods and containers:
|
||||
- An OCI and `runc` compatible command line interface, as described in the previous section.
|
||||
This interface is used by implementations such as [`CRI-O`](http://cri-o.io) and [`containerd`](https://github.com/containerd/containerd), for example.
|
||||
This interface is used by implementations such as [`CRI-O`](http://cri-o.io) and [`cri-containerd`](https://github.com/containerd/cri-containerd), for example.
|
||||
- A hardware virtualization runtime library API for CRI shims to consume and provide a more
|
||||
CRI native implementation. The [`frakti`](https://github.com/kubernetes/frakti) CRI shim is an example of such a consumer.
|
||||
|
||||
|
||||
@@ -209,5 +209,5 @@ network accessible to the collector.
|
||||
- The trace collection proposals are still being considered.
|
||||
|
||||
[kata-1x-tracing]: https://github.com/kata-containers/agent/blob/master/TRACING.md
|
||||
[trace-forwarder]: /src/tools/trace-forwarder
|
||||
[trace-forwarder]: /src/trace-forwarder
|
||||
[tracing-doc-pr]: https://github.com/kata-containers/kata-containers/pull/1937
|
||||
|
||||
@@ -41,7 +41,7 @@ Kata Containers with QEMU has complete compatibility with Kubernetes.
|
||||
Depending on the host architecture, Kata Containers supports various machine types,
|
||||
for example `pc` and `q35` on x86 systems, `virt` on ARM systems and `pseries` on IBM Power systems. The default Kata Containers
|
||||
machine type is `pc`. The machine type and its [`Machine accelerators`](#machine-accelerators) can
|
||||
be changed by editing the runtime [`configuration`](architecture/README.md#configuration) file.
|
||||
be changed by editing the runtime [`configuration`](./architecture.md/#configuration) file.
|
||||
|
||||
Devices and features used:
|
||||
- virtio VSOCK or virtio serial
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
- [Run Kata containers with `crictl`](run-kata-with-crictl.md)
|
||||
- [Run Kata Containers with Kubernetes](run-kata-with-k8s.md)
|
||||
- [How to use Kata Containers and Containerd](containerd-kata.md)
|
||||
- [How to use Kata Containers and CRI (containerd) with Kubernetes](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
- [How to use Kata Containers and CRI (containerd plugin) with Kubernetes](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
- [Kata Containers and service mesh for Kubernetes](service-mesh.md)
|
||||
- [How to import Kata Containers logs into Fluentd](how-to-import-kata-logs-with-fluentd.md)
|
||||
|
||||
@@ -17,9 +17,10 @@
|
||||
- `firecracker`
|
||||
- `ACRN`
|
||||
|
||||
While `qemu` , `cloud-hypervisor` and `firecracker` work out of the box with installation of Kata,
|
||||
some additional configuration is needed in case of `ACRN`.
|
||||
While `qemu` and `cloud-hypervisor` work out of the box with installation of Kata,
|
||||
some additional configuration is needed in case of `firecracker` and `ACRN`.
|
||||
Refer to the following guides for additional configuration steps:
|
||||
- [Kata Containers with Firecracker](https://github.com/kata-containers/documentation/wiki/Initial-release-of-Kata-Containers-with-Firecracker-support)
|
||||
- [Kata Containers with ACRN Hypervisor](how-to-use-kata-containers-with-acrn.md)
|
||||
|
||||
## Advanced Topics
|
||||
@@ -34,6 +35,3 @@
|
||||
- [How to set sandbox Kata Containers configurations with pod annotations](how-to-set-sandbox-config-kata.md)
|
||||
- [How to monitor Kata Containers in K8s](how-to-set-prometheus-in-k8s.md)
|
||||
- [How to use hotplug memory on arm64 in Kata Containers](how-to-hotplug-memory-arm64.md)
|
||||
- [How to setup swap devices in guest kernel](how-to-setup-swap-devices-in-guest-kernel.md)
|
||||
- [How to run rootless vmm](how-to-run-rootless-vmm.md)
|
||||
- [How to run Docker with Kata Containers](how-to-run-docker-with-kata.md)
|
||||
|
||||
@@ -39,7 +39,7 @@ use `RuntimeClass` instead of the deprecated annotations.
|
||||
|
||||
### Containerd Runtime V2 API: Shim V2 API
|
||||
|
||||
The [`containerd-shim-kata-v2` (short as `shimv2` in this documentation)](../../src/runtime/cmd/containerd-shim-kata-v2/)
|
||||
The [`containerd-shim-kata-v2` (short as `shimv2` in this documentation)](../../src/runtime/containerd-shim-v2)
|
||||
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/master/runtime/v2) for Kata.
|
||||
With `shimv2`, Kubernetes can launch Pod and OCI-compatible containers with one shim per Pod. Prior to `shimv2`, `2N+1`
|
||||
shims (i.e. a `containerd-shim` and a `kata-shim` for each container and the Pod sandbox itself) and no standalone `kata-proxy`
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
# How to run Docker in Docker with Kata Containers
|
||||
|
||||
This document describes the why and how behind running Docker in a Kata Container.
|
||||
|
||||
> **Note:** While in other environments this might be described as "Docker in Docker", the new architecture of Kata 2.x means [Docker can no longer be used to create containers using a Kata Containers runtime](https://github.com/kata-containers/kata-containers/issues/722).
|
||||
|
||||
## Requirements
|
||||
|
||||
- A working Kata Containers installation
|
||||
|
||||
## Install and configure Kata Containers
|
||||
|
||||
Follow the [Kata Containers installation guide](../install/README.md) to Install Kata Containers on your Kubernetes cluster.
|
||||
|
||||
## Background
|
||||
|
||||
Docker in Docker ("DinD") is the colloquial name for the ability to run `docker` from inside a container.
|
||||
|
||||
You can learn more about about Docker-in-Docker at the following links:
|
||||
|
||||
- [The original announcement of DinD](https://www.docker.com/blog/docker-can-now-run-within-docker/)
|
||||
- [`docker` image Docker Hub page](https://hub.docker.com/_/docker/) (this page lists the `-dind` releases)
|
||||
|
||||
While normally DinD refers to running `docker` from inside a Docker container,
|
||||
Kata Containers 2.x allows only supported runtimes (such as [`containerd`](../install/container-manager/containerd/containerd-install.md)).
|
||||
|
||||
Running `docker` in a Kata Container implies creating Docker containers from inside a container managed by `containerd` (or another supported container manager), as illustrated below:
|
||||
|
||||
```
|
||||
container manager -> Kata Containers shim -> Docker Daemon -> Docker container
|
||||
(containerd) (containerd-shim-kata-v2) (dockerd) (busybox sh)
|
||||
```
|
||||
|
||||
[OverlayFS][OverlayFS] is the preferred storage driver for most container runtimes on Linux ([including Docker](https://docs.docker.com/storage/storagedriver/select-storage-driver)).
|
||||
|
||||
> **Note:** While in the past Kata Containers did not contain the [`overlay` kernel module (aka OverlayFS)][OverlayFS], the kernel modules have been included since the [Kata Containers v2.0.0 release][v2.0.0].
|
||||
|
||||
[OverlayFS]: https://www.kernel.org/doc/html/latest/filesystems/overlayfs.html
|
||||
[v2.0.0]: https://github.com/kata-containers/kata-containers/releases/tag/2.0.0
|
||||
[kata-2.x-supported-runtimes]: https://github.com/kata-containers/kata-containers/blob/5737b36a3513f4da11a9dc7301b0c97ea22a51cf/docs/install/container-manager/containerd/containerd-install.md
|
||||
|
||||
## Why Docker in Kata Containers 2.x requires special measures
|
||||
|
||||
Running Docker containers Kata Containers requires care because `VOLUME`s specified in `Dockerfile`s run by Kata Containers are given the `kataShared` mount type by default, which applies to the root directory `/`:
|
||||
|
||||
```console
|
||||
/ # mount
|
||||
kataShared on / type virtiofs (rw,relatime,dax)
|
||||
```
|
||||
|
||||
`kataShared` mount types are powered by [`virtio-fs`][virtio-fs], a marked improvement over `virtio-9p`, thanks to [PR #1016](https://github.com/kata-containers/runtime/pull/1016). While `virtio-fs` is normally an excellent choice, in the case of DinD workloads `virtio-fs` causes an issue -- [it *cannot* be used as a "upper layer" of `overlayfs` without a custom patch](http://lists.katacontainers.io/pipermail/kata-dev/2020-January/001216.html).
|
||||
|
||||
As `/var/lib/docker` is a `VOLUME` specified by DinD (i.e. the `docker` images tagged `*-dind`/`*-dind-rootless`), `docker` fill fail to start (or even worse, silently pick a worse storage driver like `vfs`) when started in a Kata Container. Special measures must be taken when running DinD-powered workloads in Kata Containers.
|
||||
|
||||
## Workarounds/Solutions
|
||||
|
||||
Thanks to various community contributions (see [issue references below](#references)) the following options, with various trade-offs have been uncovered:
|
||||
|
||||
### Use a memory backed volume
|
||||
|
||||
For small workloads (small container images, without much generated filesystem load), a memory-backed volume is sufficient. Kubernetes supports a variant of [the `EmptyDir` volume][k8s-emptydir], which allows for memdisk-backed storage -- the [the `medium: Memory` ][k8s-memory-volume-type]. An example of a `Pod` using such a setup [was contributed](https://github.com/kata-containers/runtime/issues/1429#issuecomment-477385283), and is reproduced below:
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: dind
|
||||
spec:
|
||||
runtimeClassName: kata
|
||||
containers:
|
||||
- name: dind
|
||||
securityContext:
|
||||
privileged: true
|
||||
image: docker:20.10-dind
|
||||
args: ["--storage-driver=overlay2"]
|
||||
resources:
|
||||
limits:
|
||||
memory: "3G"
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/
|
||||
name: dockersock
|
||||
- mountPath: /var/lib/docker
|
||||
name: docker
|
||||
volumes:
|
||||
- name: dockersock
|
||||
emptyDir: {}
|
||||
- name: docker
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
```
|
||||
|
||||
Inside the container you can view the mount:
|
||||
|
||||
```console
|
||||
/ # mount | grep lib\/docker
|
||||
tmpfs on /var/lib/docker type tmpfs (rw,relatime)
|
||||
```
|
||||
|
||||
As is mentioned in the comment encapsulating this code, using volatile memory for container storage backing is a risky and could be possibly wasteful on machines that do not have a lot of RAM.
|
||||
|
||||
### Use a loop mounted disk
|
||||
|
||||
Using a loop mounted disk that is provisioned shortly before starting of the container workload is another approach that yields good performance.
|
||||
|
||||
Contributors provided [an example in issue #1888](https://github.com/kata-containers/runtime/issues/1888#issuecomment-739057384), which is reproduced in part below:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
containers:
|
||||
- name: docker
|
||||
image: docker:20.10-dind
|
||||
command: ["sh", "-c"]
|
||||
args:
|
||||
- if [[ $(df -PT /var/lib/docker | awk 'NR==2 {print $2}') == virtiofs ]]; then
|
||||
apk add e2fsprogs &&
|
||||
truncate -s 20G /tmp/disk.img &&
|
||||
mkfs.ext4 /tmp/disk.img &&
|
||||
mount /tmp/disk.img /var/lib/docker; fi &&
|
||||
dockerd-entrypoint.sh;
|
||||
securityContext:
|
||||
privileged: true
|
||||
```
|
||||
|
||||
Note that loop mounted disks are often sparse, which means they *do not* take up the full amount of space that has been provisioned. This solution seems to produce the best performance and flexibility, at the expense of increased complexity and additional required setup.
|
||||
|
||||
### Build a custom kernel
|
||||
|
||||
It's possible to [modify the kernel](https://github.com/kata-containers/runtime/issues/1888#issuecomment-616872558) (in addition to applying the earlier mentioned mailing list patch) to support using `virtio-fs` as an upper. Note that if you modify your kernel and use `virtio-fs` you may require [additional changes](https://github.com/kata-containers/runtime/issues/1888#issuecomment-739057384) for decent performance and to address other issues.
|
||||
|
||||
> **NOTE:** A future kernel release may rectify the usability and performance issues of using `virtio-fs` as an OverlayFS upper layer.
|
||||
|
||||
## References
|
||||
|
||||
The solutions proposed in this document are an amalgamation of thoughtful contributions from the Kata Containers community.
|
||||
|
||||
Find links to issues & related discussion and the fruits therein below:
|
||||
|
||||
- [How to run Docker in Docker with Kata Containers (#2474)](https://github.com/kata-containers/kata-containers/issues/2474)
|
||||
- [Does Kata-container support AUFS/OverlayFS? (#2493)](https://github.com/kata-containers/runtime/issues/2493)
|
||||
- [Unable to start docker in docker with virtio-fs (#1888)](https://github.com/kata-containers/runtime/issues/1888)
|
||||
- [Not using native diff for overlay2 (#1429)](https://github.com/kata-containers/runtime/issues/1429)
|
||||
@@ -1,33 +0,0 @@
|
||||
## Introduction
|
||||
To improve security, Kata Container supports running the VMM process (currently only QEMU) as a non-`root` user.
|
||||
This document describes how to enable the rootless VMM mode and its limitations.
|
||||
|
||||
## Pre-requisites
|
||||
The permission and ownership of the `kvm` device node (`/dev/kvm`) need to be configured to:
|
||||
```
|
||||
$ crw-rw---- 1 root kvm
|
||||
```
|
||||
use the following commands:
|
||||
```
|
||||
$ sudo groupadd kvm -r
|
||||
$ sudo chown root:kvm /dev/kvm
|
||||
$ sudo chmod 660 /dev/kvm
|
||||
```
|
||||
|
||||
## Configure rootless VMM
|
||||
By default, the VMM process still runs as the root user. There are two ways to enable rootless VMM:
|
||||
1. Set the `rootless` flag to `true` in the hypervisor section of `configuration.toml`.
|
||||
2. Set the Kubernetes annotation `io.katacontainers.hypervisor.rootless` to `true`.
|
||||
|
||||
## Implementation details
|
||||
When `rootless` flag is enabled, upon a request to create a Pod, Kata Containers runtime creates a random user and group (e.g. `kata-123`), and uses them to start the hypervisor process.
|
||||
The `kvm` group is also given to the hypervisor process as a supplemental group to give the hypervisor process access to the `/dev/kvm` device.
|
||||
Another necessary change is to move the hypervisor runtime files (e.g. `vhost-fs.sock`, `qmp.sock`) to a directory (under `/run/user/[uid]/`) where only the non-root hypervisor has access to.
|
||||
|
||||
## Limitations
|
||||
|
||||
1. Only the VMM process is running as a non-root user. Other processes such as Kata Container shimv2 and `virtiofsd` still run as the root user.
|
||||
2. Currently, this feature is only supported in QEMU. Still need to bring it to Firecracker and Cloud Hypervisor (see https://github.com/kata-containers/kata-containers/issues/2567).
|
||||
3. Certain features will not work when rootless VMM is enabled, including:
|
||||
1. Passing devices to the guest (`virtio-blk`, `virtio-scsi`) will not work if the non-privileged user does not have permission to access it (leading to a permission denied error). A more permissive permission (e.g. 666) may overcome this issue. However, you need to be aware of the potential security implications of reducing the security on such devices.
|
||||
2. `vfio` device will also not work because of permission denied error.
|
||||
@@ -34,6 +34,8 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.agent.enable_tracing` | `boolean` | enable tracing for the agent |
|
||||
| `io.katacontainers.config.agent.container_pipe_size` | uint32 | specify the size of the std(in/out) pipes created for containers |
|
||||
| `io.katacontainers.config.agent.kernel_modules` | string | the list of kernel modules and their parameters that will be loaded in the guest kernel. Semicolon separated list of kernel modules and their parameters. These modules will be loaded in the guest kernel using `modprobe`(8). E.g., `e1000e InterruptThrottleRate=3000,3000,3000 EEE=1; i915 enable_ppgtt=0` |
|
||||
| `io.katacontainers.config.agent.trace_mode` | string | the trace mode for the agent |
|
||||
| `io.katacontainers.config.agent.trace_type` | string | the trace type for the agent |
|
||||
|
||||
## Hypervisor Options
|
||||
| Key | Value Type | Comments |
|
||||
@@ -89,13 +91,6 @@ There are several kinds of Kata configurations and they are listed below.
|
||||
| `io.katacontainers.config.hypervisor.virtio_fs_cache` | string | the cache mode for virtio-fs, valid values are `always`, `auto` and `none` |
|
||||
| `io.katacontainers.config.hypervisor.virtio_fs_daemon` | string | virtio-fs `vhost-user` daemon path |
|
||||
| `io.katacontainers.config.hypervisor.virtio_fs_extra_args` | string | extra options passed to `virtiofs` daemon |
|
||||
| `io.katacontainers.config.hypervisor.enable_guest_swap` | `boolean` | enable swap in the guest |
|
||||
|
||||
## Container Options
|
||||
| Key | Value Type | Comments |
|
||||
|-------| ----- | ----- |
|
||||
| `io.katacontainers.container.resource.swappiness"` | `uint64` | specify the `Resources.Memory.Swappiness` |
|
||||
| `io.katacontainers.container.resource.swap_in_bytes"` | `uint64` | specify the `Resources.Memory.Swap` |
|
||||
|
||||
# CRI-O Configuration
|
||||
|
||||
@@ -105,12 +100,11 @@ In case of CRI-O, all annotations specified in the pod spec are passed down to K
|
||||
|
||||
For containerd, annotations specified in the pod spec are passed down to Kata
|
||||
starting with version `1.3.0` of containerd. Additionally, extra configuration is
|
||||
needed for containerd, by providing `pod_annotations` field and
|
||||
`container_annotations` field in the containerd config
|
||||
file. The `pod_annotations` field and `container_annotations` field are two lists of
|
||||
annotations that can be passed down to Kata as OCI annotations. They support golang match
|
||||
patterns. Since annotations supported by Kata follow the pattern `io.katacontainers.*`,
|
||||
the following configuration would work for passing annotations to Kata from containerd:
|
||||
needed for containerd, by providing a `pod_annotations` field in the containerd config
|
||||
file. The `pod_annotations` field is a list of annotations that can be passed down to
|
||||
Kata as OCI annotations. It supports golang match patterns. Since annotations supported
|
||||
by Kata follow the pattern `io.katacontainers.*`, the following configuration would work
|
||||
for passing annotations to Kata from containerd:
|
||||
|
||||
```
|
||||
$ cat /etc/containerd/config
|
||||
@@ -119,7 +113,6 @@ $ cat /etc/containerd/config
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata]
|
||||
runtime_type = "io.containerd.kata.v2"
|
||||
pod_annotations = ["io.katacontainers.*"]
|
||||
container_annotations = ["io.katacontainers.*"]
|
||||
....
|
||||
|
||||
```
|
||||
|
||||
@@ -1,59 +0,0 @@
|
||||
# Setup swap device in guest kernel
|
||||
|
||||
## Introduction
|
||||
|
||||
Setup swap device in guest kernel can help to increase memory capacity, handle some memory issues and increase file access speed sometimes.
|
||||
Kata Containers can insert a raw file to the guest as the swap device.
|
||||
|
||||
## Requisites
|
||||
|
||||
The swap config of the containers should be set by [annotations](how-to-set-sandbox-config-kata.md#container-options). So [extra configuration is needed for containerd](how-to-set-sandbox-config-kata.md#containerd-configuration).
|
||||
|
||||
Kata Containers just supports setup swap device in guest kernel with QEMU.
|
||||
Install and setup Kata Containers as shown [here](../install/README.md).
|
||||
|
||||
Enable setup swap device in guest kernel as follows:
|
||||
```
|
||||
$ sudo sed -i -e 's/^#enable_guest_swap.*$/enable_guest_swap = true/g' /etc/kata-containers/configuration.toml
|
||||
```
|
||||
|
||||
## Run a Kata Container utilizing swap device
|
||||
|
||||
Use following command to start a Kata Container with swappiness 60 and 1GB swap device (swap_in_bytes - memory_limit_in_bytes).
|
||||
```
|
||||
$ pod_yaml=pod.yaml
|
||||
$ container_yaml=container.yaml
|
||||
$ image="quay.io/prometheus/busybox:latest"
|
||||
$ cat << EOF > "${pod_yaml}"
|
||||
metadata:
|
||||
name: busybox-sandbox1
|
||||
EOF
|
||||
$ cat << EOF > "${container_yaml}"
|
||||
metadata:
|
||||
name: busybox-test-swap
|
||||
annotations:
|
||||
io.katacontainers.container.resource.swappiness: "60"
|
||||
io.katacontainers.container.resource.swap_in_bytes: "2147483648"
|
||||
linux:
|
||||
resources:
|
||||
memory_limit_in_bytes: 1073741824
|
||||
image:
|
||||
image: "$image"
|
||||
command:
|
||||
- top
|
||||
EOF
|
||||
$ sudo crictl pull $image
|
||||
$ podid=$(sudo crictl runp $pod_yaml)
|
||||
$ cid=$(sudo crictl create $podid $container_yaml $pod_yaml)
|
||||
$ sudo crictl start $cid
|
||||
```
|
||||
|
||||
Kata Container setups swap device for this container only when `io.katacontainers.container.resource.swappiness` is set.
|
||||
|
||||
The following table shows the swap size how to decide if `io.katacontainers.container.resource.swappiness` is set.
|
||||
|`io.katacontainers.container.resource.swap_in_bytes`|`memory_limit_in_bytes`|swap size|
|
||||
|---|---|---|
|
||||
|set|set| `io.katacontainers.container.resource.swap_in_bytes` - `memory_limit_in_bytes`|
|
||||
|not set|set| `memory_limit_in_bytes`|
|
||||
|not set|not set| `io.katacontainers.config.hypervisor.default_memory`|
|
||||
|set|not set|cgroup doesn't support this usage|
|
||||
@@ -3,7 +3,7 @@
|
||||
This document describes how to set up a single-machine Kubernetes (k8s) cluster.
|
||||
|
||||
The Kubernetes cluster will use the
|
||||
[CRI containerd](https://github.com/containerd/containerd/) and
|
||||
[CRI containerd plugin](https://github.com/containerd/cri) and
|
||||
[Kata Containers](https://katacontainers.io) to launch untrusted workloads.
|
||||
|
||||
## Requirements
|
||||
@@ -71,12 +71,12 @@ $ for service in ${services}; do
|
||||
service_dir="/etc/systemd/system/${service}.service.d/"
|
||||
sudo mkdir -p ${service_dir}
|
||||
|
||||
cat << EOF | sudo tee "${service_dir}/proxy.conf"
|
||||
cat << EOT | sudo tee "${service_dir}/proxy.conf"
|
||||
[Service]
|
||||
Environment="HTTP_PROXY=${http_proxy}"
|
||||
Environment="HTTPS_PROXY=${https_proxy}"
|
||||
Environment="NO_PROXY=${no_proxy}"
|
||||
EOF
|
||||
EOT
|
||||
done
|
||||
|
||||
$ sudo systemctl daemon-reload
|
||||
@@ -172,7 +172,7 @@ If a pod has the `runtimeClassName` set to `kata`, the CRI plugin runs the pod w
|
||||
- Create an pod configuration that using Kata Containers runtime
|
||||
|
||||
```bash
|
||||
$ cat << EOF | tee nginx-kata.yaml
|
||||
$ cat << EOT | tee nginx-kata.yaml
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
@@ -183,7 +183,7 @@ If a pod has the `runtimeClassName` set to `kata`, the CRI plugin runs the pod w
|
||||
- name: nginx
|
||||
image: nginx
|
||||
|
||||
EOF
|
||||
EOT
|
||||
```
|
||||
|
||||
- Create the pod
|
||||
|
||||
@@ -16,9 +16,9 @@ from the host, a potentially undesirable side-effect that decreases the security
|
||||
|
||||
The following sections document how to configure this behavior in different container runtimes.
|
||||
|
||||
#### Containerd
|
||||
#### Containerd and CRI
|
||||
|
||||
The Containerd allows configuring the privileged host devices behavior for each runtime in the containerd config. This is
|
||||
The Containerd CRI allows configuring the privileged host devices behavior for each runtime in the CRI config. This is
|
||||
done with the `privileged_without_host_devices` option. Setting this to `true` will disable hot plugging of the host
|
||||
devices into the guest, even when privileged is enabled.
|
||||
|
||||
@@ -41,7 +41,7 @@ See below example config:
|
||||
```
|
||||
|
||||
- [Kata Containers with Containerd and CRI documentation](how-to-use-k8s-with-cri-containerd-and-kata.md)
|
||||
- [Containerd CRI config documentation](https://github.com/containerd/containerd/blob/main/docs/cri/config.md)
|
||||
- [Containerd CRI config documentation](https://github.com/containerd/cri/blob/master/docs/config.md)
|
||||
|
||||
#### CRI-O
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ Kubernetes CRI (Container Runtime Interface) implementations allow using any
|
||||
OCI-compatible runtime with Kubernetes, such as the Kata Containers runtime.
|
||||
|
||||
Kata Containers support both the [CRI-O](https://github.com/kubernetes-incubator/cri-o) and
|
||||
[containerd](https://github.com/containerd/containerd) CRI implementations.
|
||||
[CRI-containerd](https://github.com/containerd/cri) CRI implementations.
|
||||
|
||||
After choosing one CRI implementation, you must make the appropriate configuration
|
||||
to ensure it integrates with Kata Containers.
|
||||
@@ -20,9 +20,9 @@ required to spawn pods and containers, and this is the preferred way to run Kata
|
||||
An equivalent shim implementation for CRI-O is planned.
|
||||
|
||||
### CRI-O
|
||||
For CRI-O installation instructions, refer to the [CRI-O Tutorial](https://github.com/cri-o/cri-o/blob/main/tutorial.md) page.
|
||||
For CRI-O installation instructions, refer to the [CRI-O Tutorial](https://github.com/kubernetes-incubator/cri-o/blob/master/tutorial.md) page.
|
||||
|
||||
The following sections show how to set up the CRI-O snippet configuration file (default path: `/etc/crio/crio.conf`) for Kata.
|
||||
The following sections show how to set up the CRI-O configuration file (default path: `/etc/crio/crio.conf`) for Kata.
|
||||
|
||||
Unless otherwise stated, all the following settings are specific to the `crio.runtime` table:
|
||||
```toml
|
||||
@@ -30,7 +30,7 @@ Unless otherwise stated, all the following settings are specific to the `crio.ru
|
||||
# runtime used and options for how to set up and manage the OCI runtime.
|
||||
[crio.runtime]
|
||||
```
|
||||
A comprehensive documentation of the configuration file can be found [here](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md).
|
||||
A comprehensive documentation of the configuration file can be found [here](https://github.com/cri-o/cri-o/blob/master/docs/crio.conf.5.md).
|
||||
|
||||
> **Note**: After any change to this file, the CRI-O daemon have to be restarted with:
|
||||
>````
|
||||
@@ -40,20 +40,82 @@ A comprehensive documentation of the configuration file can be found [here](http
|
||||
#### Kubernetes Runtime Class (CRI-O v1.12+)
|
||||
The [Kubernetes Runtime Class](https://kubernetes.io/docs/concepts/containers/runtime-class/)
|
||||
is the preferred way of specifying the container runtime configuration to run a Pod's containers.
|
||||
To use this feature, Kata must added as a runtime handler. This can be done by
|
||||
dropping a `50-kata` snippet file into `/etc/crio/crio.conf.d`, with the
|
||||
content shown below:
|
||||
To use this feature, Kata must added as a runtime handler with:
|
||||
|
||||
```toml
|
||||
[crio.runtime.runtimes.kata]
|
||||
runtime_path = "/usr/bin/containerd-shim-kata-v2"
|
||||
runtime_type = "vm"
|
||||
runtime_root = "/run/vc"
|
||||
privileged_without_host_devices = true
|
||||
[crio.runtime.runtimes.kata-runtime]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
```
|
||||
|
||||
You can also add multiple entries to specify alternatives hypervisors, e.g.:
|
||||
```toml
|
||||
[crio.runtime.runtimes.kata-qemu]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
|
||||
[crio.runtime.runtimes.kata-fc]
|
||||
runtime_path = "/usr/bin/kata-runtime"
|
||||
runtime_type = "oci"
|
||||
```
|
||||
|
||||
#### Untrusted annotation (until CRI-O v1.12)
|
||||
The untrusted annotation is used to specify a runtime for __untrusted__ workloads, i.e.
|
||||
a runtime to be used when the workload cannot be trusted and a higher level of security
|
||||
is required. An additional flag can be used to let CRI-O know if a workload
|
||||
should be considered _trusted_ or _untrusted_ by default.
|
||||
For further details, see the documentation
|
||||
[here](../design/architecture.md#mixing-vm-based-and-namespace-based-runtimes).
|
||||
|
||||
```toml
|
||||
# runtime is the OCI compatible runtime used for trusted container workloads.
|
||||
# This is a mandatory setting as this runtime will be the default one
|
||||
# and will also be used for untrusted container workloads if
|
||||
# runtime_untrusted_workload is not set.
|
||||
runtime = "/usr/bin/runc"
|
||||
|
||||
# runtime_untrusted_workload is the OCI compatible runtime used for untrusted
|
||||
# container workloads. This is an optional setting, except if
|
||||
# default_container_trust is set to "untrusted".
|
||||
runtime_untrusted_workload = "/usr/bin/kata-runtime"
|
||||
|
||||
# default_workload_trust is the default level of trust crio puts in container
|
||||
# workloads. It can either be "trusted" or "untrusted", and the default
|
||||
# is "trusted".
|
||||
# Containers can be run through different container runtimes, depending on
|
||||
# the trust hints we receive from kubelet:
|
||||
# - If kubelet tags a container workload as untrusted, crio will try first to
|
||||
# run it through the untrusted container workload runtime. If it is not set,
|
||||
# crio will use the trusted runtime.
|
||||
# - If kubelet does not provide any information about the container workload trust
|
||||
# level, the selected runtime will depend on the default_container_trust setting.
|
||||
# If it is set to "untrusted", then all containers except for the host privileged
|
||||
# ones, will be run by the runtime_untrusted_workload runtime. Host privileged
|
||||
# containers are by definition trusted and will always use the trusted container
|
||||
# runtime. If default_container_trust is set to "trusted", crio will use the trusted
|
||||
# container runtime for all containers.
|
||||
default_workload_trust = "untrusted"
|
||||
```
|
||||
|
||||
#### Network namespace management
|
||||
To enable networking for the workloads run by Kata, CRI-O needs to be configured to
|
||||
manage network namespaces, by setting the following key to `true`.
|
||||
|
||||
In CRI-O v1.16:
|
||||
```toml
|
||||
manage_network_ns_lifecycle = true
|
||||
```
|
||||
In CRI-O v1.17+:
|
||||
```toml
|
||||
manage_ns_lifecycle = true
|
||||
```
|
||||
|
||||
|
||||
### containerd
|
||||
### containerd with CRI plugin
|
||||
|
||||
If you select containerd with `cri` plugin, follow the "Getting Started for Developers"
|
||||
instructions [here](https://github.com/containerd/cri#getting-started-for-developers)
|
||||
to properly install it.
|
||||
|
||||
To customize containerd to select Kata Containers runtime, follow our
|
||||
"Configure containerd to use Kata Containers" internal documentation
|
||||
@@ -98,7 +160,7 @@ $ sudo systemctl restart kubelet
|
||||
# If using CRI-O
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --cri-socket /var/run/crio/crio.sock --pod-network-cidr=10.244.0.0/16
|
||||
|
||||
# If using containerd
|
||||
# If using CRI-containerd
|
||||
$ sudo kubeadm init --ignore-preflight-errors=all --cri-socket /run/containerd/containerd.sock --pod-network-cidr=10.244.0.0/16
|
||||
|
||||
$ export KUBECONFIG=/etc/kubernetes/admin.conf
|
||||
|
||||
@@ -34,7 +34,7 @@ as the proxy starts.
|
||||
|
||||
Follow the [instructions](../install/README.md)
|
||||
to get Kata Containers properly installed and configured with Kubernetes.
|
||||
You can choose between CRI-O and containerd, both are supported
|
||||
You can choose between CRI-O and CRI-containerd, both are supported
|
||||
through this document.
|
||||
|
||||
For both cases, select the workloads as _trusted_ by default. This way,
|
||||
@@ -159,7 +159,7 @@ containers with `privileged: true` to `privileged: false`.
|
||||
There is no difference between Istio and Linkerd in this section. It is
|
||||
about which CRI implementation you use.
|
||||
|
||||
For both CRI-O and containerd, you have to add an annotation indicating
|
||||
For both CRI-O and CRI-containerd, you have to add an annotation indicating
|
||||
the workload for this deployment is not _trusted_, which will trigger
|
||||
`kata-runtime` to be called instead of `runc`.
|
||||
|
||||
@@ -193,9 +193,9 @@ spec:
|
||||
...
|
||||
```
|
||||
|
||||
__containerd:__
|
||||
__CRI-containerd:__
|
||||
|
||||
Add the following annotation for containerd
|
||||
Add the following annotation for CRI-containerd
|
||||
```yaml
|
||||
io.kubernetes.cri.untrusted-workload: "true"
|
||||
```
|
||||
|
||||
@@ -12,26 +12,16 @@ Containers.
|
||||
|
||||
Packaged installation methods uses your distribution's native package format (such as RPM or DEB).
|
||||
|
||||
> **Note:** We encourage installation methods that provides automatic updates, it ensures security updates and bug fixes are
|
||||
> easily applied.
|
||||
*Note:* We encourage installation methods that provides automatic updates, it ensures security updates and bug fixes are
|
||||
easily applied.
|
||||
|
||||
| Installation method | Description | Automatic updates | Use case |
|
||||
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|
|
||||
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. |
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. |
|
||||
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. |
|
||||
|
||||
### Kata Deploy Installation
|
||||
|
||||
Kata Deploy provides a Dockerfile, which contains all of the binaries and
|
||||
artifacts required to run Kata Containers, as well as reference DaemonSets,
|
||||
which can be utilized to install Kata Containers on a running Kubernetes
|
||||
cluster.
|
||||
|
||||
[Use Kata Deploy](/tools/packaging/kata-deploy/README.md) to install Kata Containers on a Kubernetes Cluster.
|
||||
| Installation method | Description | Automatic updates | Use case |
|
||||
|------------------------------------------------------|---------------------------------------------------------------------|-------------------|----------------------------------------------------------|
|
||||
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. |
|
||||
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. |
|
||||
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. |
|
||||
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. |
|
||||
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. |
|
||||
|
||||
### Official packages
|
||||
|
||||
@@ -58,9 +48,9 @@ Follow the [containerd installation guide](container-manager/containerd/containe
|
||||
|
||||
## Build from source installation
|
||||
|
||||
> **Note:** Power users who decide to build from sources should be aware of the
|
||||
> implications of using an unpackaged system which will not be automatically
|
||||
> updated as new [releases](../Stable-Branch-Strategy.md) are made available.
|
||||
*Note:* Power users who decide to build from sources should be aware of the
|
||||
implications of using an unpackaged system which will not be automatically
|
||||
updated as new [releases](../Stable-Branch-Strategy.md) are made available.
|
||||
|
||||
[Building from sources](../Developer-Guide.md#initial-setup) allows power users
|
||||
who are comfortable building software from source to use the latest component
|
||||
|
||||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 150 KiB |
@@ -1,137 +0,0 @@
|
||||
# Kata Containers threat model
|
||||
|
||||
This document discusses threat models associated with the Kata Containers project.
|
||||
Kata was designed to provide additional isolation of container workloads, protecting
|
||||
the host infrastructure from potentially malicious container users or workloads. Since
|
||||
Kata Containers adds a level of isolation on top of traditional containers, the focus
|
||||
is on the additional layer provided, not on traditional container security.
|
||||
|
||||
This document provides a brief background on containers and layered security, describes
|
||||
the interface to Kata from CRI runtimes, a review of utilized virtual machine interfaces, and then
|
||||
a review of threats.
|
||||
|
||||
## Kata security objective
|
||||
|
||||
Kata seeks to prevent an untrusted container workload or user of that container workload to gain
|
||||
control of, obtain information from, or tamper with the host infrastructure.
|
||||
|
||||
In our scenario, an asset is anything on the host system, or elsewhere in the cluster
|
||||
infrastructure. The attacker is assumed to be either a malicious user or the workload itself
|
||||
running within the container. The goal of Kata is to prevent attacks which would allow
|
||||
any access to the defined assets.
|
||||
|
||||
## Background on containers, layered security
|
||||
|
||||
Traditional containers leverage several key Linux kernel features to provide isolation and
|
||||
a view that the container workload is the only entity running on the host. Key features include
|
||||
`Namespaces`, `cgroups`, `capablities`, `SELinux` and `seccomp`. The canonical runtime for creating such
|
||||
a container is `runc`. In the remainder of the document, the term `traditional-container` will be used
|
||||
to describe a container workload created by runc.
|
||||
|
||||
Kata Containers provides a second layer of isolation on top of those provided by traditional-containers.
|
||||
The hardware virtualization interface is the basis of this additional layer. Kata launches a lightweight
|
||||
virtual machine, and uses the guest’s Linux kernel to create a container workload, or workloads in the case
|
||||
of multi-container pods. In Kubernetes and in the Kata implementation, the sandbox is carried out at the
|
||||
pod level. In Kata, this sandbox is created using a virtual machine.
|
||||
|
||||
## Interface to Kata Containers: CRI, v2-shim, OCI
|
||||
|
||||
A typical Kata Containers deployment uses Kubernetes with a CRI implementation.
|
||||
On every node, Kubelet will interact with a CRI implementor, which will in turn interface with
|
||||
an OCI based runtime, such as Kata Containers. Typical CRI implementors are `cri-o` and `containerd`.
|
||||
|
||||
The CRI API, as defined at the Kubernetes [CRI-API repo](https://github.com/kubernetes/cri-api/),
|
||||
results in a few constructs being supported by the CRI implementation, and ultimately in the OCI
|
||||
runtime creating the workloads.
|
||||
|
||||
In order to run a container inside of the Kata sandbox, several virtual machine devices and interfaces
|
||||
are required. Kata translates sandbox and container definitions to underlying virtualization technologies provided
|
||||
by a set of virtual machine monitors (VMMs) and hypervisors. These devices and their underlying
|
||||
implementations are discussed in detail in the following section.
|
||||
|
||||
## Interface to the Kata sandbox/virtual machine
|
||||
|
||||
In case of Kata, today the devices which we need in the guest are:
|
||||
- Storage: In the current design of Kata Containers, we are reliant on the CRI implementor to
|
||||
assist in image handling and volume management on the host. As a result, we need to support a way of passing to the sandbox the container rootfs, volumes requested
|
||||
by the workload, and any other volumes created to facilitate sharing of secrets and `configmaps` with the containers. Depending on how these are managed, a block based device or file-system
|
||||
sharing is required. Kata Containers does this by way of `virtio-blk` and/or `virtio-fs`.
|
||||
- Networking: A method for enabling network connectivity with the workload is required. Typically this will be done providing a `TAP` device
|
||||
to the VMM, and this will be exposed to the guest as a `virtio-net` device. It is feasible to pass in a NIC device directly, in which case `VFIO` is leveraged
|
||||
and the device itself will be exposed to the guest.
|
||||
- Control: In order to interact with the guest agent and retrieve `STDIO` from containers, a medium of communication is required.
|
||||
This is available via `virtio-vsock`.
|
||||
- Devices: `VFIO` is utilized when devices are passed directly to the virtual machine and exposed to the container.
|
||||
- Dynamic Resource Management: `ACPI` is utilized to allow for dynamic VM resource management (for example: CPU, memory, device hotplug). This is required when containers are resized,
|
||||
or more generally when containers are added to a pod.
|
||||
|
||||
How these devices are utilized varies depending on the VMM utilized. We clarify the default settings provided when integrating Kata
|
||||
with the QEMU, Firecracker and Cloud Hypervisor VMMs in the following sections.
|
||||
|
||||
### Devices
|
||||
|
||||
Each virtio device is implemented by a backend, which may execute within userspace on the host (vhost-user), the VMM itself, or within the host kernel (vhost). While it may provide enhanced performance,
|
||||
vhost devices are often seen as higher risk since an exploit would be already running within the kernel space. While VMM and vhost-user are both in userspace on the host, `vhost-user` generally allows for the back-end process to require less system calls and capabilities compared to a full VMM.
|
||||
|
||||
#### `virtio-blk` and `virtio-scsi`
|
||||
|
||||
The backend for `virtio-blk` and `virtio-scsi` are based in the VMM itself (ring3 in the context of x86) by default for Cloud Hypervisor, Firecracker and QEMU.
|
||||
While `vhost` based back-ends are available for QEMU, it is not recommended. `vhost-user` back-ends are being added for Cloud Hypervisor, they are not utilized in Kata today.
|
||||
|
||||
#### `virtio-fs`
|
||||
|
||||
`virtio-fs` is supported in Cloud Hypervisor and QEMU. `virtio-fs`'s interaction with the host filesystem is done through a vhost-user daemon, `virtiofsd`.
|
||||
The `virtio-fs` client, running in the guest, will generate requests to access files. `virtiofsd` will receive requests, open the file, and request the VMM
|
||||
to `mmap` it into the guest. When DAX is utilized, the guest will access the host's page cache, avoiding the need for copy and duplication. DAX is still an experimental feature,
|
||||
and is not enabled by default.
|
||||
|
||||
From the `virtiofsd` [documentation](https://qemu-project.gitlab.io/qemu/tools/virtiofsd.html):
|
||||
```This program must be run as the root user. Upon startup the program will switch into a new file system namespace with the shared directory tree as its root. This prevents “file system escapes” due to symlinks and other file system objects that might lead to files outside the shared directory. The program also sandboxes itself using seccomp(2) to prevent ptrace(2) and other vectors that could allow an attacker to compromise the system after gaining control of the virtiofsd process.```
|
||||
|
||||
DAX-less support for `virtio-fs` is available as of the 5.4 Linux kernel. QEMU VMM supports virtio-fs as of v4.2. Cloud Hypervisor
|
||||
supports `virtio-fs`.
|
||||
|
||||
#### `virtio-net`
|
||||
|
||||
`virtio-net` has many options, depending on the VMM and Kata configurations.
|
||||
|
||||
##### QEMU networking
|
||||
|
||||
While QEMU has options for `vhost`, `virtio-net` and `vhost-user`, the `virtio-net` backend
|
||||
for Kata defaults to `vhost-net` for performance reasons. The default configuration is being
|
||||
reevaluated.
|
||||
|
||||
##### Firecracker networking
|
||||
|
||||
For Firecracker, the `virtio-net` backend is within Firecracker's VMM.
|
||||
|
||||
##### Cloud Hypervisor networking
|
||||
|
||||
For Cloud Hypervisor, the current backend default is within the VMM. `vhost-user-net` support
|
||||
is being added (written in rust, Cloud Hypervisor specific).
|
||||
|
||||
#### virtio-vsock
|
||||
|
||||
##### QEMU vsock
|
||||
|
||||
In QEMU, vsock is backed by `vhost_vsock`, which runs within the kernel itself.
|
||||
|
||||
##### Firecracker and Cloud Hypervisor
|
||||
|
||||
In Firecracker and Cloud Hypervisor, vsock is backed by a unix-domain-socket in the hosts userspace.
|
||||
|
||||
#### VFIO
|
||||
|
||||
Utilizing VFIO, devices can be passed through to the virtual machine. We will assess this separately. Exposure to
|
||||
host is limited to gaps in device pass-through handling. This is supported in QEMU and Cloud Hypervisor, but not
|
||||
Firecracker.
|
||||
|
||||
#### ACPI
|
||||
|
||||
ACPI is necessary for hotplug of CPU, memory and devices. ACPI is available in QEMU and Cloud Hypervisor. Device, CPU and memory hotplug
|
||||
are not available in Firecracker.
|
||||
|
||||
## Devices and threat model
|
||||
|
||||

|
||||
|
||||
213
docs/tracing.md
213
docs/tracing.md
@@ -1,213 +0,0 @@
|
||||
# Overview
|
||||
|
||||
This document explains how to trace Kata Containers components.
|
||||
|
||||
# Introduction
|
||||
|
||||
The Kata Containers runtime and agent are able to generate
|
||||
[OpenTelemetry][opentelemetry] trace spans, which allow the administrator to
|
||||
observe what those components are doing and how much time they are spending on
|
||||
each operation.
|
||||
|
||||
# OpenTelemetry summary
|
||||
|
||||
An OpenTelemetry-enabled application creates a number of trace "spans". A span
|
||||
contains the following attributes:
|
||||
|
||||
- A name
|
||||
- A pair of timestamps (recording the start time and end time of some operation)
|
||||
- A reference to the span's parent span
|
||||
|
||||
All spans need to be *finished*, or *completed*, to allow the OpenTelemetry
|
||||
framework to generate the final trace information (by effectively closing the
|
||||
transaction encompassing the initial (root) span and all its children).
|
||||
|
||||
For Kata, the root span represents the total amount of time taken to run a
|
||||
particular component from startup to its shutdown (the "run time").
|
||||
|
||||
# Architecture
|
||||
|
||||
## Runtime tracing architecture
|
||||
|
||||
The runtime, which runs in the host environment, has been modified to
|
||||
optionally generate trace spans which are sent to a trace collector on the
|
||||
host.
|
||||
|
||||
## Agent tracing architecture
|
||||
|
||||
An OpenTelemetry system (such as [Jaeger][jaeger-tracing]) uses a collector to
|
||||
gather up trace spans from the application for viewing and processing. For an
|
||||
application to use the collector, it must run in the same context as
|
||||
the collector.
|
||||
|
||||
This poses a problem for tracing the Kata Containers agent since it does not
|
||||
run in the same context as the collector: it runs inside a virtual machine (VM).
|
||||
|
||||
To allow spans from the agent to be sent to the trace collector, Kata provides
|
||||
a [trace forwarder][trace-forwarder] component. This runs in the same context
|
||||
as the collector (generally on the host system) and listens on a
|
||||
[`VSOCK`][vsock] channel for traces generated by the agent, forwarding them on
|
||||
to the trace collector.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> This design supports agent tracing without having to make changes to the
|
||||
> image, but also means that [custom images][osbuilder] can also benefit from
|
||||
> agent tracing.
|
||||
|
||||
The following diagram summarises the architecture used to trace the Kata
|
||||
Containers agent:
|
||||
|
||||
```
|
||||
+--------------------------------------------+
|
||||
| Host |
|
||||
| |
|
||||
| +---------------+ |
|
||||
| | OpenTelemetry | |
|
||||
| | Trace | |
|
||||
| | Collector | |
|
||||
| +---------------+ |
|
||||
| ^ +---------------+ |
|
||||
| | spans | Kata VM | |
|
||||
| +-----+-----+ | | |
|
||||
| | Kata | spans o +-------+ | |
|
||||
| | Trace |<-----------------| Kata | | |
|
||||
| | Forwarder | VSOCK o | Agent | | |
|
||||
| +-----------+ Channel | +-------+ | |
|
||||
| +---------------+ |
|
||||
+--------------------------------------------+
|
||||
```
|
||||
|
||||
# Agent tracing prerequisites
|
||||
|
||||
- You must have a trace collector running.
|
||||
|
||||
Although the collector normally runs on the host, it can also be run from
|
||||
inside a Docker image configured to expose the appropriate host ports to the
|
||||
collector.
|
||||
|
||||
The [Jaeger "all-in-one" Docker image][jaeger-all-in-one] method
|
||||
is the quickest and simplest way to run the collector for testing.
|
||||
|
||||
- If you wish to trace the agent, you must start the
|
||||
[trace forwarder][trace-forwarder].
|
||||
|
||||
> **Notes:**
|
||||
>
|
||||
> - If agent tracing is enabled but the forwarder is not running,
|
||||
> the agent will log an error (signalling that it cannot generate trace
|
||||
> spans), but continue to work as normal.
|
||||
>
|
||||
> - The trace forwarder requires a trace collector (such as Jaeger) to be
|
||||
> running before it is started. If a collector is not running, the trace
|
||||
> forwarder will exit with an error.
|
||||
|
||||
# Enable tracing
|
||||
|
||||
By default, tracing is disabled for all components. To enable _any_ form of
|
||||
tracing an `enable_tracing` option must be enabled for at least one component.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> Enabling this option will only allow tracing for subsequently
|
||||
> started containers.
|
||||
|
||||
## Enable runtime tracing
|
||||
|
||||
To enable runtime tracing, set the tracing option as shown:
|
||||
|
||||
```toml
|
||||
[runtime]
|
||||
enable_tracing = true
|
||||
```
|
||||
|
||||
## Enable agent tracing
|
||||
|
||||
To enable agent tracing, set the tracing option as shown:
|
||||
|
||||
```toml
|
||||
[agent.kata]
|
||||
enable_tracing = true
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> If both agent tracing and runtime tracing are enabled, the resulting trace
|
||||
> spans will be "collated": expanding individual runtime spans in the Jaeger
|
||||
> web UI will show the agent trace spans resulting from the runtime
|
||||
> operation.
|
||||
|
||||
# Appendices
|
||||
|
||||
## Agent tracing requirements
|
||||
|
||||
### Host environment
|
||||
|
||||
- The host kernel must support the VSOCK socket type.
|
||||
|
||||
This will be available if the kernel is built with the
|
||||
`CONFIG_VHOST_VSOCK` configuration option.
|
||||
|
||||
- The VSOCK kernel module must be loaded:
|
||||
|
||||
```
|
||||
$ sudo modprobe vhost_vsock
|
||||
```
|
||||
|
||||
### Guest environment
|
||||
|
||||
- The guest kernel must support the VSOCK socket type:
|
||||
|
||||
This will be available if the kernel is built with the
|
||||
`CONFIG_VIRTIO_VSOCKETS` configuration option.
|
||||
|
||||
> **Note:** The default Kata Containers guest kernel provides this feature.
|
||||
|
||||
## Agent tracing limitations
|
||||
|
||||
- Agent tracing is only "completed" when the workload and the Kata agent
|
||||
process have exited.
|
||||
|
||||
Although trace information *can* be inspected before the workload and agent
|
||||
have exited, it is incomplete. This is shown as `<trace-without-root-span>`
|
||||
in the Jaeger web UI.
|
||||
|
||||
If the workload is still running, the trace transaction -- which spans the entire
|
||||
runtime of the Kata agent -- will not have been completed. To view the complete
|
||||
trace details, wait for the workload to end, or stop the container.
|
||||
|
||||
## Performance impact
|
||||
|
||||
[OpenTelemetry][opentelemetry] is designed for high performance. It combines
|
||||
the best of two previous generation projects (OpenTracing and OpenCensus) and
|
||||
uses a very efficient mechanism to capture trace spans. Further, the trace
|
||||
points inserted into the agent are generated dynamically at compile time. This
|
||||
is advantageous since new versions of the agent will automatically benefit
|
||||
from improvements in the tracing infrastructure. Overall, the impact of
|
||||
enabling runtime and agent tracing should be extremely low.
|
||||
|
||||
## Agent shutdown behaviour
|
||||
|
||||
In normal operation, the Kata runtime manages the VM shutdown and performs
|
||||
certain optimisations to speed up this process. However, if agent tracing is
|
||||
enabled, the agent itself is responsible for shutting down the VM. This it to
|
||||
ensure all agent trace transactions are completed. This means there will be a
|
||||
small performance impact for container shutdown when agent tracing is enabled
|
||||
as the runtime must wait for the VM to shutdown fully.
|
||||
|
||||
## Set up a tracing development environment
|
||||
|
||||
If you want to debug, further develop, or test tracing,
|
||||
[enabling full debug][enable-full-debug]
|
||||
is highly recommended. For working with the agent, you may also wish to
|
||||
[enable a debug console][setup-debug-console]
|
||||
to allow you to access the VM environment.
|
||||
|
||||
[enable-full-debug]: https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#enable-full-debug
|
||||
[jaeger-all-in-one]: https://www.jaegertracing.io/docs/getting-started/
|
||||
[jaeger-tracing]: https://www.jaegertracing.io
|
||||
[opentelemetry]: https://opentelemetry.io
|
||||
[osbuilder]: https://github.com/kata-containers/kata-containers/blob/main/tools/osbuilder
|
||||
[setup-debug-console]: https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#set-up-a-debug-console
|
||||
[trace-forwarder]: /src/tools/trace-forwarder
|
||||
[vsock]: https://wiki.qemu.org/Features/VirtioVsock
|
||||
@@ -67,7 +67,7 @@ To use large BARs devices (for example, Nvidia Tesla P100), you need Kata versio
|
||||
|
||||
The following configuration in the Kata `configuration.toml` file as shown below can work:
|
||||
|
||||
Hotplug for PCI devices by `acpi_pcihp` (Linux's ACPI PCI Hotplug driver):
|
||||
Hotplug for PCI devices by `shpchp` (Linux's SHPC PCI Hotplug driver):
|
||||
```
|
||||
machine_type = "q35"
|
||||
|
||||
@@ -91,6 +91,7 @@ The following kernel config options need to be enabled:
|
||||
```
|
||||
# Support PCI/PCIe device hotplug (Required for large BARs device)
|
||||
CONFIG_HOTPLUG_PCI_PCIE=y
|
||||
CONFIG_HOTPLUG_PCI_SHPC=y
|
||||
|
||||
# Support for loading modules (Required for load Nvidia drivers)
|
||||
CONFIG_MODULES=y
|
||||
|
||||
@@ -235,7 +235,7 @@ then [Kata-deploy](https://github.com/kata-containers/kata-containers/tree/main/
|
||||
is use to install Kata. This will make sure that the correct `agent` version
|
||||
is installed into the rootfs in the steps below.
|
||||
|
||||
The following instructions use Ubuntu as the root filesystem with systemd as
|
||||
The following instructions use Debian as the root filesystem with systemd as
|
||||
the init and will add in the `kmod` binary, which is not a standard binary in
|
||||
a Kata rootfs image. The `kmod` binary is necessary to load the Intel® QAT
|
||||
kernel modules when the virtual machine rootfs boots.
|
||||
@@ -257,7 +257,7 @@ $ cd $GOPATH
|
||||
$ export AGENT_VERSION=$(kata-runtime version | head -n 1 | grep -o "[0-9.]\+")
|
||||
$ cd ${OSBUILDER}/rootfs-builder
|
||||
$ sudo rm -rf ${ROOTFS_DIR}
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh ubuntu'
|
||||
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh debian'
|
||||
```
|
||||
|
||||
### Compile Intel® QAT drivers for Kata Containers kernel and add to Kata Containers rootfs
|
||||
|
||||
@@ -1,113 +1,107 @@
|
||||
# Kata Containers with SGX
|
||||
|
||||
Intel Software Guard Extensions (SGX) is a set of instructions that increases the security
|
||||
Intel® Software Guard Extensions (SGX) is a set of instructions that increases the security
|
||||
of applications code and data, giving them more protections from disclosure or modification.
|
||||
|
||||
This document guides you to run containers with SGX enclaves with Kata Containers in Kubernetes.
|
||||
> **Note:** At the time of writing this document, SGX patches have not landed on the Linux kernel
|
||||
> project, so specific versions for guest and host kernels must be installed to enable SGX.
|
||||
|
||||
## Preconditions
|
||||
## Check if SGX is enabled
|
||||
|
||||
* Intel SGX capable bare metal nodes
|
||||
* Host kernel Linux 5.13 or later with SGX and SGX KVM enabled:
|
||||
Run the following command to check if your host supports SGX.
|
||||
|
||||
```sh
|
||||
$ grep SGX /boot/config-`uname -r`
|
||||
CONFIG_X86_SGX=y
|
||||
CONFIG_X86_SGX_KVM=y
|
||||
$ grep -o sgx /proc/cpuinfo
|
||||
```
|
||||
|
||||
* Kubernetes cluster configured with:
|
||||
* [`kata-deploy`](https://github.com/kata-containers/kata-containers/tree/main/tools/packaging/kata-deploy) based Kata Containers installation
|
||||
* [Intel SGX Kubernetes device plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/sgx_plugin#deploying-with-pre-built-images)
|
||||
Continue to the following section if the output of the above command is empty,
|
||||
otherwise continue to section [Install Guest kernel with SGX support](#install-guest-kernel-with-sgx-support)
|
||||
|
||||
> Note: Kata Containers supports creating VM sandboxes with Intel® SGX enabled
|
||||
> using [cloud-hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor/) VMM only. QEMU support is waiting to get the
|
||||
> Intel SGX enabled QEMU upstream release.
|
||||
## Install Host kernel with SGX support
|
||||
|
||||
## Installation
|
||||
|
||||
### Kata Containers Guest Kernel
|
||||
|
||||
Follow the instructions to [setup](../../tools/packaging/kernel/README.md#setup-kernel-source-code) and [build](../../tools/packaging/kernel/README.md#build-the-kernel) the experimental guest kernel. Then, install as:
|
||||
The following commands were tested on Fedora 32, they might work on other distros too.
|
||||
|
||||
```sh
|
||||
$ sudo cp kata-linux-experimental-*/vmlinux /opt/kata/share/kata-containers/vmlinux.sgx
|
||||
$ sudo sed -i 's|vmlinux.container|vmlinux.sgx|g' \
|
||||
/opt/kata/share/defaults/kata-containers/configuration-clh.toml
|
||||
$ git clone --depth=1 https://github.com/intel/kvm-sgx
|
||||
$ pushd kvm-sgx
|
||||
$ cp /boot/config-$(uname -r) .config
|
||||
$ yes "" | make oldconfig
|
||||
$ # In the following step, enable: INTEL_SGX and INTEL_SGX_VIRTUALIZATION
|
||||
$ make menuconfig
|
||||
$ make -j$(($(nproc)-1)) bzImage
|
||||
$ make -j$(($(nproc)-1)) modules
|
||||
$ sudo make modules_install
|
||||
$ sudo make install
|
||||
$ popd
|
||||
$ sudo reboot
|
||||
```
|
||||
|
||||
### Kata Containers Configuration
|
||||
> **Notes:**
|
||||
> * Run: `mokutil --sb-state` to check whether secure boot is enabled, if so, you will need to sign the kernel.
|
||||
> * You'll lose SGX support when a new distro kernel is installed and the system rebooted.
|
||||
|
||||
Once you have restarted your system with the new brand Linux Kernel with SGX support, run
|
||||
the following command to make sure it's enabled. If the output is empty, go to the BIOS
|
||||
setup and enable SGX manually.
|
||||
|
||||
```sh
|
||||
$ grep -o sgx /proc/cpuinfo
|
||||
```
|
||||
|
||||
## Install Guest kernel with SGX support
|
||||
|
||||
Install the guest kernel in the Kata Containers directory, this way it can be used to run
|
||||
Kata Containers.
|
||||
|
||||
```sh
|
||||
$ curl -LOk https://github.com/devimc/kvm-sgx/releases/download/v0.0.1/kata-virtiofs-sgx.tar.gz
|
||||
$ sudo tar -xf kata-virtiofs-sgx.tar.gz -C /usr/share/kata-containers/
|
||||
$ sudo sed -i 's|kernel =|kernel = "/usr/share/kata-containers/vmlinux-virtiofs-sgx.container"|g' \
|
||||
/usr/share/defaults/kata-containers/configuration.toml
|
||||
```
|
||||
|
||||
## Run Kata Containers with SGX enabled
|
||||
|
||||
Before running a Kata Container make sure that your version of `crio` or `containerd`
|
||||
supports annotations.
|
||||
|
||||
For `containerd` check in `/etc/containerd/config.toml` that the list of `pod_annotations` passed
|
||||
to the `sandbox` are: `["io.katacontainers.*", "sgx.intel.com/epc"]`.
|
||||
|
||||
## Usage
|
||||
|
||||
With the following sample job deployed using `kubectl apply -f`:
|
||||
|
||||
> `sgx.yaml`
|
||||
```yaml
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: oesgx-demo-job
|
||||
labels:
|
||||
jobgroup: oesgx-demo
|
||||
name: sgx
|
||||
annotations:
|
||||
sgx.intel.com/epc: "32Mi"
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
jobgroup: oesgx-demo
|
||||
spec:
|
||||
runtimeClassName: kata-clh
|
||||
initContainers:
|
||||
- name: init-sgx
|
||||
image: busybox
|
||||
command: ['sh', '-c', 'mkdir /dev/sgx; ln -s /dev/sgx_enclave /dev/sgx/enclave; ln -s /dev/sgx_provision /dev/sgx/provision']
|
||||
volumeMounts:
|
||||
- mountPath: /dev
|
||||
name: dev-mount
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
-
|
||||
name: eosgx-demo-job-1
|
||||
image: oeciteam/oe-helloworld:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
add: ["IPC_LOCK"]
|
||||
resources:
|
||||
limits:
|
||||
sgx.intel.com/epc: "512Ki"
|
||||
volumes:
|
||||
- name: dev-mount
|
||||
hostPath:
|
||||
path: /dev
|
||||
terminationGracePeriodSeconds: 0
|
||||
runtimeClassName: kata
|
||||
containers:
|
||||
- name: c1
|
||||
image: busybox
|
||||
command:
|
||||
- sh
|
||||
stdin: true
|
||||
tty: true
|
||||
volumeMounts:
|
||||
- mountPath: /dev/sgx/
|
||||
name: test-volume
|
||||
volumes:
|
||||
- name: test-volume
|
||||
hostPath:
|
||||
path: /dev/sgx/
|
||||
type: Directory
|
||||
```
|
||||
|
||||
You'll see the enclave output:
|
||||
|
||||
```sh
|
||||
$ kubectl logs oesgx-demo-job-wh42g
|
||||
Hello world from the enclave
|
||||
Enclave called into host to print: Hello World!
|
||||
$ kubectl apply -f sgx.yaml
|
||||
$ kubectl exec -ti sgx ls /dev/sgx/
|
||||
enclave provision
|
||||
```
|
||||
|
||||
### Notes
|
||||
The output of the latest command shouldn't be empty, otherwise check
|
||||
your system environment to make sure SGX is fully supported.
|
||||
|
||||
* The Kata VM's SGX Encrypted Page Cache (EPC) memory size is based on the sum of `sgx.intel.com/epc`
|
||||
resource requests within the pod.
|
||||
* `init-sgx` can be removed from the YAML configuration file if the Kata rootfs is modified with the
|
||||
necessary udev rules.
|
||||
See the [note on SGX backwards compatibility](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/cmd/sgx_plugin#backwards-compatibility-note).
|
||||
* Intel SGX DCAP attestation is known to work from Kata sandboxes but it comes with one limitation: If
|
||||
the Intel SGX `aesm` daemon runs on the bare metal node and DCAP `out-of-proc` attestation is used,
|
||||
containers within the Kata sandbox cannot get the access to the host's `/var/run/aesmd/aesm.sock`
|
||||
because socket passthrough is not supported. An alternative is to deploy the `aesm` daemon as a side-car
|
||||
container.
|
||||
* Projects like [Gramine Shielded Containers (GSC)](https://gramine-gsc.readthedocs.io/en/latest/) are
|
||||
also known to work. For GSC specifically, the Kata guest kernel needs to have the `CONFIG_NUMA=y`
|
||||
enabled and at least one CPU online when running the GSC container.
|
||||
[1]: github.com/cloud-hypervisor/cloud-hypervisor/
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Setup to run SPDK vhost-user devices with Kata Containers
|
||||
# Setup to run SPDK vhost-user devices with Kata Containers and Docker*
|
||||
|
||||
> **Note:** This guide only applies to QEMU, since the vhost-user storage
|
||||
> device is only available for QEMU now. The enablement work on other
|
||||
@@ -104,7 +104,7 @@ devices:
|
||||
|
||||
- `vhost-user-blk`
|
||||
- `vhost-user-scsi`
|
||||
- `vhost-user-nvme` (deprecated from SPDK 21.07 release)
|
||||
- `vhost-user-nvme`
|
||||
|
||||
For more information, visit [SPDK](https://spdk.io) and [SPDK vhost-user target](https://spdk.io/doc/vhost.html).
|
||||
|
||||
@@ -222,43 +222,26 @@ minor `0` should be created for it, in order to be recognized by Kata runtime:
|
||||
$ sudo mknod /var/run/kata-containers/vhost-user/block/devices/vhostblk0 b 241 0
|
||||
```
|
||||
|
||||
> **Note:** The enablement of vhost-user block device in Kata containers
|
||||
> is supported by Kata Containers `1.11.0-alpha1` or newer.
|
||||
> Make sure you have updated your Kata containers before evaluation.
|
||||
|
||||
## Launch a Kata container with SPDK vhost-user block device
|
||||
|
||||
To use `vhost-user-blk` device, use `ctr` to pass a host `vhost-user-blk`
|
||||
device to the container. In your `config.json`, you should use `devices`
|
||||
To use `vhost-user-blk` device, use Docker to pass a host `vhost-user-blk`
|
||||
device to the container. In docker, `--device=HOST-DIR:CONTAINER-DIR` is used
|
||||
to pass a host device to the container.
|
||||
|
||||
For example (only `vhost-user-blk` listed):
|
||||
|
||||
```json
|
||||
{
|
||||
"linux": {
|
||||
"devices": [
|
||||
{
|
||||
"path": "/dev/vda",
|
||||
"type": "b",
|
||||
"major": 241,
|
||||
"minor": 0,
|
||||
"fileMode": 420,
|
||||
"uid": 0,
|
||||
"gid": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
With `rootfs` provisioned under `bundle` directory, you can run your SPDK container:
|
||||
For example:
|
||||
|
||||
```bash
|
||||
$ sudo ctr run -d --runtime io.containerd.run.kata.v2 --config bundle/config.json spdk_container
|
||||
$ sudo docker run --runtime kata-runtime --device=/var/run/kata-containers/vhost-user/block/devices/vhostblk0:/dev/vda -it busybox sh
|
||||
```
|
||||
|
||||
Example of performing I/O operations on the `vhost-user-blk` device inside
|
||||
container:
|
||||
|
||||
```
|
||||
$ sudo ctr t exec --exec-id 1 -t spdk_container sh
|
||||
/ # ls -l /dev/vda
|
||||
brw-r--r-- 1 root root 254, 0 Jan 20 03:54 /dev/vda
|
||||
/ # dd if=/dev/vda of=/tmp/ddtest bs=4k count=20
|
||||
|
||||
@@ -7,15 +7,15 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
serde_json = "1.0.73"
|
||||
serde_json = "1.0.39"
|
||||
# slog:
|
||||
# - Dynamic keys required to allow HashMap keys to be slog::Serialized.
|
||||
# - The 'max_*' features allow changing the log level at runtime
|
||||
# (by stopping the compiler from removing log calls).
|
||||
slog = { version = "2.7.0", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug"] }
|
||||
slog-json = "2.4.0"
|
||||
slog-async = "2.7.0"
|
||||
slog-scope = "4.4.0"
|
||||
slog = { version = "2.5.2", features = ["dynamic-keys", "max_level_trace", "release_max_level_info"] }
|
||||
slog-json = "2.3.0"
|
||||
slog-async = "2.3.0"
|
||||
slog-scope = "4.1.2"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2.0"
|
||||
tempfile = "3.1.0"
|
||||
@@ -20,8 +20,6 @@ const LOG_LEVELS: &[(&str, slog::Level)] = &[
|
||||
("critical", slog::Level::Critical),
|
||||
];
|
||||
|
||||
const DEFAULT_SUBSYSTEM: &str = "root";
|
||||
|
||||
// XXX: 'writer' param used to make testing possible.
|
||||
pub fn create_logger<W>(
|
||||
name: &str,
|
||||
@@ -52,7 +50,7 @@ where
|
||||
let logger = slog::Logger::root(
|
||||
async_drain.fuse(),
|
||||
o!("version" => env!("CARGO_PKG_VERSION"),
|
||||
"subsystem" => DEFAULT_SUBSYSTEM,
|
||||
"subsystem" => "root",
|
||||
"pid" => process::id().to_string(),
|
||||
"name" => name.to_string(),
|
||||
"source" => source.to_string()),
|
||||
@@ -218,8 +216,8 @@ where
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::{json, Value};
|
||||
use slog::{crit, debug, error, info, warn, Logger};
|
||||
use serde_json::Value;
|
||||
use slog::info;
|
||||
use std::io::prelude::*;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
@@ -297,15 +295,15 @@ mod tests {
|
||||
let result_level = result.unwrap();
|
||||
let expected_level = d.result.unwrap();
|
||||
|
||||
assert!(result_level == expected_level, "{}", msg);
|
||||
assert!(result_level == expected_level, msg);
|
||||
continue;
|
||||
} else {
|
||||
assert!(result.is_err(), "{}", msg);
|
||||
assert!(result.is_err(), msg);
|
||||
}
|
||||
|
||||
let expected_error = d.result.as_ref().unwrap_err();
|
||||
let actual_error = result.unwrap_err();
|
||||
assert!(&actual_error == expected_error, "{}", msg);
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -352,13 +350,13 @@ mod tests {
|
||||
let msg = format!("{}, result: {:?}", msg, result);
|
||||
|
||||
if d.result.is_ok() {
|
||||
assert!(result == d.result, "{}", msg);
|
||||
assert!(result == d.result, msg);
|
||||
continue;
|
||||
}
|
||||
|
||||
let expected_error = d.result.as_ref().unwrap_err();
|
||||
let actual_error = result.unwrap_err();
|
||||
assert!(&actual_error == expected_error, "{}", msg);
|
||||
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
|
||||
let actual_error = format!("{}", result.unwrap_err());
|
||||
assert!(actual_error == expected_error, msg);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -378,17 +376,14 @@ mod tests {
|
||||
let record_key = "record-key-1";
|
||||
let record_value = "record-key-2";
|
||||
|
||||
let (logger, guard) = create_logger(name, source, level, writer);
|
||||
let logger = create_logger(name, source, level, writer);
|
||||
|
||||
let msg = "foo, bar, baz";
|
||||
|
||||
// Call the logger (which calls the drain)
|
||||
// Note: This "mid level" log level should be available in debug or
|
||||
// release builds.
|
||||
info!(&logger, "{}", msg; "subsystem" => record_subsystem, record_key => record_value);
|
||||
info!(logger, "{}", msg; "subsystem" => record_subsystem, record_key => record_value);
|
||||
|
||||
// Force temp file to be flushed
|
||||
drop(guard);
|
||||
drop(logger);
|
||||
|
||||
let mut contents = String::new();
|
||||
@@ -435,168 +430,4 @@ mod tests {
|
||||
.expect("failed to find record key field");
|
||||
assert_eq!(field_record_value, record_value);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_logger_levels() {
|
||||
let name = "name";
|
||||
let source = "source";
|
||||
|
||||
let debug_msg = "a debug log level message";
|
||||
let info_msg = "an info log level message";
|
||||
let warn_msg = "a warn log level message";
|
||||
let error_msg = "an error log level message";
|
||||
let critical_msg = "a critical log level message";
|
||||
|
||||
// The slog crate will *remove* macro calls for log levels "above" the
|
||||
// configured log level.lock
|
||||
//
|
||||
// At the time of writing, the default slog log
|
||||
// level is "info", but this crate overrides that using the magic
|
||||
// "*max_level*" features in the "Cargo.toml" manifest.
|
||||
|
||||
// However, there are two log levels:
|
||||
//
|
||||
// - max_level_${level}
|
||||
//
|
||||
// This is the log level for normal "cargo build" (development/debug)
|
||||
// builds.
|
||||
//
|
||||
// - release_max_level_${level}
|
||||
//
|
||||
// This is the log level for "cargo install" and
|
||||
// "cargo build --release" (release) builds.
|
||||
//
|
||||
// This crate sets them to different values, which is sensible and
|
||||
// standard practice. However, that causes a problem: there is
|
||||
// currently no clean way for this test code to detect _which_
|
||||
// profile the test is being built for (development or release),
|
||||
// meaning we cannot know which macros are expected to produce output
|
||||
// and which aren't ;(
|
||||
//
|
||||
// The best we can do is test the following log levels which
|
||||
// are expected to work in all build profiles.
|
||||
|
||||
let debug_closure = |logger: &Logger, msg: String| debug!(logger, "{}", msg);
|
||||
let info_closure = |logger: &Logger, msg: String| info!(logger, "{}", msg);
|
||||
let warn_closure = |logger: &Logger, msg: String| warn!(logger, "{}", msg);
|
||||
let error_closure = |logger: &Logger, msg: String| error!(logger, "{}", msg);
|
||||
let critical_closure = |logger: &Logger, msg: String| crit!(logger, "{}", msg);
|
||||
|
||||
struct TestData<'a> {
|
||||
slog_level: slog::Level,
|
||||
slog_level_tag: &'a str,
|
||||
msg: String,
|
||||
closure: Box<dyn Fn(&Logger, String)>,
|
||||
}
|
||||
|
||||
let tests = &[
|
||||
TestData {
|
||||
slog_level: slog::Level::Debug,
|
||||
// Looks like a typo but tragically it isn't! ;(
|
||||
slog_level_tag: "DEBG",
|
||||
msg: debug_msg.into(),
|
||||
closure: Box::new(debug_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Info,
|
||||
slog_level_tag: "INFO",
|
||||
msg: info_msg.into(),
|
||||
closure: Box::new(info_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Warning,
|
||||
slog_level_tag: "WARN",
|
||||
msg: warn_msg.into(),
|
||||
closure: Box::new(warn_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Error,
|
||||
// Another language tragedy
|
||||
slog_level_tag: "ERRO",
|
||||
msg: error_msg.into(),
|
||||
closure: Box::new(error_closure),
|
||||
},
|
||||
TestData {
|
||||
slog_level: slog::Level::Critical,
|
||||
slog_level_tag: "CRIT",
|
||||
msg: critical_msg.into(),
|
||||
closure: Box::new(critical_closure),
|
||||
},
|
||||
];
|
||||
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]", i);
|
||||
|
||||
// Create a writer for the logger drain to use
|
||||
let writer =
|
||||
NamedTempFile::new().expect(&format!("{:}: failed to create tempfile", msg));
|
||||
|
||||
// Used to check file contents before the temp file is unlinked
|
||||
let mut writer_ref = writer
|
||||
.reopen()
|
||||
.expect(&format!("{:?}: failed to clone tempfile", msg));
|
||||
|
||||
let (logger, logger_guard) = create_logger(name, source, d.slog_level, writer);
|
||||
|
||||
// Call the logger (which calls the drain)
|
||||
(d.closure)(&logger, d.msg.to_owned());
|
||||
|
||||
// Force temp file to be flushed
|
||||
drop(logger_guard);
|
||||
drop(logger);
|
||||
|
||||
let mut contents = String::new();
|
||||
writer_ref
|
||||
.read_to_string(&mut contents)
|
||||
.expect(&format!("{:?}: failed to read tempfile contents", msg));
|
||||
|
||||
// Convert file to JSON
|
||||
let fields: Value = serde_json::from_str(&contents)
|
||||
.expect(&format!("{:?}: failed to convert logfile to json", msg));
|
||||
|
||||
// Check the expected JSON fields
|
||||
|
||||
let field_ts = fields
|
||||
.get("ts")
|
||||
.expect(&format!("{:?}: failed to find timestamp field", msg));
|
||||
assert_ne!(field_ts, "", "{}", msg);
|
||||
|
||||
let field_version = fields
|
||||
.get("version")
|
||||
.expect(&format!("{:?}: failed to find version field", msg));
|
||||
assert_eq!(field_version, env!("CARGO_PKG_VERSION"), "{}", msg);
|
||||
|
||||
let field_pid = fields
|
||||
.get("pid")
|
||||
.expect(&format!("{:?}: failed to find pid field", msg));
|
||||
assert_ne!(field_pid, "", "{}", msg);
|
||||
|
||||
let field_level = fields
|
||||
.get("level")
|
||||
.expect(&format!("{:?}: failed to find level field", msg));
|
||||
assert_eq!(field_level, d.slog_level_tag, "{}", msg);
|
||||
|
||||
let field_msg = fields
|
||||
.get("msg")
|
||||
.expect(&format!("{:?}: failed to find msg field", msg));
|
||||
assert_eq!(field_msg, &json!(d.msg), "{}", msg);
|
||||
|
||||
let field_name = fields
|
||||
.get("name")
|
||||
.expect(&format!("{:?}: failed to find name field", msg));
|
||||
assert_eq!(field_name, name, "{}", msg);
|
||||
|
||||
let field_source = fields
|
||||
.get("source")
|
||||
.expect(&format!("{:?}: failed to find source field", msg));
|
||||
assert_eq!(field_source, source, "{}", msg);
|
||||
|
||||
let field_subsystem = fields
|
||||
.get("subsystem")
|
||||
.expect(&format!("{:?}: failed to find subsystem field", msg));
|
||||
|
||||
// No explicit subsystem, so should be the default
|
||||
assert_eq!(field_subsystem, &json!(DEFAULT_SUBSYSTEM), "{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -76,7 +76,7 @@ then a new configuration file can be [created](#configure-kata-containers)
|
||||
and [configured][7].
|
||||
|
||||
[1]: https://docs.snapcraft.io/snaps/intro
|
||||
[2]: ../docs/design/architecture/README.md#root-filesystem-image
|
||||
[2]: ../docs/design/architecture.md#root-filesystem-image
|
||||
[3]: https://docs.snapcraft.io/reference/confinement#classic
|
||||
[4]: https://github.com/kata-containers/runtime#configuration
|
||||
[5]: https://docs.docker.com/engine/reference/commandline/dockerd
|
||||
|
||||
@@ -59,7 +59,7 @@ parts:
|
||||
|
||||
yq_version=3.4.1
|
||||
yq_url="https://${yq_pkg}/releases/download/${yq_version}/yq_${goos}_${goarch}"
|
||||
curl -o "${yq_path}" -L "${yq_url}"
|
||||
curl -o "${yq_path}" -LSsf "${yq_url}"
|
||||
chmod +x "${yq_path}"
|
||||
|
||||
kata_dir=gopath/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
@@ -118,19 +118,18 @@ parts:
|
||||
export AGENT_INIT=yes
|
||||
export USE_DOCKER=1
|
||||
export DEBUG=1
|
||||
arch="$(uname -m)"
|
||||
initrd_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.initrd.architecture.${arch}.name)
|
||||
image_distro=$(${yq} r -X ${kata_dir}/versions.yaml assets.image.architecture.${arch}.name)
|
||||
case "$arch" in
|
||||
case "$(uname -m)" in
|
||||
aarch64)
|
||||
sudo -E PATH=$PATH make initrd DISTRO=alpine
|
||||
;;
|
||||
ppc64le|s390x)
|
||||
# Cannot use alpine on ppc64le/s390x because it would require a musl agent
|
||||
sudo -E PATH=$PATH make initrd DISTRO=ubuntu
|
||||
;;
|
||||
x86_64)
|
||||
# In some build systems it's impossible to build a rootfs image, try with the initrd image
|
||||
sudo -E PATH=$PATH make image DISTRO=${image_distro} || sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
sudo -E PATH=$PATH make image DISTRO=clearlinux || sudo -E PATH=$PATH make initrd DISTRO=alpine
|
||||
;;
|
||||
|
||||
aarch64|ppc64le|s390x)
|
||||
sudo -E PATH=$PATH make initrd DISTRO=${initrd_distro}
|
||||
;;
|
||||
|
||||
*) echo "unsupported architecture: $(uname -m)"; exit 1;;
|
||||
esac
|
||||
|
||||
@@ -140,7 +139,7 @@ parts:
|
||||
cp kata-containers*.img ${kata_image_dir}
|
||||
|
||||
runtime:
|
||||
after: [godeps, image, cloud-hypervisor]
|
||||
after: [godeps, image]
|
||||
plugin: nil
|
||||
build-attributes: [no-patchelf]
|
||||
override-build: |
|
||||
@@ -186,7 +185,6 @@ parts:
|
||||
- flex
|
||||
override-build: |
|
||||
yq=${SNAPCRAFT_STAGE}/yq
|
||||
export PATH="${PATH}:${SNAPCRAFT_STAGE}"
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
versions_file="${kata_dir}/versions.yaml"
|
||||
@@ -201,17 +199,10 @@ parts:
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
|
||||
cd ${kata_dir}/tools/packaging/kernel
|
||||
kernel_dir_prefix="kata-linux-"
|
||||
|
||||
# Setup and build kernel
|
||||
if [ "$(uname -m)" = "x86_64" ]; then
|
||||
kernel_version="$(${yq} r $versions_file assets.kernel-experimental.tag)"
|
||||
kernel_version=${kernel_version#v}
|
||||
kernel_dir_prefix="kata-linux-experimental-"
|
||||
./build-kernel.sh -e -v ${kernel_version} -d setup
|
||||
else
|
||||
./build-kernel.sh -v ${kernel_version} -d setup
|
||||
fi
|
||||
./build-kernel.sh -v ${kernel_version} -d setup
|
||||
kernel_dir_prefix="kata-linux-"
|
||||
cd ${kernel_dir_prefix}*
|
||||
make -j $(($(nproc)-1)) EXTRAVERSION=".container"
|
||||
|
||||
@@ -314,7 +305,7 @@ parts:
|
||||
;;
|
||||
|
||||
*)
|
||||
cp -a ${kata_dir}/tools/packaging/qemu/default-configs/* configs/devices/
|
||||
cp -a ${kata_dir}/tools/packaging/qemu/default-configs/* default-configs/devices/
|
||||
;;
|
||||
esac
|
||||
|
||||
@@ -336,22 +327,6 @@ parts:
|
||||
# Hack: move qemu to /
|
||||
"snap/kata-containers/current/": "./"
|
||||
|
||||
cloud-hypervisor:
|
||||
plugin: nil
|
||||
after: [godeps]
|
||||
override-build: |
|
||||
export GOPATH=${SNAPCRAFT_STAGE}/gopath
|
||||
yq=${SNAPCRAFT_STAGE}/yq
|
||||
kata_dir=${GOPATH}/src/github.com/${SNAPCRAFT_PROJECT_NAME}/${SNAPCRAFT_PROJECT_NAME}
|
||||
versions_file="${kata_dir}/versions.yaml"
|
||||
version="$(${yq} r ${versions_file} assets.hypervisor.cloud_hypervisor.version)"
|
||||
url="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${version}"
|
||||
curl -L ${url}/cloud-hypervisor-static -o cloud-hypervisor
|
||||
curl -LO ${url}/clh-remote
|
||||
|
||||
install -D cloud-hypervisor ${SNAPCRAFT_PART_INSTALL}/usr/bin/cloud-hypervisor
|
||||
install -D clh-remote ${SNAPCRAFT_PART_INSTALL}/usr/bin/clh-remote
|
||||
|
||||
apps:
|
||||
runtime:
|
||||
command: usr/bin/kata-runtime
|
||||
|
||||
840
src/agent/Cargo.lock
generated
840
src/agent/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -5,29 +5,30 @@ authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oci = { path = "../libs/oci" }
|
||||
oci = { path = "oci" }
|
||||
logging = { path = "../../pkg/logging" }
|
||||
rustjail = { path = "rustjail" }
|
||||
protocols = { path = "../libs/protocols" }
|
||||
protocols = { path = "protocols" }
|
||||
lazy_static = "1.3.0"
|
||||
ttrpc = { version = "0.5.0", features = ["async", "protobuf-codec"], default-features = false }
|
||||
protobuf = "=2.14.0"
|
||||
libc = "0.2.58"
|
||||
nix = "0.23.0"
|
||||
nix = "0.21.0"
|
||||
capctl = "0.2.0"
|
||||
serde_json = "1.0.39"
|
||||
scan_fmt = "0.2.3"
|
||||
scopeguard = "1.0.0"
|
||||
thiserror = "1.0.26"
|
||||
regex = "1.5.4"
|
||||
regex = "1"
|
||||
serial_test = "0.5.1"
|
||||
|
||||
# Async helpers
|
||||
async-trait = "0.1.42"
|
||||
async-recursion = "0.3.2"
|
||||
futures = "0.3.17"
|
||||
futures = "0.3.12"
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1.14.0", features = ["full"] }
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio-vsock = "0.3.1"
|
||||
|
||||
netlink-sys = { version = "0.7.0", features = ["tokio_socket",]}
|
||||
@@ -35,20 +36,21 @@ rtnetlink = "0.8.0"
|
||||
netlink-packet-utils = "0.4.1"
|
||||
ipnetwork = "0.17.0"
|
||||
|
||||
# Note: this crate sets the slog 'max_*' features which allows the log level
|
||||
# to be modified at runtime.
|
||||
logging = { path = "../libs/logging" }
|
||||
slog = "2.5.2"
|
||||
# slog:
|
||||
# - Dynamic keys required to allow HashMap keys to be slog::Serialized.
|
||||
# - The 'max_*' features allow changing the log level at runtime
|
||||
# (by stopping the compiler from removing log calls).
|
||||
slog = { version = "2.5.2", features = ["dynamic-keys", "max_level_trace", "release_max_level_info"] }
|
||||
slog-scope = "4.1.2"
|
||||
|
||||
# Redirect ttrpc log calls
|
||||
slog-stdlog = "4.0.0"
|
||||
log = "0.4.11"
|
||||
|
||||
prometheus = { version = "0.13.0", features = ["process"] }
|
||||
procfs = "0.12.0"
|
||||
prometheus = { version = "0.9.0", features = ["process"] }
|
||||
procfs = "0.7.9"
|
||||
anyhow = "1.0.32"
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.8" }
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.5" }
|
||||
|
||||
# Tracing
|
||||
tracing = "0.1.26"
|
||||
@@ -57,21 +59,15 @@ tracing-opentelemetry = "0.13.0"
|
||||
opentelemetry = { version = "0.14.0", features = ["rt-tokio-current-thread"]}
|
||||
vsock-exporter = { path = "vsock-exporter" }
|
||||
|
||||
# Configuration
|
||||
serde = { version = "1.0.129", features = ["derive"] }
|
||||
toml = "0.5.8"
|
||||
clap = { version = "3.0.1", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1.0"
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"oci",
|
||||
"protocols",
|
||||
"rustjail",
|
||||
]
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
|
||||
[features]
|
||||
seccomp = ["rustjail/seccomp"]
|
||||
|
||||
202
src/agent/LICENSE
Normal file
202
src/agent/LICENSE
Normal file
@@ -0,0 +1,202 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
@@ -27,20 +27,6 @@ COMMIT_MSG = $(if $(COMMIT),$(COMMIT),unknown)
|
||||
# Exported to allow cargo to see it
|
||||
export VERSION_COMMIT := $(if $(COMMIT),$(VERSION)-$(COMMIT),$(VERSION))
|
||||
|
||||
EXTRA_RUSTFEATURES :=
|
||||
|
||||
##VAR SECCOMP=yes|no define if agent enables seccomp feature
|
||||
SECCOMP := yes
|
||||
|
||||
# Enable seccomp feature of rust build
|
||||
ifeq ($(SECCOMP),yes)
|
||||
override EXTRA_RUSTFEATURES += seccomp
|
||||
endif
|
||||
|
||||
ifneq ($(EXTRA_RUSTFEATURES),)
|
||||
override EXTRA_RUSTFEATURES := --features $(EXTRA_RUSTFEATURES)
|
||||
endif
|
||||
|
||||
include ../../utils.mk
|
||||
|
||||
TARGET_PATH = target/$(TRIPLE)/$(BUILD_TYPE)/$(TARGET)
|
||||
@@ -101,20 +87,18 @@ endef
|
||||
##TARGET default: build code
|
||||
default: $(TARGET) show-header
|
||||
|
||||
$(TARGET): $(GENERATED_CODE) logging-crate-tests $(TARGET_PATH)
|
||||
|
||||
logging-crate-tests:
|
||||
make -C $(CWD)/../libs/logging
|
||||
$(TARGET): $(GENERATED_CODE) $(TARGET_PATH)
|
||||
|
||||
$(TARGET_PATH): $(SOURCES) | show-summary
|
||||
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
@RUSTFLAGS="$(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE)
|
||||
|
||||
$(GENERATED_FILES): %: %.in
|
||||
@sed $(foreach r,$(GENERATED_REPLACEMENTS),-e 's|@$r@|$($r)|g') "$<" > "$@"
|
||||
|
||||
##TARGET optimize: optimized build
|
||||
optimize: $(SOURCES) | show-summary show-header
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE) $(EXTRA_RUSTFEATURES)
|
||||
@RUSTFLAGS="-C link-arg=-s $(EXTRA_RUSTFLAGS) --deny-warnings" cargo build --target $(TRIPLE) --$(BUILD_TYPE)
|
||||
|
||||
|
||||
##TARGET clippy: run clippy linter
|
||||
clippy: $(GENERATED_CODE)
|
||||
@@ -143,7 +127,7 @@ vendor:
|
||||
|
||||
#TARGET test: run cargo tests
|
||||
test:
|
||||
@cargo test --all --target $(TRIPLE) $(EXTRA_RUSTFEATURES) -- --nocapture
|
||||
@cargo test --all --target $(TRIPLE)
|
||||
|
||||
##TARGET check: run test
|
||||
check: clippy format
|
||||
@@ -208,10 +192,9 @@ codecov-html: check_tarpaulin
|
||||
|
||||
.PHONY: \
|
||||
help \
|
||||
logging-crate-tests \
|
||||
optimize \
|
||||
show-header \
|
||||
show-summary \
|
||||
optimize \
|
||||
vendor
|
||||
|
||||
##TARGET generate-protocols: generate/update grpc agent protocols
|
||||
|
||||
@@ -1,38 +1,47 @@
|
||||
# Kata Agent
|
||||
# Kata Agent in Rust
|
||||
|
||||
## Overview
|
||||
This is a rust version of the [`kata-agent`](https://github.com/kata-containers/agent).
|
||||
|
||||
The Kata agent is a long running process that runs inside the Virtual Machine
|
||||
(VM) (also known as the "pod" or "sandbox").
|
||||
In Denver PTG, [we discussed about re-writing agent in rust](https://etherpad.openstack.org/p/katacontainers-2019-ptg-denver-agenda):
|
||||
|
||||
The agent is packaged inside the Kata Containers
|
||||
[guest image](../../docs/design/architecture/README.md#guest-image)
|
||||
which is used to boot the VM. Once the runtime has launched the configured
|
||||
[hypervisor](../../docs/hypervisors.md) to create a new VM, the agent is
|
||||
started. From this point on, the agent is responsible for creating and
|
||||
managing the life cycle of the containers inside the VM.
|
||||
> In general, we all think about re-write agent in rust to reduce the footprint of agent. Moreover, Eric mentioned the possibility to stop using gRPC, which may have some impact on footprint. We may begin to do some POC to show how much we could save by re-writing agent in rust.
|
||||
|
||||
For further details, see the
|
||||
[architecture document](../../docs/design/architecture).
|
||||
After that, we drafted the initial code here, and any contributions are welcome.
|
||||
|
||||
## Audience
|
||||
## Features
|
||||
|
||||
If you simply wish to use Kata Containers, it is not necessary to understand
|
||||
the details of how the agent operates. Please see the
|
||||
[installation documentation](../../docs/install) for details of how deploy
|
||||
Kata Containers (which will include the Kata agent).
|
||||
| Feature | Status |
|
||||
| :--|:--:|
|
||||
| **OCI Behaviors** |
|
||||
| create/start containers | :white_check_mark: |
|
||||
| signal/wait process | :white_check_mark: |
|
||||
| exec/list process | :white_check_mark: |
|
||||
| I/O stream | :white_check_mark: |
|
||||
| Cgroups | :white_check_mark: |
|
||||
| Capabilities, `rlimit`, readonly path, masked path, users | :white_check_mark: |
|
||||
| container stats (`stats_container`) | :white_check_mark: |
|
||||
| Hooks | :white_check_mark: |
|
||||
| **Agent Features & APIs** |
|
||||
| run agent as `init` (mount fs, udev, setup `lo`) | :white_check_mark: |
|
||||
| block device as root device | :white_check_mark: |
|
||||
| Health API | :white_check_mark: |
|
||||
| network, interface/routes (`update_container`) | :white_check_mark: |
|
||||
| File transfer API (`copy_file`) | :white_check_mark: |
|
||||
| Device APIs (`reseed_random_device`, , `online_cpu_memory`, `mem_hotplug_probe`, `set_guet_data_time`) | :white_check_mark: |
|
||||
| VSOCK support | :white_check_mark: |
|
||||
| virtio-serial support | :heavy_multiplication_x: |
|
||||
| OCI Spec validator | :white_check_mark: |
|
||||
| **Infrastructures**|
|
||||
| Debug Console | :white_check_mark: |
|
||||
| Command line | :white_check_mark: |
|
||||
| Tracing | :heavy_multiplication_x: |
|
||||
|
||||
The remainder of this document is only useful for developers and testers.
|
||||
## Getting Started
|
||||
|
||||
## Build from Source
|
||||
### Build from Source
|
||||
The rust-agent needs to be built statically and linked with `musl`
|
||||
|
||||
Since the agent is written in the Rust language this section assumes the tool
|
||||
chain has been installed using standard Rust `rustup` tool.
|
||||
|
||||
### Build with musl
|
||||
|
||||
If you wish to build the agent with the `musl` C library, you need to run the
|
||||
following commands:
|
||||
> **Note:** skip this step for ppc64le, the build scripts explicitly use gnu for ppc64le.
|
||||
|
||||
```bash
|
||||
$ arch=$(uname -m)
|
||||
@@ -40,15 +49,12 @@ $ rustup target add "${arch}-unknown-linux-musl"
|
||||
$ sudo ln -s /usr/bin/g++ /bin/musl-g++
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> It is not currently possible to build using `musl` on ppc64le and s390x
|
||||
> since both platforms lack the `musl` target.
|
||||
|
||||
### Build the agent binary
|
||||
|
||||
The following steps download the Kata Containers source files and build the agent:
|
||||
ppc64le-only: Manually install `protoc`, e.g.
|
||||
```bash
|
||||
$ sudo dnf install protobuf-compiler
|
||||
```
|
||||
|
||||
Download the source files in the Kata containers repository and build the agent:
|
||||
```bash
|
||||
$ GOPATH="${GOPATH:-$HOME/go}"
|
||||
$ dir="$GOPATH/src/github.com/kata-containers"
|
||||
@@ -56,60 +62,17 @@ $ git -C ${dir} clone --depth 1 https://github.com/kata-containers/kata-containe
|
||||
$ make -C ${dir}/kata-containers/src/agent
|
||||
```
|
||||
|
||||
## Change the agent API
|
||||
|
||||
The Kata runtime communicates with the Kata agent using a ttRPC based API protocol.
|
||||
|
||||
This ttRPC API is defined by a set of [protocol buffers files](protocols/protos).
|
||||
The protocol files are used to generate the bindings for the following components:
|
||||
|
||||
| Component | Language | Generation method `[*]` | Tooling required |
|
||||
|-|-|-|-|
|
||||
| runtime | Golang | Run, `make generate-protocols` | `protoc` |
|
||||
| agent | Rust | Run, `make` | |
|
||||
|
||||
> **Key:**
|
||||
>
|
||||
> `[*]` - All commands must be run in the agent repository.
|
||||
|
||||
If you wish to change the API, these files must be regenerated. Although the
|
||||
rust code will be automatically generated by the
|
||||
[build script](protocols/build.rs),
|
||||
the Golang code generation requires the external `protoc` command to be
|
||||
available in `$PATH`.
|
||||
|
||||
To install the `protoc` command on a Fedora/CentOS/RHEL system:
|
||||
## Run Kata CI with rust-agent
|
||||
* Firstly, install Kata as noted by ["how to install Kata"](../../docs/install/README.md)
|
||||
* Secondly, build your own Kata initrd/image following the steps in ["how to build your own initrd/image"](../../docs/Developer-Guide.md#create-and-install-rootfs-and-initrd-image).
|
||||
notes: Please use your rust agent instead of the go agent when building your initrd/image.
|
||||
* Clone the Kata CI test cases from: https://github.com/kata-containers/tests.git, and then run the CRI test with:
|
||||
|
||||
```bash
|
||||
$ sudo dnf install -y protobuf-compiler
|
||||
$sudo -E PATH=$PATH -E GOPATH=$GOPATH integration/containerd/shimv2/shimv2-tests.sh
|
||||
```
|
||||
|
||||
## Custom guest image and kernel assets
|
||||
|
||||
If you wish to develop or test changes to the agent, you will need to create a
|
||||
custom guest image using the [osbuilder tool](../../tools/osbuilder). You
|
||||
may also wish to create a custom [guest kernel](../../tools/packaging/kernel).
|
||||
|
||||
Once created, [configure](../runtime/README.md#configuration) Kata Containers to use
|
||||
these custom assets to allow you to test your changes.
|
||||
|
||||
> **Note:**
|
||||
>
|
||||
> To simplify development and testing, you may wish to run the agent
|
||||
> [stand alone](#run-the-agent-stand-alone) initially.
|
||||
|
||||
## Tracing
|
||||
|
||||
For details of tracing the operation of the agent, see the
|
||||
[tracing documentation](/docs/tracing.md).
|
||||
|
||||
## Run the agent stand alone
|
||||
|
||||
Although the agent is designed to run in a VM environment, for development and
|
||||
testing purposes it is possible to run it as a normal application.
|
||||
|
||||
When run in this way, the agent can be controlled using the low-level Kata
|
||||
agent control tool, rather than the Kata runtime.
|
||||
|
||||
For further details, see the
|
||||
[agent control tool documentation](../tools/agent-ctl/README.md#run-the-tool-and-the-agent-in-the-same-environment).
|
||||
## Mini Benchmark
|
||||
The memory of `RssAnon` consumed by the go-agent and rust-agent as below:
|
||||
go-agent: about 11M
|
||||
rust-agent: about 1.1M
|
||||
|
||||
@@ -5,7 +5,7 @@ authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
serde = "1.0.131"
|
||||
serde_derive = "1.0.131"
|
||||
serde_json = "1.0.73"
|
||||
libc = "0.2.112"
|
||||
serde = "1.0.91"
|
||||
serde_derive = "1.0.91"
|
||||
serde_json = "1.0.39"
|
||||
libc = "0.2.58"
|
||||
@@ -4,16 +4,10 @@ version = "0.1.0"
|
||||
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
|
||||
edition = "2018"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
with-serde = [ "serde", "serde_json" ]
|
||||
|
||||
[dependencies]
|
||||
ttrpc = { version = "0.5.0", features = ["async"] }
|
||||
async-trait = "0.1.42"
|
||||
protobuf = { version = "=2.14.0", features = ["with-serde"] }
|
||||
serde = { version = "1.0.130", features = ["derive"], optional = true }
|
||||
serde_json = { version = "1.0.68", optional = true }
|
||||
protobuf = "=2.14.0"
|
||||
|
||||
[build-dependencies]
|
||||
ttrpc-codegen = "0.2.0"
|
||||
44
src/agent/protocols/build.rs
Normal file
44
src/agent/protocols/build.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
// Copyright (c) 2020 Ant Group
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use std::fs;
|
||||
use ttrpc_codegen::{Codegen, Customize};
|
||||
|
||||
fn main() {
|
||||
let protos = vec![
|
||||
"protos/types.proto",
|
||||
"protos/agent.proto",
|
||||
"protos/health.proto",
|
||||
"protos/google/protobuf/empty.proto",
|
||||
"protos/oci.proto",
|
||||
];
|
||||
|
||||
Codegen::new()
|
||||
.out_dir("src")
|
||||
.inputs(&protos)
|
||||
.include("protos")
|
||||
.rust_protobuf()
|
||||
.customize(Customize {
|
||||
async_server: true,
|
||||
..Default::default()
|
||||
})
|
||||
.run()
|
||||
.expect("Gen codes failed.");
|
||||
|
||||
// There is a message named 'Box' in oci.proto
|
||||
// so there is a struct named 'Box', we should replace Box<Self> to ::std::boxed::Box<Self>
|
||||
// to avoid the conflict.
|
||||
replace_text_in_file(
|
||||
"src/oci.rs",
|
||||
"self: Box<Self>",
|
||||
"self: ::std::boxed::Box<Self>",
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn replace_text_in_file(file_name: &str, from: &str, to: &str) -> Result<(), std::io::Error> {
|
||||
let new_contents = fs::read_to_string(file_name)?.replace(from, to);
|
||||
fs::write(&file_name, new_contents.as_bytes())
|
||||
}
|
||||
@@ -52,6 +52,8 @@ service AgentService {
|
||||
rpc AddARPNeighbors(AddARPNeighborsRequest) returns (google.protobuf.Empty);
|
||||
|
||||
// observability
|
||||
rpc StartTracing(StartTracingRequest) returns (google.protobuf.Empty);
|
||||
rpc StopTracing(StopTracingRequest) returns (google.protobuf.Empty);
|
||||
rpc GetMetrics(GetMetricsRequest) returns (Metrics);
|
||||
|
||||
// misc (TODO: some rpcs can be replaced by hyperstart-exec)
|
||||
@@ -490,6 +492,12 @@ message CopyFileRequest {
|
||||
bytes data = 8;
|
||||
}
|
||||
|
||||
message StartTracingRequest {
|
||||
}
|
||||
|
||||
message StopTracingRequest {
|
||||
}
|
||||
|
||||
message GetOOMEventRequest {}
|
||||
|
||||
message OOMEvent {
|
||||
@@ -8,10 +8,10 @@ edition = "2018"
|
||||
serde = "1.0.91"
|
||||
serde_json = "1.0.39"
|
||||
serde_derive = "1.0.91"
|
||||
oci = { path = "../../libs/oci" }
|
||||
protocols = { path ="../../libs/protocols" }
|
||||
oci = { path = "../oci" }
|
||||
protocols = { path ="../protocols" }
|
||||
caps = "0.5.0"
|
||||
nix = "0.23.0"
|
||||
nix = "0.21.0"
|
||||
scopeguard = "1.0.0"
|
||||
capctl = "0.2.0"
|
||||
lazy_static = "1.3.0"
|
||||
@@ -19,22 +19,18 @@ libc = "0.2.58"
|
||||
protobuf = "=2.14.0"
|
||||
slog = "2.5.2"
|
||||
slog-scope = "4.1.2"
|
||||
scan_fmt = "0.2.6"
|
||||
regex = "1.5.4"
|
||||
scan_fmt = "0.2"
|
||||
regex = "1.1"
|
||||
path-absolutize = "1.2.0"
|
||||
anyhow = "1.0.32"
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.8" }
|
||||
cgroups = { package = "cgroups-rs", version = "0.2.5" }
|
||||
rlimit = "0.5.3"
|
||||
|
||||
tokio = { version = "1.2.0", features = ["sync", "io-util", "process", "time", "macros"] }
|
||||
futures = "0.3.17"
|
||||
futures = "0.3"
|
||||
async-trait = "0.1.31"
|
||||
inotify = "0.9.2"
|
||||
libseccomp = { version = "0.1.3", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
serial_test = "0.5.0"
|
||||
tempfile = "3.1.0"
|
||||
|
||||
[features]
|
||||
seccomp = ["libseccomp"]
|
||||
|
||||
@@ -22,6 +22,7 @@ use crate::cgroups::Manager as CgroupManager;
|
||||
use crate::container::DEFAULT_DEVICES;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use libc::{self, pid_t};
|
||||
use nix::errno::Errno;
|
||||
use oci::{
|
||||
LinuxBlockIo, LinuxCpu, LinuxDevice, LinuxDeviceCgroup, LinuxHugepageLimit, LinuxMemory,
|
||||
LinuxNetwork, LinuxPids, LinuxResources,
|
||||
@@ -174,7 +175,7 @@ impl CgroupManager for Manager {
|
||||
freezer_controller.freeze()?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,8 +25,6 @@ use crate::cgroups::mock::Manager as FsManager;
|
||||
use crate::cgroups::Manager;
|
||||
use crate::log_child;
|
||||
use crate::process::Process;
|
||||
#[cfg(feature = "seccomp")]
|
||||
use crate::seccomp;
|
||||
use crate::specconv::CreateOpts;
|
||||
use crate::{mount, validator};
|
||||
|
||||
@@ -153,7 +151,7 @@ lazy_static! {
|
||||
},
|
||||
LinuxDevice {
|
||||
path: "/dev/full".to_string(),
|
||||
r#type: "c".to_string(),
|
||||
r#type: String::from("c"),
|
||||
major: 1,
|
||||
minor: 7,
|
||||
file_mode: Some(0o666),
|
||||
@@ -419,7 +417,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
ns.r#type.clone(),
|
||||
ns.path.clone()
|
||||
);
|
||||
log_child!(cfd_log, "error is : {:?}", e);
|
||||
log_child!(cfd_log, "error is : {:?}", e.as_errno());
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -496,7 +494,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
log_child!(cfd_log, "join namespace {:?}", s);
|
||||
sched::setns(fd, s).or_else(|e| {
|
||||
if s == CloneFlags::CLONE_NEWUSER {
|
||||
if e != Errno::EINVAL {
|
||||
if e.as_errno().unwrap() != Errno::EINVAL {
|
||||
let _ = write_sync(cwfd, SYNC_FAILED, format!("{:?}", e).as_str());
|
||||
return Err(e);
|
||||
}
|
||||
@@ -595,30 +593,11 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
})?;
|
||||
}
|
||||
|
||||
// NoNewPrivileges
|
||||
// NoNewPeiviledges, Drop capabilities
|
||||
if oci_process.no_new_privileges {
|
||||
capctl::prctl::set_no_new_privs().map_err(|_| anyhow!("cannot set no new privileges"))?;
|
||||
}
|
||||
|
||||
// Log unknown seccomp system calls in advance before the log file descriptor closes.
|
||||
#[cfg(feature = "seccomp")]
|
||||
if let Some(ref scmp) = linux.seccomp {
|
||||
if let Some(syscalls) = seccomp::get_unknown_syscalls(scmp) {
|
||||
log_child!(cfd_log, "unknown seccomp system calls: {:?}", syscalls);
|
||||
}
|
||||
}
|
||||
|
||||
// Without NoNewPrivileges, we need to set seccomp
|
||||
// before dropping capabilities because the calling thread
|
||||
// must have the CAP_SYS_ADMIN.
|
||||
#[cfg(feature = "seccomp")]
|
||||
if !oci_process.no_new_privileges {
|
||||
if let Some(ref scmp) = linux.seccomp {
|
||||
seccomp::init_seccomp(scmp)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Drop capabilities
|
||||
if oci_process.capabilities.is_some() {
|
||||
let c = oci_process.capabilities.as_ref().unwrap();
|
||||
capabilities::drop_privileges(cfd_log, c)?;
|
||||
@@ -644,10 +623,11 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
|
||||
// setup the envs
|
||||
for e in env.iter() {
|
||||
match valid_env(e) {
|
||||
Some((key, value)) => env::set_var(key, value),
|
||||
None => log_child!(cfd_log, "invalid env key-value: {:?}", e),
|
||||
let v: Vec<&str> = e.splitn(2, '=').collect();
|
||||
if v.len() != 2 {
|
||||
continue;
|
||||
}
|
||||
env::set_var(v[0], v[1]);
|
||||
}
|
||||
|
||||
// set the "HOME" env getting from "/etc/passwd", if
|
||||
@@ -661,7 +641,7 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
let exec_file = Path::new(&args[0]);
|
||||
log_child!(cfd_log, "process command: {:?}", &args);
|
||||
if !exec_file.exists() {
|
||||
find_file(exec_file).ok_or_else(|| anyhow!("the file {} was not found", &args[0]))?;
|
||||
find_file(exec_file).ok_or_else(|| anyhow!("the file {} is not exist", &args[0]))?;
|
||||
}
|
||||
|
||||
// notify parent that the child's ready to start
|
||||
@@ -671,8 +651,8 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
let _ = unistd::close(crfd);
|
||||
let _ = unistd::close(cwfd);
|
||||
|
||||
unistd::setsid().context("create a new session")?;
|
||||
if oci_process.terminal {
|
||||
unistd::setsid()?;
|
||||
unsafe {
|
||||
libc::ioctl(0, libc::TIOCSCTTY);
|
||||
}
|
||||
@@ -689,16 +669,6 @@ fn do_init_child(cwfd: RawFd) -> Result<()> {
|
||||
unistd::read(fd, &mut buf)?;
|
||||
}
|
||||
|
||||
// With NoNewPrivileges, we should set seccomp as close to
|
||||
// do_exec as possible in order to reduce the amount of
|
||||
// system calls in the seccomp profiles.
|
||||
#[cfg(feature = "seccomp")]
|
||||
if oci_process.no_new_privileges {
|
||||
if let Some(ref scmp) = linux.seccomp {
|
||||
seccomp::init_seccomp(scmp)?;
|
||||
}
|
||||
}
|
||||
|
||||
do_exec(&args);
|
||||
}
|
||||
|
||||
@@ -1002,6 +972,8 @@ impl BaseContainer for LinuxContainer {
|
||||
|
||||
info!(logger, "entered namespaces!");
|
||||
|
||||
self.created = SystemTime::now();
|
||||
|
||||
if p.init {
|
||||
let spec = self.config.spec.as_mut().unwrap();
|
||||
update_namespaces(&self.logger, spec, p.pid)?;
|
||||
@@ -1116,8 +1088,10 @@ fn do_exec(args: &[String]) -> ! {
|
||||
.collect();
|
||||
|
||||
let _ = unistd::execvp(p.as_c_str(), &sa).map_err(|e| match e {
|
||||
nix::Error::UnknownErrno => std::process::exit(-2),
|
||||
_ => std::process::exit(e as i32),
|
||||
nix::Error::Sys(errno) => {
|
||||
std::process::exit(errno as i32);
|
||||
}
|
||||
_ => std::process::exit(-2),
|
||||
});
|
||||
|
||||
unreachable!()
|
||||
@@ -1163,7 +1137,7 @@ fn get_pid_namespace(logger: &Logger, linux: &Linux) -> Result<Option<RawFd>> {
|
||||
ns.r#type.clone(),
|
||||
ns.path.clone()
|
||||
);
|
||||
error!(logger, "error is : {:?}", e);
|
||||
error!(logger, "error is : {:?}", e.as_errno());
|
||||
|
||||
e
|
||||
})?;
|
||||
@@ -1396,13 +1370,13 @@ impl LinuxContainer {
|
||||
.context(format!("cannot change onwer of container {} root", id))?;
|
||||
|
||||
if config.spec.is_none() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
let spec = config.spec.as_ref().unwrap();
|
||||
|
||||
if spec.linux.is_none() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
let linux = spec.linux.as_ref().unwrap();
|
||||
@@ -1479,7 +1453,7 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
let binary = PathBuf::from(h.path.as_str());
|
||||
let path = binary.canonicalize()?;
|
||||
if !path.exists() {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(anyhow!(nix::Error::from_errno(Errno::EINVAL)));
|
||||
}
|
||||
|
||||
let args = h.args.clone();
|
||||
@@ -1548,7 +1522,7 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
|
||||
if code != 0 {
|
||||
error!(logger, "hook {} exit status is {}", &path, code);
|
||||
return Err(anyhow!(nix::Error::UnknownErrno));
|
||||
return Err(anyhow!(nix::Error::from_errno(Errno::UnknownErrno)));
|
||||
}
|
||||
|
||||
debug!(logger, "hook {} exit status is 0", &path);
|
||||
@@ -1564,34 +1538,10 @@ async fn execute_hook(logger: &Logger, h: &Hook, st: &OCIState) -> Result<()> {
|
||||
|
||||
match tokio::time::timeout(Duration::new(timeout, 0), join_handle).await {
|
||||
Ok(r) => r.unwrap(),
|
||||
Err(_) => Err(anyhow!(nix::Error::ETIMEDOUT)),
|
||||
Err(_) => Err(anyhow!(nix::Error::from_errno(Errno::ETIMEDOUT))),
|
||||
}
|
||||
}
|
||||
|
||||
// valid environment variables according to https://doc.rust-lang.org/std/env/fn.set_var.html#panics
|
||||
fn valid_env(e: &str) -> Option<(&str, &str)> {
|
||||
// wherther key or value will contain NULL char.
|
||||
if e.as_bytes().contains(&b'\0') {
|
||||
return None;
|
||||
}
|
||||
|
||||
let v: Vec<&str> = e.splitn(2, '=').collect();
|
||||
|
||||
// key can't hold an `equal` sign, but value can
|
||||
if v.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (key, value) = (v[0].trim(), v[1].trim());
|
||||
|
||||
// key can't be empty
|
||||
if key.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((key, value))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1670,7 +1620,7 @@ mod tests {
|
||||
)
|
||||
.await;
|
||||
|
||||
let expected_err = nix::Error::ETIMEDOUT;
|
||||
let expected_err = nix::Error::from_errno(Errno::ETIMEDOUT);
|
||||
assert_eq!(
|
||||
res.unwrap_err().downcast::<nix::Error>().unwrap(),
|
||||
expected_err
|
||||
@@ -2015,49 +1965,4 @@ mod tests {
|
||||
let ret = do_init_child(std::io::stdin().as_raw_fd());
|
||||
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_valid_env() {
|
||||
let env = valid_env("a=b=c");
|
||||
assert_eq!(Some(("a", "b=c")), env);
|
||||
|
||||
let env = valid_env("a=b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
let env = valid_env("a =b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env(" a =b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env("a= b");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
|
||||
let env = valid_env("a=b ");
|
||||
assert_eq!(Some(("a", "b")), env);
|
||||
let env = valid_env("a=b c ");
|
||||
assert_eq!(Some(("a", "b c")), env);
|
||||
|
||||
let env = valid_env("=b");
|
||||
assert_eq!(None, env);
|
||||
|
||||
let env = valid_env("a=");
|
||||
assert_eq!(Some(("a", "")), env);
|
||||
|
||||
let env = valid_env("a==");
|
||||
assert_eq!(Some(("a", "=")), env);
|
||||
|
||||
let env = valid_env("a");
|
||||
assert_eq!(None, env);
|
||||
|
||||
let invalid_str = vec![97, b'\0', 98];
|
||||
let invalid_string = std::str::from_utf8(&invalid_str).unwrap();
|
||||
|
||||
let invalid_env = format!("{}=value", invalid_string);
|
||||
let env = valid_env(&invalid_env);
|
||||
assert_eq!(None, env);
|
||||
|
||||
let invalid_env = format!("key={}", invalid_string);
|
||||
let env = valid_env(&invalid_env);
|
||||
assert_eq!(None, env);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,8 +34,6 @@ pub mod container;
|
||||
pub mod mount;
|
||||
pub mod pipestream;
|
||||
pub mod process;
|
||||
#[cfg(feature = "seccomp")]
|
||||
pub mod seccomp;
|
||||
pub mod specconv;
|
||||
pub mod sync;
|
||||
pub mod sync_with_async;
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use libc::uid_t;
|
||||
use nix::errno::Errno;
|
||||
use nix::fcntl::{self, OFlag};
|
||||
#[cfg(not(test))]
|
||||
use nix::mount;
|
||||
@@ -18,7 +19,7 @@ use std::fs::{self, OpenOptions};
|
||||
use std::mem::MaybeUninit;
|
||||
use std::os::unix;
|
||||
use std::os::unix::io::RawFd;
|
||||
use std::path::{Component, Path, PathBuf};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use path_absolutize::*;
|
||||
use std::fs::File;
|
||||
@@ -34,9 +35,17 @@ use crate::log_child;
|
||||
// struct is populated from the content in the /proc/<pid>/mountinfo file.
|
||||
#[derive(std::fmt::Debug)]
|
||||
pub struct Info {
|
||||
id: i32,
|
||||
parent: i32,
|
||||
major: i32,
|
||||
minor: i32,
|
||||
root: String,
|
||||
mount_point: String,
|
||||
opts: String,
|
||||
optional: String,
|
||||
fstype: String,
|
||||
source: String,
|
||||
vfs_opts: String,
|
||||
}
|
||||
|
||||
const MOUNTINFOFORMAT: &str = "{d} {d} {d}:{d} {} {} {} {}";
|
||||
@@ -103,7 +112,6 @@ lazy_static! {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
pub fn mount<
|
||||
P1: ?Sized + NixPath,
|
||||
P2: ?Sized + NixPath,
|
||||
@@ -116,42 +124,21 @@ pub fn mount<
|
||||
flags: MsFlags,
|
||||
data: Option<&P4>,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
mount::mount(source, target, fstype, flags, data)
|
||||
#[cfg(not(test))]
|
||||
return mount::mount(source, target, fstype, flags, data);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
pub fn mount<
|
||||
P1: ?Sized + NixPath,
|
||||
P2: ?Sized + NixPath,
|
||||
P3: ?Sized + NixPath,
|
||||
P4: ?Sized + NixPath,
|
||||
>(
|
||||
_source: Option<&P1>,
|
||||
_target: &P2,
|
||||
_fstype: Option<&P3>,
|
||||
_flags: MsFlags,
|
||||
_data: Option<&P4>,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
pub fn umount2<P: ?Sized + NixPath>(
|
||||
target: &P,
|
||||
flags: MntFlags,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
mount::umount2(target, flags)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
pub fn umount2<P: ?Sized + NixPath>(
|
||||
_target: &P,
|
||||
_flags: MntFlags,
|
||||
) -> std::result::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return mount::umount2(target, flags);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn init_rootfs(
|
||||
@@ -463,20 +450,14 @@ fn mount_cgroups(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
fn pivot_root<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
new_root: &P1,
|
||||
put_old: &P2,
|
||||
) -> anyhow::Result<(), nix::Error> {
|
||||
unistd::pivot_root(new_root, put_old)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn pivot_root<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
_new_root: &P1,
|
||||
_put_old: &P2,
|
||||
) -> anyhow::Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return unistd::pivot_root(new_root, put_old);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn pivot_rootfs<P: ?Sized + NixPath + std::fmt::Debug>(path: &P) -> Result<()> {
|
||||
@@ -554,20 +535,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
for (_index, line) in reader.lines().enumerate() {
|
||||
let line = line?;
|
||||
|
||||
//Example mountinfo format:
|
||||
// id
|
||||
// | / parent
|
||||
// | | / major:minor
|
||||
// | | | / root
|
||||
// | | | | / mount_point
|
||||
// | | | | | / opts
|
||||
// | | | | | | / optional
|
||||
// | | | | | | | / fstype
|
||||
// | | | | | | | | / source
|
||||
// | | | | | | | | | / vfs_opts
|
||||
// 22 96 0:21 / /sys rw,nosuid,nodev,noexec,relatime shared:2 - sysfs sysfs rw,seclabel
|
||||
|
||||
let (_id, _parent, _major, _minor, _root, mount_point, _opts, optional) = scan_fmt!(
|
||||
let (id, parent, major, minor, root, mount_point, opts, optional) = scan_fmt!(
|
||||
&line,
|
||||
MOUNTINFOFORMAT,
|
||||
i32,
|
||||
@@ -582,7 +550,7 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
|
||||
let fields: Vec<&str> = line.split(" - ").collect();
|
||||
if fields.len() == 2 {
|
||||
let (fstype, _source, _vfs_opts) =
|
||||
let (fstype, source, vfs_opts) =
|
||||
scan_fmt!(fields[1], "{} {} {}", String, String, String)?;
|
||||
|
||||
let mut optional_new = String::new();
|
||||
@@ -591,9 +559,17 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
}
|
||||
|
||||
let info = Info {
|
||||
id,
|
||||
parent,
|
||||
major,
|
||||
minor,
|
||||
root,
|
||||
mount_point,
|
||||
opts,
|
||||
optional: optional_new,
|
||||
fstype,
|
||||
source,
|
||||
vfs_opts,
|
||||
};
|
||||
|
||||
infos.push(info);
|
||||
@@ -606,15 +582,11 @@ fn parse_mount_table() -> Result<Vec<Info>> {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(not(test))]
|
||||
fn chroot<P: ?Sized + NixPath>(path: &P) -> Result<(), nix::Error> {
|
||||
unistd::chroot(path)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
#[cfg(test)]
|
||||
fn chroot<P: ?Sized + NixPath>(_path: &P) -> Result<(), nix::Error> {
|
||||
Ok(())
|
||||
#[cfg(not(test))]
|
||||
return unistd::chroot(path);
|
||||
#[cfg(test)]
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
pub fn ms_move_root(rootfs: &str) -> Result<bool> {
|
||||
@@ -651,7 +623,7 @@ pub fn ms_move_root(rootfs: &str) -> Result<bool> {
|
||||
None::<&str>,
|
||||
)?;
|
||||
umount2(abs_mount_point, MntFlags::MNT_DETACH).or_else(|e| {
|
||||
if e.ne(&nix::Error::EINVAL) && e.ne(&nix::Error::EPERM) {
|
||||
if e.ne(&nix::Error::from(Errno::EINVAL)) && e.ne(&nix::Error::from(Errno::EPERM)) {
|
||||
return Err(anyhow!(e));
|
||||
}
|
||||
|
||||
@@ -773,7 +745,7 @@ fn mount_from(
|
||||
let _ = fs::create_dir_all(&dir).map_err(|e| {
|
||||
log_child!(
|
||||
cfd_log,
|
||||
"create dir {}: {}",
|
||||
"creat dir {}: {}",
|
||||
dir.to_str().unwrap(),
|
||||
e.to_string()
|
||||
)
|
||||
@@ -794,8 +766,14 @@ fn mount_from(
|
||||
}
|
||||
};
|
||||
|
||||
let _ = stat::stat(dest.as_str())
|
||||
.map_err(|e| log_child!(cfd_log, "dest stat error. {}: {:?}", dest.as_str(), e));
|
||||
let _ = stat::stat(dest.as_str()).map_err(|e| {
|
||||
log_child!(
|
||||
cfd_log,
|
||||
"dest stat error. {}: {:?}",
|
||||
dest.as_str(),
|
||||
e.as_errno()
|
||||
)
|
||||
});
|
||||
|
||||
mount(
|
||||
Some(src.as_str()),
|
||||
@@ -805,7 +783,7 @@ fn mount_from(
|
||||
Some(d.as_str()),
|
||||
)
|
||||
.map_err(|e| {
|
||||
log_child!(cfd_log, "mount error: {:?}", e);
|
||||
log_child!(cfd_log, "mount error: {:?}", e.as_errno());
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -827,7 +805,7 @@ fn mount_from(
|
||||
None::<&str>,
|
||||
)
|
||||
.map_err(|e| {
|
||||
log_child!(cfd_log, "remout {}: {:?}", dest.as_str(), e);
|
||||
log_child!(cfd_log, "remout {}: {:?}", dest.as_str(), e.as_errno());
|
||||
e
|
||||
})?;
|
||||
}
|
||||
@@ -850,35 +828,18 @@ fn default_symlinks() -> Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dev_rel_path(path: &str) -> Option<&Path> {
|
||||
let path = Path::new(path);
|
||||
|
||||
if !path.starts_with("/dev")
|
||||
|| path == Path::new("/dev")
|
||||
|| path.components().any(|c| c == Component::ParentDir)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
path.strip_prefix("/").ok()
|
||||
}
|
||||
|
||||
fn create_devices(devices: &[LinuxDevice], bind: bool) -> Result<()> {
|
||||
let op: fn(&LinuxDevice, &Path) -> Result<()> = if bind { bind_dev } else { mknod_dev };
|
||||
let op: fn(&LinuxDevice) -> Result<()> = if bind { bind_dev } else { mknod_dev };
|
||||
let old = stat::umask(Mode::from_bits_truncate(0o000));
|
||||
for dev in DEFAULT_DEVICES.iter() {
|
||||
let path = Path::new(&dev.path[1..]);
|
||||
op(dev, path).context(format!("Creating container device {:?}", dev))?;
|
||||
op(dev)?;
|
||||
}
|
||||
for dev in devices {
|
||||
let path = dev_rel_path(&dev.path).ok_or_else(|| {
|
||||
if !dev.path.starts_with("/dev") || dev.path.contains("..") {
|
||||
let msg = format!("{} is not a valid device path", dev.path);
|
||||
anyhow!(msg)
|
||||
})?;
|
||||
if let Some(dir) = path.parent() {
|
||||
fs::create_dir_all(dir).context(format!("Creating container device {:?}", dev))?;
|
||||
bail!(anyhow!(msg));
|
||||
}
|
||||
op(dev, path).context(format!("Creating container device {:?}", dev))?;
|
||||
op(dev)?;
|
||||
}
|
||||
stat::umask(old);
|
||||
Ok(())
|
||||
@@ -900,21 +861,21 @@ lazy_static! {
|
||||
};
|
||||
}
|
||||
|
||||
fn mknod_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
fn mknod_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
let f = match LINUXDEVICETYPE.get(dev.r#type.as_str()) {
|
||||
Some(v) => v,
|
||||
None => return Err(anyhow!("invalid spec".to_string())),
|
||||
};
|
||||
|
||||
stat::mknod(
|
||||
relpath,
|
||||
&dev.path[1..],
|
||||
*f,
|
||||
Mode::from_bits_truncate(dev.file_mode.unwrap_or(0)),
|
||||
nix::sys::stat::makedev(dev.major as u64, dev.minor as u64),
|
||||
)?;
|
||||
|
||||
unistd::chown(
|
||||
relpath,
|
||||
&dev.path[1..],
|
||||
Some(Uid::from_raw(dev.uid.unwrap_or(0) as uid_t)),
|
||||
Some(Gid::from_raw(dev.gid.unwrap_or(0) as uid_t)),
|
||||
)?;
|
||||
@@ -922,9 +883,9 @@ fn mknod_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn bind_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
fn bind_dev(dev: &LinuxDevice) -> Result<()> {
|
||||
let fd = fcntl::open(
|
||||
relpath,
|
||||
&dev.path[1..],
|
||||
OFlag::O_RDWR | OFlag::O_CREAT,
|
||||
Mode::from_bits_truncate(0o644),
|
||||
)?;
|
||||
@@ -933,7 +894,7 @@ fn bind_dev(dev: &LinuxDevice, relpath: &Path) -> Result<()> {
|
||||
|
||||
mount(
|
||||
Some(&*dev.path),
|
||||
relpath,
|
||||
&dev.path[1..],
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND,
|
||||
None::<&str>,
|
||||
@@ -996,7 +957,7 @@ pub fn finish_rootfs(cfd_log: RawFd, spec: &Spec, process: &Process) -> Result<(
|
||||
|
||||
fn mask_path(path: &str) -> Result<()> {
|
||||
if !path.starts_with('/') || path.contains("..") {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
match mount(
|
||||
@@ -1006,30 +967,49 @@ fn mask_path(path: &str) -> Result<()> {
|
||||
MsFlags::MS_BIND,
|
||||
None::<&str>,
|
||||
) {
|
||||
Err(e) => match e {
|
||||
nix::Error::ENOENT | nix::Error::ENOTDIR => Ok(()),
|
||||
_ => Err(e.into()),
|
||||
},
|
||||
Ok(_) => Ok(()),
|
||||
Err(nix::Error::Sys(e)) => {
|
||||
if e != Errno::ENOENT && e != Errno::ENOTDIR {
|
||||
//info!("{}: {}", path, e.desc());
|
||||
return Err(nix::Error::Sys(e).into());
|
||||
}
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn readonly_path(path: &str) -> Result<()> {
|
||||
if !path.starts_with('/') || path.contains("..") {
|
||||
return Err(anyhow!(nix::Error::EINVAL));
|
||||
return Err(nix::Error::Sys(Errno::EINVAL).into());
|
||||
}
|
||||
|
||||
if let Err(e) = mount(
|
||||
match mount(
|
||||
Some(&path[1..]),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND | MsFlags::MS_REC,
|
||||
None::<&str>,
|
||||
) {
|
||||
match e {
|
||||
nix::Error::ENOENT => return Ok(()),
|
||||
_ => return Err(e.into()),
|
||||
};
|
||||
Err(nix::Error::Sys(e)) => {
|
||||
if e == Errno::ENOENT {
|
||||
return Ok(());
|
||||
} else {
|
||||
//info!("{}: {}", path, e.desc());
|
||||
return Err(nix::Error::Sys(e).into());
|
||||
}
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
mount(
|
||||
@@ -1278,12 +1258,11 @@ mod tests {
|
||||
uid: Some(unistd::getuid().as_raw()),
|
||||
gid: Some(unistd::getgid().as_raw()),
|
||||
};
|
||||
let path = Path::new("fifo");
|
||||
|
||||
let ret = mknod_dev(&dev, path);
|
||||
let ret = mknod_dev(&dev);
|
||||
assert!(ret.is_ok(), "Should pass. Got: {:?}", ret);
|
||||
|
||||
let ret = stat::stat(path);
|
||||
let ret = stat::stat("fifo");
|
||||
assert!(ret.is_ok(), "Should pass. Got: {:?}", ret);
|
||||
}
|
||||
#[test]
|
||||
@@ -1400,26 +1379,4 @@ mod tests {
|
||||
assert!(result == t.result, "{}", msg);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dev_rel_path() {
|
||||
// Valid device paths
|
||||
assert_eq!(dev_rel_path("/dev/sda").unwrap(), Path::new("dev/sda"));
|
||||
assert_eq!(dev_rel_path("//dev/sda").unwrap(), Path::new("dev/sda"));
|
||||
assert_eq!(
|
||||
dev_rel_path("/dev/vfio/99").unwrap(),
|
||||
Path::new("dev/vfio/99")
|
||||
);
|
||||
assert_eq!(dev_rel_path("/dev/...").unwrap(), Path::new("dev/..."));
|
||||
assert_eq!(dev_rel_path("/dev/a..b").unwrap(), Path::new("dev/a..b"));
|
||||
assert_eq!(dev_rel_path("/dev//foo").unwrap(), Path::new("dev/foo"));
|
||||
|
||||
// Bad device paths
|
||||
assert!(dev_rel_path("/devfoo").is_none());
|
||||
assert!(dev_rel_path("/etc/passwd").is_none());
|
||||
assert!(dev_rel_path("/dev/../etc/passwd").is_none());
|
||||
assert!(dev_rel_path("dev/foo").is_none());
|
||||
assert!(dev_rel_path("").is_none());
|
||||
assert!(dev_rel_path("/dev").is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ impl io::Read for &StreamFd {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
match unistd::read(self.0, buf) {
|
||||
Ok(l) => Ok(l),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -39,7 +39,7 @@ impl io::Write for &StreamFd {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
match unistd::write(self.0, buf) {
|
||||
Ok(l) => Ok(l),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl StreamFd {
|
||||
fn close(&mut self) -> io::Result<()> {
|
||||
match unistd::close(self.0) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(e) => Err(e.into()),
|
||||
Err(e) => Err(e.as_errno().unwrap().into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,16 +24,6 @@ use tokio::io::{split, ReadHalf, WriteHalf};
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::sync::Notify;
|
||||
|
||||
macro_rules! close_process_stream {
|
||||
($self: ident, $stream:ident, $stream_type: ident) => {
|
||||
if $self.$stream.is_some() {
|
||||
$self.close_stream(StreamType::$stream_type);
|
||||
let _ = unistd::close($self.$stream.unwrap());
|
||||
$self.$stream = None;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum StreamType {
|
||||
Stdin,
|
||||
@@ -157,22 +147,6 @@ impl Process {
|
||||
notify.notify_one();
|
||||
}
|
||||
|
||||
pub fn close_stdin(&mut self) {
|
||||
close_process_stream!(self, term_master, TermMaster);
|
||||
close_process_stream!(self, parent_stdin, ParentStdin);
|
||||
|
||||
self.notify_term_close();
|
||||
}
|
||||
|
||||
pub fn cleanup_process_stream(&mut self) {
|
||||
close_process_stream!(self, parent_stdin, ParentStdin);
|
||||
close_process_stream!(self, parent_stdout, ParentStdout);
|
||||
close_process_stream!(self, parent_stderr, ParentStderr);
|
||||
close_process_stream!(self, term_master, TermMaster);
|
||||
|
||||
self.notify_term_close();
|
||||
}
|
||||
|
||||
fn get_fd(&self, stream_type: &StreamType) -> Option<RawFd> {
|
||||
match stream_type {
|
||||
StreamType::Stdin => self.stdin,
|
||||
|
||||
@@ -1,272 +0,0 @@
|
||||
// Copyright 2021 Sony Group Corporation
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use libseccomp::*;
|
||||
use oci::{LinuxSeccomp, LinuxSeccompArg};
|
||||
use std::str::FromStr;
|
||||
|
||||
fn get_filter_attr_from_flag(flag: &str) -> Result<ScmpFilterAttr> {
|
||||
match flag {
|
||||
"SECCOMP_FILTER_FLAG_TSYNC" => Ok(ScmpFilterAttr::CtlTsync),
|
||||
"SECCOMP_FILTER_FLAG_LOG" => Ok(ScmpFilterAttr::CtlLog),
|
||||
"SECCOMP_FILTER_FLAG_SPEC_ALLOW" => Ok(ScmpFilterAttr::CtlSsb),
|
||||
_ => Err(anyhow!("Invalid seccomp flag")),
|
||||
}
|
||||
}
|
||||
|
||||
// get_rule_conditions gets rule conditions for a system call from the args.
|
||||
fn get_rule_conditions(args: &[LinuxSeccompArg]) -> Result<Vec<ScmpArgCompare>> {
|
||||
let mut conditions: Vec<ScmpArgCompare> = Vec::new();
|
||||
|
||||
for arg in args {
|
||||
if arg.op.is_empty() {
|
||||
return Err(anyhow!("seccomp opreator is required"));
|
||||
}
|
||||
|
||||
let cond = ScmpArgCompare::new(
|
||||
arg.index,
|
||||
ScmpCompareOp::from_str(&arg.op)?,
|
||||
arg.value,
|
||||
Some(arg.value_two),
|
||||
);
|
||||
|
||||
conditions.push(cond);
|
||||
}
|
||||
|
||||
Ok(conditions)
|
||||
}
|
||||
|
||||
pub fn get_unknown_syscalls(scmp: &LinuxSeccomp) -> Option<Vec<String>> {
|
||||
let mut unknown_syscalls: Vec<String> = Vec::new();
|
||||
|
||||
for syscall in &scmp.syscalls {
|
||||
for name in &syscall.names {
|
||||
if get_syscall_from_name(name, None).is_err() {
|
||||
unknown_syscalls.push(name.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if unknown_syscalls.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(unknown_syscalls)
|
||||
}
|
||||
}
|
||||
|
||||
// init_seccomp creates a seccomp filter and loads it for the current process
|
||||
// including all the child processes.
|
||||
pub fn init_seccomp(scmp: &LinuxSeccomp) -> Result<()> {
|
||||
let def_action = ScmpAction::from_str(scmp.default_action.as_str(), Some(libc::EPERM as u32))?;
|
||||
|
||||
// Create a new filter context
|
||||
let mut filter = ScmpFilterContext::new_filter(def_action)?;
|
||||
|
||||
// Add extra architectures
|
||||
for arch in &scmp.architectures {
|
||||
let scmp_arch = ScmpArch::from_str(arch)?;
|
||||
filter.add_arch(scmp_arch)?;
|
||||
}
|
||||
|
||||
// Unset no new privileges bit
|
||||
filter.set_no_new_privs_bit(false)?;
|
||||
|
||||
// Add a rule for each system call
|
||||
for syscall in &scmp.syscalls {
|
||||
if syscall.names.is_empty() {
|
||||
return Err(anyhow!("syscall name is required"));
|
||||
}
|
||||
|
||||
let action = ScmpAction::from_str(&syscall.action, Some(syscall.errno_ret))?;
|
||||
if action == def_action {
|
||||
continue;
|
||||
}
|
||||
|
||||
for name in &syscall.names {
|
||||
let syscall_num = match get_syscall_from_name(name, None) {
|
||||
Ok(num) => num,
|
||||
Err(_) => {
|
||||
// If we cannot resolve the given system call, we assume it is not supported
|
||||
// by the kernel. Hence, we skip it without generating an error.
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if syscall.args.is_empty() {
|
||||
filter.add_rule(action, syscall_num, None)?;
|
||||
} else {
|
||||
let conditions = get_rule_conditions(&syscall.args)?;
|
||||
filter.add_rule(action, syscall_num, Some(&conditions))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set filter attributes for each seccomp flag
|
||||
for flag in &scmp.flags {
|
||||
let scmp_attr = get_filter_attr_from_flag(flag)?;
|
||||
filter.set_filter_attr(scmp_attr, 1)?;
|
||||
}
|
||||
|
||||
// Load the filter
|
||||
filter.load()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::skip_if_not_root;
|
||||
use libc::{dup3, process_vm_readv, EPERM, O_CLOEXEC};
|
||||
use std::io::Error;
|
||||
use std::ptr::null;
|
||||
|
||||
macro_rules! syscall_assert {
|
||||
($e1: expr, $e2: expr) => {
|
||||
let mut errno: i32 = 0;
|
||||
if $e1 < 0 {
|
||||
errno = -Error::last_os_error().raw_os_error().unwrap();
|
||||
}
|
||||
assert_eq!(errno, $e2);
|
||||
};
|
||||
}
|
||||
|
||||
const TEST_DATA: &str = r#"{
|
||||
"defaultAction": "SCMP_ACT_ALLOW",
|
||||
"architectures": [
|
||||
],
|
||||
"flags": [
|
||||
"SECCOMP_FILTER_FLAG_LOG"
|
||||
],
|
||||
"syscalls": [
|
||||
{
|
||||
"names": [
|
||||
"dup3",
|
||||
"invalid_syscall1",
|
||||
"invalid_syscall2"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO"
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 10,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 111,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 20,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"names": [
|
||||
"process_vm_readv"
|
||||
],
|
||||
"action": "SCMP_ACT_ERRNO",
|
||||
"errnoRet": 222,
|
||||
"args": [
|
||||
{
|
||||
"index": 0,
|
||||
"value": 30,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"value": 40,
|
||||
"op": "SCMP_CMP_EQ"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}"#;
|
||||
|
||||
#[test]
|
||||
fn test_get_filter_attr_from_flag() {
|
||||
skip_if_not_root!();
|
||||
|
||||
assert_eq!(
|
||||
get_filter_attr_from_flag("SECCOMP_FILTER_FLAG_TSYNC").unwrap(),
|
||||
ScmpFilterAttr::CtlTsync
|
||||
);
|
||||
|
||||
assert_eq!(get_filter_attr_from_flag("ERROR").is_err(), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_unknown_syscalls() {
|
||||
let scmp: oci::LinuxSeccomp = serde_json::from_str(TEST_DATA).unwrap();
|
||||
let syscalls = get_unknown_syscalls(&scmp).unwrap();
|
||||
|
||||
assert_eq!(syscalls, vec!["invalid_syscall1", "invalid_syscall2"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_init_seccomp() {
|
||||
skip_if_not_root!();
|
||||
|
||||
let mut scmp: oci::LinuxSeccomp = serde_json::from_str(TEST_DATA).unwrap();
|
||||
let mut arch: Vec<oci::Arch>;
|
||||
|
||||
if cfg!(target_endian = "little") {
|
||||
// For little-endian architectures
|
||||
arch = vec![
|
||||
"SCMP_ARCH_X86".to_string(),
|
||||
"SCMP_ARCH_X32".to_string(),
|
||||
"SCMP_ARCH_X86_64".to_string(),
|
||||
"SCMP_ARCH_AARCH64".to_string(),
|
||||
"SCMP_ARCH_ARM".to_string(),
|
||||
"SCMP_ARCH_PPC64LE".to_string(),
|
||||
];
|
||||
} else {
|
||||
// For big-endian architectures
|
||||
arch = vec!["SCMP_ARCH_S390X".to_string()];
|
||||
}
|
||||
|
||||
scmp.architectures.append(&mut arch);
|
||||
|
||||
init_seccomp(&scmp).unwrap();
|
||||
|
||||
// Basic syscall with simple rule
|
||||
syscall_assert!(unsafe { dup3(0, 1, O_CLOEXEC) }, -EPERM);
|
||||
|
||||
// Syscall with permitted arguments
|
||||
syscall_assert!(unsafe { process_vm_readv(1, null(), 0, null(), 0, 0) }, 0);
|
||||
|
||||
// Multiple arguments with OR rules with ERRNO
|
||||
syscall_assert!(
|
||||
unsafe { process_vm_readv(10, null(), 0, null(), 0, 0) },
|
||||
-111
|
||||
);
|
||||
syscall_assert!(
|
||||
unsafe { process_vm_readv(20, null(), 0, null(), 0, 0) },
|
||||
-111
|
||||
);
|
||||
|
||||
// Multiple arguments with AND rules with ERRNO
|
||||
syscall_assert!(unsafe { process_vm_readv(30, null(), 0, null(), 0, 0) }, 0);
|
||||
syscall_assert!(
|
||||
unsafe { process_vm_readv(30, null(), 40, null(), 0, 0) },
|
||||
-222
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::unistd;
|
||||
use std::mem;
|
||||
use std::os::unix::io::RawFd;
|
||||
@@ -40,7 +41,7 @@ pub fn write_count(fd: RawFd, buf: &[u8], count: usize) -> Result<usize> {
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
if e != nix::Error::EINTR {
|
||||
if e != nix::Error::from_errno(Errno::EINTR) {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
@@ -64,7 +65,7 @@ fn read_count(fd: RawFd, count: usize) -> Result<Vec<u8>> {
|
||||
}
|
||||
|
||||
Err(e) => {
|
||||
if e != nix::Error::EINTR {
|
||||
if e != nix::Error::from_errno(Errno::EINTR) {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,12 +5,13 @@
|
||||
|
||||
use crate::container::Config;
|
||||
use anyhow::{anyhow, Context, Error, Result};
|
||||
use nix::errno::Errno;
|
||||
use oci::{Linux, LinuxIdMapping, LinuxNamespace, Spec};
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Component, PathBuf};
|
||||
|
||||
fn einval() -> Error {
|
||||
anyhow!(nix::Error::EINVAL)
|
||||
anyhow!(nix::Error::from_errno(Errno::EINVAL))
|
||||
}
|
||||
|
||||
fn get_linux(oci: &Spec) -> Result<&Linux> {
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
# This is an agent configuration file example.
|
||||
dev_mode = true
|
||||
server_addr = 'vsock://8:2048'
|
||||
|
||||
[endpoints]
|
||||
# All endpoints are allowed
|
||||
allowed = [
|
||||
"AddARPNeighborsRequest",
|
||||
"AddSwapRequest",
|
||||
"CloseStdinRequest",
|
||||
"CopyFileRequest",
|
||||
"CreateContainerRequest",
|
||||
"CreateSandboxRequest",
|
||||
"DestroySandboxRequest",
|
||||
"ExecProcessRequest",
|
||||
"GetMetricsRequest",
|
||||
"GetOOMEventRequest",
|
||||
"GuestDetailsRequest",
|
||||
"ListInterfacesRequest",
|
||||
"ListRoutesRequest",
|
||||
"MemHotplugByProbeRequest",
|
||||
"OnlineCPUMemRequest",
|
||||
"PauseContainerRequest",
|
||||
"PullImageRequest",
|
||||
"ReadStreamRequest",
|
||||
"RemoveContainerRequest",
|
||||
"ReseedRandomDevRequest",
|
||||
"ResumeContainerRequest",
|
||||
"SetGuestDateTimeRequest",
|
||||
"SignalProcessRequest",
|
||||
"StartContainerRequest",
|
||||
"StatsContainerRequest",
|
||||
"TtyWinResizeRequest",
|
||||
"UpdateContainerRequest",
|
||||
"UpdateInterfaceRequest",
|
||||
"UpdateRoutesRequest",
|
||||
"WaitProcessRequest",
|
||||
"WriteStreamRequest"
|
||||
]
|
||||
@@ -2,13 +2,10 @@
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
use crate::rpc;
|
||||
use crate::tracer;
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::time;
|
||||
use tracing::instrument;
|
||||
|
||||
@@ -22,7 +19,6 @@ const DEBUG_CONSOLE_VPORT_OPTION: &str = "agent.debug_console_vport";
|
||||
const LOG_VPORT_OPTION: &str = "agent.log_vport";
|
||||
const CONTAINER_PIPE_SIZE_OPTION: &str = "agent.container_pipe_size";
|
||||
const UNIFIED_CGROUP_HIERARCHY_OPTION: &str = "agent.unified_cgroup_hierarchy";
|
||||
const CONFIG_FILE: &str = "agent.config_file";
|
||||
|
||||
const DEFAULT_LOG_LEVEL: slog::Level = slog::Level::Info;
|
||||
const DEFAULT_HOTPLUG_TIMEOUT: time::Duration = time::Duration::from_secs(3);
|
||||
@@ -33,7 +29,7 @@ const VSOCK_PORT: u16 = 1024;
|
||||
// Environment variables used for development and testing
|
||||
const SERVER_ADDR_ENV_VAR: &str = "KATA_AGENT_SERVER_ADDR";
|
||||
const LOG_LEVEL_ENV_VAR: &str = "KATA_AGENT_LOG_LEVEL";
|
||||
const TRACING_ENV_VAR: &str = "KATA_AGENT_TRACING";
|
||||
const TRACE_TYPE_ENV_VAR: &str = "KATA_AGENT_TRACE_TYPE";
|
||||
|
||||
const ERR_INVALID_LOG_LEVEL: &str = "invalid log level";
|
||||
const ERR_INVALID_LOG_LEVEL_PARAM: &str = "invalid log level parameter";
|
||||
@@ -51,17 +47,6 @@ const ERR_INVALID_CONTAINER_PIPE_SIZE_PARAM: &str = "unable to parse container p
|
||||
const ERR_INVALID_CONTAINER_PIPE_SIZE_KEY: &str = "invalid container pipe size key name";
|
||||
const ERR_INVALID_CONTAINER_PIPE_NEGATIVE: &str = "container pipe size should not be negative";
|
||||
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
pub struct EndpointsConfig {
|
||||
pub allowed: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct AgentEndpoints {
|
||||
pub allowed: HashSet<String>,
|
||||
pub all_allowed: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AgentConfig {
|
||||
pub debug_console: bool,
|
||||
@@ -73,38 +58,7 @@ pub struct AgentConfig {
|
||||
pub container_pipe_size: i32,
|
||||
pub server_addr: String,
|
||||
pub unified_cgroup_hierarchy: bool,
|
||||
pub tracing: bool,
|
||||
pub endpoints: AgentEndpoints,
|
||||
pub supports_seccomp: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct AgentConfigBuilder {
|
||||
pub debug_console: Option<bool>,
|
||||
pub dev_mode: Option<bool>,
|
||||
pub log_level: Option<String>,
|
||||
pub hotplug_timeout: Option<time::Duration>,
|
||||
pub debug_console_vport: Option<i32>,
|
||||
pub log_vport: Option<i32>,
|
||||
pub container_pipe_size: Option<i32>,
|
||||
pub server_addr: Option<String>,
|
||||
pub unified_cgroup_hierarchy: Option<bool>,
|
||||
pub tracing: Option<bool>,
|
||||
pub endpoints: Option<EndpointsConfig>,
|
||||
}
|
||||
|
||||
macro_rules! config_override {
|
||||
($builder:ident, $config:ident, $field:ident) => {
|
||||
if let Some(v) = $builder.$field {
|
||||
$config.$field = v;
|
||||
}
|
||||
};
|
||||
|
||||
($builder:ident, $config:ident, $field:ident, $func: ident) => {
|
||||
if let Some(v) = $builder.$field {
|
||||
$config.$field = $func(&v)?;
|
||||
}
|
||||
};
|
||||
pub tracing: tracer::TraceType,
|
||||
}
|
||||
|
||||
// parse_cmdline_param parse commandline parameters.
|
||||
@@ -137,8 +91,8 @@ macro_rules! parse_cmdline_param {
|
||||
};
|
||||
}
|
||||
|
||||
impl Default for AgentConfig {
|
||||
fn default() -> Self {
|
||||
impl AgentConfig {
|
||||
pub fn new() -> AgentConfig {
|
||||
AgentConfig {
|
||||
debug_console: false,
|
||||
dev_mode: false,
|
||||
@@ -149,94 +103,34 @@ impl Default for AgentConfig {
|
||||
container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE,
|
||||
server_addr: format!("{}:{}", VSOCK_ADDR, VSOCK_PORT),
|
||||
unified_cgroup_hierarchy: false,
|
||||
tracing: false,
|
||||
endpoints: Default::default(),
|
||||
supports_seccomp: rpc::have_seccomp(),
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for AgentConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let agent_config_builder: AgentConfigBuilder =
|
||||
toml::from_str(s).map_err(anyhow::Error::new)?;
|
||||
let mut agent_config: AgentConfig = Default::default();
|
||||
|
||||
// Overwrite default values with the configuration files ones.
|
||||
config_override!(agent_config_builder, agent_config, debug_console);
|
||||
config_override!(agent_config_builder, agent_config, dev_mode);
|
||||
config_override!(
|
||||
agent_config_builder,
|
||||
agent_config,
|
||||
log_level,
|
||||
logrus_to_slog_level
|
||||
);
|
||||
config_override!(agent_config_builder, agent_config, hotplug_timeout);
|
||||
config_override!(agent_config_builder, agent_config, debug_console_vport);
|
||||
config_override!(agent_config_builder, agent_config, log_vport);
|
||||
config_override!(agent_config_builder, agent_config, container_pipe_size);
|
||||
config_override!(agent_config_builder, agent_config, server_addr);
|
||||
config_override!(agent_config_builder, agent_config, unified_cgroup_hierarchy);
|
||||
config_override!(agent_config_builder, agent_config, tracing);
|
||||
|
||||
// Populate the allowed endpoints hash set, if we got any from the config file.
|
||||
if let Some(endpoints) = agent_config_builder.endpoints {
|
||||
for ep in endpoints.allowed {
|
||||
agent_config.endpoints.allowed.insert(ep);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(agent_config)
|
||||
}
|
||||
}
|
||||
|
||||
impl AgentConfig {
|
||||
#[instrument]
|
||||
pub fn from_cmdline(file: &str, args: Vec<String>) -> Result<AgentConfig> {
|
||||
// If config file specified in the args, generate our config from it
|
||||
let config_position = args.iter().position(|a| a == "--config" || a == "-c");
|
||||
if let Some(config_position) = config_position {
|
||||
if let Some(config_file) = args.get(config_position + 1) {
|
||||
return AgentConfig::from_config_file(config_file);
|
||||
} else {
|
||||
panic!("The config argument wasn't formed properly: {:?}", args);
|
||||
}
|
||||
}
|
||||
|
||||
let mut config: AgentConfig = Default::default();
|
||||
pub fn parse_cmdline(&mut self, file: &str) -> Result<()> {
|
||||
let cmdline = fs::read_to_string(file)?;
|
||||
let params: Vec<&str> = cmdline.split_ascii_whitespace().collect();
|
||||
for param in params.iter() {
|
||||
// If we get a configuration file path from the command line, we
|
||||
// generate our config from it.
|
||||
// The agent will fail to start if the configuration file is not present,
|
||||
// or if it can't be parsed properly.
|
||||
if param.starts_with(format!("{}=", CONFIG_FILE).as_str()) {
|
||||
let config_file = get_string_value(param)?;
|
||||
return AgentConfig::from_config_file(&config_file);
|
||||
}
|
||||
|
||||
// parse cmdline flags
|
||||
parse_cmdline_param!(param, DEBUG_CONSOLE_FLAG, config.debug_console);
|
||||
parse_cmdline_param!(param, DEV_MODE_FLAG, config.dev_mode);
|
||||
parse_cmdline_param!(param, DEBUG_CONSOLE_FLAG, self.debug_console);
|
||||
parse_cmdline_param!(param, DEV_MODE_FLAG, self.dev_mode);
|
||||
|
||||
// Support "bare" tracing option for backwards compatibility with
|
||||
// Kata 1.x.
|
||||
if param == &TRACE_MODE_OPTION {
|
||||
config.tracing = true;
|
||||
self.tracing = tracer::TraceType::Isolated;
|
||||
continue;
|
||||
}
|
||||
|
||||
parse_cmdline_param!(param, TRACE_MODE_OPTION, config.tracing, get_bool_value);
|
||||
parse_cmdline_param!(param, TRACE_MODE_OPTION, self.tracing, get_trace_type);
|
||||
|
||||
// parse cmdline options
|
||||
parse_cmdline_param!(param, LOG_LEVEL_OPTION, config.log_level, get_log_level);
|
||||
parse_cmdline_param!(param, LOG_LEVEL_OPTION, self.log_level, get_log_level);
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
SERVER_ADDR_OPTION,
|
||||
config.server_addr,
|
||||
self.server_addr,
|
||||
get_string_value
|
||||
);
|
||||
|
||||
@@ -244,7 +138,7 @@ impl AgentConfig {
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
HOTPLUG_TIMOUT_OPTION,
|
||||
config.hotplug_timeout,
|
||||
self.hotplug_timeout,
|
||||
get_hotplug_timeout,
|
||||
|hotplug_timeout: time::Duration| hotplug_timeout.as_secs() > 0
|
||||
);
|
||||
@@ -253,14 +147,14 @@ impl AgentConfig {
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
DEBUG_CONSOLE_VPORT_OPTION,
|
||||
config.debug_console_vport,
|
||||
self.debug_console_vport,
|
||||
get_vsock_port,
|
||||
|port| port > 0
|
||||
);
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
LOG_VPORT_OPTION,
|
||||
config.log_vport,
|
||||
self.log_vport,
|
||||
get_vsock_port,
|
||||
|port| port > 0
|
||||
);
|
||||
@@ -268,47 +162,34 @@ impl AgentConfig {
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
CONTAINER_PIPE_SIZE_OPTION,
|
||||
config.container_pipe_size,
|
||||
self.container_pipe_size,
|
||||
get_container_pipe_size
|
||||
);
|
||||
parse_cmdline_param!(
|
||||
param,
|
||||
UNIFIED_CGROUP_HIERARCHY_OPTION,
|
||||
config.unified_cgroup_hierarchy,
|
||||
self.unified_cgroup_hierarchy,
|
||||
get_bool_value
|
||||
);
|
||||
}
|
||||
|
||||
if let Ok(addr) = env::var(SERVER_ADDR_ENV_VAR) {
|
||||
config.server_addr = addr;
|
||||
self.server_addr = addr;
|
||||
}
|
||||
|
||||
if let Ok(addr) = env::var(LOG_LEVEL_ENV_VAR) {
|
||||
if let Ok(level) = logrus_to_slog_level(&addr) {
|
||||
config.log_level = level;
|
||||
self.log_level = level;
|
||||
}
|
||||
}
|
||||
|
||||
if let Ok(value) = env::var(TRACING_ENV_VAR) {
|
||||
let name_value = format!("{}={}", TRACING_ENV_VAR, value);
|
||||
|
||||
config.tracing = get_bool_value(&name_value)?;
|
||||
if let Ok(value) = env::var(TRACE_TYPE_ENV_VAR) {
|
||||
if let Ok(result) = value.parse::<tracer::TraceType>() {
|
||||
self.tracing = result;
|
||||
}
|
||||
}
|
||||
|
||||
// We did not get a configuration file: allow all endpoints.
|
||||
config.endpoints.all_allowed = true;
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
pub fn from_config_file(file: &str) -> Result<AgentConfig> {
|
||||
let config = fs::read_to_string(file)?;
|
||||
AgentConfig::from_str(&config)
|
||||
}
|
||||
|
||||
pub fn is_allowed_endpoint(&self, ep: &str) -> bool {
|
||||
self.endpoints.all_allowed || self.endpoints.allowed.contains(ep)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -355,6 +236,25 @@ fn get_log_level(param: &str) -> Result<slog::Level> {
|
||||
logrus_to_slog_level(fields[1])
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn get_trace_type(param: &str) -> Result<tracer::TraceType> {
|
||||
ensure!(!param.is_empty(), "invalid trace type parameter");
|
||||
|
||||
let fields: Vec<&str> = param.split('=').collect();
|
||||
ensure!(
|
||||
fields[0] == TRACE_MODE_OPTION,
|
||||
"invalid trace type key name"
|
||||
);
|
||||
|
||||
if fields.len() == 1 {
|
||||
return Ok(tracer::TraceType::Isolated);
|
||||
}
|
||||
|
||||
let result = fields[1].parse::<tracer::TraceType>()?;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn get_hotplug_timeout(param: &str) -> Result<time::Duration> {
|
||||
let fields: Vec<&str> = param.split('=').collect();
|
||||
@@ -439,6 +339,10 @@ mod tests {
|
||||
use std::time;
|
||||
use tempfile::tempdir;
|
||||
|
||||
const ERR_INVALID_TRACE_TYPE_PARAM: &str = "invalid trace type parameter";
|
||||
const ERR_INVALID_TRACE_TYPE: &str = "invalid trace type";
|
||||
const ERR_INVALID_TRACE_TYPE_KEY: &str = "invalid trace type key name";
|
||||
|
||||
// Parameters:
|
||||
//
|
||||
// 1: expected Result
|
||||
@@ -467,7 +371,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_new() {
|
||||
let config: AgentConfig = Default::default();
|
||||
let config = AgentConfig::new();
|
||||
assert!(!config.debug_console);
|
||||
assert!(!config.dev_mode);
|
||||
assert_eq!(config.log_level, DEFAULT_LOG_LEVEL);
|
||||
@@ -475,7 +379,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_cmdline() {
|
||||
fn test_parse_cmdline() {
|
||||
const TEST_SERVER_ADDR: &str = "vsock://-1:1024";
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -489,7 +393,7 @@ mod tests {
|
||||
container_pipe_size: i32,
|
||||
server_addr: &'a str,
|
||||
unified_cgroup_hierarchy: bool,
|
||||
tracing: bool,
|
||||
tracing: tracer::TraceType,
|
||||
}
|
||||
|
||||
impl Default for TestData<'_> {
|
||||
@@ -504,7 +408,7 @@ mod tests {
|
||||
container_pipe_size: DEFAULT_CONTAINER_PIPE_SIZE,
|
||||
server_addr: TEST_SERVER_ADDR,
|
||||
unified_cgroup_hierarchy: false,
|
||||
tracing: false,
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -763,121 +667,64 @@ mod tests {
|
||||
},
|
||||
TestData {
|
||||
contents: "trace",
|
||||
tracing: false,
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: ".trace",
|
||||
tracing: false,
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.tracer",
|
||||
tracing: false,
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trac",
|
||||
tracing: false,
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace",
|
||||
tracing: true,
|
||||
tracing: tracer::TraceType::Isolated,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=true",
|
||||
tracing: true,
|
||||
contents: "agent.trace=isolated",
|
||||
tracing: tracer::TraceType::Isolated,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=false",
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=0",
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=1",
|
||||
tracing: true,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=a",
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=foo",
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=.",
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "agent.trace=,",
|
||||
tracing: false,
|
||||
contents: "agent.trace=disabled",
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING="],
|
||||
tracing: false,
|
||||
env_vars: vec!["KATA_AGENT_TRACE_TYPE=isolated"],
|
||||
tracing: tracer::TraceType::Isolated,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=''"],
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=0"],
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=."],
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=,"],
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=foo"],
|
||||
tracing: false,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=1"],
|
||||
tracing: true,
|
||||
..Default::default()
|
||||
},
|
||||
TestData {
|
||||
contents: "",
|
||||
env_vars: vec!["KATA_AGENT_TRACING=true"],
|
||||
tracing: true,
|
||||
env_vars: vec!["KATA_AGENT_TRACE_TYPE=disabled"],
|
||||
tracing: tracer::TraceType::Disabled,
|
||||
..Default::default()
|
||||
},
|
||||
];
|
||||
|
||||
let dir = tempdir().expect("failed to create tmpdir");
|
||||
|
||||
// First, check a missing file is handled
|
||||
let file_path = dir.path().join("enoent");
|
||||
|
||||
let filename = file_path.to_str().expect("failed to create filename");
|
||||
|
||||
let mut config = AgentConfig::new();
|
||||
let result = config.parse_cmdline(&filename.to_owned());
|
||||
assert!(result.is_err());
|
||||
|
||||
// Now, test various combinations of file contents and environment
|
||||
// variables.
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
@@ -906,8 +753,22 @@ mod tests {
|
||||
vars_to_unset.push(name);
|
||||
}
|
||||
|
||||
let config =
|
||||
AgentConfig::from_cmdline(filename, vec![]).expect("Failed to parse command line");
|
||||
let mut config = AgentConfig::new();
|
||||
assert!(!config.debug_console, "{}", msg);
|
||||
assert!(!config.dev_mode, "{}", msg);
|
||||
assert!(!config.unified_cgroup_hierarchy, "{}", msg);
|
||||
assert_eq!(
|
||||
config.hotplug_timeout,
|
||||
time::Duration::from_secs(3),
|
||||
"{}",
|
||||
msg
|
||||
);
|
||||
assert_eq!(config.container_pipe_size, 0, "{}", msg);
|
||||
assert_eq!(config.server_addr, TEST_SERVER_ADDR, "{}", msg);
|
||||
assert_eq!(config.tracing, tracer::TraceType::Disabled, "{}", msg);
|
||||
|
||||
let result = config.parse_cmdline(filename);
|
||||
assert!(result.is_ok(), "{}", msg);
|
||||
|
||||
assert_eq!(d.debug_console, config.debug_console, "{}", msg);
|
||||
assert_eq!(d.dev_mode, config.dev_mode, "{}", msg);
|
||||
@@ -928,40 +789,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_cmdline_with_args_overwrites() {
|
||||
let expected = AgentConfig {
|
||||
dev_mode: true,
|
||||
server_addr: "unix://@/tmp/foo.socket".to_string(),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let example_config_file_contents =
|
||||
"dev_mode = true\nserver_addr = 'unix://@/tmp/foo.socket'";
|
||||
let dir = tempdir().expect("failed to create tmpdir");
|
||||
let file_path = dir.path().join("config.toml");
|
||||
let filename = file_path.to_str().expect("failed to create filename");
|
||||
let mut file = File::create(filename).unwrap_or_else(|_| panic!("failed to create file"));
|
||||
file.write_all(example_config_file_contents.as_bytes())
|
||||
.unwrap_or_else(|_| panic!("failed to write file contents"));
|
||||
|
||||
let config =
|
||||
AgentConfig::from_cmdline("", vec!["--config".to_string(), filename.to_string()])
|
||||
.expect("Failed to parse command line");
|
||||
|
||||
assert_eq!(expected.debug_console, config.debug_console);
|
||||
assert_eq!(expected.dev_mode, config.dev_mode);
|
||||
assert_eq!(
|
||||
expected.unified_cgroup_hierarchy,
|
||||
config.unified_cgroup_hierarchy,
|
||||
);
|
||||
assert_eq!(expected.log_level, config.log_level);
|
||||
assert_eq!(expected.hotplug_timeout, config.hotplug_timeout);
|
||||
assert_eq!(expected.container_pipe_size, config.container_pipe_size);
|
||||
assert_eq!(expected.server_addr, config.server_addr);
|
||||
assert_eq!(expected.tracing, config.tracing);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_logrus_to_slog_level() {
|
||||
#[derive(Debug)]
|
||||
@@ -1393,33 +1220,60 @@ Caused by:
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_builder_from_string() {
|
||||
let config = AgentConfig::from_str(
|
||||
r#"
|
||||
dev_mode = true
|
||||
server_addr = 'vsock://8:2048'
|
||||
fn test_get_trace_type() {
|
||||
#[derive(Debug)]
|
||||
struct TestData<'a> {
|
||||
param: &'a str,
|
||||
result: Result<tracer::TraceType>,
|
||||
}
|
||||
|
||||
[endpoints]
|
||||
allowed = ["CreateContainer", "StartContainer"]
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
let tests = &[
|
||||
TestData {
|
||||
param: "",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE_PARAM)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.tracer",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE_KEY)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trac",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE_KEY)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace=",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace==",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace=foo",
|
||||
result: Err(anyhow!(ERR_INVALID_TRACE_TYPE)),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace",
|
||||
result: Ok(tracer::TraceType::Isolated),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace=isolated",
|
||||
result: Ok(tracer::TraceType::Isolated),
|
||||
},
|
||||
TestData {
|
||||
param: "agent.trace=disabled",
|
||||
result: Ok(tracer::TraceType::Disabled),
|
||||
},
|
||||
];
|
||||
|
||||
// Verify that the all_allowed flag is false
|
||||
assert!(!config.endpoints.all_allowed);
|
||||
for (i, d) in tests.iter().enumerate() {
|
||||
let msg = format!("test[{}]: {:?}", i, d);
|
||||
|
||||
// Verify that the override worked
|
||||
assert!(config.dev_mode);
|
||||
assert_eq!(config.server_addr, "vsock://8:2048");
|
||||
assert_eq!(
|
||||
config.endpoints.allowed,
|
||||
vec!["CreateContainer".to_string(), "StartContainer".to_string()]
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect()
|
||||
);
|
||||
let result = get_trace_type(d.param);
|
||||
|
||||
// Verify that the default values are valid
|
||||
assert_eq!(config.hotplug_timeout, DEFAULT_HOTPLUG_TIMEOUT);
|
||||
let msg = format!("{}: result: {:?}", msg, result);
|
||||
|
||||
assert_result!(d.result, result, msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,8 +149,10 @@ fn run_in_child(slave_fd: libc::c_int, shell: String) -> Result<()> {
|
||||
|
||||
// run shell
|
||||
let _ = unistd::execvp(cmd.as_c_str(), &args).map_err(|e| match e {
|
||||
nix::Error::UnknownErrno => std::process::exit(-2),
|
||||
_ => std::process::exit(e as i32),
|
||||
nix::Error::Sys(errno) => {
|
||||
std::process::exit(errno as i32);
|
||||
}
|
||||
_ => std::process::exit(-2),
|
||||
});
|
||||
|
||||
Ok(())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,6 +9,7 @@
|
||||
use std::fs;
|
||||
|
||||
pub const SYSFS_DIR: &str = "/sys";
|
||||
pub const SYSFS_PCI_BUS_RESCAN_FILE: &str = "/sys/bus/pci/rescan";
|
||||
#[cfg(any(
|
||||
target_arch = "powerpc64",
|
||||
target_arch = "s390x",
|
||||
@@ -83,8 +84,6 @@ pub const SYSFS_MEMORY_ONLINE_PATH: &str = "/sys/devices/system/memory";
|
||||
|
||||
pub const SYSFS_SCSI_HOST_PATH: &str = "/sys/class/scsi_host";
|
||||
|
||||
pub const SYSFS_BUS_PCI_PATH: &str = "/sys/bus/pci";
|
||||
|
||||
pub const SYSFS_CGROUPPATH: &str = "/sys/fs/cgroup";
|
||||
pub const SYSFS_ONLINE_FILE: &str = "online";
|
||||
|
||||
|
||||
@@ -20,7 +20,6 @@ extern crate scopeguard;
|
||||
extern crate slog;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use clap::{AppSettings, Parser};
|
||||
use nix::fcntl::OFlag;
|
||||
use nix::sys::socket::{self, AddressFamily, SockAddr, SockFlag, SockType};
|
||||
use nix::unistd::{self, dup, Pid};
|
||||
@@ -78,33 +77,11 @@ mod rpc;
|
||||
mod tracer;
|
||||
|
||||
const NAME: &str = "kata-agent";
|
||||
const KERNEL_CMDLINE_FILE: &str = "/proc/cmdline";
|
||||
|
||||
lazy_static! {
|
||||
static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> = Arc::new(RwLock::new(
|
||||
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
|
||||
// clap::Parser::parse() greedily process all command line input including cargo test parameters,
|
||||
// so should only be used inside main.
|
||||
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
// The default clap version info doesn't match our form, so we need to override it
|
||||
#[clap(global_setting(AppSettings::DisableVersionFlag))]
|
||||
struct AgentOpts {
|
||||
/// Print the version information
|
||||
#[clap(short, long)]
|
||||
version: bool,
|
||||
#[clap(subcommand)]
|
||||
subcmd: Option<SubCommand>,
|
||||
/// Specify a custom agent config file
|
||||
#[clap(short, long)]
|
||||
config: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
enum SubCommand {
|
||||
Init {},
|
||||
static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> =
|
||||
Arc::new(RwLock::new(config::AgentConfig::new()));
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -136,10 +113,10 @@ async fn create_logger_task(rfd: RawFd, vsock_port: u32, shutdown: Receiver<bool
|
||||
)?;
|
||||
|
||||
let addr = SockAddr::new_vsock(libc::VMADDR_CID_ANY, vsock_port);
|
||||
socket::bind(listenfd, &addr)?;
|
||||
socket::listen(listenfd, 1)?;
|
||||
socket::bind(listenfd, &addr).unwrap();
|
||||
socket::listen(listenfd, 1).unwrap();
|
||||
|
||||
writer = Box::new(util::get_vsock_stream(listenfd).await?);
|
||||
writer = Box::new(util::get_vsock_stream(listenfd).await.unwrap());
|
||||
} else {
|
||||
writer = Box::new(tokio::io::stdout());
|
||||
}
|
||||
@@ -157,11 +134,15 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
console::initialize();
|
||||
|
||||
lazy_static::initialize(&AGENT_CONFIG);
|
||||
|
||||
// support vsock log
|
||||
let (rfd, wfd) = unistd::pipe2(OFlag::O_CLOEXEC)?;
|
||||
|
||||
let (shutdown_tx, shutdown_rx) = channel(true);
|
||||
|
||||
let agent_config = AGENT_CONFIG.clone();
|
||||
|
||||
let init_mode = unistd::getpid() == Pid::from_raw(1);
|
||||
if init_mode {
|
||||
// dup a new file descriptor for this temporary logger writer,
|
||||
@@ -182,15 +163,20 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
e
|
||||
})?;
|
||||
|
||||
lazy_static::initialize(&AGENT_CONFIG);
|
||||
let mut config = agent_config.write().await;
|
||||
config.parse_cmdline(KERNEL_CMDLINE_FILE)?;
|
||||
|
||||
init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?;
|
||||
init_agent_as_init(&logger, config.unified_cgroup_hierarchy)?;
|
||||
drop(logger_async_guard);
|
||||
} else {
|
||||
lazy_static::initialize(&AGENT_CONFIG);
|
||||
// once parsed cmdline and set the config, release the write lock
|
||||
// as soon as possible in case other thread would get read lock on
|
||||
// it.
|
||||
let mut config = agent_config.write().await;
|
||||
config.parse_cmdline(KERNEL_CMDLINE_FILE)?;
|
||||
}
|
||||
let config = agent_config.read().await;
|
||||
|
||||
let config = AGENT_CONFIG.read().await;
|
||||
let log_vport = config.log_vport as u32;
|
||||
|
||||
let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone()));
|
||||
@@ -219,16 +205,16 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
ttrpc_log_guard = Ok(slog_stdlog::init().map_err(|e| e)?);
|
||||
}
|
||||
|
||||
if config.tracing {
|
||||
tracer::setup_tracing(NAME, &logger)?;
|
||||
if config.tracing != tracer::TraceType::Disabled {
|
||||
let _ = tracer::setup_tracing(NAME, &logger, &config)?;
|
||||
}
|
||||
|
||||
let root_span = span!(tracing::Level::TRACE, "root-span");
|
||||
let root = span!(tracing::Level::TRACE, "root-span", work_units = 2);
|
||||
|
||||
// XXX: Start the root trace transaction.
|
||||
//
|
||||
// XXX: Note that *ALL* spans needs to start after this point!!
|
||||
let span_guard = root_span.enter();
|
||||
let _enter = root.enter();
|
||||
|
||||
// Start the sandbox and wait for its ttRPC server to end
|
||||
start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
|
||||
@@ -252,35 +238,25 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
// Wait for all threads to finish
|
||||
let results = join_all(tasks).await;
|
||||
|
||||
// force flushing spans
|
||||
drop(span_guard);
|
||||
drop(root_span);
|
||||
for result in results {
|
||||
if let Err(e) = result {
|
||||
return Err(anyhow!(e).into());
|
||||
}
|
||||
}
|
||||
|
||||
if config.tracing {
|
||||
if config.tracing != tracer::TraceType::Disabled {
|
||||
tracer::end_tracing();
|
||||
}
|
||||
|
||||
eprintln!("{} shutdown complete", NAME);
|
||||
|
||||
let mut wait_errors: Vec<tokio::task::JoinError> = vec![];
|
||||
for result in results {
|
||||
if let Err(e) = result {
|
||||
eprintln!("wait task error: {:#?}", e);
|
||||
wait_errors.push(e);
|
||||
}
|
||||
}
|
||||
|
||||
if wait_errors.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("wait all tasks failed: {:#?}", wait_errors).into())
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let args = AgentOpts::parse();
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
if args.version {
|
||||
if args.len() == 2 && args[1] == "--version" {
|
||||
println!(
|
||||
"{} version {} (api version: {}, commit version: {}, type: rust)",
|
||||
NAME,
|
||||
@@ -288,10 +264,11 @@ fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
version::API_VERSION,
|
||||
version::VERSION_COMMIT,
|
||||
);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if let Some(SubCommand::Init {}) = args.subcmd {
|
||||
if args.len() == 2 && args[1] == "init" {
|
||||
reset_sigpipe();
|
||||
rustjail::container::init_child();
|
||||
exit(0);
|
||||
@@ -348,7 +325,7 @@ async fn start_sandbox(
|
||||
sandbox.lock().await.sender = Some(tx);
|
||||
|
||||
// vsock:///dev/vsock, port
|
||||
let mut server = rpc::start(sandbox.clone(), config.server_addr.as_str())?;
|
||||
let mut server = rpc::start(sandbox.clone(), config.server_addr.as_str());
|
||||
server.start().await?;
|
||||
|
||||
rx.await?;
|
||||
|
||||
@@ -8,7 +8,6 @@ extern crate procfs;
|
||||
use prometheus::{Encoder, Gauge, GaugeVec, IntCounter, TextEncoder};
|
||||
|
||||
use anyhow::Result;
|
||||
use slog::warn;
|
||||
use tracing::instrument;
|
||||
|
||||
const NAMESPACE_KATA_AGENT: &str = "kata_agent";
|
||||
@@ -24,50 +23,50 @@ macro_rules! sl {
|
||||
lazy_static! {
|
||||
|
||||
static ref AGENT_SCRAPE_COUNT: IntCounter =
|
||||
prometheus::register_int_counter!(format!("{}_{}",NAMESPACE_KATA_AGENT,"scrape_count"), "Metrics scrape count").unwrap();
|
||||
prometheus::register_int_counter!(format!("{}_{}",NAMESPACE_KATA_AGENT,"scrape_count").as_ref(), "Metrics scrape count").unwrap();
|
||||
|
||||
static ref AGENT_THREADS: Gauge =
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"threads"), "Agent process threads").unwrap();
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"threads").as_ref(), "Agent process threads").unwrap();
|
||||
|
||||
static ref AGENT_TOTAL_TIME: Gauge =
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_time"), "Agent process total time").unwrap();
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_time").as_ref(), "Agent process total time").unwrap();
|
||||
|
||||
static ref AGENT_TOTAL_VM: Gauge =
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_vm"), "Agent process total VM size").unwrap();
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_vm").as_ref(), "Agent process total VM size").unwrap();
|
||||
|
||||
static ref AGENT_TOTAL_RSS: Gauge =
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_rss"), "Agent process total RSS size").unwrap();
|
||||
prometheus::register_gauge!(format!("{}_{}",NAMESPACE_KATA_AGENT,"total_rss").as_ref(), "Agent process total RSS size").unwrap();
|
||||
|
||||
static ref AGENT_PROC_STATUS: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"proc_status"), "Agent process status.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"proc_status").as_ref(), "Agent process status.", &["item"]).unwrap();
|
||||
|
||||
static ref AGENT_IO_STAT: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"io_stat"), "Agent process IO statistics.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"io_stat").as_ref(), "Agent process IO statistics.", &["item"]).unwrap();
|
||||
|
||||
static ref AGENT_PROC_STAT: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"proc_stat"), "Agent process statistics.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_AGENT,"proc_stat").as_ref(), "Agent process statistics.", &["item"]).unwrap();
|
||||
|
||||
// guest os metrics
|
||||
static ref GUEST_LOAD: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"load") , "Guest system load.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"load").as_ref() , "Guest system load.", &["item"]).unwrap();
|
||||
|
||||
static ref GUEST_TASKS: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"tasks") , "Guest system load.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"tasks").as_ref() , "Guest system load.", &["item"]).unwrap();
|
||||
|
||||
static ref GUEST_CPU_TIME: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"cpu_time") , "Guest CPU statistics.", &["cpu","item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"cpu_time").as_ref() , "Guest CPU statistics.", &["cpu","item"]).unwrap();
|
||||
|
||||
static ref GUEST_VM_STAT: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"vm_stat") , "Guest virtual memory statistics.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"vm_stat").as_ref() , "Guest virtual memory statistics.", &["item"]).unwrap();
|
||||
|
||||
static ref GUEST_NETDEV_STAT: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"netdev_stat") , "Guest net devices statistics.", &["interface","item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"netdev_stat").as_ref() , "Guest net devices statistics.", &["interface","item"]).unwrap();
|
||||
|
||||
static ref GUEST_DISKSTAT: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"diskstat") , "Disks statistics in system.", &["disk","item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"diskstat").as_ref() , "Disks statistics in system.", &["disk","item"]).unwrap();
|
||||
|
||||
static ref GUEST_MEMINFO: GaugeVec =
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"meminfo") , "Statistics about memory usage in the system.", &["item"]).unwrap();
|
||||
prometheus::register_gauge_vec!(format!("{}_{}",NAMESPACE_KATA_GUEST,"meminfo").as_ref() , "Statistics about memory usage in the system.", &["item"]).unwrap();
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -75,7 +74,7 @@ pub fn get_metrics(_: &protocols::agent::GetMetricsRequest) -> Result<String> {
|
||||
AGENT_SCRAPE_COUNT.inc();
|
||||
|
||||
// update agent process metrics
|
||||
update_agent_metrics()?;
|
||||
update_agent_metrics();
|
||||
|
||||
// update guest os metrics
|
||||
update_guest_metrics();
|
||||
@@ -85,26 +84,23 @@ pub fn get_metrics(_: &protocols::agent::GetMetricsRequest) -> Result<String> {
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let encoder = TextEncoder::new();
|
||||
encoder.encode(&metric_families, &mut buffer)?;
|
||||
encoder.encode(&metric_families, &mut buffer).unwrap();
|
||||
|
||||
Ok(String::from_utf8(buffer)?)
|
||||
Ok(String::from_utf8(buffer).unwrap())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
fn update_agent_metrics() -> Result<()> {
|
||||
fn update_agent_metrics() {
|
||||
let me = procfs::process::Process::myself();
|
||||
|
||||
let me = match me {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
// FIXME: return Ok for all errors?
|
||||
warn!(sl!(), "failed to create process instance: {:?}", e);
|
||||
if let Err(err) = me {
|
||||
error!(sl!(), "failed to create process instance: {:?}", err);
|
||||
return;
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let me = me.unwrap();
|
||||
|
||||
let tps = procfs::ticks_per_second()?;
|
||||
let tps = procfs::ticks_per_second().unwrap();
|
||||
|
||||
// process total time
|
||||
AGENT_TOTAL_TIME.set((me.stat.utime + me.stat.stime) as f64 / (tps as f64));
|
||||
@@ -113,7 +109,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
AGENT_TOTAL_VM.set(me.stat.vsize as f64);
|
||||
|
||||
// Total resident set
|
||||
let page_size = procfs::page_size()? as f64;
|
||||
let page_size = procfs::page_size().unwrap() as f64;
|
||||
AGENT_TOTAL_RSS.set(me.stat.rss as f64 * page_size);
|
||||
|
||||
// io
|
||||
@@ -136,11 +132,11 @@ fn update_agent_metrics() -> Result<()> {
|
||||
}
|
||||
|
||||
match me.status() {
|
||||
Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get process status: {:?}", err);
|
||||
}
|
||||
Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -352,17 +348,17 @@ fn set_gauge_vec_cpu_time(gv: &prometheus::GaugeVec, cpu: &str, cpu_time: &procf
|
||||
gv.with_label_values(&[cpu, "idle"])
|
||||
.set(cpu_time.idle as f64);
|
||||
gv.with_label_values(&[cpu, "iowait"])
|
||||
.set(cpu_time.iowait.unwrap_or(0) as f64);
|
||||
.set(cpu_time.iowait.unwrap_or(0.0) as f64);
|
||||
gv.with_label_values(&[cpu, "irq"])
|
||||
.set(cpu_time.irq.unwrap_or(0) as f64);
|
||||
.set(cpu_time.irq.unwrap_or(0.0) as f64);
|
||||
gv.with_label_values(&[cpu, "softirq"])
|
||||
.set(cpu_time.softirq.unwrap_or(0) as f64);
|
||||
.set(cpu_time.softirq.unwrap_or(0.0) as f64);
|
||||
gv.with_label_values(&[cpu, "steal"])
|
||||
.set(cpu_time.steal.unwrap_or(0) as f64);
|
||||
.set(cpu_time.steal.unwrap_or(0.0) as f64);
|
||||
gv.with_label_values(&[cpu, "guest"])
|
||||
.set(cpu_time.guest.unwrap_or(0) as f64);
|
||||
.set(cpu_time.guest.unwrap_or(0.0) as f64);
|
||||
gv.with_label_values(&[cpu, "guest_nice"])
|
||||
.set(cpu_time.guest_nice.unwrap_or(0) as f64);
|
||||
.set(cpu_time.guest_nice.unwrap_or(0.0) as f64);
|
||||
}
|
||||
|
||||
#[instrument]
|
||||
@@ -474,7 +470,7 @@ fn set_gauge_vec_proc_status(gv: &prometheus::GaugeVec, status: &procfs::process
|
||||
gv.with_label_values(&["vmswap"])
|
||||
.set(status.vmswap.unwrap_or(0) as f64);
|
||||
gv.with_label_values(&["hugetlbpages"])
|
||||
.set(status.hugetlbpages.unwrap_or(0) as f64);
|
||||
.set(status.hugetblpages.unwrap_or(0) as f64);
|
||||
gv.with_label_values(&["voluntary_ctxt_switches"])
|
||||
.set(status.voluntary_ctxt_switches.unwrap_or(0) as f64);
|
||||
gv.with_label_values(&["nonvoluntary_ctxt_switches"])
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user