Compare commits

..

1 Commits

Author SHA1 Message Date
Fupan Li
3f45fcd3b3 tests: fix the issue of missing teardown pods
Signed-off-by: Fupan Li <fupan.lfp@antgroup.com>
2025-11-17 19:24:26 +08:00
1118 changed files with 28759 additions and 45020 deletions

View File

@@ -1,7 +0,0 @@
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

View File

@@ -1,30 +0,0 @@
{
"Verbose": false,
"Debug": false,
"IgnoreDefaults": false,
"SpacesAfterTabs": false,
"NoColor": false,
"Exclude": [
"src/runtime/vendor",
"src/tools/log-parser/vendor",
"tests/metrics/cmd/checkmetrics/vendor",
"tests/vendor",
"src/runtime/virtcontainers/pkg/cloud-hypervisor/client",
"\\.img$",
"\\.dtb$",
"\\.drawio$",
"\\.svg$",
"\\.patch$"
],
"AllowedContentTypes": [],
"PassedFiles": [],
"Disable": {
"EndOfLine": false,
"Indentation": false,
"IndentSize": false,
"InsertFinalNewline": false,
"TrimTrailingWhitespace": false,
"MaxLineLength": false,
"Charset": false
}
}

View File

@@ -10,6 +10,11 @@ self-hosted-runner:
- amd64-nvidia-a100
- amd64-nvidia-h100-snp
- arm64-k8s
- containerd-v1.7-overlayfs
- containerd-v2.0-overlayfs
- containerd-v2.1-overlayfs
- containerd-v2.2
- containerd-v2.2-overlayfs
- garm-ubuntu-2004
- garm-ubuntu-2004-smaller
- garm-ubuntu-2204
@@ -20,7 +25,6 @@ self-hosted-runner:
- ppc64le-k8s
- ppc64le-small
- ubuntu-24.04-ppc64le
- ubuntu-24.04-s390x
- metrics
- riscv-builder
- sev-snp

View File

@@ -17,7 +17,7 @@ runs:
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly
toolchain: nightly
override: true
- name: Cache

View File

@@ -12,6 +12,7 @@ updates:
- "/src/tools/agent-ctl"
- "/src/tools/genpolicy"
- "/src/tools/kata-ctl"
- "/src/tools/runk"
- "/src/tools/trace-forwarder"
schedule:
interval: "daily"

View File

@@ -71,7 +71,7 @@ jobs:
fail-fast: false
matrix:
containerd_version: ['lts', 'active']
vmm: ['clh', 'cloud-hypervisor', 'dragonball', 'qemu', 'qemu-runtime-rs']
vmm: ['clh', 'cloud-hypervisor', 'dragonball', 'qemu']
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: ${{ matrix.containerd_version }}
@@ -117,7 +117,7 @@ jobs:
fail-fast: false
matrix:
containerd_version: ['lts', 'active']
vmm: ['clh', 'qemu', 'dragonball', 'qemu-runtime-rs']
vmm: ['clh', 'qemu', 'dragonball']
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: ${{ matrix.containerd_version }}
@@ -147,22 +147,49 @@ jobs:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata
run: bash tests/integration/nydus/gha-run.sh install-kata kata-artifacts
- name: Install kata-tools
run: bash tests/integration/nydus/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Run nydus tests
timeout-minutes: 10
run: bash tests/integration/nydus/gha-run.sh run
run-runk:
name: run-runk
# Skip runk tests as we have no maintainers. TODO: Decide when to remove altogether
if: false
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: lts
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Install dependencies
run: bash tests/integration/runk/gha-run.sh install-dependencies
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata
run: bash tests/integration/runk/gha-run.sh install-kata kata-artifacts
- name: Run runk tests
timeout-minutes: 10
run: bash tests/integration/runk/gha-run.sh run
run-tracing:
name: run-tracing
strategy:
@@ -265,7 +292,6 @@ jobs:
- dragonball
- qemu
- cloud-hypervisor
- qemu-runtime-rs
runs-on: ubuntu-22.04
env:
KATA_HYPERVISOR: ${{ matrix.vmm }}
@@ -340,16 +366,8 @@ jobs:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata & kata-tools
run: |
bash tests/functional/kata-agent-apis/gha-run.sh install-kata kata-artifacts
bash tests/functional/kata-agent-apis/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Install kata
run: bash tests/functional/kata-agent-apis/gha-run.sh install-kata kata-artifacts
- name: Run kata agent api tests with agent-ctl
run: bash tests/functional/kata-agent-apis/gha-run.sh run

View File

@@ -12,12 +12,7 @@ name: Build checks
jobs:
check:
name: check
runs-on: >-
${{
( contains(inputs.instance, 's390x') && matrix.component.name == 'runtime' ) && 's390x' ||
( contains(inputs.instance, 'ppc64le') && (matrix.component.name == 'runtime' || matrix.component.name == 'agent') ) && 'ppc64le' ||
inputs.instance
}}
runs-on: ${{ matrix.runner || inputs.instance }}
strategy:
fail-fast: false
matrix:
@@ -74,7 +69,37 @@ jobs:
- rust
- protobuf-compiler
instance:
- ${{ inputs.instance }}
- ${{ inputs.instance }}
include:
- component:
name: runtime
path: src/runtime
needs:
- golang
- XDG_RUNTIME_DIR
instance: ubuntu-24.04-s390x
runner: s390x
- component:
name: runtime
path: src/runtime
needs:
- golang
- XDG_RUNTIME_DIR
instance: ubuntu-24.04-ppc64le
runner: ppc64le
- component:
name: agent
path: src/agent
needs:
- rust
- libdevmapper
- libseccomp
- protobuf-compiler
- clang
instance: ubuntu-24.04-ppc64le
runner: ppc64le
steps:
- name: Adjust a permission for repo

View File

@@ -41,22 +41,29 @@ jobs:
matrix:
asset:
- agent
- agent-ctl
- busybox
- cloud-hypervisor
- cloud-hypervisor-glibc
- coco-guest-components
- csi-kata-directvolume
- firecracker
- genpolicy
- kata-ctl
- kata-manager
- kernel
- kernel-confidential
- kernel-dragonball-experimental
- kernel-nvidia-gpu
- kernel-nvidia-gpu-confidential
- nydus
- ovmf
- ovmf-sev
- ovmf-tdx
- pause-image
- qemu
- qemu-snp-experimental
- qemu-tdx-experimental
- trace-forwarder
- virtiofsd
stage:
- ${{ inputs.stage }}
@@ -114,7 +121,7 @@ jobs:
echo "oci-name=${oci_image%@*}" >> "$GITHUB_OUTPUT"
echo "oci-digest=${oci_image#*@}" >> "$GITHUB_OUTPUT"
- uses: oras-project/setup-oras@22ce207df3b08e061f537244349aac6ae1d214f6 # v1.2.4
- uses: oras-project/setup-oras@5c0b487ce3fe0ce3ab0d034e63669e426e294e4d # v1.2.2
if: ${{ env.PERFORM_ATTESTATION == 'yes' }}
with:
version: "1.2.0"
@@ -143,11 +150,11 @@ jobs:
if-no-files-found: error
- name: store-extratarballs-artifact ${{ matrix.asset }}
if: ${{ matrix.asset == 'kernel' || startsWith(matrix.asset, 'kernel-nvidia-gpu') }}
if: ${{ startsWith(matrix.asset, 'kernel-nvidia-gpu') }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-artifacts-amd64-${{ matrix.asset }}-modules${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-modules.tar.zst
name: kata-artifacts-amd64-${{ matrix.asset }}-headers${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-headers.tar.zst
retention-days: 15
if-no-files-found: error
@@ -164,8 +171,6 @@ jobs:
- rootfs-image
- rootfs-image-confidential
- rootfs-image-mariner
- rootfs-image-nvidia-gpu
- rootfs-image-nvidia-gpu-confidential
- rootfs-initrd
- rootfs-initrd-confidential
- rootfs-initrd-nvidia-gpu
@@ -235,8 +240,8 @@ jobs:
asset:
- busybox
- coco-guest-components
- kernel-modules
- kernel-nvidia-gpu-modules
- kernel-nvidia-gpu-headers
- kernel-nvidia-gpu-confidential-headers
- pause-image
steps:
- uses: geekyeggo/delete-artifact@f275313e70c08f6120db482d7a6b98377786765b # v5.1.0
@@ -357,104 +362,3 @@ jobs:
path: kata-static.tar.zst
retention-days: 15
if-no-files-found: error
build-tools-asset:
name: build-tools-asset
runs-on: ubuntu-22.04
permissions:
contents: read
packages: write
strategy:
matrix:
asset:
- agent-ctl
- csi-kata-directvolume
- genpolicy
- kata-ctl
- kata-manager
- trace-forwarder
stage:
- ${{ inputs.stage }}
steps:
- name: Login to Kata Containers quay.io
if: ${{ inputs.push-to-registry == 'yes' }}
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: quay.io
username: ${{ vars.QUAY_DEPLOYER_USERNAME }}
password: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Build ${{ matrix.asset }}
id: build
run: |
make "${KATA_ASSET}-tarball"
build_dir=$(readlink -f build)
# store-artifact does not work with symlink
mkdir -p kata-tools-build && cp "${build_dir}"/kata-static-"${KATA_ASSET}"*.tar.* kata-tools-build/.
env:
KATA_ASSET: ${{ matrix.asset }}
TAR_OUTPUT: ${{ matrix.asset }}.tar.gz
PUSH_TO_REGISTRY: ${{ inputs.push-to-registry }}
ARTEFACT_REGISTRY: ghcr.io
ARTEFACT_REGISTRY_USERNAME: ${{ github.actor }}
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ inputs.target-branch }}
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
- name: store-artifact ${{ matrix.asset }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-tools-artifacts-amd64-${{ matrix.asset }}${{ inputs.tarball-suffix }}
path: kata-tools-build/kata-static-${{ matrix.asset }}.tar.zst
retention-days: 15
if-no-files-found: error
create-kata-tools-tarball:
name: create-kata-tools-tarball
runs-on: ubuntu-22.04
needs: [build-tools-asset]
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
fetch-tags: true
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
pattern: kata-tools-artifacts-amd64-*${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
merge-multiple: true
- name: merge-artifacts
run: |
./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-tools-artifacts versions.yaml kata-tools-static.tar.zst
env:
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
- name: store-artifacts
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-static.tar.zst
retention-days: 15
if-no-files-found: error

View File

@@ -102,7 +102,7 @@ jobs:
echo "oci-name=${oci_image%@*}" >> "$GITHUB_OUTPUT"
echo "oci-digest=${oci_image#*@}" >> "$GITHUB_OUTPUT"
- uses: oras-project/setup-oras@22ce207df3b08e061f537244349aac6ae1d214f6 # v1.2.4
- uses: oras-project/setup-oras@5c0b487ce3fe0ce3ab0d034e63669e426e294e4d # v1.2.2
if: ${{ env.PERFORM_ATTESTATION == 'yes' }}
with:
version: "1.2.0"
@@ -134,8 +134,8 @@ jobs:
if: ${{ startsWith(matrix.asset, 'kernel-nvidia-gpu') }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-artifacts-arm64-${{ matrix.asset }}-modules${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-modules.tar.zst
name: kata-artifacts-arm64-${{ matrix.asset }}-headers${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-headers.tar.zst
retention-days: 15
if-no-files-found: error
@@ -150,7 +150,6 @@ jobs:
matrix:
asset:
- rootfs-image
- rootfs-image-nvidia-gpu
- rootfs-initrd
- rootfs-initrd-nvidia-gpu
steps:
@@ -216,7 +215,7 @@ jobs:
matrix:
asset:
- busybox
- kernel-nvidia-gpu-modules
- kernel-nvidia-gpu-headers
steps:
- uses: geekyeggo/delete-artifact@f275313e70c08f6120db482d7a6b98377786765b # v5.1.0
with:

View File

@@ -32,7 +32,7 @@ jobs:
permissions:
contents: read
packages: write
runs-on: ubuntu-24.04-ppc64le
runs-on: ppc64le-small
strategy:
matrix:
asset:
@@ -89,7 +89,7 @@ jobs:
build-asset-rootfs:
name: build-asset-rootfs
runs-on: ubuntu-24.04-ppc64le
runs-on: ppc64le-small
needs: build-asset
permissions:
contents: read
@@ -170,7 +170,7 @@ jobs:
build-asset-shim-v2:
name: build-asset-shim-v2
runs-on: ubuntu-24.04-ppc64le
runs-on: ppc64le-small
needs: [build-asset, build-asset-rootfs, remove-rootfs-binary-artifacts]
permissions:
contents: read
@@ -230,7 +230,7 @@ jobs:
create-kata-tarball:
name: create-kata-tarball
runs-on: ubuntu-24.04-ppc64le
runs-on: ppc64le-small
needs: [build-asset, build-asset-rootfs, build-asset-shim-v2]
permissions:
contents: read

View File

@@ -32,7 +32,7 @@ permissions: {}
jobs:
build-asset:
name: build-asset
runs-on: ubuntu-24.04-s390x
runs-on: s390x
permissions:
contents: read
packages: write
@@ -44,6 +44,7 @@ jobs:
- agent
- coco-guest-components
- kernel
- kernel-confidential
- pause-image
- qemu
- virtiofsd
@@ -120,15 +121,6 @@ jobs:
retention-days: 15
if-no-files-found: error
- name: store-extratarballs-artifact ${{ matrix.asset }}
if: ${{ matrix.asset == 'kernel' }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-artifacts-s390x-${{ matrix.asset }}-modules${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-modules.tar.zst
retention-days: 15
if-no-files-found: error
build-asset-rootfs:
name: build-asset-rootfs
runs-on: s390x
@@ -265,7 +257,7 @@ jobs:
build-asset-shim-v2:
name: build-asset-shim-v2
runs-on: ubuntu-24.04-s390x
runs-on: s390x
needs: [build-asset, build-asset-rootfs, remove-rootfs-binary-artifacts]
permissions:
contents: read
@@ -327,7 +319,7 @@ jobs:
create-kata-tarball:
name: create-kata-tarball
runs-on: ubuntu-24.04-s390x
runs-on: s390x
needs:
- build-asset
- build-asset-rootfs

View File

@@ -1,75 +0,0 @@
name: Build kubectl multi-arch image
on:
schedule:
# Run every Sunday at 00:00 UTC
- cron: '0 0 * * 0'
workflow_dispatch:
# Allow manual triggering
push:
branches:
- main
paths:
- 'tools/packaging/kubectl/Dockerfile'
- '.github/workflows/build-kubectl-image.yaml'
permissions: {}
env:
REGISTRY: quay.io
IMAGE_NAME: kata-containers/kubectl
jobs:
build-and-push:
name: Build and push multi-arch image
runs-on: ubuntu-24.04
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: Set up QEMU
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
- name: Login to Quay.io
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ${{ env.REGISTRY }}
username: ${{ vars.QUAY_DEPLOYER_USERNAME }}
password: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
- name: Get kubectl version
id: kubectl-version
run: |
KUBECTL_VERSION=$(curl -L -s https://dl.k8s.io/release/stable.txt)
echo "version=${KUBECTL_VERSION}" >> "$GITHUB_OUTPUT"
- name: Generate image metadata
id: meta
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=raw,value=latest
type=raw,value={{date 'YYYYMMDD'}}
type=raw,value=${{ steps.kubectl-version.outputs.version }}
type=sha,prefix=
- name: Build and push multi-arch image
uses: docker/build-push-action@ca052bb54ab0790a636c9b5f226502c73d547a25 # v5.4.0
with:
context: tools/packaging/kubectl/
file: tools/packaging/kubectl/Dockerfile
platforms: linux/amd64,linux/arm64,linux/s390x,linux/ppc64le
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

View File

@@ -147,7 +147,7 @@ jobs:
tag: ${{ inputs.tag }}-s390x
commit-hash: ${{ inputs.commit-hash }}
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-24.04-s390x
runner: s390x
arch: s390x
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -165,7 +165,7 @@ jobs:
tag: ${{ inputs.tag }}-ppc64le
commit-hash: ${{ inputs.commit-hash }}
target-branch: ${{ inputs.target-branch }}
runner: ubuntu-24.04-ppc64le
runner: ppc64le-small
arch: ppc64le
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
@@ -233,14 +233,14 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64-${{ inputs.tag }}
path: kata-tools-artifacts
name: kata-static-tarball-amd64-${{ inputs.tag }}
path: kata-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Install tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-artifacts
- name: Copy binary into Docker context
run: |
@@ -314,7 +314,6 @@ jobs:
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
with:
tarball-suffix: -${{ inputs.tag }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
@@ -474,7 +473,7 @@ jobs:
vmm: ${{ matrix.params.vmm }}
run-cri-containerd-tests-arm64:
if: false
if: ${{ inputs.skip-test != 'yes' }}
needs: build-kata-static-tarball-arm64
strategy:
fail-fast: false

View File

@@ -1,32 +0,0 @@
name: Documentation
on:
push:
branches:
- main
permissions: {}
jobs:
deploy-docs:
name: deploy-docs
permissions:
contents: read
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- uses: actions/configure-pages@v5
- uses: actions/checkout@v5
with:
persist-credentials: false
- uses: actions/setup-python@v5
with:
python-version: 3.x
- run: pip install zensical
- run: zensical build --clean
- uses: actions/upload-pages-artifact@v4
with:
path: site
- uses: actions/deploy-pages@v4
id: deployment

View File

@@ -1,29 +0,0 @@
name: EditorConfig checker
on:
pull_request:
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
editorconfig-checker:
name: editorconfig-checker
runs-on: ubuntu-24.04
steps:
- name: Checkout the code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: false
- name: Set up editorconfig-checker
uses: editorconfig-checker/action-editorconfig-checker@4b6cd6190d435e7e084fb35e36a096e98506f7b9 # v2.1.0
with:
version: v3.6.1
- name: Run editorconfig-checker
run: editorconfig-checker

View File

@@ -10,9 +10,7 @@ on:
- opened
- synchronize
- reopened
- edited
- labeled
- unlabeled
permissions: {}

View File

@@ -0,0 +1,43 @@
name: kata-runtime-classes-sync
on:
pull_request:
types:
- opened
- edited
- reopened
- synchronize
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
kata-deploy-runtime-classes-check:
name: kata-deploy-runtime-classes-check
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: Ensure the split out runtime classes match the all-in-one file
run: |
pushd tools/packaging/kata-deploy/runtimeclasses/
echo "::group::Combine runtime classes"
for runtimeClass in $(find . -type f \( -name "*.yaml" -and -not -name "kata-runtimeClasses.yaml" \) | sort); do
echo "Adding ${runtimeClass} to the resultingRuntimeClasses.yaml"
cat "${runtimeClass}" >> resultingRuntimeClasses.yaml;
done
echo "::endgroup::"
echo "::group::Displaying the content of resultingRuntimeClasses.yaml"
cat resultingRuntimeClasses.yaml
echo "::endgroup::"
echo ""
echo "::group::Displaying the content of kata-runtimeClasses.yaml"
cat kata-runtimeClasses.yaml
echo "::endgroup::"
echo ""
diff resultingRuntimeClasses.yaml kata-runtimeClasses.yaml

View File

@@ -131,7 +131,7 @@ jobs:
repo: kata-containers/kata-deploy-ci
tag: kata-containers-latest-ppc64le
target-branch: ${{ github.ref_name }}
runner: ubuntu-24.04-ppc64le
runner: ppc64le-small
arch: ppc64le
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}

View File

@@ -50,24 +50,6 @@ jobs:
fetch-depth: 0
persist-credentials: false
- name: Remove unnecessary directories to free up space
run: |
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/julia*
sudo rm -rf /opt/az
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/firefox
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"

View File

@@ -1,43 +0,0 @@
# Push gperf and busybox tarballs to the ORAS cache (ghcr.io) so that
# download-with-oras-cache.sh can pull them instead of hitting upstream.
# Runs when versions.yaml changes on main (e.g. after a PR merge) or manually.
name: CI | Push ORAS tarball cache
on:
push:
branches:
- main
paths:
- 'versions.yaml'
workflow_dispatch:
permissions: {}
jobs:
push-oras-cache:
name: push-oras-cache
runs-on: ubuntu-22.04
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: false
- name: Install yq
run: ./ci/install_yq.sh
- name: Install ORAS
uses: oras-project/setup-oras@22ce207df3b08e061f537244349aac6ae1d214f6 # v1.2.4
with:
version: "1.2.0"
- name: Populate ORAS tarball cache
run: ./tools/packaging/scripts/populate-oras-tarball-cache.sh all
env:
ARTEFACT_REGISTRY: ghcr.io
ARTEFACT_REPOSITORY: kata-containers
ARTEFACT_REGISTRY_USERNAME: ${{ github.actor }}
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -31,7 +31,7 @@ jobs:
permissions:
contents: read
packages: write
runs-on: ubuntu-24.04-ppc64le
runs-on: ppc64le-small
steps:
- name: Login to Kata Containers ghcr.io
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0

View File

@@ -35,7 +35,7 @@ jobs:
permissions:
contents: read
packages: write
runs-on: ubuntu-24.04-s390x
runs-on: s390x
steps:
- name: Login to Kata Containers ghcr.io
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0

View File

@@ -181,23 +181,6 @@ jobs:
GH_TOKEN: ${{ github.token }}
ARCHITECTURE: ppc64le
- name: Set KATA_TOOLS_STATIC_TARBALL env var
run: |
tarball=$(pwd)/kata-tools-static.tar.zst
echo "KATA_TOOLS_STATIC_TARBALL=${tarball}" >> "$GITHUB_ENV"
- name: Download amd64 tools artifacts
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64
- name: Upload amd64 static tarball tools to GitHub
run: |
./tools/packaging/release/release.sh upload-kata-tools-static-tarball
env:
GH_TOKEN: ${{ github.token }}
ARCHITECTURE: amd64
upload-versions-yaml:
name: upload-versions-yaml
needs: release

View File

@@ -0,0 +1,167 @@
name: CI | Run containerd guest pull stability tests
on:
schedule:
- cron: "0 */1 * * *" #run every hour
permissions: {}
# This job relies on k8s pre-installed using kubeadm
jobs:
run-containerd-guest-pull-stability-tests:
name: run-containerd-guest-pull-stability-tests-${{ matrix.environment.test-type }}-${{ matrix.environment.containerd }}
strategy:
fail-fast: false
matrix:
environment: [
{ test-type: multi-snapshotter, containerd: v2.2 },
{ test-type: force-guest-pull, containerd: v1.7 },
{ test-type: force-guest-pull, containerd: v2.0 },
{ test-type: force-guest-pull, containerd: v2.1 },
{ test-type: force-guest-pull, containerd: v2.2 },
]
env:
# I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here.
IMAGES_LIST: quay.io/mongodb/mongodb-community-server@sha256:8b73733842da21b6bbb6df4d7b2449229bb3135d2ec8c6880314d88205772a11 ghcr.io/edgelesssys/redis@sha256:ecb0a964c259a166a1eb62f0eb19621d42bd1cce0bc9bb0c71c828911d4ba93d
runs-on: containerd-${{ matrix.environment.test-type }}-${{ matrix.environment.containerd }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: Rotate the journal
run: sudo journalctl --rotate --vacuum-time 1s
- name: Pull the kata-deploy image to be used
run: sudo ctr -n k8s.io image pull quay.io/kata-containers/kata-deploy-ci:kata-containers-latest
- name: Deploy Kata Containers
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
env:
KATA_HYPERVISOR: qemu-coco-dev
KUBERNETES: vanilla
SNAPSHOTTER: ${{ matrix.environment.test-type == 'multi-snapshotter' && 'nydus' || '' }}
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: ${{ matrix.environment.test-type == 'multi-snapshotter' }}
EXPERIMENTAL_FORCE_GUEST_PULL: ${{ matrix.environment.test-type == 'force-guest-pull' && 'qemu-coco-dev' || '' }}
# This is needed as we may hit the createContainerTimeout
- name: Adjust Kata Containers' create_container_timeout
run: |
sudo sed -i -e 's/^\(create_container_timeout\).*=.*$/\1 = 600/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
grep "create_container_timeout.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
# This is needed in order to have enough tmpfs space inside the guest to pull the image
- name: Adjust Kata Containers' default_memory
run: |
sudo sed -i -e 's/^\(default_memory\).*=.*$/\1 = 4096/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
grep "default_memory.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
- name: Run a few containers using overlayfs
run: |
# I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here
# shellcheck disable=SC2086
for img in ${IMAGES_LIST}; do
echo "overlayfs | Using on image: ${img}"
pod="$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')"
kubectl run "${pod}" \
-it --rm \
--restart=Never \
--image="${img}" \
--image-pull-policy=Always \
--pod-running-timeout=10m \
-- uname -r
done
- name: Run a the same few containers using a different snapshotter
run: |
# I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here
# shellcheck disable=SC2086
for img in ${IMAGES_LIST}; do
echo "nydus | Using on image: ${img}"
pod="kata-$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')"
kubectl run "${pod}" \
-it --rm \
--restart=Never \
--image="${img}" \
--image-pull-policy=Always \
--pod-running-timeout=10m \
--overrides='{
"spec": {
"runtimeClassName": "kata-qemu-coco-dev"
}
}' \
-- uname -r
done
- name: Uninstall Kata Containers
run: bash tests/integration/kubernetes/gha-run.sh cleanup
env:
KATA_HYPERVISOR: qemu-coco-dev
KUBERNETES: vanilla
SNAPSHOTTER: nydus
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true
- name: Run a few containers using overlayfs
run: |
# I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here
# shellcheck disable=SC2086
for img in ${IMAGES_LIST}; do
echo "overlayfs | Using on image: ${img}"
pod="$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')"
kubectl run "${pod}" \
-it --rm \
--restart=Never \
--image=${img} \
--image-pull-policy=Always \
--pod-running-timeout=10m \
-- uname -r
done
- name: Deploy Kata Containers
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
env:
KATA_HYPERVISOR: qemu-coco-dev
KUBERNETES: vanilla
SNAPSHOTTER: nydus
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true
# This is needed as we may hit the createContainerTimeout
- name: Adjust Kata Containers' create_container_timeout
run: |
sudo sed -i -e 's/^\(create_container_timeout\).*=.*$/\1 = 600/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
grep "create_container_timeout.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
# This is needed in order to have enough tmpfs space inside the guest to pull the image
- name: Adjust Kata Containers' default_memory
run: |
sudo sed -i -e 's/^\(default_memory\).*=.*$/\1 = 4096/g' /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
grep "default_memory.*=" /opt/kata/share/defaults/kata-containers/configuration-qemu-coco-dev.toml
- name: Run a the same few containers using a different snapshotter
run: |
# I don't want those to be inside double quotes, so I'm deliberately ignoring the double quotes here
# shellcheck disable=SC2086
for img in ${IMAGES_LIST}; do
echo "nydus | Using on image: ${img}"
pod="kata-$(echo ${img} | tr ':.@/' '-' | awk '{print substr($0,1,56)}')"
kubectl run "${pod}" \
-it --rm \
--restart=Never \
--image="${img}" \
--image-pull-policy=Always \
--pod-running-timeout=10m \
--overrides='{
"spec": {
"runtimeClassName": "kata-qemu-coco-dev"
}
}' \
-- uname -r
done
- name: Uninstall Kata Containers
run: bash tests/integration/kubernetes/gha-run.sh cleanup || true
if: always()
env:
KATA_HYPERVISOR: qemu-coco-dev
KUBERNETES: vanilla
SNAPSHOTTER: nydus
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: true

View File

@@ -93,14 +93,14 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Install kata
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-artifacts
- name: Download Azure CLI
uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1
@@ -142,10 +142,6 @@ jobs:
timeout-minutes: 60
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Refresh OIDC token in case access token expired
if: always()
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0

View File

@@ -32,7 +32,6 @@ jobs:
matrix:
vmm:
- qemu
- qemu-runtime-rs
k8s:
- kubeadm
runs-on: arm64-k8s
@@ -69,10 +68,6 @@ jobs:
timeout-minutes: 30
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Collect artifacts ${{ matrix.vmm }}
if: always()
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts

View File

@@ -1,10 +1,7 @@
name: CI | Run NVIDIA GPU kubernetes tests on amd64
name: CI | Run NVIDIA GPU kubernetes tests on arm64
on:
workflow_call:
inputs:
tarball-suffix:
required: true
type: string
registry:
required: true
type: string
@@ -48,7 +45,6 @@ jobs:
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
KUBERNETES: kubeadm
KBS: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }}
K8S_TEST_HOST_TYPE: baremetal
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -63,15 +59,6 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Uninstall previous `kbs-client`
if: matrix.environment.name != 'nvidia-gpu'
timeout-minutes: 10
@@ -102,11 +89,6 @@ jobs:
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
env:
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Collect artifacts ${{ matrix.environment.vmm }}
if: always()
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
@@ -126,6 +108,5 @@ jobs:
- name: Delete CoCo KBS
if: always() && matrix.environment.name != 'nvidia-gpu'
timeout-minutes: 10
run: |
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs

View File

@@ -75,7 +75,3 @@ jobs:
- name: Run tests
timeout-minutes: 30
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests

View File

@@ -131,18 +131,12 @@ jobs:
timeout-minutes: 60
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Delete kata-deploy
if: always()
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh cleanup-zvsi
- name: Delete CoCo KBS
if: always()
timeout-minutes: 10
run: |
if [ "${KBS}" == "true" ]; then
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs

View File

@@ -46,7 +46,6 @@ jobs:
matrix:
vmm:
- qemu-coco-dev
- qemu-coco-dev-runtime-rs
snapshotter:
- nydus
pull-type:
@@ -84,14 +83,14 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Install kata
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-artifacts
- name: Log into the Azure account
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0
@@ -140,10 +139,6 @@ jobs:
timeout-minutes: 300
run: bash tests/stability/gha-stability-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Refresh OIDC token in case access token expired
if: always()
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0

View File

@@ -79,15 +79,6 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Deploy Kata
timeout-minutes: 20
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
@@ -120,12 +111,10 @@ jobs:
- name: Delete kata-deploy
if: always()
timeout-minutes: 15
run: bash tests/integration/kubernetes/gha-run.sh cleanup
- name: Delete CoCo KBS
if: always()
timeout-minutes: 10
run: |
[[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] && echo "ITA_KEY=${GH_ITA_KEY}" >> "${GITHUB_ENV}"
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs
@@ -170,7 +159,6 @@ jobs:
AUTHENTICATED_IMAGE_USER: ${{ vars.AUTHENTICATED_IMAGE_USER }}
AUTHENTICATED_IMAGE_PASSWORD: ${{ secrets.AUTHENTICATED_IMAGE_PASSWORD }}
SNAPSHOTTER: ${{ matrix.snapshotter }}
EXPERIMENTAL_FORCE_GUEST_PULL: ${{ matrix.pull-type == 'experimental-force-guest-pull' && matrix.vmm || '' }}
# Caution: current ingress controller used to expose the KBS service
# requires much vCPUs, lefting only a few for the tests. Depending on the
# host type chose it will result on the creation of a cluster with
@@ -189,14 +177,14 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Install kata
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-artifacts
- name: Log into the Azure account
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0
@@ -229,6 +217,7 @@ jobs:
timeout-minutes: 20
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-aks
env:
EXPERIMENTAL_FORCE_GUEST_PULL: ${{ env.PULL_TYPE == 'experimental-force-guest-pull' && env.KATA_HYPERVISOR || '' }}
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: ${{ env.SNAPSHOTTER == 'nydus' }}
AUTO_GENERATE_POLICY: ${{ env.PULL_TYPE == 'experimental-force-guest-pull' && 'no' || 'yes' }}
@@ -312,15 +301,6 @@ jobs:
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Remove unnecessary directories to free up space
run: |
sudo rm -rf /usr/local/.ghcup
@@ -329,6 +309,7 @@ jobs:
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/share/powershell

View File

@@ -102,10 +102,6 @@ jobs:
- name: Run tests
run: bash tests/functional/kata-deploy/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Refresh OIDC token in case access token expired
if: always()
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0

View File

@@ -66,6 +66,7 @@ jobs:
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/share/powershell
@@ -84,7 +85,3 @@ jobs:
- name: Run tests
run: bash tests/functional/kata-deploy/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/functional/kata-deploy/gha-run.sh report-tests

54
.github/workflows/run-runk-tests.yaml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: CI | Run runk tests
on:
workflow_call:
inputs:
tarball-suffix:
required: false
type: string
commit-hash:
required: false
type: string
target-branch:
required: false
type: string
default: ""
permissions: {}
jobs:
run-runk:
name: run-runk
# Skip runk tests as we have no maintainers. TODO: Decide when to remove altogether
if: false
runs-on: ubuntu-22.04
env:
CONTAINERD_VERSION: lts
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Install dependencies
run: bash tests/integration/runk/gha-run.sh install-dependencies
env:
GH_TOKEN: ${{ github.token }}
- name: get-kata-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-artifacts
- name: Install kata
run: bash tests/integration/runk/gha-run.sh install-kata kata-artifacts
- name: Run runk tests
run: bash tests/integration/runk/gha-run.sh run

View File

@@ -6,21 +6,14 @@ on:
permissions: {}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
stale:
name: stale
runs-on: ubuntu-22.04
permissions:
actions: write # Needed to manage caches for state persistence across runs
pull-requests: write # Needed to add/remove labels, post comments, or close PRs
steps:
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
stale-pr-message: 'This PR has been opened without activity for 180 days. Please comment on the issue or it will be closed in 7 days.'
stale-pr-message: 'This PR has been opened without with no activity for 180 days. Comment on the issue otherwise it will be closed in 7 days'
days-before-pr-stale: 180
days-before-pr-close: 7
days-before-issue-stale: -1

View File

@@ -29,7 +29,7 @@ jobs:
matrix:
instance:
- "ubuntu-24.04-arm"
- "ubuntu-24.04-s390x"
- "s390x"
- "ubuntu-24.04-ppc64le"
uses: ./.github/workflows/build-checks.yaml
with:

View File

@@ -21,7 +21,7 @@ jobs:
persist-credentials: false
- name: Run zizmor
uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
uses: zizmorcore/zizmor-action@e673c3917a1aef3c65c972347ed84ccd013ecda4 # v0.2.0
with:
advanced-security: false
annotations: true

2
.gitignore vendored
View File

@@ -18,5 +18,3 @@ src/tools/log-parser/kata-log-parser
tools/packaging/static-build/agent/install_libseccomp.sh
.envrc
.direnv
**/.DS_Store
site/

View File

@@ -1,140 +0,0 @@
[workspace.package]
authors = ["The Kata Containers community <kata-dev@lists.katacontainers.io>"]
edition = "2018"
license = "Apache-2.0"
rust-version = "1.88"
[workspace]
members = [
# Dragonball
"src/dragonball",
"src/dragonball/dbs_acpi",
"src/dragonball/dbs_address_space",
"src/dragonball/dbs_allocator",
"src/dragonball/dbs_arch",
"src/dragonball/dbs_boot",
"src/dragonball/dbs_device",
"src/dragonball/dbs_interrupt",
"src/dragonball/dbs_legacy_devices",
"src/dragonball/dbs_pci",
"src/dragonball/dbs_tdx",
"src/dragonball/dbs_upcall",
"src/dragonball/dbs_utils",
"src/dragonball/dbs_virtio_devices",
# runtime-rs
"src/runtime-rs",
"src/runtime-rs/crates/agent",
"src/runtime-rs/crates/hypervisor",
"src/runtime-rs/crates/persist",
"src/runtime-rs/crates/resource",
"src/runtime-rs/crates/runtimes",
"src/runtime-rs/crates/service",
"src/runtime-rs/crates/shim",
"src/runtime-rs/crates/shim-ctl",
"src/runtime-rs/tests/utils",
]
resolver = "2"
# TODO: Add all excluded crates to root workspace
exclude = [
"src/agent",
"src/tools",
"src/libs",
# kata-deploy binary is standalone and has its own Cargo.toml for now
"tools/packaging/kata-deploy/binary",
# We are cloning and building rust packages under
# "tools/packaging/kata-deploy/local-build/build" folder, which may mislead
# those packages to think they are part of the kata root workspace
"tools/packaging/kata-deploy/local-build/build",
]
[workspace.dependencies]
# Rust-VMM crates
event-manager = "0.2.1"
kvm-bindings = "0.6.0"
kvm-ioctls = "=0.12.1"
linux-loader = "0.8.0"
seccompiler = "0.5.0"
vfio-bindings = "0.3.0"
vfio-ioctls = "0.1.0"
virtio-bindings = "0.1.0"
virtio-queue = "0.7.0"
vm-fdt = "0.2.0"
vm-memory = "0.10.0"
vm-superio = "0.5.0"
vmm-sys-util = "0.11.0"
# Local dependencies from Dragonball Sandbox crates
dragonball = { path = "src/dragonball" }
dbs-acpi = { path = "src/dragonball/dbs_acpi" }
dbs-address-space = { path = "src/dragonball/dbs_address_space" }
dbs-allocator = { path = "src/dragonball/dbs_allocator" }
dbs-arch = { path = "src/dragonball/dbs_arch" }
dbs-boot = { path = "src/dragonball/dbs_boot" }
dbs-device = { path = "src/dragonball/dbs_device" }
dbs-interrupt = { path = "src/dragonball/dbs_interrupt" }
dbs-legacy-devices = { path = "src/dragonball/dbs_legacy_devices" }
dbs-pci = { path = "src/dragonball/dbs_pci" }
dbs-tdx = { path = "src/dragonball/dbs_tdx" }
dbs-upcall = { path = "src/dragonball/dbs_upcall" }
dbs-utils = { path = "src/dragonball/dbs_utils" }
dbs-virtio-devices = { path = "src/dragonball/dbs_virtio_devices" }
# Local dependencies from runtime-rs
agent = { path = "src/runtime-rs/crates/agent" }
hypervisor = { path = "src/runtime-rs/crates/hypervisor" }
persist = { path = "src/runtime-rs/crates/persist" }
resource = { path = "src/runtime-rs/crates/resource" }
runtimes = { path = "src/runtime-rs/crates/runtimes" }
service = { path = "src/runtime-rs/crates/service" }
tests_utils = { path = "src/runtime-rs/tests/utils" }
ch-config = { path = "src/runtime-rs/crates/hypervisor/ch-config" }
common = { path = "src/runtime-rs/crates/runtimes/common" }
linux_container = { path = "src/runtime-rs/crates/runtimes/linux_container" }
virt_container = { path = "src/runtime-rs/crates/runtimes/virt_container" }
wasm_container = { path = "src/runtime-rs/crates/runtimes/wasm_container" }
# Local dependencies from `src/lib`
kata-sys-util = { path = "src/libs/kata-sys-util" }
kata-types = { path = "src/libs/kata-types", features = ["safe-path"] }
logging = { path = "src/libs/logging" }
protocols = { path = "src/libs/protocols", features = ["async"] }
runtime-spec = { path = "src/libs/runtime-spec" }
safe-path = { path = "src/libs/safe-path" }
shim-interface = { path = "src/libs/shim-interface" }
test-utils = { path = "src/libs/test-utils" }
# Outside dependencies
actix-rt = "2.7.0"
anyhow = "1.0"
async-trait = "0.1.48"
containerd-shim = { version = "0.10.0", features = ["async"] }
containerd-shim-protos = { version = "0.10.0", features = ["async"] }
go-flag = "0.1.0"
hyper = "0.14.20"
hyperlocal = "0.8.0"
lazy_static = "1.4"
libc = "0.2"
log = "0.4.14"
netns-rs = "0.1.0"
# Note: nix needs to stay sync'd with libs versions
nix = "0.26.4"
oci-spec = { version = "0.8.1", features = ["runtime"] }
protobuf = "3.7.2"
rand = "0.8.4"
serde = { version = "1.0.145", features = ["derive"] }
serde_json = "1.0.91"
sha2 = "0.10.9"
slog = "2.5.2"
slog-scope = "4.4.0"
strum = { version = "0.24.0", features = ["derive"] }
tempfile = "3.19.1"
thiserror = "1.0"
tokio = "1.46.1"
tracing = "0.1.41"
tracing-opentelemetry = "0.18.0"
ttrpc = "0.8.4"
url = "2.5.4"

View File

@@ -18,6 +18,7 @@ TOOLS =
TOOLS += agent-ctl
TOOLS += kata-ctl
TOOLS += log-parser
TOOLS += runk
TOOLS += trace-forwarder
STANDARD_TARGETS = build check clean install static-checks-build test vendor
@@ -47,10 +48,7 @@ docs-url-alive-check:
bash ci/docs-url-alive-check.sh
build-and-publish-kata-debug:
bash tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh ${KATA_DEBUG_REGISTRY} ${KATA_DEBUG_TAG}
docs-serve:
docker run --rm -p 8000:8000 -v ./docs:/docs:ro -v ${PWD}/zensical.toml:/zensical.toml:ro zensical/zensical serve --config-file /zensical.toml -a 0.0.0.0:8000
bash tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh ${KATA_DEBUG_REGISTRY} ${KATA_DEBUG_TAG}
.PHONY: \
all \
@@ -58,5 +56,4 @@ docs-serve:
install-tarball \
default \
static-checks \
docs-url-alive-check \
docs-serve
docs-url-alive-check

View File

@@ -139,6 +139,7 @@ The table below lists the remaining parts of the project:
| [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. |
| [`kata-ctl`](src/tools/kata-ctl) | utility | Tool that provides advanced commands and debug facilities. |
| [`trace-forwarder`](src/tools/trace-forwarder) | utility | Agent tracing helper. |
| [`runk`](src/tools/runk) | utility | Standard OCI container runtime based on the agent. |
| [`ci`](.github/workflows) | CI | Continuous Integration configuration files and scripts. |
| [`ocp-ci`](ci/openshift-ci/README.md) | CI | Continuous Integration configuration for the OpenShift pipelines. |
| [`katacontainers.io`](https://github.com/kata-containers/www.katacontainers.io) | Source for the [`katacontainers.io`](https://www.katacontainers.io) site. |

View File

@@ -1 +1 @@
3.27.0
3.22.0

View File

@@ -11,10 +11,6 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${script_dir}/../tests/common.bash"
# Path to the ORAS cache helper for downloading tarballs (sourced when needed)
# Use ORAS_CACHE_HELPER env var (set by build.sh in Docker) or fallback to repo path
oras_cache_helper="${ORAS_CACHE_HELPER:-${script_dir}/../tools/packaging/scripts/download-with-oras-cache.sh}"
# The following variables if set on the environment will change the behavior
# of gperf and libseccomp configure scripts, that may lead this script to
# fail. So let's ensure they are unset here.
@@ -48,9 +44,6 @@ fi
gperf_tarball="gperf-${gperf_version}.tar.gz"
gperf_tarball_url="${gperf_url}/${gperf_tarball}"
# Use ORAS cache for gperf downloads (gperf upstream can be unreliable)
USE_ORAS_CACHE="${USE_ORAS_CACHE:-yes}"
# We need to build the libseccomp library from sources to create a static
# library for the musl libc.
# However, ppc64le, riscv64 and s390x have no musl targets in Rust. Hence, we do
@@ -75,23 +68,7 @@ trap finish EXIT
build_and_install_gperf() {
echo "Build and install gperf version ${gperf_version}"
mkdir -p "${gperf_install_dir}"
# Use ORAS cache if available and enabled
if [[ "${USE_ORAS_CACHE}" == "yes" ]] && [[ -f "${oras_cache_helper}" ]]; then
echo "Using ORAS cache for gperf download"
source "${oras_cache_helper}"
local cached_tarball
cached_tarball=$(download_component gperf "$(pwd)")
if [[ -f "${cached_tarball}" ]]; then
gperf_tarball="${cached_tarball}"
else
echo "ORAS cache download failed, falling back to direct download"
curl -sLO "${gperf_tarball_url}"
fi
else
curl -sLO "${gperf_tarball_url}"
fi
curl -sLO "${gperf_tarball_url}"
tar -xf "${gperf_tarball}"
pushd "gperf-${gperf_version}"
# Unset $CC for configure, we will always use native for gperf

View File

@@ -73,12 +73,12 @@ function install_yq() {
goarch=arm64
;;
"arm64")
# If we're on an apple silicon machine, just assign amd64.
# The version of yq we use doesn't have a darwin arm build,
# If we're on an apple silicon machine, just assign amd64.
# The version of yq we use doesn't have a darwin arm build,
# but Rosetta can come to the rescue here.
if [[ ${goos} == "Darwin" ]]; then
goarch=amd64
else
else
goarch=arm64
fi
;;

View File

@@ -46,12 +46,16 @@ fi
[[ ${SELINUX_PERMISSIVE} == "yes" ]] && oc delete -f "${deployments_dir}/machineconfig_selinux.yaml.in"
# Delete kata-containers
helm uninstall kata-deploy --wait --namespace kube-system
pushd "${katacontainers_repo_dir}/tools/packaging/kata-deploy" || { echo "Failed to push to ${katacontainers_repo_dir}/tools/packaging/kata-deploy"; exit 125; }
oc delete -f kata-deploy/base/kata-deploy.yaml
oc -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod
oc apply -f kata-cleanup/base/kata-cleanup.yaml
echo "Wait for all related pods to be gone"
( repeats=1; for _ in $(seq 1 600); do
oc get pods -l name="kubelet-kata-cleanup" --no-headers=true -n kube-system 2>&1 | grep "No resources found" -q && ((repeats++)) || repeats=1
[[ "${repeats}" -gt 5 ]] && echo kata-cleanup finished && break
sleep 1
done) || { echo "There are still some kata-cleanup related pods after 600 iterations"; oc get all -n kube-system; exit 1; }
oc delete -f kata-cleanup/base/kata-cleanup.yaml
oc delete -f kata-rbac/base/kata-rbac.yaml
oc delete -f runtimeclasses/kata-runtimeClasses.yaml

View File

@@ -51,13 +51,13 @@ apply_kata_deploy() {
oc label --overwrite ns kube-system pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=baseline pod-security.kubernetes.io/audit=baseline
local version chart
version='0.0.0-dev'
version=$(curl -sSL https://api.github.com/repos/kata-containers/kata-containers/releases/latest | jq .tag_name | tr -d '"')
chart="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy"
# Ensure any potential leftover is cleaned up ... and this secret usually is not in case of previous failures
oc delete secret sh.helm.release.v1.kata-deploy.v1 -n kube-system || true
echo "Installing kata using helm ${chart} ${version} (sha printed in helm output)"
echo "Installing kata using helm ${chart} ${version}"
helm install kata-deploy --wait --namespace kube-system --set "image.reference=${KATA_DEPLOY_IMAGE%%:*},image.tag=${KATA_DEPLOY_IMAGE##*:}" "${chart}" --version "${version}"
}

View File

@@ -157,16 +157,6 @@ if [[ -z "${CAA_IMAGE}" ]]; then
fi
# Get latest PP image
#
# You can list the CI images by:
# az sig image-version list-community --location "eastus" --public-gallery-name "cocopodvm-d0e4f35f-5530-4b9c-8596-112487cdea85" --gallery-image-definition "podvm_image0" --output table
# or the release images by:
# az sig image-version list-community --location "eastus" --public-gallery-name "cococommunity-42d8482d-92cd-415b-b332-7648bd978eff" --gallery-image-definition "peerpod-podvm-fedora" --output table
# or the release debug images by:
# az sig image-version list-community --location "eastus" --public-gallery-name "cococommunity-42d8482d-92cd-415b-b332-7648bd978eff" --gallery-image-definition "peerpod-podvm-fedora-debug" --output table
#
# Note there are other flavours of the released images, you can list them by:
# az sig image-definition list-community --location "eastus" --public-gallery-name "cococommunity-42d8482d-92cd-415b-b332-7648bd978eff" --output table
if [[ -z "${PP_IMAGE_ID}" ]]; then
SUCCESS_TIME=$(curl -s \
-H "Accept: application/vnd.github+json" \

View File

@@ -83,4 +83,4 @@ files to the repository and create a pull request when you are ready.
If you have an idea for a blog post and would like to get feedback from the
community about it or have any questions about the process, please reach out
on one of the community's [communication channels](https://katacontainers.io/community/).
on one of the community's [communication channels](https://katacontainers.io/community/).

View File

@@ -125,7 +125,7 @@ If you want to enable SELinux in Permissive mode, add `enforcing=0` to the kerne
Enable full debug as follows:
```bash
$ sudo sed -i -E 's/^(\s*enable_debug\s*=\s*)false/\1true/' /etc/kata-containers/configuration.toml
$ sudo sed -i -e 's/^# *\(enable_debug\).*=.*$/\1 = true/g' /etc/kata-containers/configuration.toml
$ sudo sed -i -e 's/^kernel_params = "\(.*\)"/kernel_params = "\1 agent.log=debug initcall_debug"/g' /etc/kata-containers/configuration.toml
```
@@ -289,14 +289,14 @@ provided by your distribution.
As a prerequisite, you need to install Docker. Otherwise, you will not be
able to run the `rootfs.sh` script with `USE_DOCKER=true` as expected in
the following example. Specifying the `OS_VERSION` is required when using `distro="ubuntu"`.
the following example.
```bash
$ export distro="ubuntu" # example
$ export ROOTFS_DIR="$(realpath kata-containers/tools/osbuilder/rootfs-builder/rootfs)"
$ sudo rm -rf "${ROOTFS_DIR}"
$ pushd kata-containers/tools/osbuilder/rootfs-builder
$ script -fec 'sudo -E USE_DOCKER=true OS_VERSION=noble ./rootfs.sh "${distro}"'
$ script -fec 'sudo -E USE_DOCKER=true ./rootfs.sh "${distro}"'
$ popd
```

View File

@@ -206,7 +206,7 @@ For security reasons, the following mounts are disallowed:
| `proc \|\| sysfs` | `*` | not a directory (e.g. symlink) | CVE-2019-19921 |
For bind mounts under /proc, these destinations are allowed:
* `/proc/cpuinfo`
* `/proc/diskstats`
* `/proc/meminfo`

View File

@@ -83,7 +83,3 @@ Documents that help to understand and contribute to Kata Containers.
If you have a suggestion for how we can improve the
[website](https://katacontainers.io), please raise an issue (or a PR) on
[the repository that holds the source for the website](https://github.com/OpenStackweb/kata-netlify-refresh).
### Toolchain Guidance
* [Toolchain Guidance](./Toochain-Guidance.md)

View File

@@ -1,39 +0,0 @@
# Toolchains
As a community we want to strike a balance between having up-to-date toolchains, to receive the
latest security fixes and to be able to benefit from new features and packages, whilst not being
too bleeding edge and disrupting downstream and other consumers. As a result we have the following
guidelines (note, not hard rules) for our go and rust toolchains that we are attempting to try out:
## Go toolchain
Go is released [every six months](https://go.dev/wiki/Go-Release-Cycle) with support for the
[last two major release versions](https://go.dev/doc/devel/release#policy). We always want to
ensure that we are on a supported version so we receive security fixes. To try and make
things easier for some of our users, we aim to be using the older of the two supported major
versions, unless there is a compelling reason to adopt the newer version.
In practice this means that we bump our major version of the go toolchain every six months to
version (1.x-1) in response to a new version (1.x) coming out, which makes our current version
(1.x-2) no longer supported. We will bump the minor version whenever required to satisfy
dependency updates, or security fixes.
Our go toolchain version is recorded in [`versions.yaml`](../versions.yaml) under
`.languages.golang.version` and should match with the version in our `go.mod` files.
## Rust toolchain
Rust has a [six week](https://doc.rust-lang.org/book/appendix-05-editions.html#:~:text=The%20Rust%20language%20and%20compiler,these%20tiny%20changes%20add%20up.)
release cycle and they only support the latest stable release, so if we wanted to remain on a
supported release we would only ever build with the latest stable and bump every 6 weeks.
However feedback from our community has indicated that this is a challenge as downstream consumers
often want to get rust from their distro, or downstream fork and these struggle to keep up with
the six week release schedule. As a result the community has agreed to try out a policy of
"stable-2", where we aim to build with a rust version that is two versions behind the latest stable
version.
In practice this should mean that we bump our rust toolchain every six weeks, to version
1.x-2 when 1.x is released as stable and we should be picking up the latest point release
of that version, if there were any.
The rust-toolchain that we are using is recorded in [`rust-toolchain.toml`](../rust-toolchain.toml).

View File

@@ -198,7 +198,7 @@ fn join_params_with_dash(str: &str, num: i32) -> Result<String> {
return Err("number must be positive");
}
let result = format!("{str}-{num}");
let result = format!("{}-{}", str, num);
Ok(result)
}
@@ -253,13 +253,13 @@ mod tests {
// Run the tests
for (i, d) in tests.iter().enumerate() {
// Create a string containing details of the test
let msg = format!("test[{i}]: {d:?}");
let msg = format!("test[{}]: {:?}", i, d);
// Call the function under test
let result = join_params_with_dash(d.str, d.num);
// Update the test details string with the results of the call
let msg = format!("{msg}, result: {result:?}");
let msg = format!("{}, result: {:?}", msg, result);
// Perform the checks
if d.result.is_ok() {
@@ -267,8 +267,8 @@ mod tests {
continue;
}
let expected_error = format!("{d.result.as_ref().unwrap_err()}");
let actual_error = format!("{result.unwrap_err()}");
let expected_error = format!("{}", d.result.as_ref().unwrap_err());
let actual_error = format!("{}", result.unwrap_err());
assert!(actual_error == expected_error, msg);
}
}

View File

@@ -1,9 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32">
<!-- Dark background matching the site -->
<rect width="32" height="32" rx="4" fill="#1a1a2e"/>
<!-- Kata logo scaled and centered -->
<g transform="translate(-27, -2) scale(0.75)">
<path d="M70.925 25.22L58.572 37.523 46.27 25.22l2.192-2.192 10.11 10.11 10.11-10.11zm-6.575-.2l-3.188-3.188 3.188-3.188 3.188 3.188zm-4.93-2.54l3.736 3.736-3.736 3.736zm-1.694 7.422l-8.07-8.07 8.07-8.07zm1.694-16.14l3.686 3.686-3.686 3.686zm-13.15 4.682L58.572 6.143l12.353 12.303-2.192 2.192-10.16-10.11-10.11 10.11zm26.997 0L58.572 3.752 43.878 18.446l3.387 3.387-3.387 3.387 14.694 14.694L73.266 25.22l-3.337-3.387z" fill="#f15b3e"/>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 710 B

View File

@@ -4,7 +4,7 @@ As we know, we can interact with cgroups in two ways, **`cgroupfs`** and **`syst
## usage
For systemd, kata agent configures cgroups according to the following `linux.cgroupsPath` format standard provided by `runc` (`[slice]:[prefix]:[name]`). If you don't provide a valid `linux.cgroupsPath`, kata agent will treat it as `"system.slice:kata_agent:<container-id>"`.
For systemd, kata agent configures cgroups according to the following `linux.cgroupsPath` format standard provided by `runc` (`[slice]:[prefix]:[name]`). If you don't provide a valid `linux.cgroupsPath`, kata agent will treat it as `"system.slice:kata_agent:<container-id>"`.
> Here slice is a systemd slice under which the container is placed. If empty, it defaults to system.slice, except when cgroup v2 is used and rootless container is created, in which case it defaults to user.slice.
>
@@ -65,7 +65,7 @@ The kata agent will translate the parameters in the `linux.resources` of `config
## Systemd Interface
`session.rs` and `system.rs` in `src/agent/rustjail/src/cgroups/systemd/interface` are automatically generated by `zbus-xmlgen`, which is is an accompanying tool provided by `zbus` to generate Rust code from `D-Bus XML interface descriptions`. The specific commands to generate these two files are as follows:
`session.rs` and `system.rs` in `src/agent/rustjail/src/cgroups/systemd/interface` are automatically generated by `zbus-xmlgen`, which is is an accompanying tool provided by `zbus` to generate Rust code from `D-Bus XML interface descriptions`. The specific commands to generate these two files are as follows:
```shell
// system.rs

View File

@@ -10,7 +10,7 @@ participant proxy
#Docker Exec
Docker->kata-runtime: exec
kata-runtime->virtcontainers: EnterContainer()
virtcontainers->agent: exec
virtcontainers->agent: exec
agent->virtcontainers: Process started in the container
virtcontainers->shim: start shim
shim->agent: ReadStdout()

View File

@@ -322,7 +322,7 @@ The runtime is responsible for starting the [hypervisor](#hypervisor)
and it's VM, and communicating with the [agent](#agent) using a
[ttRPC based protocol](#agent-communications-protocol) over a VSOCK
socket that provides a communications link between the VM and the
host.
host.
This protocol allows the runtime to send container management commands
to the agent. The protocol is also used to carry the standard I/O

View File

@@ -4,15 +4,15 @@ Containers typically live in their own, possibly shared, networking namespace.
At some point in a container lifecycle, container engines will set up that namespace
to add the container to a network which is isolated from the host network.
In order to setup the network for a container, container engines call into a
In order to setup the network for a container, container engines call into a
networking plugin. The network plugin will usually create a virtual
ethernet (`veth`) pair adding one end of the `veth` pair into the container
networking namespace, while the other end of the `veth` pair is added to the
ethernet (`veth`) pair adding one end of the `veth` pair into the container
networking namespace, while the other end of the `veth` pair is added to the
host networking namespace.
This is a very namespace-centric approach as many hypervisors or VM
Managers (VMMs) such as `virt-manager` cannot handle `veth`
interfaces. Typically, [`TAP`](https://www.kernel.org/doc/Documentation/networking/tuntap.txt)
interfaces. Typically, [`TAP`](https://www.kernel.org/doc/Documentation/networking/tuntap.txt)
interfaces are created for VM connectivity.
To overcome incompatibility between typical container engines expectations
@@ -22,15 +22,15 @@ interfaces with `TAP` ones using [Traffic Control](https://man7.org/linux/man-pa
![Kata Containers networking](../arch-images/network.png)
With a TC filter rules in place, a redirection is created between the container network
and the virtual machine. As an example, the network plugin may place a device,
`eth0`, in the container's network namespace, which is one end of a VETH device.
and the virtual machine. As an example, the network plugin may place a device,
`eth0`, in the container's network namespace, which is one end of a VETH device.
Kata Containers will create a tap device for the VM, `tap0_kata`,
and setup a TC redirection filter to redirect traffic from `eth0`'s ingress to `tap0_kata`'s egress,
and a second TC filter to redirect traffic from `tap0_kata`'s ingress to `eth0`'s egress.
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata.
With this method, Kata created a MACVTAP device to connect directly to the `eth0` device.
TC-filter is the default because it allows for simpler configuration, better CNI plugin
Kata Containers maintains support for MACVTAP, which was an earlier implementation used in Kata.
With this method, Kata created a MACVTAP device to connect directly to the `eth0` device.
TC-filter is the default because it allows for simpler configuration, better CNI plugin
compatibility, and performance on par with MACVTAP.
Kata Containers has deprecated support for bridge due to lacking performance relative to TC-filter and MACVTAP.

View File

@@ -51,7 +51,6 @@ containers started after the VM has been launched.
Users can check to see if the container uses the `devicemapper` block
device as its rootfs by calling `mount(8)` within the container. If
the `devicemapper` block device is used, the root filesystem (`/`)
will be mounted from `/dev/vda`. Users can enable direct mounting of
the underlying block device by setting the runtime
[configuration](README.md#configuration) flag `disable_block_device_use` to
`false`.
will be mounted from `/dev/vda`. Users can disable direct mounting of
the underlying block device through the runtime
[configuration](README.md#configuration).

View File

@@ -111,20 +111,20 @@ In our case, there will be a variety of resources, and every resource has severa
- Are the "service", "message dispatcher" and "runtime handler" all part of the single Kata 3.x runtime binary?
Yes. They are components in Kata 3.x runtime. And they will be packed into one binary.
1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc.
2. Message dispatcher, it is used to match multiple requests from the service module.
3. Runtime handler is used to deal with the operation for sandbox and container.
1. Service is an interface, which is responsible for handling multiple services like task service, image service and etc.
2. Message dispatcher, it is used to match multiple requests from the service module.
3. Runtime handler is used to deal with the operation for sandbox and container.
- What is the name of the Kata 3.x runtime binary?
Apparently we can't use `containerd-shim-v2-kata` because it's already used. We are facing the hardest issue of "naming" again. Any suggestions are welcomed.
Internally we use `containerd-shim-v2-rund`.
- Is the Kata 3.x design compatible with the containerd shimv2 architecture?
- Is the Kata 3.x design compatible with the containerd shimv2 architecture?
Yes. It is designed to follow the functionality of go version kata. And it implements the `containerd shim v2` interface/protocol.
- How will users migrate to the Kata 3.x architecture?
The migration plan will be provided before the Kata 3.x is merging into the main branch.
- Is `Dragonball` limited to its own built-in VMM? Can the `Dragonball` system be configured to work using an external `Dragonball` VMM/hypervisor?
@@ -134,35 +134,35 @@ In our case, there will be a variety of resources, and every resource has severa
`runD` is the `containerd-shim-v2` counterpart of `runC` and can run a pod/containers. `Dragonball` is a `microvm`/VMM that is designed to run container workloads. Instead of `microvm`/VMM, we sometimes refer to it as secure sandbox.
- QEMU, Cloud Hypervisor and Firecracker support are planned, but how that would work. Are they working in separate process?
Yes. They are unable to work as built in VMM.
- What is `upcall`?
The `upcall` is used to hotplug CPU/memory/MMIO devices, and it solves two issues.
1. avoid dependency on PCI/ACPI
2. avoid dependency on `udevd` within guest and get deterministic results for hotplug operations. So `upcall` is an alternative to ACPI based CPU/memory/device hotplug. And we may cooperate with the community to add support for ACPI based CPU/memory/device hotplug if needed.
`Dbs-upcall` is a `vsock-based` direct communication tool between VMM and guests. The server side of the `upcall` is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests once the kernel has started. And the client side is in VMM , it'll be a thread that communicates with VSOCK through `uds`. We have accomplished device hotplug / hot-unplug directly through `upcall` in order to avoid virtualization of ACPI to minimize virtual machine's overhead. And there could be many other usage through this direct communication channel. It's already open source.
https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall
https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-upcall
- The URL below says the kernel patches work with 4.19, but do they also work with 5.15+ ?
Forward compatibility should be achievable, we have ported it to 5.10 based kernel.
- Are these patches platform-specific or would they work for any architecture that supports VSOCK?
It's almost platform independent, but some message related to CPU hotplug are platform dependent.
- Could the kernel driver be replaced with a userland daemon in the guest using loopback VSOCK?
We need to create device nodes for hot-added CPU/memory/devices, so it's not easy for userspace daemon to do these tasks.
- The fact that `upcall` allows communication between the VMM and the guest suggests that this architecture might be incompatible with https://github.com/confidential-containers where the VMM should have no knowledge of what happens inside the VM.
1. `TDX` doesn't support CPU/memory hotplug yet.
2. For ACPI based device hotplug, it depends on ACPI `DSDT` table, and the guest kernel will execute `ASL` code to handle during handling those hotplug event. And it should be easier to audit VSOCK based communication than ACPI `ASL` methods.
- What is the security boundary for the monolithic / "Built-in VMM" case?
It has the security boundary of virtualization. More details will be provided in next stage.

View File

@@ -1,62 +1,62 @@
# Motivation
Today, there exist a few gaps between Container Storage Interface (CSI) and virtual machine (VM) based runtimes such as Kata Containers
Today, there exist a few gaps between Container Storage Interface (CSI) and virtual machine (VM) based runtimes such as Kata Containers
that prevent them from working together smoothly.
First, its cumbersome to use a persistent volume (PV) with Kata Containers. Today, for a PV with Filesystem volume mode, Virtio-fs
is the only way to surface it inside a Kata Container guest VM. But often mounting the filesystem (FS) within the guest operating system (OS) is
is the only way to surface it inside a Kata Container guest VM. But often mounting the filesystem (FS) within the guest operating system (OS) is
desired due to performance benefits, availability of native FS features and security benefits over the Virtio-fs mechanism.
Second, its difficult if not impossible to resize a PV online with Kata Containers. While a PV can be expanded on the host OS,
the updated metadata needs to be propagated to the guest OS in order for the application container to use the expanded volume.
Second, its difficult if not impossible to resize a PV online with Kata Containers. While a PV can be expanded on the host OS,
the updated metadata needs to be propagated to the guest OS in order for the application container to use the expanded volume.
Currently, there is not a way to propagate the PV metadata from the host OS to the guest OS without restarting the Pod sandbox.
# Proposed Solution
Because of the OS boundary, these features cannot be implemented in the CSI node driver plugin running on the host OS
as is normally done in the runc container. Instead, they can be done by the Kata Containers agent inside the guest OS,
but it requires the CSI driver to pass the relevant information to the Kata Containers runtime.
An ideal long term solution would be to have the `kubelet` coordinating the communication between the CSI driver and
the container runtime, as described in [KEP-2857](https://github.com/kubernetes/enhancements/pull/2893/files).
Because of the OS boundary, these features cannot be implemented in the CSI node driver plugin running on the host OS
as is normally done in the runc container. Instead, they can be done by the Kata Containers agent inside the guest OS,
but it requires the CSI driver to pass the relevant information to the Kata Containers runtime.
An ideal long term solution would be to have the `kubelet` coordinating the communication between the CSI driver and
the container runtime, as described in [KEP-2857](https://github.com/kubernetes/enhancements/pull/2893/files).
However, as the KEP is still under review, we would like to propose a short/medium term solution to unblock our use case.
The proposed solution is built on top of a previous [proposal](https://github.com/egernst/kata-containers/blob/da-proposal/docs/design/direct-assign-volume.md)
The proposed solution is built on top of a previous [proposal](https://github.com/egernst/kata-containers/blob/da-proposal/docs/design/direct-assign-volume.md)
described by Eric Ernst. The previous proposal has two gaps:
1. Writing a `csiPlugin.json` file to the volume root path introduced a security risk. A malicious user can gain unauthorized
access to a block device by writing their own `csiPlugin.json` to the above location through an ephemeral CSI plugin.
1. Writing a `csiPlugin.json` file to the volume root path introduced a security risk. A malicious user can gain unauthorized
access to a block device by writing their own `csiPlugin.json` to the above location through an ephemeral CSI plugin.
2. The proposal didn't describe how to establish a mapping between a volume and a kata sandbox, which is needed for
2. The proposal didn't describe how to establish a mapping between a volume and a kata sandbox, which is needed for
implementing CSI volume resize and volume stat collection APIs.
This document particularly focuses on how to address these two gaps.
## Assumptions and Limitations
1. The proposal assumes that a block device volume will only be used by one Pod on a node at a time, which we believe
is the most common pattern in Kata Containers use cases. Its also unsafe to have the same block device attached to more than
one Kata pod. In the context of Kubernetes, the `PersistentVolumeClaim` (PVC) needs to have the `accessMode` as `ReadWriteOncePod`.
2. More advanced Kubernetes volume features such as, `fsGroup`, `fsGroupChangePolicy`, and `subPath` are not supported.
1. The proposal assumes that a block device volume will only be used by one Pod on a node at a time, which we believe
is the most common pattern in Kata Containers use cases. Its also unsafe to have the same block device attached to more than
one Kata pod. In the context of Kubernetes, the `PersistentVolumeClaim` (PVC) needs to have the `accessMode` as `ReadWriteOncePod`.
2. More advanced Kubernetes volume features such as, `fsGroup`, `fsGroupChangePolicy`, and `subPath` are not supported.
## End User Interface
1. The user specifies a PV as a direct-assigned volume. How a PV is specified as a direct-assigned volume is left for each CSI implementation to decide.
There are a few options for reference:
1. A storage class parameter specifies whether it's a direct-assigned volume. This avoids any lookups of PVC
or Pod information from the CSI plugin (as external provisioner takes care of these). However, all PVs in the storage class with the parameter set
1. A storage class parameter specifies whether it's a direct-assigned volume. This avoids any lookups of PVC
or Pod information from the CSI plugin (as external provisioner takes care of these). However, all PVs in the storage class with the parameter set
will have host mounts skipped.
2. Use a PVC annotation. This approach requires the CSI plugins have `--extra-create-metadata` [set](https://kubernetes-csi.github.io/docs/external-provisioner.html#persistentvolumeclaim-and-persistentvolume-parameters)
to be able to perform a lookup of the PVC annotations from the API server. Pro: API server lookup of annotations only required during creation of PV.
to be able to perform a lookup of the PVC annotations from the API server. Pro: API server lookup of annotations only required during creation of PV.
Con: The CSI plugin will always skip host mounting of the PV.
3. The CSI plugin can also lookup pod `runtimeclass` during `NodePublish`. This approach can be found in the [ALIBABA CSI plugin](https://github.com/kubernetes-sigs/alibaba-cloud-csi-driver/blob/master/pkg/disk/nodeserver.go#L248).
2. The CSI node driver delegates the direct assigned volume to the Kata Containers runtime. The CSI node driver APIs need to
2. The CSI node driver delegates the direct assigned volume to the Kata Containers runtime. The CSI node driver APIs need to
be modified to pass the volume mount information and collect volume information to/from the Kata Containers runtime by invoking `kata-runtime` command line commands.
* **`NodePublishVolume`** -- It invokes `kata-runtime direct-volume add --volume-path [volumePath] --mount-info [mountInfo]`
* **NodePublishVolume** -- It invokes `kata-runtime direct-volume add --volume-path [volumePath] --mount-info [mountInfo]`
to propagate the volume mount information to the Kata Containers runtime for it to carry out the filesystem mount operation.
The `volumePath` is the [target_path](https://github.com/container-storage-interface/spec/blob/master/csi.proto#L1364) in the CSI `NodePublishVolumeRequest`.
The `mountInfo` is a serialized JSON string.
* **`NodeGetVolumeStats`** -- It invokes `kata-runtime direct-volume stats --volume-path [volumePath]` to retrieve the filesystem stats of direct-assigned volume.
* **`NodeExpandVolume`** -- It invokes `kata-runtime direct-volume resize --volume-path [volumePath] --size [size]` to send a resize request to the Kata Containers runtime to
The `mountInfo` is a serialized JSON string.
* **NodeGetVolumeStats** -- It invokes `kata-runtime direct-volume stats --volume-path [volumePath]` to retrieve the filesystem stats of direct-assigned volume.
* **NodeExpandVolume** -- It invokes `kata-runtime direct-volume resize --volume-path [volumePath] --size [size]` to send a resize request to the Kata Containers runtime to
resize the direct-assigned volume.
* **`NodeStageVolume/NodeUnStageVolume`** -- It invokes `kata-runtime direct-volume remove --volume-path [volumePath]` to remove the persisted metadata of a direct-assigned volume.
* **NodeStageVolume/NodeUnStageVolume** -- It invokes `kata-runtime direct-volume remove --volume-path [volumePath]` to remove the persisted metadata of a direct-assigned volume.
The `mountInfo` object is defined as follows:
```Golang
@@ -78,17 +78,17 @@ Notes: given that the `mountInfo` is persisted to the disk by the Kata runtime,
## Implementation Details
### Kata runtime
Instead of the CSI node driver writing the mount info into a `csiPlugin.json` file under the volume root,
as described in the original proposal, here we propose that the CSI node driver passes the mount information to
the Kata Containers runtime through a new `kata-runtime` commandline command. The `kata-runtime` then writes the mount
Instead of the CSI node driver writing the mount info into a `csiPlugin.json` file under the volume root,
as described in the original proposal, here we propose that the CSI node driver passes the mount information to
the Kata Containers runtime through a new `kata-runtime` commandline command. The `kata-runtime` then writes the mount
information to a `mountInfo.json` file in a predefined location (`/run/kata-containers/shared/direct-volumes/[volume_path]/`).
When the Kata Containers runtime starts a container, it verifies whether a volume mount is a direct-assigned volume by checking
whether there is a `mountInfo` file under the computed Kata `direct-volumes` directory. If it is, the runtime parses the `mountInfo` file,
When the Kata Containers runtime starts a container, it verifies whether a volume mount is a direct-assigned volume by checking
whether there is a `mountInfo` file under the computed Kata `direct-volumes` directory. If it is, the runtime parses the `mountInfo` file,
updates the mount spec with the data in `mountInfo`. The updated mount spec is then passed to the Kata agent in the guest VM together
with other mounts. The Kata Containers runtime also creates a file named by the sandbox id under the `direct-volumes/[volume_path]/`
directory. The reason for adding a sandbox id file is to establish a mapping between the volume and the sandbox using it.
Later, when the Kata Containers runtime handles the `get-stats` and `resize` commands, it uses the sandbox id to identify
with other mounts. The Kata Containers runtime also creates a file named by the sandbox id under the `direct-volumes/[volume_path]/`
directory. The reason for adding a sandbox id file is to establish a mapping between the volume and the sandbox using it.
Later, when the Kata Containers runtime handles the `get-stats` and `resize` commands, it uses the sandbox id to identify
the endpoint of the corresponding `containerd-shim-kata-v2`.
### containerd-shim-kata-v2 changes
@@ -101,12 +101,12 @@ $ curl --unix-socket "$shim_socket_path" -I -X GET 'http://localhost/direct-volu
$ curl --unix-socket "$shim_socket_path" -I -X POST 'http://localhost/direct-volume/resize' -d '{ "volumePath"": [volumePath], "Size": "123123" }'
```
The shim then forwards the corresponding request to the `kata-agent` to carry out the operations inside the guest VM. For `resize` operation,
the Kata runtime also needs to notify the hypervisor to resize the block device (e.g. call `block_resize` in QEMU).
The shim then forwards the corresponding request to the `kata-agent` to carry out the operations inside the guest VM. For `resize` operation,
the Kata runtime also needs to notify the hypervisor to resize the block device (e.g. call `block_resize` in QEMU).
### Kata agent changes
The mount spec of a direct-assigned volume is passed to `kata-agent` through the existing `Storage` GRPC object.
The mount spec of a direct-assigned volume is passed to `kata-agent` through the existing `Storage` GRPC object.
Two new APIs and three new GRPC objects are added to GRPC protocol between the shim and agent for resizing and getting volume stats:
```protobuf
@@ -226,7 +226,7 @@ Lets assume that changes have been made in the `aws-ebs-csi-driver` node driv
1. In the node CSI driver, the `NodePublishVolume` API invokes: `kata-runtime direct-volume add --volume-path "/kubelet/a/b/c/d/sdf" --mount-info "{\"Device\": \"/dev/sdf\", \"fstype\": \"ext4\"}"`.
2. The `Kata-runtime` writes the mount-info JSON to a file called `mountInfo.json` under `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.
**Node `unstage` volume**
**Node unstage volume**
1. In the node CSI driver, the `NodeUnstageVolume` API invokes: `kata-runtime direct-volume remove --volume-path "/kubelet/a/b/c/d/sdf"`.
2. Kata-runtime deletes the directory `/run/kata-containers/shared/direct-volumes/kubelet/a/b/c/d/sdf`.

View File

@@ -59,5 +59,5 @@ The table below summarized when and where those different hooks will be executed
+ `Hook Path` specifies where hook's path be resolved.
+ `Exec Place` specifies in which namespace those hooks can be executed.
+ For `CreateContainer` Hooks, OCI requires to run them inside the container namespace while the hook executable path is in the host runtime, which is a non-starter for VM-based containers. So we design to keep them running in the *host vmm namespace.*
+ `Exec Time` specifies at which time point those hooks can be executed.
+ For `CreateContainer` Hooks, OCI requires to run them inside the container namespace while the hook executable path is in the host runtime, which is a non-starter for VM-based containers. So we design to keep them running in the *host vmm namespace.*
+ `Exec Time` specifies at which time point those hooks can be executed.

View File

@@ -118,7 +118,7 @@ all vCPU and I/O related threads) will be created in the `/kata_<PodSandboxID>`
### Why create a kata-cgroup under the parent cgroup?
And why not directly adding the per sandbox shim directly to the pod cgroup (e.g.
And why not directly adding the per sandbox shim directly to the pod cgroup (e.g.
`/kubepods` in the Kubernetes context)?
The Kata Containers shim implementation creates a per-sandbox cgroup
@@ -219,13 +219,13 @@ the `/kubepods` cgroup hierarchy, and a `/<PodSandboxID>` under the `/kata_overh
On a typical cgroup v1 hierarchy mounted under `/sys/fs/cgroup/`, for a pod which sandbox
ID is `12345678`, create with `sandbox_cgroup_only` disabled, the 2 memory subsystems
for the sandbox cgroup and the overhead cgroup would respectively live under
for the sandbox cgroup and the overhead cgroup would respectively live under
`/sys/fs/cgroup/memory/kubepods/kata_12345678` and `/sys/fs/cgroup/memory/kata_overhead/12345678`.
Unlike when `sandbox_cgroup_only` is enabled, the Kata Containers shim will move itself
to the overhead cgroup first, and then move the vCPU threads to the sandbox cgroup as
they're created. All Kata processes and threads will run under the overhead cgroup except for
the vCPU threads.
the vCPU threads.
With `sandbox_cgroup_only` disabled, Kata Containers assumes the pod cgroup is only sized
to accommodate for the actual container workloads processes. For Kata, this maps
@@ -247,7 +247,7 @@ cgroup size and constraints accordingly.
# Supported cgroups
Kata Containers currently supports cgroups `v1` and `v2`.
Kata Containers currently supports cgroups `v1` and `v2`.
In the following sections each cgroup is described briefly.

View File

@@ -119,17 +119,17 @@ The metrics service also doesn't hold any metrics in memory.
*Metrics size*: response size of one Prometheus scrape request.
It's easy to estimate the size of one metrics fetch request issued by Prometheus.
The formula to calculate the expected size when no gzip compression is in place is:
The formula to calculate the expected size when no gzip compression is in place is:
9 + (144 - 9) * `number of kata sandboxes`
Prometheus supports `gzip compression`. When enabled, the response size of each request will be smaller:
Prometheus supports `gzip compression`. When enabled, the response size of each request will be smaller:
2 + (10 - 2) * `number of kata sandboxes`
**Example**
We have 10 sandboxes running on a node. The expected size of one metrics fetch request issued by Prometheus against the kata-monitor agent running on that node will be:
**Example**
We have 10 sandboxes running on a node. The expected size of one metrics fetch request issued by Prometheus against the kata-monitor agent running on that node will be:
9 + (144 - 9) * 10 = **1.35M**
If `gzip compression` is enabled:
If `gzip compression` is enabled:
2 + (10 - 2) * 10 = **82K**
#### Metrics delay ####

View File

@@ -71,7 +71,7 @@ The Kata Containers runtime **MUST** support scalable I/O through the SRIOV tech
### Virtualization overhead reduction
A compelling aspect of containers is their minimal overhead compared to bare metal applications.
A container runtime should keep the overhead to a minimum in order to provide the expected user
experience.
experience.
The Kata Containers runtime implementation **SHOULD** be optimized for:
* Minimal workload boot and shutdown times

View File

@@ -5,7 +5,7 @@ To safeguard the integrity of container images and prevent tampering from the ho
## Introduction to remote snapshot
Containerd 1.7 introduced `remote snapshotter` feature which is the foundation for pulling images in the guest for Confidential Containers.
While it's beyond the scope of this document to fully explain how the container rootfs is created to the point it can be executed, a fundamental grasp of the snapshot concept is essential. Putting it in a simple way, containerd fetches the image layers from an OCI registry into its local content storage. However, they cannot be mounted as is (e.g. the layer can be tar+gzip compressed) as well as they should be immutable so the content can be shared among containers. Thus containerd leverages snapshots of those layers to build the container's rootfs.
While it's beyond the scope of this document to fully explain how the container rootfs is created to the point it can be executed, a fundamental grasp of the snapshot concept is essential. Putting it in a simple way, containerd fetches the image layers from an OCI registry into its local content storage. However, they cannot be mounted as is (e.g. the layer can be tar+gzip compressed) as well as they should be immutable so the content can be shared among containers. Thus containerd leverages snapshots of those layers to build the container's rootfs.
The role of `remote snapshotter` is to reuse snapshots that are stored in a remotely shared place, thus enabling containerd to prepare the containers rootfs in a manner similar to that of a local `snapshotter`. The key behavior that makes this the building block of Kata's guest image management for Confidential Containers is that containerd will not pull the image layers from registry, instead it assumes that `remote snapshotter` and/or an external entity will perform that operation on his behalf.
@@ -48,7 +48,7 @@ Pull the container image directly from the guest VM using `nydus snapshotter` ba
#### Architecture
The following diagram provides an overview of the architecture for pulling image in the guest with key components.
The following diagram provides an overview of the architecture for pulling image in the guest with key components.
```mermaid
flowchart LR
Kubelet[kubelet]--> |1\. Pull image request & metadata|Containerd
@@ -129,7 +129,7 @@ Next the `handleImageGuestPullBlockVolume()` is called to build the Storage obje
Below is an example of storage information packaged in the message sent to the kata-agent:
```json
"driver": "image_guest_pull",
"driver": "image_guest_pull",
"driver_options": [
"image_guest_pull"{
"metadata":{
@@ -145,15 +145,15 @@ Below is an example of storage information packaged in the message sent to the k
"io.kubernetes.cri.sandbox-uid": "de7c6a0c-79c0-44dc-a099-69bb39f180af",
}
}
],
"source": "quay.io/kata-containers/confidential-containers:unsigned",
"fstype": "overlay",
"options": [],
],
"source": "quay.io/kata-containers/confidential-containers:unsigned",
"fstype": "overlay",
"options": [],
"mount_point": "/run/kata-containers/cb0b47276ea66ee9f44cc53afa94d7980b57a52c3f306f68cb034e58d9fbd3c6/rootfs",
```
Next, the kata-agent's RPC module will handle the create container request which, among other things, involves adding storages to the sandbox. The storage module contains implementations of `StorageHandler` interface for various storage types, being the `ImagePullHandler` in charge of handling the storage object for the container image (the storage manager instantiates the handler based on the value of the "driver").
`ImagePullHandler` delegates the image pulling operation to the `confidential_data_hub.pull_image()` that is going to create the image's bundle directory on the guest filesystem and, in turn, the `ImagePullService` of Confidential Data Hub to fetch, uncompress and mount the image's rootfs.
`ImagePullHandler` delegates the image pulling operation to the `confidential_data_hub.pull_image()` that is going to create the image's bundle directory on the guest filesystem and, in turn, the `ImagePullService` of Confidential Data Hub to fetch, uncompress and mount the image's rootfs.
> **Notes:**
> In this flow, `confidential_data_hub.pull_image()` parses the image metadata, looking for either the `io.kubernetes.cri.container-type: sandbox` or `io.kubernetes.cri-o.ContainerType: sandbox` (CRI-IO case) annotation, then it never calls the `pull_image()` RPC of Confidential Data Hub because the pause image is expected to already be inside the guest's filesystem, so instead `confidential_data_hub.unpack_pause_image()` is called.

View File

@@ -1,6 +1,6 @@
# Virtual machine vCPU sizing in Kata Containers 3.0
> Preview:
> Preview:
> [Kubernetes(since 1.23)][1] and [Containerd(since 1.6.0-beta4)][2] will help calculate `Sandbox Size` info and pass it to Kata Containers through annotations.
> In order to adapt to this beneficial change and be compatible with the past, we have implemented the new vCPUs handling way in `runtime-rs`, which is slightly different from the original `runtime-go`'s design.
@@ -20,7 +20,7 @@ Our understanding and priority of these resources are as follows, which will aff
* `default_vcpus`: default number of vCPUs when starting a VM.
* `default_maxvcpus`: maximum number of vCPUs.
* From `Annotation`:
* `InitialSize`: we call the size of the resource passed from the annotations as `InitialSize`. Kubernetes will calculate the sandbox size according to the Pod's statement, which is the `InitialSize` here. This size should be the size we want to prioritize.
* `InitialSize`: we call the size of the resource passed from the annotations as `InitialSize`. Kubernetes will calculate the sandbox size according to the Pod's statement, which is the `InitialSize` here. This size should be the size we want to prioritize.
* From `Container Spec`:
* The amount of CPU resources that the Container wants to use will be declared through the spec. Including the aforementioned annotations, we mainly consider `cpu quota` and `cpuset` when calculating the number of vCPUs.
* `cpu quota`: `cpu quota` is the most common way to declare the amount of CPU resources. The number of vCPUs introduced by `cpu quota` declared in a container's spec is: `vCPUs = ceiling( quota / period )`.

View File

@@ -1,9 +1,9 @@
# Design Doc for Kata Containers VCPUs Pinning Feature
# Design Doc for Kata Containers' VCPUs Pinning Feature
## Background
By now, vCPU threads of Kata Containers are scheduled randomly to CPUs. And each pod would request a specific set of CPUs which we call it CPU set (just the CPU set meaning in Linux cgroups).
By now, vCPU threads of Kata Containers are scheduled randomly to CPUs. And each pod would request a specific set of CPUs which we call it CPU set (just the CPU set meaning in Linux cgroups).
If the number of vCPU threads are equal to that of CPUs claimed in CPU set, we can then pin each vCPU thread to one specified CPU, to reduce the cost of random scheduling.
If the number of vCPU threads are equal to that of CPUs claimed in CPU set, we can then pin each vCPU thread to one specified CPU, to reduce the cost of random scheduling.
## Detailed Design
@@ -20,7 +20,7 @@ Two ways are provided to use this vCPU thread pinning feature: through `QEMU` co
### When is VCPUs Pinning Checked?
As shown in Section 1, when `num(vCPU threads) == num(CPUs in CPU set)`, we shall pin each vCPU thread to a specified CPU. And when this condition is broken, we should restore to the original random scheduling pattern.
As shown in Section 1, when `num(vCPU threads) == num(CPUs in CPU set)`, we shall pin each vCPU thread to a specified CPU. And when this condition is broken, we should restore to the original random scheduling pattern.
So when may `num(CPUs in CPU set)` change? There are 5 possible scenes:
| Possible scenes | Related Code |
@@ -34,4 +34,4 @@ So when may `num(CPUs in CPU set)` change? There are 5 possible scenes:
### Core Pinning Logics
We can split the whole process into the following steps. Related methods are `checkVCPUsPinning` and `resetVCPUsPinning`, in file Sandbox.go.
![](arch-images/vcpus-pinning-process.png)
![](arch-images/vcpus-pinning-process.png)

View File

@@ -49,4 +49,4 @@
- [How to use the Kata Agent Policy](how-to-use-the-kata-agent-policy.md)
- [How to pull images in the guest](how-to-pull-images-in-guest-with-kata.md)
- [How to use mem-agent to decrease the memory usage of Kata container](how-to-use-memory-agent.md)
- [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md)
- [How to use seccomp with runtime-rs](how-to-use-seccomp-with-runtime-rs.md)

View File

@@ -1,24 +1,24 @@
# How to use Kata Containers and Containerd
This document covers the installation and configuration of [containerd](https://containerd.io/)
This document covers the installation and configuration of [containerd](https://containerd.io/)
and [Kata Containers](https://katacontainers.io). The containerd provides not only the `ctr`
command line tool, but also the [CRI](https://kubernetes.io/blog/2016/12/container-runtime-interface-cri-in-kubernetes/)
command line tool, but also the [CRI](https://kubernetes.io/blog/2016/12/container-runtime-interface-cri-in-kubernetes/)
interface for [Kubernetes](https://kubernetes.io) and other CRI clients.
This document is primarily written for Kata Containers v1.5.0-rc2 or above, and containerd v1.2.0 or above.
This document is primarily written for Kata Containers v1.5.0-rc2 or above, and containerd v1.2.0 or above.
Previous versions are addressed here, but we suggest users upgrade to the newer versions for better support.
## Concepts
### Kubernetes `RuntimeClass`
[`RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/) is a Kubernetes feature first
introduced in Kubernetes 1.12 as alpha. It is the feature for selecting the container runtime configuration to
[`RuntimeClass`](https://kubernetes.io/docs/concepts/containers/runtime-class/) is a Kubernetes feature first
introduced in Kubernetes 1.12 as alpha. It is the feature for selecting the container runtime configuration to
use to run a pods containers. This feature is supported in `containerd` since [v1.2.0](https://github.com/containerd/containerd/releases/tag/v1.2.0).
Before the `RuntimeClass` was introduced, Kubernetes was not aware of the difference of runtimes on the node. `kubelet`
creates Pod sandboxes and containers through CRI implementations, and treats all the Pods equally. However, there
are requirements to run trusted Pods (i.e. Kubernetes plugin) in a native container like runc, and to run untrusted
are requirements to run trusted Pods (i.e. Kubernetes plugin) in a native container like runc, and to run untrusted
workloads with isolated sandboxes (i.e. Kata Containers).
As a result, the CRI implementations extended their semantics for the requirements:
@@ -32,17 +32,17 @@ As a result, the CRI implementations extended their semantics for the requiremen
```
- Similarly, CRI-O introduced the annotation `io.kubernetes.cri-o.TrustedSandbox` for untrusted Pods.
To eliminate the complexity of user configuration introduced by the non-standardized annotations and provide
extensibility, `RuntimeClass` was introduced. This gives users the ability to affect the runtime behavior
through `RuntimeClass` without the knowledge of the CRI daemons. We suggest that users with multiple runtimes
To eliminate the complexity of user configuration introduced by the non-standardized annotations and provide
extensibility, `RuntimeClass` was introduced. This gives users the ability to affect the runtime behavior
through `RuntimeClass` without the knowledge of the CRI daemons. We suggest that users with multiple runtimes
use `RuntimeClass` instead of the deprecated annotations.
### Containerd Runtime V2 API: Shim V2 API
The [`containerd-shim-kata-v2` (short as `shimv2` in this documentation)](../../src/runtime/cmd/containerd-shim-kata-v2/)
implements the [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/core/runtime/v2) for Kata.
With `shimv2`, Kubernetes can launch Pod and OCI-compatible containers with one shim per Pod. Prior to `shimv2`, `2N+1`
shims (i.e. a `containerd-shim` and a `kata-shim` for each container and the Pod sandbox itself) and no standalone `kata-proxy`
With `shimv2`, Kubernetes can launch Pod and OCI-compatible containers with one shim per Pod. Prior to `shimv2`, `2N+1`
shims (i.e. a `containerd-shim` and a `kata-shim` for each container and the Pod sandbox itself) and no standalone `kata-proxy`
process were used, even with VSOCK not available.
![Kubernetes integration with shimv2](../design/arch-images/shimv2.svg)
@@ -87,7 +87,7 @@ $ popd
### Install `cri-tools`
> **Note:** `cri-tools` is a set of tools for CRI used for development and testing. Users who only want
> **Note:** `cri-tools` is a set of tools for CRI used for development and testing. Users who only want
> to use containerd with Kubernetes can skip the `cri-tools`.
You can install the `cri-tools` from source code:
@@ -104,7 +104,7 @@ $ popd
### Configure containerd to use Kata Containers
By default, the configuration of containerd is located at `/etc/containerd/config.toml`, and the
By default, the configuration of containerd is located at `/etc/containerd/config.toml`, and the
`cri` plugins are placed in the following section:
```toml
@@ -123,7 +123,7 @@ The following sections outline how to add Kata Containers to the configurations.
#### Kata Containers as a `RuntimeClass`
For
For
- Kata Containers v1.5.0 or above (including `1.5.0-rc`)
- Containerd v1.2.0 or above
- Kubernetes v1.12.0 or above
@@ -132,8 +132,8 @@ The `RuntimeClass` is suggested.
The following configuration includes two runtime classes:
- `plugins.cri.containerd.runtimes.runc`: the runc, and it is the default runtime.
- `plugins.cri.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/main/core/runtime/v2))
where the dot-connected string `io.containerd.kata.v2` is translated to `containerd-shim-kata-v2` (i.e. the
- `plugins.cri.containerd.runtimes.kata`: The function in containerd (reference [the document here](https://github.com/containerd/containerd/tree/main/core/runtime/v2))
where the dot-connected string `io.containerd.kata.v2` is translated to `containerd-shim-kata-v2` (i.e. the
binary name of the Kata implementation of [Containerd Runtime V2 (Shim API)](https://github.com/containerd/containerd/tree/main/core/runtime/v2)).
```toml
@@ -168,9 +168,9 @@ This `ConfigPath` option is optional. If you do not specify it, shimv2 first tri
#### Kata Containers as the runtime for untrusted workload
For cases without `RuntimeClass` support, we can use the legacy annotation method to support using Kata Containers
for an untrusted workload. With the following configuration, you can run trusted workloads with a runtime such as `runc`
and then, run an untrusted workload with Kata Containers:
For cases without `RuntimeClass` support, we can use the legacy annotation method to support using Kata Containers
for an untrusted workload. With the following configuration, you can run trusted workloads with a runtime such as `runc`
and then, run an untrusted workload with Kata Containers:
```toml
[plugins.cri.containerd]
@@ -201,9 +201,9 @@ If you want to set Kata Containers as the only runtime in the deployment, you ca
> **Note:** If you skipped the [Install `cri-tools`](#install-cri-tools) section, you can skip this section too.
First, add the CNI configuration in the containerd configuration.
First, add the CNI configuration in the containerd configuration.
The following is the configuration if you installed CNI as the *[Install CNI plugins](#install-cni-plugins)* section outlined.
The following is the configuration if you installed CNI as the *[Install CNI plugins](#install-cni-plugins)* section outlined.
Put the CNI configuration as `/etc/cni/net.d/10-mynet.conf`:
@@ -324,7 +324,7 @@ $ sudo crictl start 1aab7585530e6
1aab7585530e6
```
In Kubernetes, you need to create a `RuntimeClass` resource and add the `RuntimeClass` field in the Pod Spec
In Kubernetes, you need to create a `RuntimeClass` resource and add the `RuntimeClass` field in the Pod Spec
(see this [document](https://kubernetes.io/docs/concepts/containers/runtime-class/) for more information).
If `RuntimeClass` is not supported, you can use the following annotation in a Kubernetes pod to identify as an untrusted workload:

View File

@@ -3358,4 +3358,4 @@
"title": "Kata containers",
"uid": "75pdqURGk",
"version": 1
}
}

View File

@@ -27,7 +27,7 @@ spec:
containers:
- name: kata-monitor
image: quay.io/kata-containers/kata-monitor:2.0.0
args:
args:
- -log-level=debug
ports:
- containerPort: 8090

View File

@@ -79,7 +79,7 @@ metadata:
spec:
replicas: 1
selector:
matchLabels:
matchLabels:
app: prometheus
template:
metadata:

View File

@@ -16,7 +16,7 @@ To pull images in the guest, we need to do the following steps:
### Delete images used for pulling in the guest
Though the `CRI Runtime Specific Snapshotter` is still an [experimental feature](https://github.com/containerd/containerd/blob/main/RELEASES.md#experimental-features) in containerd, which containerd is not supported to manage the same image in different `snapshotters`(The default `snapshotter` in containerd is `overlayfs`). To avoid errors caused by this, it is recommended to delete images (including the pause image) in containerd that needs to be pulled in guest later before configuring `nydus snapshotter` in containerd.
Though the `CRI Runtime Specific Snapshotter` is still an [experimental feature](https://github.com/containerd/containerd/blob/main/RELEASES.md#experimental-features) in containerd, which containerd is not supported to manage the same image in different `snapshotters`(The default `snapshotter` in containerd is `overlayfs`). To avoid errors caused by this, it is recommended to delete images (including the pause image) in containerd that needs to be pulled in guest later before configuring `nydus snapshotter` in containerd.
### Install `nydus snapshotter`
@@ -24,7 +24,7 @@ Though the `CRI Runtime Specific Snapshotter` is still an [experimental feature]
To use DaemonSet to install `nydus snapshotter`, we need to ensure that `yq` exists in the host.
1. Download `nydus snapshotter` repo
1. Download `nydus snapshotter` repo
```bash
$ nydus_snapshotter_install_dir="/tmp/nydus-snapshotter"
$ nydus_snapshotter_url=https://github.com/containerd/nydus-snapshotter
@@ -42,7 +42,7 @@ $ yq -i \
$ yq -i \
> 'data.ENABLE_CONFIG_FROM_VOLUME = "false"' -P \
> misc/snapshotter/base/nydus-snapshotter.yaml
# Enable to run snapshotter as a systemd service
# Enable to run snapshotter as a systemd service
# (skip if you want to run nydus snapshotter as a standalone process)
$ yq -i \
> 'data.ENABLE_SYSTEMD_SERVICE = "true"' -P \
@@ -79,7 +79,7 @@ Created symlink /etc/systemd/system/multi-user.target.wants/nydus-snapshotter.se
#### Install `nydus snapshotter` manually
1. Download `nydus snapshotter` binary from release
1. Download `nydus snapshotter` binary from release
```bash
$ ARCH=$(uname -m)
$ golang_arch=$(case "$ARCH" in
@@ -111,7 +111,7 @@ level=info msg="Run daemons monitor..."
Configure `nydus snapshotter` to enable `CRI Runtime Specific Snapshotter` in containerd. This ensures run kata containers with `nydus snapshotter`. Below, the steps are illustrated using `kata-qemu` as an example.
```toml
# Modify containerd configuration to ensure that the following lines appear in the containerd configuration
# Modify containerd configuration to ensure that the following lines appear in the containerd configuration
# (Assume that the containerd config is located in /etc/containerd/config.toml)
[plugins."io.containerd.grpc.v1.cri".containerd]
@@ -124,7 +124,7 @@ Configure `nydus snapshotter` to enable `CRI Runtime Specific Snapshotter` in co
snapshotter = "nydus"
```
> **Notes:**
> **Notes:**
> The `CRI Runtime Specific Snapshotter` feature only works for containerd v1.7.0 and above. So for Containerd v1.7.0 below, in addition to the above settings, we need to set the global `snapshotter` to `nydus` in containerd config. For example:
```toml
@@ -256,7 +256,7 @@ spec:
values:
- NODE_NAME
volumes:
- name: trusted-image-storage
- name: trusted-storage
persistentVolumeClaim:
claimName: trusted-pvc
containers:
@@ -280,7 +280,7 @@ quay.io/confidential-containers/test-images largeimage
```bash
$ lsblk --fs
NAME FSTYPE LABEL UUID FSAVAIL FSUSE% MOUNTPOINT
sda
sda
└─encrypted_disk_GsLDt
178M 87% /run/kata-containers/image
@@ -309,4 +309,4 @@ $ free -m
total used free shared buff/cache available
Mem: 1989 52 43 0 1893 1904
Swap: 0 0 0
```
```

View File

@@ -10,19 +10,19 @@ Currently, there is no widely applicable and convenient method available for use
## Solution
According to the proposal, it requires to use the `kata-ctl direct-volume` command to add a direct assigned block volume device to the Kata Containers runtime.
According to the proposal, it requires to use the `kata-ctl direct-volume` command to add a direct assigned block volume device to the Kata Containers runtime.
And then with the help of method [get_volume_mount_info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L95), get information from JSON file: `(mountinfo.json)` and parse them into structure [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70) which is used to save device-related information.
And then with the help of method [get_volume_mount_info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L95), get information from JSON file: `(mountinfo.json)` and parse them into structure [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70) which is used to save device-related information.
We only fill the `mountinfo.json`, such as `device` ,`volume_type`, `fs_type`, `metadata` and `options`, which correspond to the fields in [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70), to describe a device.
We only fill the `mountinfo.json`, such as `device` ,`volume_type`, `fs_type`, `metadata` and `options`, which correspond to the fields in [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70), to describe a device.
The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/volume001` which under fixed path `/run/kata-containers/shared/direct-volumes/`.
And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is
The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/volume001` which under fixed path `/run/kata-containers/shared/direct-volumes/`.
And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is
encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`.
Finally, when running a Kata Containers with `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume.
Finally, when running a Kata Containers with `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume.
Now, supported types:
Now, supported types:
- `directvol` for direct volume
- `vfiovol` for VFIO device based volume
@@ -46,10 +46,10 @@ $ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g
```json
{
"device": "/tmp/stor/rawdisk01.20g",
"volume_type": "directvol",
"fs_type": "ext4",
"metadata":"{}",
"device": "/tmp/stor/rawdisk01.20g",
"volume_type": "directvol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
@@ -57,7 +57,7 @@ $ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g
```bash
$ sudo kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
$# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx
$ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
$ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
{"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]}
```
@@ -79,8 +79,8 @@ In this scenario, the device's host kernel driver will be replaced by `vfio-pci`
And either device's BDF or its VFIO IOMMU group ID in `/dev/vfio/` is fine for "device" in `mountinfo.json`.
```bash
$ lspci -nn -k -s 45:00.1
45:00.1 SCSI storage controller
$ lspci -nn -k -s 45:00.1
45:00.1 SCSI storage controller
...
Kernel driver in use: vfio-pci
...
@@ -99,9 +99,9 @@ First, configure the `mountinfo.json`, as below:
```json
{
"device": "45:00.1",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
@@ -111,9 +111,9 @@ First, configure the `mountinfo.json`, as below:
```json
{
"device": "0000:45:00.1",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
@@ -122,10 +122,10 @@ First, configure the `mountinfo.json`, as below:
```json
{
"device": "/dev/vfio/110",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"device": "/dev/vfio/110",
"volume_type": "vfiovol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
@@ -135,7 +135,7 @@ Second, run kata-containers with device(`/dev/vfio/110`) as an example:
```bash
$ sudo kata-ctl direct-volume add /kubelet/kata-vfio-vol-003/vfiovol003 "{\"device\": \"/dev/vfio/110\", \"volume_type\": \"vfiovol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
$ # /kubelet/kata-vfio-vol-003/directvol003 <==> /run/kata-containers/shared/direct-volumes/F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx
$ cat F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx/mountInfo.json
$ cat F0va22F0ZvaS12F0YS10a2F0DAxvbC0F0ZXvdm9sdF0Z0YSx/mountInfo.json
{"volume_type":"vfiovol","device":"/dev/vfio/110","fs_type":"ext4","metadata":{},"options":[]}
```

View File

@@ -1,5 +1,5 @@
## Introduction
To improve security, Kata Container supports running the VMM process (QEMU and cloud-hypervisor) as a non-`root` user.
To improve security, Kata Container supports running the VMM process (QEMU and cloud-hypervisor) as a non-`root` user.
This document describes how to enable the rootless VMM mode and its limitations.
## Pre-requisites
@@ -20,8 +20,8 @@ By default, the VMM process still runs as the root user. There are two ways to e
2. Set the Kubernetes annotation `io.katacontainers.hypervisor.rootless` to `true`.
## Implementation details
When `rootless` flag is enabled, upon a request to create a Pod, Kata Containers runtime creates a random user and group (e.g. `kata-123`), and uses them to start the hypervisor process.
The `kvm` group is also given to the hypervisor process as a supplemental group to give the hypervisor process access to the `/dev/kvm` device.
When `rootless` flag is enabled, upon a request to create a Pod, Kata Containers runtime creates a random user and group (e.g. `kata-123`), and uses them to start the hypervisor process.
The `kvm` group is also given to the hypervisor process as a supplemental group to give the hypervisor process access to the `/dev/kvm` device.
Another necessary change is to move the hypervisor runtime files (e.g. `vhost-fs.sock`, `qmp.sock`) to a directory (under `/run/user/[uid]/`) where only the non-root hypervisor has access to.
## Limitations
@@ -30,4 +30,4 @@ Another necessary change is to move the hypervisor runtime files (e.g. `vhost-fs
2. Currently, this feature is only supported in QEMU and cloud-hypervisor. For firecracker, you can use jailer to run the VMM process with a non-root user.
3. Certain features will not work when rootless VMM is enabled, including:
1. Passing devices to the guest (`virtio-blk`, `virtio-scsi`) will not work if the non-privileged user does not have permission to access it (leading to a permission denied error). A more permissive permission (e.g. 666) may overcome this issue. However, you need to be aware of the potential security implications of reducing the security on such devices.
2. `vfio` device will also not work because of permission denied error.
2. `vfio` device will also not work because of permission denied error.

View File

@@ -50,7 +50,7 @@ There are several kinds of Kata configurations and they are listed below.
| `io.katacontainers.config.hypervisor.default_max_vcpus` | uint32| the maximum number of vCPUs allocated for the VM by the hypervisor |
| `io.katacontainers.config.hypervisor.default_memory` | uint32| the memory assigned for a VM by the hypervisor in `MiB` |
| `io.katacontainers.config.hypervisor.default_vcpus` | float32| the default vCPUs assigned for a VM by the hypervisor |
| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disable hotplugging host block devices to guest VMs for container rootfs |
| `io.katacontainers.config.hypervisor.disable_block_device_use` | `boolean` | disallow a block device from being used |
| `io.katacontainers.config.hypervisor.disable_image_nvdimm` | `boolean` | specify if a `nvdimm` device should be used as rootfs for the guest (QEMU) |
| `io.katacontainers.config.hypervisor.disable_vhost_net` | `boolean` | specify if `vhost-net` is not available on the host |
| `io.katacontainers.config.hypervisor.enable_hugepages` | `boolean` | if the memory should be `pre-allocated` from huge pages |
@@ -59,7 +59,7 @@ There are several kinds of Kata configurations and they are listed below.
| `io.katacontainers.config.hypervisor.enable_iothreads` | `boolean`| enable IO to be processed in a separate thread. Supported currently for virtio-`scsi` driver |
| `io.katacontainers.config.hypervisor.enable_mem_prealloc` | `boolean` | the memory space used for `nvdimm` device by the hypervisor |
| `io.katacontainers.config.hypervisor.enable_vhost_user_store` | `boolean` | enable vhost-user storage device (QEMU) |
| `io.katacontainers.config.hypervisor.vhost_user_reconnect_timeout_sec` | `string`| the timeout for reconnecting vhost user socket (QEMU)
| `io.katacontainers.config.hypervisor.vhost_user_reconnect_timeout_sec` | `string`| the timeout for reconnecting vhost user socket (QEMU)
| `io.katacontainers.config.hypervisor.enable_virtio_mem` | `boolean` | enable virtio-mem (QEMU) |
| `io.katacontainers.config.hypervisor.entropy_source` (R) | string| the path to a host source of entropy (`/dev/random`, `/dev/urandom` or real hardware RNG device) |
| `io.katacontainers.config.hypervisor.file_mem_backend` (R) | string | file based memory backend root directory |
@@ -97,8 +97,6 @@ There are several kinds of Kata configurations and they are listed below.
| `io.katacontainers.config.hypervisor.use_legacy_serial` | `boolean` | uses legacy serial device for guest's console (QEMU) |
| `io.katacontainers.config.hypervisor.default_gpus` | uint32 | the minimum number of GPUs required for the VM. Only used by remote hypervisor to help with instance selection |
| `io.katacontainers.config.hypervisor.default_gpu_model` | string | the GPU model required for the VM. Only used by remote hypervisor to help with instance selection |
| `io.katacontainers.config.hypervisor.block_device_num_queues` | `usize` | The number of queues to use for block devices (runtime-rs only) |
| `io.katacontainers.config.hypervisor.block_device_queue_size` | uint32 | The size of the of the queue to use for block devices (runtime-rs only) |
## Container Options
| Key | Value Type | Comments |

View File

@@ -19,7 +19,7 @@ The Kubernetes cluster will use the
## Install and configure containerd
First, follow the [How to use Kata Containers and Containerd](containerd-kata.md) to install and configure containerd.
First, follow the [How to use Kata Containers and Containerd](containerd-kata.md) to install and configure containerd.
Then, make sure the containerd works with the [examples in it](containerd-kata.md#run).
## Install and configure Kubernetes
@@ -44,7 +44,7 @@ In order to allow Kubelet to use containerd (using the CRI interface), configure
```bash
$ sudo mkdir -p /etc/systemd/system/kubelet.service.d/
$ cat << EOF | sudo tee /etc/systemd/system/kubelet.service.d/0-containerd.conf
[Service]
[Service]
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
EOF
```
@@ -182,7 +182,7 @@ If a pod has the `runtimeClassName` set to `kata`, the CRI runs the pod with the
containers:
- name: nginx
image: nginx
EOF
```

View File

@@ -2,9 +2,9 @@
## Sysctls
In Linux, the sysctl interface allows an administrator to modify kernel
parameters at runtime. Parameters are available via the `/proc/sys/` virtual
process file system.
In Linux, the sysctl interface allows an administrator to modify kernel
parameters at runtime. Parameters are available via the `/proc/sys/` virtual
process file system.
The parameters include the following subsystems among others:
- `fs` (file systems)
@@ -17,9 +17,9 @@ To get a complete list of kernel parameters, run:
$ sudo sysctl -a
```
Kubernetes provide mechanisms for setting namespaced sysctls.
Kubernetes provide mechanisms for setting namespaced sysctls.
Namespaced sysctls can be set per pod in the case of Kubernetes.
The following sysctls are known to be namespaced and can be set with
The following sysctls are known to be namespaced and can be set with
Kubernetes:
- `kernel.shm*`
@@ -84,14 +84,14 @@ The recommendation is to set them directly on the host or use a privileged
container in the case of Kubernetes.
In the case of Kata, the approach of setting sysctls on the host does not
work since the host sysctls have no effect on a Kata Container running
work since the host sysctls have no effect on a Kata Container running
inside a guest. Kata gives you the ability to set non-namespaced sysctls using a privileged container.
This has the advantage that the non-namespaced sysctls are set inside the guest
without having any effect on the `/proc/sys` values of any other pod or the
host itself.
This has the advantage that the non-namespaced sysctls are set inside the guest
without having any effect on the `/proc/sys` values of any other pod or the
host itself.
The recommended approach to do this would be to set the sysctl value in a
privileged init container. In this way, the application containers do not need any elevated
privileged init container. In this way, the application containers do not need any elevated
privileges.
```

View File

@@ -116,4 +116,4 @@ time for I in $(seq 100); do
done
# Display the memory usage again after running the test
free -h
free -h

View File

@@ -1,6 +1,6 @@
# Kata Agent Policy
Agent Policy is a Kata Containers feature that enables the Guest VM to perform additional validation for each [ttRPC API](../../src/libs/protocols/protos/agent.proto) request.
Agent Policy is a Kata Containers feature that enables the Guest VM to perform additional validation for each [ttRPC API](../../src/libs/protocols/protos/agent.proto) request.
The Policy is commonly used for implementing confidential containers, where the Kata Shim and the Kata Agent have different trust properties. However, the Policy can be used for non-confidential containers too - e.g., for a basic defense in depth step of blocking the Host from starting an application on the Guest. However, for non-confidential containers, the Host might be able to modify the Policy and/or replace the Agent and disable its Policy rules, so a Policy is more helpful for confidential containers.
@@ -35,7 +35,7 @@ Kubernetes users can encode in `base64` format their Policy documents, and add t
For example, the [`allow-all-except-exec-process.rego`](../../src/kata-opa/allow-all-except-exec-process.rego) sample policy file is different from the [default Policy](../../src/kata-opa/allow-all.rego) because it rejects any `ExecProcess` requests. To encode this policy file, you need to:
- Embed the policy inside an init data struct
- Compress
- Base64 encode
- Base64 encode
For example:
```bash

View File

@@ -22,7 +22,7 @@ mitigation does not affect a container's ability to mount *guest* devices.
## Containerd
The Containerd allows configuring the privileged host devices behavior for each runtime in the containerd config. This is
done with the `privileged_without_host_devices` option. Setting this to `true` will disable hot plugging of the host
done with the `privileged_without_host_devices` option. Setting this to `true` will disable hot plugging of the host
devices into the guest, even when privileged is enabled.
Support for configuring privileged host devices behaviour was added in containerd `1.3.0` version.
@@ -49,7 +49,7 @@ See below example config:
## CRI-O
Similar to containerd, CRI-O allows configuring the privileged host devices
behavior for each runtime in the CRI config. This is done with the
behavior for each runtime in the CRI config. This is done with the
`privileged_without_host_devices` option. Setting this to `true` will disable
hot plugging of the host devices into the guest, even when privileged is enabled.
@@ -74,4 +74,4 @@ See below example config:
```
- [Kata Containers with CRI-O](../how-to/run-kata-with-k8s.md#cri-o)

View File

@@ -30,7 +30,7 @@ POD ID CREATED STATE NAME
#### Create container in the pod sandbox with config file
```bash
$ sudo crictl create 16a62b035940f container_config.json sandbox_config.json
$ sudo crictl create 16a62b035940f container_config.json sandbox_config.json
e6ca0e0f7f532686236b8b1f549e4878e4fe32ea6b599a5d684faf168b429202
```
@@ -66,7 +66,7 @@ $ sudo crictl exec -it e6ca0e0f7f532 sh
And run commands in it:
```
/ # hostname
/ # hostname
busybox_host
/ # id
uid=0(root) gid=0(root)

View File

@@ -169,7 +169,7 @@ Add the following annotation for CRI-O
```yaml
io.kubernetes.cri-o.TrustedSandbox: "false"
```
The following is an example of what your YAML can look like:
The following is an example of what your YAML can look like:
```yaml
...
@@ -199,7 +199,7 @@ Add the following annotation for containerd
```yaml
io.kubernetes.cri.untrusted-workload: "true"
```
The following is an example of what your YAML can look like:
The following is an example of what your YAML can look like:
```yaml
...

View File

@@ -3,12 +3,12 @@
### What is VMCache
VMCache is a new function that creates VMs as caches before using it.
It helps speed up new container creation.
It helps speed up new container creation.
The function consists of a server and some clients communicating
through Unix socket. The protocol is gRPC in [`protocols/cache/cache.proto`](../../src/runtime/protocols/cache/cache.proto).
through Unix socket. The protocol is gRPC in [`protocols/cache/cache.proto`](../../src/runtime/protocols/cache/cache.proto).
The VMCache server will create some VMs and cache them by factory cache.
It will convert the VM to gRPC format and transport it when gets
requested from clients.
requested from clients.
Factory `grpccache` is the VMCache client. It will request gRPC format
VM and convert it back to a VM. If VMCache function is enabled,
`kata-runtime` will request VM from factory `grpccache` when it creates
@@ -16,8 +16,8 @@ a new sandbox.
### How is this different to VM templating
Both [VM templating](../how-to/what-is-vm-templating-and-how-do-I-use-it.md) and VMCache help speed up new container creation.
When VM templating enabled, new VMs are created by cloning from a pre-created template VM, and they will share the same initramfs, kernel and agent memory in readonly mode. So it saves a lot of memory if there are many Kata Containers running on the same host.
Both [VM templating](../how-to/what-is-vm-templating-and-how-do-I-use-it.md) and VMCache help speed up new container creation.
When VM templating enabled, new VMs are created by cloning from a pre-created template VM, and they will share the same initramfs, kernel and agent memory in readonly mode. So it saves a lot of memory if there are many Kata Containers running on the same host.
VMCache is not vulnerable to [share memory CVE](../how-to/what-is-vm-templating-and-how-do-I-use-it.md#what-are-the-cons) because each VM doesn't share the memory.
### How to enable VMCache
@@ -25,9 +25,9 @@ VMCache is not vulnerable to [share memory CVE](../how-to/what-is-vm-templating-
VMCache can be enabled by changing your Kata Containers config file (`/usr/share/defaults/kata-containers/configuration.toml`,
overridden by `/etc/kata-containers/configuration.toml` if provided) such that:
* `vm_cache_number` specifies the number of caches of VMCache:
* unspecified or == 0
* unspecified or == 0
VMCache is disabled
* `> 0`
* `> 0`
will be set to the specified number
* `vm_cache_endpoint` specifies the address of the Unix socket.

View File

@@ -10,8 +10,8 @@ much like a process fork done by the kernel but here we *fork* VMs.
### How is this different from VMCache
Both [VMCache](../how-to/what-is-vm-cache-and-how-do-I-use-it.md) and VM templating help speed up new container creation.
When VMCache enabled, new VMs are created by the VMCache server. So it is not vulnerable to share memory CVE because each VM doesn't share the memory.
Both [VMCache](../how-to/what-is-vm-cache-and-how-do-I-use-it.md) and VM templating help speed up new container creation.
When VMCache enabled, new VMs are created by the VMCache server. So it is not vulnerable to share memory CVE because each VM doesn't share the memory.
VM templating saves a lot of memory if there are many Kata Containers running on the same host.
### What are the Pros

View File

@@ -1,11 +1,11 @@
# Kata Containers installation guides
The following is an overview of the different installation methods available.
The following is an overview of the different installation methods available.
## Prerequisites
Kata Containers requires nested virtualization or bare metal. Check
[hardware requirements](./../../README.md#hardware-requirements) to see if your system is capable of running Kata
Kata Containers requires nested virtualization or bare metal. Check
[hardware requirements](./../../README.md#hardware-requirements) to see if your system is capable of running Kata
Containers.
The Kata Deploy Helm chart is the preferred way to install all of the binaries and

View File

@@ -101,7 +101,7 @@
```
> **Note:**
>
>
> The containerd daemon needs to be able to find the
> `containerd-shim-kata-v2` binary to allow Kata Containers to be created.

View File

@@ -1,10 +1,10 @@
# Kata Containers 3.0 rust runtime installation
The following is an overview of the different installation methods available.
The following is an overview of the different installation methods available.
## Prerequisites
Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check
[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata
Kata Containers 3.0 rust runtime requires nested virtualization or bare metal. Check
[hardware requirements](/src/runtime/README.md#hardware-requirements) to see if your system is capable of running Kata
Containers.
### Platform support
@@ -25,10 +25,10 @@ architectures:
| Installation method | Description | Automatic updates | Use case | Availability
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- |
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | Yes |
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No |
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No |
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
### Kata Deploy Installation
@@ -57,7 +57,7 @@ Follow the [`kata-deploy`](../../tools/packaging/kata-deploy/helm-chart/README.m
```
* Musl support for fully static binary
Example for `x86_64`
```
$ rustup target add x86_64-unknown-linux-musl

View File

@@ -103,8 +103,48 @@ $ minikube ssh "grep -c -E 'vmx|svm' /proc/cpuinfo"
## Installing Kata Containers
You can now install the Kata Containers runtime components
[following the official instructions](../../tools/packaging/kata-deploy/helm-chart).
You can now install the Kata Containers runtime components. You will need a local copy of some Kata
Containers components to help with this, and then use `kubectl` on the host (that Minikube has already
configured for you) to deploy them:
```sh
$ git clone https://github.com/kata-containers/kata-containers.git
$ cd kata-containers/tools/packaging/kata-deploy
$ kubectl apply -f kata-rbac/base/kata-rbac.yaml
$ kubectl apply -f kata-deploy/base/kata-deploy.yaml
```
This installs the Kata Containers components into `/opt/kata` inside the Minikube node. It can take
a few minutes for the operation to complete. You can check the installation has worked by checking
the status of the `kata-deploy` pod, which will be executing
[this script](../../tools/packaging/kata-deploy/scripts/kata-deploy.sh),
and will be executing a `sleep infinity` once it has successfully completed its work.
You can accomplish this by running the following:
```sh
$ podname=$(kubectl -n kube-system get pods -o=name | grep -F kata-deploy | sed 's?pod/??')
$ kubectl -n kube-system exec ${podname} -- ps -ef | grep -F infinity
```
> *NOTE:* This check only works for single node clusters, which is the default for Minikube.
> For multi-node clusters, the check would need to be adapted to check `kata-deploy` had
> completed on all nodes.
## Enabling Kata Containers
Now you have installed the Kata Containers components in the Minikube node. Next, you need to configure
Kubernetes `RuntimeClass` to know when to use Kata Containers to run a pod.
### Register the runtime
Now register the `kata qemu` runtime with that class. This should result in no errors:
```sh
$ cd kata-containers/tools/packaging/kata-deploy/runtimeclasses
$ kubectl apply -f kata-runtimeClasses.yaml
```
The Kata Containers installation process should be complete and enabled in the Minikube cluster.
## Testing Kata Containers

View File

@@ -48,7 +48,7 @@ $ make test
- Run a test in the current package in verbose mode:
```bash
# Example
# Example
$ test="config::tests::test_get_log_level"
$ cargo test "$test" -vv -- --exact --nocapture
@@ -223,7 +223,7 @@ What's wrong with this function?
```rust
fn foo(config: &Config, path_prefix: String, container_id: String, pid: String) -> Result<()> {
let mut full_path = format!("{path_prefix}/{container_id}");
let mut full_path = format!("{}/{}", path_prefix, container_id);
let _ = remove_recursively(&mut full_path);

View File

@@ -80,8 +80,8 @@ In case of Kata, today the devices which we need in the guest are:
- Dynamic Resource Management: `ACPI` is utilized to allow for dynamic VM
resource management (for example: CPU, memory, device hotplug). This is
required when containers are resized, or more generally when containers are
added to a pod.
added to a pod.
How these devices are utilized varies depending on the VMM utilized. We clarify
the default settings provided when integrating Kata with the QEMU, Dragonball,
Firecracker and Cloud Hypervisor VMMs in the following sections.

View File

@@ -107,7 +107,7 @@ Containers agent:
By default, tracing is disabled for all components. To enable _any_ form of
tracing an `enable_tracing` option must be enabled for at least one component.
> **Note:**
> **Note:**
>
> Enabling this option will only allow tracing for subsequently
> started containers.
@@ -187,7 +187,7 @@ from improvements in the tracing infrastructure. Overall, the impact of
enabling runtime and agent tracing should be extremely low.
## Agent shutdown behaviour
In normal operation, the Kata runtime manages the VM shutdown and performs
certain optimisations to speed up this process. However, if agent tracing is
enabled, the agent itself is responsible for shutting down the VM. This it to

View File

@@ -83,10 +83,10 @@ If you have [s390-tools](https://github.com/ibm-s390-linux/s390-tools) available
```
[container]# lszcrypt -V
CARD.DOM TYPE MODE STATUS REQUESTS PENDING HWTYPE QDEPTH FUNCTIONS DRIVER SESTAT
CARD.DOM TYPE MODE STATUS REQUESTS PENDING HWTYPE QDEPTH FUNCTIONS DRIVER SESTAT
--------------------------------------------------------------------------------------------------------
03 CEX8P EP11-Coproc online 2 0 14 08 -----XN-F- cex4card -
03.0041 CEX8P EP11-Coproc online 2 0 14 08 -----XN-F- cex4queue usable
03 CEX8P EP11-Coproc online 2 0 14 08 -----XN-F- cex4card -
03.0041 CEX8P EP11-Coproc online 2 0 14 08 -----XN-F- cex4queue usable
```
---

View File

@@ -3,4 +3,4 @@
Kata Containers supports passing certain GPUs from the host into the container. Select the GPU vendor for detailed information:
- [Intel Discrete GPUs](Intel-Discrete-GPU-passthrough-and-Kata.md)/[Intel Integrated GPUs](Intel-GPU-passthrough-and-Kata.md)
- [NVIDIA GPUs](NVIDIA-GPU-passthrough-and-Kata.md) and [Enabling NVIDIA GPU workloads using GPU passthrough with Kata Containers](NVIDIA-GPU-passthrough-and-Kata-QEMU.md)
- [NVIDIA](NVIDIA-GPU-passthrough-and-Kata.md)

View File

@@ -6,15 +6,15 @@ For integrated GPUs please refer to [Integrate-Intel-GPUs-with-Kata](Intel-GPU-p
> **Note:** These instructions are for a system that has an x86_64 CPU.
An Intel Discrete GPU can be passed to a Kata Container using GPU passthrough,
An Intel Discrete GPU can be passed to a Kata Container using GPU passthrough,
or SR-IOV passthrough.
In Intel GPU pass-through mode, an entire physical GPU is directly assigned to one VM.
In Intel GPU pass-through mode, an entire physical GPU is directly assigned to one VM.
In this mode of operation, the GPU is accessed exclusively by the Intel driver running in
the VM to which it is assigned. The GPU is not shared among VMs.
With SR-IOV mode, it is possible to pass a Virtual GPU instance to a virtual machine.
With this, multiple Virtual GPU instances can be carved out of a single physical GPU
With this, multiple Virtual GPU instances can be carved out of a single physical GPU
and be passed to different VMs, allowing the GPU to be shared.
| Technology | Description |
@@ -28,13 +28,13 @@ Intel GPUs Recommended for Virtualization:
- Intel® Data Center GPU Max Series (`Ponte Vecchio`)
- Intel® Data Center GPU Flex Series (`Arctic Sound-M`)
- Intel® Data Center GPU Arc Series
- Intel® Data Center GPU Arc Series
The following steps outline the workflow for using an Intel Graphics device with Kata Containers.
## Host BIOS requirements
Hardware such as Intel Max and Flex series require larger PCI BARs.
Hardware such as Intel Max and Flex series require larger PCI BARs.
For large BAR devices, MMIO mapping above the 4GB address space should be enabled in the PCI configuration of the BIOS.
@@ -89,7 +89,7 @@ CONFIG_VFIO_IOMMU_TYPE1
CONFIG_VFIO_PCI
```
## Host kernel command line
## Host kernel command line
Your host kernel needs to be booted with `intel_iommu=on` and `i915.enable_iaf=0` on the kernel command
line.
@@ -112,8 +112,8 @@ $ sudo update-grub
For CentOS/RHEL:
```bash
$ sudo grub2-mkconfig -o /boot/grub2/grub.cfg
```
$ sudo grub2-mkconfig -o /boot/grub2/grub.cfg
```
4. Reboot the system
```bash
@@ -234,7 +234,7 @@ Use the following steps to pass an Intel Graphics device in SR-IOV mode to a Kat
$ BDF="0000:3a:00.0"
$ cat "/sys/bus/pci/devices/$BDF/sriov_totalvfs"
63
```
```
Create SR-IOV interfaces for the GPU:
```sh

View File

@@ -1,569 +0,0 @@
# Enabling NVIDIA GPU workloads using GPU passthrough with Kata Containers
This page provides:
1. A description of the components involved when running GPU workloads with
Kata Containers using the NVIDIA TEE and non-TEE GPU runtime classes.
1. An explanation of the orchestration flow on a Kubernetes node for this
scenario.
1. A deployment guide enabling to utilize these runtime classes.
The goal is to educate readers familiar with Kubernetes and Kata Containers
on NVIDIA's reference implementation which is reflected in Kata CI's build
and test framework. With this, we aim to enable readers to leverage this
stack, or to use the principles behind this stack in order to run GPU
workloads on their variant of the Kata Containers stack.
We assume the reader is familiar with Kubernetes, Kata Containers, and
Confidential Containers.
> **Note:**
>
> The current supported mode for enabling GPU workloads in the TEE scenario
> is single GPU passthrough (one GPU per pod) on AMD64 platforms (AMD SEV-SNP
> being the only supported TEE scenario so far with support for Intel TDX being
> on the way).
## Component Overview
Before providing deployment guidance, we describe the components involved to
support running GPU workloads. We start from a top to bottom perspective
from the NVIDIA GPU operator via the Kata runtime to the components within
the NVIDIA GPU Utility Virtual Machine (UVM) root filesystem.
### NVIDIA GPU Operator
A central component is the
[NVIDIA GPU operator](https://github.com/NVIDIA/gpu-operator) which can be
deployed onto your cluster as a helm chart. Installing the GPU operator
delivers various operands on your nodes in the form of Kubernetes DaemonSets.
These operands are vital to support the flow of orchestrating pod manifests
using NVIDIA GPU runtime classes with GPU passthrough on your nodes. Without
getting into the details, the most important operands and their
responsibilities are:
- **nvidia-vfio-manager:** Binding discovered NVIDIA GPUs to the `vfio-pci`
driver for VFIO passthrough.
- **nvidia-cc-manager:** Transitioning GPUs into confidential computing (CC)
and non-CC mode (see the
[NVIDIA/k8s-cc-manager](https://github.com/NVIDIA/k8s-cc-manager)
repository).
- **nvidia-kata-manager:** Creating host-side CDI specifications for GPU
passthrough, resulting in the file `/var/run/cdi/nvidia.yaml`, containing
`kind: nvidia.com/pgpu` (see the
[NVIDIA/k8s-kata-manager](https://github.com/NVIDIA/k8s-kata-manager)
repository).
- **nvidia-sandbox-device-plugin** (see the
[NVIDIA/sandbox-device-plugin](https://github.com/NVIDIA/sandbox-device-plugin)
repository):
- Allocating GPUs during pod deployment.
- Discovering NVIDIA GPUs, their capabilities, and advertising these to
the Kubernetes control plane (allocatable resources as type
`nvidia.com/pgpu` resources will appear for the node and GPU Device IDs
will be registered with Kubelet). These GPUs can thus be allocated as
container resources in your pod manifests. See below GPU operator
deployment instructions for the use of the key `pgpu`, controlled via a
variable.
To summarize, the GPU operator manages the GPUs on each node, allowing for
simple orchestration of pod manifests using Kata Containers. Once the cluster
with GPU operator and Kata bits is up and running, the end user can schedule
Kata NVIDIA GPU workloads, using resource limits and the
`kata-qemu-nvidia-gpu` or `kata-qemu-nvidia-gpu-snp` runtime classes, for
example:
```yaml
apiVersion: v1
kind: Pod
...
spec:
...
runtimeClassName: kata-qemu-nvidia-gpu-snp
...
resources:
limits:
"nvidia.com/pgpu": 1
...
```
When this happens, the Kubelet calls into the sandbox device plugin to
allocate a GPU. The sandbox device plugin returns `DeviceSpec` entries to the
Kubelet for the allocated GPU. The Kubelet uses internal device IDs for
tracking of allocated GPUs and includes the device specifications in the CRI
request when scheduling the pod through containerd. Containerd processes the
device specifications and includes the device configuration in the OCI
runtime spec used to invoke the Kata runtime during the create container
request.
### Kata runtime
The Kata runtime for the NVIDIA GPU handlers is configured to cold-plug VFIO
devices (`cold_plug_vfio` is set to `root-port` while
`hot_plug_vfio` is set to `no-port`). Cold-plug is by design the only
supported mode for NVIDIA GPU passthrough of the NVIDIA reference stack.
With cold-plug, the Kata runtime attaches the GPU at VM launch time, when
creating the pod sandbox. This happens *before* the create container request,
i.e., before the Kata runtime receives the OCI spec including device
configurations from containerd. Thus, a mechanism to acquire the device
information is required. This is done by the runtime calling the
`coldPlugDevices()` function during sandbox creation. In this function,
the runtime queries Kubelet's Pod Resources API to discover allocated GPU
device IDs (e.g., `nvidia.com/pgpu = [vfio0]`). The runtime formats these as
CDI device identifiers and injects them into the OCI spec using
`config.InjectCDIDevices()`. The runtime then consults the host CDI
specifications and determines the device path the GPU is backed by
(e.g., `/dev/vfio/devices/vfio0`). Finally, the runtime resolves the device's
PCI BDF (e.g., `0000:21:00`) and cold-plugs the GPU by launching QEMU with
relevant parameters for device passthrough (e.g.,
`-device vfio-pci,host=0000:21:00.0,x-pci-vendor-id=0x10de,x-pci-device-id=0x2321,bus=rp0,iommufd=iommufdvfio-faf829f2ea7aec330`).
The runtime also creates *inner runtime* CDI annotations
which map host VFIO devices to guest GPU devices. These are annotations
intended for the kata-agent, here referred to as the inner runtime (inside the
UVM), to properly handle GPU passthrough into containers. These annotations
serve as metadata providing the kata-agent with the information needed to
attach the passthrough devices to the correct container.
The annotations are key-value pairs consisting of `cdi.k8s.io/vfio<num>` keys
(derived from the host VFIO device path, e.g., `/dev/vfio/devices/vfio1`) and
`nvidia.com/gpu=<index>` values (referencing the corresponding device in the
guest CDI spec). These annotations are injected by the runtime during container
creation via the `annotateContainerWithVFIOMetadata` function (see
`container.go`).
We continue describing the orchestration flow inside the UVM in the next
section.
### Kata NVIDIA GPU UVM
#### UVM composition
To better understand the orchestration flow inside the NVIDIA GPU UVM, we
first look at the components its root filesystem contains. Should you decide
to use your own root filesystem to enable NVIDIA GPU scenarios, this should
give you a good idea on what ingredients you need.
From a file system perspective, the UVM is composed of two files: a standard
Kata kernel image and the NVIDIA GPU rootfs in initrd or disk image format.
These two files are being utilized for the QEMU launch command when the UVM
is created.
The two most important pieces in Kata Container's build recipes for the
NVIDIA GPU root filesystem are the `nvidia_chroot.sh` and `nvidia_rootfs.sh`
files. The build follows a two-stage process. In the first stage, a
full-fledged Ubuntu-based root filesystem is composed within a chroot
environment. In this stage, NVIDIA kernel modules are built and signed
against the current Kata kernel and relevant NVIDIA packages are installed.
In the second stage, a chiseled build is performed: Only relevant contents
from the first stage are copied and compressed into a new distro-less root
filesystem folder. Kata's build infrastructure then turns this root
filesystem into the NVIDIA initrd and image files.
The resulting root filesystem contains the following software components:
- NVRC - the
[NVIDIA Runtime Container init system](https://github.com/NVIDIA/nvrc/tree/main)
- NVIDIA drivers (kernel modules)
- NVIDIA user space driver libraries
- NVIDIA user space tools
- kata-agent
- confidential computing guest components: the attestation agent,
confidential data hub and api-server-rest binaries
- CRI-O pause container (for the guest image-pull method)
- BusyBox utilities (provides a base set of libraries and binaries, and a
linker)
- some supporting files, such as file containing a list of supported GPU
device IDs which NVRC reads
#### UVM orchestration flow
When the Kata runtime asks QEMU to launch the VM, the UVM's Linux kernel
boots and mounts the root filesystem. After this, NVRC starts as the initial
process.
NVRC scans for NVIDIA GPUs on the PCI bus, loads the
NVIDIA kernel modules, waits for driver initialization, creates the device nodes,
and initializes the GPU hardware (using the `nvidia-smi` binary). NVRC also
creates the guest-side CDI specification file (using the
`nvidia-ctk cdi generate` command). This file specifies devices of
`kind: nvidia.com/gpu`, i.e., GPUs appearing to be physical GPUs on regular
bare metal systems. The guest CDI specification also contains `containerEdits`
for each device, specifying device nodes (e.g., `/dev/nvidia0`,
`/dev/nvidiactl`), library mounts, and environment variables to be mounted
into the container which receives the passthrough GPU.
Then, NVRC forks the Kata agent while continuing to run as the
init system. This allows NVRC to handle ongoing GPU management tasks
while kata-agent focuses on container lifecycle management. See the
[NVRC sources](https://github.com/NVIDIA/nvrc/blob/main/src/main.rs) for an
overview on the steps carried out by NVRC.
When the Kata runtime sends the create container request, the Kata agent
parses the inner runtime CDI annotation. For example, for the inner runtime
annotation `"cdi.k8s.io/vfio1": "nvidia.com/gpu=0"`, the agent looks up device
`0` in the guest CDI specification with `kind: nvidia.com/gpu`.
The Kata agent also reads the guest CDI specification's `containerEdits`
section and injects relevant contents into the OCI spec of the respective
container. The kata agent then creates and starts a `rustjail` container
based on the final OCI spec. The container now has relevant device nodes,
binaries and low-level libraries available, and can start a user application
linked against the CUDA runtime API (e.g., `libcudart.so` and other
libraries). When used, the CUDA runtime API in turn calls the CUDA driver
API and kernel drivers, interacting with the pass-through GPU device.
An additional step is exercised in our CI samples: when using images from an
authenticated registry, the guest-pull mechanism triggers attestation using
trustee's Key Broker Service (KBS) for secure release of the NGC API
authentication key used to access the NVCR container registry. As part of
this, the attestation agent exercises composite attestation and transitions
the GPU into `Ready` state (without this, the GPU has to explicitly be
transitioned into `Ready` state by passing the `nvrc.smi.srs=1` kernel
parameter via the shim config, causing NVRC to transition the GPU into the
`Ready` state).
## Deployment Guidance
This guidance assumes you use bare-metal machines with proper support for
Kata's non-TEE and TEE GPU workload deployment scenarios for your Kubernetes
nodes. We provide guidance based on the upstream Kata CI procedures for the
NVIDIA GPU CI validation jobs. Note that, this setup:
- uses the guest image pull method to pull container image layers
- uses the genpolicy tool to attach Kata agent security policies to the pod
manifest
- has dedicated (composite) attestation tests, a CUDA vectorAdd test, and a
NIM/RA test sample with secure API key release
A similar deployment guide and scenario description can be found in NVIDIA resources
under
[Early Access: NVIDIA GPU Operator with Confidential Containers based on Kata](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/confidential-containers.html).
### Requirements
The requirements for the TEE scenario are:
- Ubuntu 25.10 as host OS
- CPU with AMD SEV-SNP support with proper BIOS/UEFI version and settings
- CC-capable Hopper/Blackwell GPU with proper VBIOS version.
BIOS and VBIOS configuration is out of scope for this guide. Other resources,
such as the documentation found on the
[NVIDIA Trusted Computing Solutions](https://docs.nvidia.com/nvtrust/index.html)
page and the above linked NVIDIA documentation, provide guidance on
selecting proper hardware and on properly configuring its firmware and OS.
### Installation
#### Containerd and Kubernetes
First, set up your Kubernetes cluster. For instance, in Kata CI, our NVIDIA
jobs use a single-node vanilla Kubernetes cluster with a 2.x containerd
version and Kata's current supported Kubernetes version. We set this cluster
up using the `deploy_k8s` function from `tests/integration/kubernetes/gha-run.sh`
as follows:
```bash
$ export KUBERNETES="vanilla"
$ export CONTAINER_ENGINE="containerd"
$ export CONTAINER_ENGINE_VERSION="v2.1"
$ source tests/gha-run-k8s-common.sh
$ deploy_k8s
```
> **Note:**
>
> We recommend to configure your Kubelet with a higher
> `runtimeRequestTimeout` timeout value than the two minute default timeout.
> Using the guest-pull mechanism, pulling large images may take a significant
> amount of time and may delay container start, possibly leading your Kubelet
> to de-allocate your pod before it transitions from the *container created*
> to the *container running* state.
> **Note:**
>
> The NVIDIA GPU runtime classes use VFIO cold-plug which, as
> described above, requires the Kata runtime to query Kubelet's Pod Resources
> API to discover allocated GPU devices during sandbox creation. For
> Kubernetes versions **older than 1.34**, you must explicitly enable the
> `KubeletPodResourcesGet` feature gate in your Kubelet configuration. For
> Kubernetes 1.34 and later, this feature is enabled by default.
#### GPU Operator
Assuming you have the helm tools installed, deploy the latest version of the
GPU Operator as a helm chart (minimum version: `v25.10.0`):
```bash
$ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
$ helm install --wait --generate-name \
-n gpu-operator --create-namespace \
nvidia/gpu-operator \
--set sandboxWorkloads.enabled=true \
--set sandboxWorkloads.defaultWorkload=vm-passthrough \
--set kataManager.enabled=true \
--set kataManager.config.runtimeClasses=null \
--set kataManager.repository=nvcr.io/nvidia/cloud-native \
--set kataManager.image=k8s-kata-manager \
--set kataManager.version=v0.2.4 \
--set ccManager.enabled=true \
--set ccManager.defaultMode=on \
--set ccManager.repository=nvcr.io/nvidia/cloud-native \
--set ccManager.image=k8s-cc-manager \
--set ccManager.version=v0.2.0 \
--set sandboxDevicePlugin.repository=nvcr.io/nvidia/cloud-native \
--set sandboxDevicePlugin.image=nvidia-sandbox-device-plugin \
--set sandboxDevicePlugin.version=v0.0.1 \
--set 'sandboxDevicePlugin.env[0].name=P_GPU_ALIAS' \
--set 'sandboxDevicePlugin.env[0].value=pgpu' \
--set nfd.enabled=true \
--set nfd.nodefeaturerules=true
```
> **Note:**
>
> For heterogeneous clusters with different GPU types, you can omit
> the `P_GPU_ALIAS` environment variable lines. This will cause the sandbox
> device plugin to create GPU model-specific resource types (e.g.,
> `nvidia.com/GH100_H100L_94GB`) instead of the generic `nvidia.com/pgpu`,
> which in turn can be used by pods through respective resource limits.
> For simplicity, this guide uses the generic alias.
> **Note:**
>
> Using `--set sandboxWorkloads.defaultWorkload=vm-passthrough` causes all
> your nodes to be labeled for GPU VM passthrough. Remove this parameter if
> you intend to only use selected nodes for this scenario, and label these
> nodes by hand, using:
> `kubectl label node <node-name> nvidia.com/gpu.workload.config=vm-passthrough`.
#### Kata Containers
Install the latest Kata Containers helm chart, similar to
[existing documentation](https://github.com/kata-containers/kata-containers/blob/main/tools/packaging/kata-deploy/helm-chart/README.md)
(minimum version: `3.24.0`).
```bash
$ export VERSION=$(curl -sSL https://api.github.com/repos/kata-containers/kata-containers/releases/latest | jq .tag_name | tr -d '"')
$ export CHART="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy"
$ helm install kata-deploy \
--namespace kata-system \
--create-namespace \
-f "https://raw.githubusercontent.com/kata-containers/kata-containers/refs/tags/${VERSION}/tools/packaging/kata-deploy/helm-chart/kata-deploy/try-kata-nvidia-gpu.values.yaml" \
--set nfd.enabled=false \
--set shims.qemu-nvidia-gpu-tdx.enabled=false \
--wait --timeout 10m --atomic \
"${CHART}" --version "${VERSION}"
```
#### Trustee's KBS for remote attestation
For our Kata CI runners we use Trustee's KBS for composite attestation for
secure key release, for instance, for test scenarios which use authenticated
container images. In such scenarios, the credentials to access the
authenticated container registry are only released to the confidential guest
after successful attestation. Please see the section below for more
information about this.
```bash
$ export NVIDIA_VERIFIER_MODE="remote"
$ export KBS_INGRESS="nodeport"
$ bash tests/integration/kubernetes/gha-run.sh deploy-coco-kbs
$ bash tests/integration/kubernetes/gha-run.sh install-kbs-client
```
Please note, that Trustee can also be deployed via any other upstream
mechanism as documented by the
[confidential-containers repository](https://github.com/confidential-containers/trustee).
For our architecture it is important to set up KBS in the remote verifier
mode which requires entering a licensing agreement with NVIDIA, see the
[notes in confidential-containers repository](https://github.com/confidential-containers/trustee/blob/main/deps/verifier/src/nvidia/README.md).
### Cluster validation and preparation
If you did not use the `sandboxWorkloads.defaultWorkload=vm-passthrough`
parameter during GPU operator deployment, label your nodes for GPU VM
passthrough, for the example of using all nodes for GPU passthrough, run:
```bash
$ kubectl label nodes --all nvidia.com/gpu.workload.config=vm-passthrough --overwrite
```
Check if the `nvidia-cc-manager` pod is running if you intend to run GPU TEE
scenarios. If not, you need to manually label the node as CC capable. Current
GPU Operator node feature rules do not yet recognize all CC capable GPU PCI
IDs. Run the following command:
```bash
$ kubectl label nodes --all nvidia.com/cc.capable=true
```
After this, assure the `nvidia-cc-manager` pod is running. With the suggested
parameters for GPU Operator deployment, the `nvidia-cc-manager` will
automatically transition the GPU into CC mode.
After deployment, you can transition your node(s) to the desired CC state,
using either the `on` or `off` value, depending on your scenario. For the
non-CC scenario, transition to the `off` state via:
`kubectl label nodes --all nvidia.com/cc.mode=off` and wait until all pods
are back running. When an actual change is exercised, various GPU operator
operands will be restarted.
Ensure all pods are running:
```bash
$ kubectl get pods -A
```
On your node(s), ensure for correct driver binding. Your GPU device should be
bound to the VFIO driver, i.e., showing `Kernel driver in use: vfio-pci`
when running:
```bash
$ lspci -nnk -d 10de:
```
### Run the CUDA vectorAdd sample
Create the following file:
```yaml
apiVersion: v1
kind: Pod
metadata:
name: cuda-vectoradd-kata
namespace: default
annotations:
io.katacontainers.config.hypervisor.kernel_params: "nvrc.smi.srs=1"
spec:
runtimeClassName: ${GPU_RUNTIME_CLASS_NAME}
restartPolicy: Never
containers:
- name: cuda-vectoradd
image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04"
resources:
limits:
nvidia.com/pgpu: "1"
memory: 16Gi
```
Depending on your scenario and on the CC state, export your desired runtime
class name define the environment variable:
```bash
$ export GPU_RUNTIME_CLASS_NAME="kata-qemu-nvidia-gpu-snp"
```
Then, deploy the sample Kubernetes pod manifest and observe the pod logs:
```bash
$ envsubst < ./cuda-vectoradd-kata.yaml.in | kubectl apply -f -
$ kubectl wait --for=condition=Ready pod/cuda-vectoradd-kata --timeout=60s
$ kubectl logs -n default cuda-vectoradd-kata
```
Expect the following output:
```
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
Copy output data from the CUDA device to the host memory
Test PASSED
Done
```
To stop the pod, run: `kubectl delete pod cuda-vectoradd-kata`.
### Next steps
#### Transition between CC and non-CC mode
Use the previously described node labeling approach to transition between
the CC and non-CC mode. In case of the non-CC mode, you can use the
`kata-qemu-nvidia-gpu` value for the `GPU_RUNTIME_CLASS_NAME` runtime class
variable in the above CUDA vectorAdd sample. The `kata-qemu-nvidia-gpu-snp`
runtime class will **NOT** work in this mode - and vice versa.
#### Run Kata CI tests locally
Upstream Kata CI runs the CUDA vectorAdd test, a composite attestation test,
and a basic NIM/RAG deployment. Running CI tests for the TEE GPU scenario
requires KBS to be deployed (except for the CUDA vectorAdd test). The best
place to get started running these tests locally is to look into our
[NVIDIA CI workflow manifest](https://github.com/kata-containers/kata-containers/blob/main/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml)
and into the underling
[run_kubernetes_nv_tests.sh](https://github.com/kata-containers/kata-containers/blob/main/tests/integration/kubernetes/run_kubernetes_nv_tests.sh)
script. For example, to run the CUDA vectorAdd scenario against the TEE GPU
runtime class use the following commands:
```bash
# create the kata runtime class the test framework uses
$ export KATA_HYPERVISOR=qemu-nvidia-gpu-snp
$ kubectl delete runtimeclass kata --ignore-not-found
$ kubectl get runtimeclass "kata-${KATA_HYPERVISOR}" -o json | \
jq '.metadata.name = "kata" | del(.metadata.uid, .metadata.resourceVersion, .metadata.creationTimestamp)' | \
kubectl apply -f -
$ cd tests/integration/kubernetes
$ K8S_TEST_NV="k8s-nvidia-cuda.bats" ./gha-run.sh run-nv-tests
```
> **Note:**
>
> The other scenarios require an NGC API key to run, i.e., to export the
> `NGC_API_KEY` variable with a valid NGC API key.
#### Deploy pods using attestation
Attestation is a fundamental piece of the confidential containers solution.
In our upstream CI we use attestation at the example of leveraging the
authenticated container image pull mechanism where container images reside
in the authenticated NVCR registry (`k8s-nvidia-nim.bats`), and for
requesting secrets from KBS (`k8s-confidential-attestation.bats`). KBS will
release the image pull secret to a confidential guest. To get the
authentication credentials from inside the guest, KBS must already be
deployed and configured. In our CI samples, we configure KBS with the guest
image pull secret, a resource policy, and launch the pod with certain kernel
command line parameters:
`"agent.image_registry_auth=kbs:///default/credentials/nvcr agent.aa_kbc_params=cc_kbc::${CC_KBS_ADDR}"`.
The `agent.aa_kbc_params` option is a general configuration for attestation.
For your use case, you need to set the IP address and port under which KBS
is reachable through the `CC_KBS_ADDR` variable (see our CI sample). This
tells the guest how to reach KBS. Something like this must be set whenever
attestation is used, but on its own this parameter does not trigger
attestation. The `agent.image_registry_auth` option tells the guest to ask
for a resource from KBS and use it as the authentication configuration. When
this is set, the guest will request this resource at boot (and trigger
attestation) regardless of which image is being pulled.
To deploy your own pods using authenticated container images, or secure key
release for attestation, follow steps similar to our mentioned CI samples.
#### Deploy pods with Kata agent security policies
With GPU passthrough being supported by the
[genpolicy tool](https://github.com/kata-containers/kata-containers/tree/main/src/tools/genpolicy),
you can use the tool to create a Kata agent security policy. Our CI deploys
all sample pod manifests with a Kata agent security policy.
#### Deploy pods using your own containers and manifests
You can author pod manifests leveraging your own containers, for instance,
containers built using the CUDA container toolkit. We recommend to start
with a CUDA base container.
The GPU is transitioned into the `Ready` state via attestation, for instance,
when pulling authenticated images. If your deployment scenario does not use
attestation, please refer back to the CUDA vectorAdd pod manifest. In this
manifest, we ensure that NVRC sets the GPU to `Ready` state by adding the
following annotation in the manifest:
`io.katacontainers.config.hypervisor.kernel_params: "nvrc.smi.srs=1"`
> **Notes:**
>
> - musl-based container images (e.g., using Alpine), or distro-less
> containers are not supported.
> - for the TEE scenario, only single-GPU passthrough per pod is supported,
> so your pod resource limit must be: `nvidia.com/pgpu: "1"` (on a system
> with multiple GPUs, you can thus pass through one GPU per pod).

View File

@@ -1,25 +1,10 @@
# Using NVIDIA GPU device with Kata Containers
This page gives an overview on the different modes in which GPUs can be passed
to a Kata Containers container, provides host system requirements, explains how
Kata Containers guest components can be built to support the NVIDIA GPU
scenario, and gives practical usage examples using `ctr`.
Please see the guide
[Enabling NVIDIA GPU workloads using GPU passthrough with Kata Containers](NVIDIA-GPU-passthrough-and-Kata-QEMU.md)
for a documentation of an end-to-end reference implementation of a Kata
Containers stack for GPU passthrough using QEMU, the go-based Kata Runtime,
and an NVIDIA-specific root filesystem. This reference implementation is built
and validated in Kata's CI, and it can be used to test GPU workloads with Kata
components and Kubernetes out of the box.
## Comparison between Passthrough and vGPU Modes
An NVIDIA GPU device can be passed to a Kata Containers container using GPU
passthrough (NVIDIA GPU passthrough mode) as well as GPU mediated passthrough
passthrough (NVIDIA GPU pass-through mode) as well as GPU mediated passthrough
(NVIDIA `vGPU` mode).
NVIDIA GPU passthrough mode, an entire physical GPU is directly assigned to one
NVIDIA GPU pass-through mode, an entire physical GPU is directly assigned to one
VM, bypassing the NVIDIA Virtual GPU Manager. In this mode of operation, the GPU
is accessed exclusively by the NVIDIA driver running in the VM to which it is
assigned. The GPU is not shared among VMs.
@@ -35,20 +20,18 @@ with [MIG-slices](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
| Technology | Description | Behavior | Detail |
| --- | --- | --- | --- |
| NVIDIA GPU passthrough mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation |
| NVIDIA GPU pass-through mode | GPU passthrough | Physical GPU assigned to a single VM | Direct GPU assignment to VM without limitation |
| NVIDIA vGPU time-sliced | GPU time-sliced | Physical GPU time-sliced for multiple VMs | Mediated passthrough |
| NVIDIA vGPU MIG-backed | GPU with MIG-slices | Physical GPU MIG-sliced for multiple VMs | Mediated passthrough |
## Host Requirements
## Hardware Requirements
### Hardware
NVIDIA GPUs recommended for virtualization:
NVIDIA GPUs Recommended for Virtualization:
- NVIDIA Tesla (T4, M10, P6, V100 or newer)
- NVIDIA Quadro RTX 6000/8000
### Firmware
## Host BIOS Requirements
Some hardware requires a larger PCI BARs window, for example, NVIDIA Tesla P100,
K40m
@@ -72,7 +55,9 @@ Some hardware vendors use a different name in BIOS, such as:
If one is using a GPU based on the Ampere architecture and later additionally
SR-IOV needs to be enabled for the `vGPU` use-case.
### Kernel
The following steps outline the workflow for using an NVIDIA GPU with Kata.
## Host Kernel Requirements
The following configurations need to be enabled on your host kernel:
@@ -85,13 +70,7 @@ The following configurations need to be enabled on your host kernel:
Your host kernel needs to be booted with `intel_iommu=on` on the kernel command
line.
## Build the Kata Components
This section explains how to build an environment with Kata Containers bits
supporting the GPU scenario. We first deploy and configure the regular Kata
components, then describe how to build the guest kernel and root filesystem.
### Install and configure Kata Containers
## Install and configure Kata Containers
To use non-large BARs devices (for example, NVIDIA Tesla T4), you need Kata
version 1.3.0 or above. Follow the [Kata Containers setup
@@ -122,7 +101,7 @@ hotplug_vfio_on_root_bus = true
pcie_root_port = 1
```
### Build guest kernel with GPU support
## Build Kata Containers kernel with GPU support
The default guest kernel installed with Kata Containers does not provide GPU
support. To use an NVIDIA GPU with Kata Containers, you need to build a kernel
@@ -181,11 +160,11 @@ code, using `Dragonball VMM` for NVIDIA GPU `hot-plug/hot-unplug` requires apply
addition to the above kernel configuration items. Follow these steps to build for NVIDIA GPU `hot-[un]plug`
for `Dragonball`:
```sh
# Prepare .config to support both upcall and nvidia gpu
```sh
# Prepare .config to support both upcall and nvidia gpu
$ ./build-kernel.sh -v 5.10.25 -e -t dragonball -g nvidia -f setup
# Build guest kernel to support both upcall and nvidia gpu
# Build guest kernel to support both upcall and nvidia gpu
$ ./build-kernel.sh -v 5.10.25 -e -t dragonball -g nvidia build
# Install guest kernel to support both upcall and nvidia gpu
@@ -217,7 +196,303 @@ Before using the new guest kernel, please update the `kernel` parameters in
kernel = "/usr/share/kata-containers/vmlinuz-nvidia-gpu.container"
```
### Build Guest OS with NVIDIA Driver and Toolkit
## NVIDIA GPU pass-through mode with Kata Containers
Use the following steps to pass an NVIDIA GPU device in pass-through mode with Kata:
1. Find the Bus-Device-Function (BDF) for the GPU device on the host:
```sh
$ sudo lspci -nn -D | grep -i nvidia
0000:d0:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:20b9] (rev a1)
```
> PCI address `0000:d0:00.0` is assigned to the hardware GPU device.
> `10de:20b9` is the device ID of the hardware GPU device.
2. Find the IOMMU group for the GPU device:
```sh
$ BDF="0000:d0:00.0"
$ readlink -e /sys/bus/pci/devices/$BDF/iommu_group
```
The previous output shows that the GPU belongs to IOMMU group 192. The next
step is to bind the GPU to the VFIO-PCI driver.
```sh
$ BDF="0000:d0:00.0"
$ DEV="/sys/bus/pci/devices/$BDF"
$ echo "vfio-pci" > $DEV/driver_override
$ echo $BDF > $DEV/driver/unbind
$ echo $BDF > /sys/bus/pci/drivers_probe
# To return the device to the standard driver, we simply clear the
# driver_override and reprobe the device, ex:
$ echo > $DEV/preferred_driver
$ echo $BDF > $DEV/driver/unbind
$ echo $BDF > /sys/bus/pci/drivers_probe
```
3. Check the IOMMU group number under `/dev/vfio`:
```sh
$ ls -l /dev/vfio
total 0
crw------- 1 zvonkok zvonkok 243, 0 Mar 18 03:06 192
crw-rw-rw- 1 root root 10, 196 Mar 18 02:27 vfio
```
4. Start a Kata container with the GPU device:
```sh
# You may need to `modprobe vhost-vsock` if you get
# host system doesn't support vsock: stat /dev/vhost-vsock
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch uname -r
```
5. Run `lspci` within the container to verify the GPU device is seen in the list
of the PCI devices. Note the vendor-device id of the GPU (`10de:20b9`) in the `lspci` output.
```sh
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -nn | grep '10de:20b9'"
```
6. Additionally, you can check the PCI BARs space of the NVIDIA GPU device in the container:
```sh
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -s 02:00.0 -vv | grep Region"
```
> **Note**: If you see a message similar to the above, the BAR space of the NVIDIA
> GPU has been successfully allocated.
## NVIDIA vGPU mode with Kata Containers
NVIDIA vGPU is a licensed product on all supported GPU boards. A software license
is required to enable all vGPU features within the guest VM. NVIDIA vGPU manager
needs to be installed on the host to configure GPUs in vGPU mode. See [NVIDIA Virtual GPU Software Documentation v14.0 through 14.1](https://docs.nvidia.com/grid/14.0/) for more details.
### NVIDIA vGPU time-sliced
In the time-sliced mode, the GPU is not partitioned and the workload uses the
whole GPU and shares access to the GPU engines. Processes are scheduled in
series. The best effort scheduler is the default one and can be exchanged by
other scheduling policies see the documentation above how to do that.
Beware if you had `MIG` enabled before to disable `MIG` on the GPU if you want
to use `time-sliced` `vGPU`.
```sh
$ sudo nvidia-smi -mig 0
```
Enable the virtual functions for the physical GPU in the `sysfs` file system.
```sh
$ sudo /usr/lib/nvidia/sriov-manage -e 0000:41:00.0
```
Get the `BDF` of the available virtual function on the GPU, and choose one for the
following steps.
```sh
$ cd /sys/bus/pci/devices/0000:41:00.0/
$ ls -l | grep virtfn
```
#### List all available vGPU instances
The following shell snippet will walk the `sysfs` and only print instances
that are available, that can be created.
```sh
# The 00.0 is often the PF of the device the VFs will have the funciont in the
# BDF incremented by some values so e.g. the very first VF is 0000:41:00.4
cd /sys/bus/pci/devices/0000:41:00.0/
for vf in $(ls -d virtfn*)
do
BDF=$(basename $(readlink -f $vf))
for md in $(ls -d $vf/mdev_supported_types/*)
do
AVAIL=$(cat $md/available_instances)
NAME=$(cat $md/name)
DIR=$(basename $md)
if [ $AVAIL -gt 0 ]; then
echo "| BDF | INSTANCES | NAME | DIR |"
echo "+--------------+-----------+----------------+------------+"
printf "| %12s |%10d |%15s | %10s |\n\n" "$BDF" "$AVAIL" "$NAME" "$DIR"
fi
done
done
```
If there are available instances you get something like this (for the first VF),
beware that the output is highly dependent on the GPU you have, if there is no
output check again if `MIG` is really disabled.
```sh
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-4C | nvidia-692 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-8C | nvidia-693 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-10C | nvidia-694 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-16C | nvidia-695 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-20C | nvidia-696 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-40C | nvidia-697 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-80C | nvidia-698 |
```
Change to the `mdev_supported_types` directory for the virtual function on which
you want to create the `vGPU`. Taking the first output as an example:
```sh
$ cd virtfn0/mdev_supported_types/nvidia-692
$ UUIDGEN=$(uuidgen)
$ sudo bash -c "echo $UUIDGEN > create"
```
Confirm that the `vGPU` was created. You should see the `UUID` pointing to a
subdirectory of the `sysfs` space.
```sh
$ ls -l /sys/bus/mdev/devices/
```
Get the `IOMMU` group number and verify there is a `VFIO` device created to use
with Kata.
```sh
$ ls -l /sys/bus/mdev/devices/*/
$ ls -l /dev/vfio
```
Use the `VFIO` device created in the same way as in the pass-through use-case.
Beware that the guest needs the NVIDIA guest drivers, so one would need to build
a new guest `OS` image.
### NVIDIA vGPU MIG-backed
We're not going into detail what `MIG` is but briefly it is a technology to
partition the hardware into independent instances with guaranteed quality of
service. For more details see [NVIDIA Multi-Instance GPU User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
First enable `MIG` mode for a GPU, depending on the platform you're running
a reboot would be necessary. Some platforms support GPU reset.
```sh
$ sudo nvidia-smi -mig 1
```
If the platform supports a GPU reset one can run, otherwise you will get a
warning to reboot the server.
```sh
$ sudo nvidia-smi --gpu-reset
```
The driver per default provides a number of profiles that users can opt-in when
configuring the MIG feature.
```sh
$ sudo nvidia-smi mig -lgip
+-----------------------------------------------------------------------------+
| GPU instance profiles: |
| GPU Name ID Instances Memory P2P SM DEC ENC |
| Free/Total GiB CE JPEG OFA |
|=============================================================================|
| 0 MIG 1g.10gb 19 7/7 9.50 No 14 0 0 |
| 1 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 1g.10gb+me 20 1/1 9.50 No 14 1 0 |
| 1 1 1 |
+-----------------------------------------------------------------------------+
| 0 MIG 2g.20gb 14 3/3 19.50 No 28 1 0 |
| 2 0 0 |
+-----------------------------------------------------------------------------+
...
```
Create the GPU instances that correspond to the `vGPU` types of the `MIG-backed`
`vGPUs` that you will create [NVIDIA A100 PCIe 80GB Virtual GPU Types](https://docs.nvidia.com/grid/13.0/grid-vgpu-user-guide/index.html#vgpu-types-nvidia-a100-pcie-80gb).
```sh
# MIG 1g.10gb --> vGPU A100D-1-10C
$ sudo nvidia-smi mig -cgi 19
```
List the GPU instances and get the GPU instance id to create the compute
instance.
```sh
$ sudo nvidia-smi mig -lgi # list the created GPU instances
$ sudo nvidia-smi mig -cci -gi 9 # each GPU instance can have several compute
# instances. Instance -> Workload
```
Verify that the compute instances were created within the GPU instance
```sh
$ nvidia-smi
... snip ...
+-----------------------------------------------------------------------------+
| MIG devices: |
+------------------+----------------------+-----------+-----------------------+
| GPU GI CI MIG | Memory-Usage | Vol| Shared |
| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG|
| | | ECC| |
|==================+======================+===========+=======================|
| 0 9 0 0 | 0MiB / 9728MiB | 14 0 | 1 0 0 0 0 |
| | 0MiB / 4095MiB | | |
+------------------+----------------------+-----------+-----------------------+
... snip ...
```
We can use the [snippet](#list-all-available-vgpu-instances) from before to list
the available `vGPU` instances, this time `MIG-backed`.
```sh
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 |GRID A100D-1-10C | nvidia-699 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.5 | 1 |GRID A100D-1-10C | nvidia-699 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:01.6 | 1 |GRID A100D-1-10C | nvidia-699 |
... snip ...
```
Repeat the steps after the [snippet](#list-all-available-vgpu-instances) listing
to create the corresponding `mdev` device and use the guest `OS` created in the
previous section with `time-sliced` `vGPUs`.
## Install NVIDIA Driver + Toolkit in Kata Containers Guest OS
Consult the [Developer-Guide](https://github.com/kata-containers/kata-containers/blob/main/docs/Developer-Guide.md#create-a-rootfs-image) on how to create a
rootfs base image for a distribution of your choice. This is going to be used as
@@ -308,12 +583,9 @@ Enable the `guest_hook_path` in Kata's `configuration.toml`
guest_hook_path = "/usr/share/oci/hooks"
```
As the last step one can remove the additional packages and files that were added
to the `$ROOTFS_DIR` to keep it as small as possible.
One has built a NVIDIA rootfs, kernel and now we can run any GPU container
without installing the drivers into the container. Check NVIDIA device status
with `nvidia-smi`:
with `nvidia-smi`
```sh
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/nvidia/cuda:11.6.0-base-ubuntu20.04" cuda nvidia-smi
@@ -339,309 +611,8 @@ Fri Mar 18 10:36:59 2022
+-----------------------------------------------------------------------------+
```
## Usage Examples with Kata Containers
The following sections give usage examples for this based on the different modes.
### NVIDIA GPU passthrough mode
Use the following steps to pass an NVIDIA GPU device in passthrough mode with Kata:
1. Find the Bus-Device-Function (BDF) for the GPU device on the host:
```sh
$ sudo lspci -nn -D | grep -i nvidia
0000:d0:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:20b9] (rev a1)
```
> PCI address `0000:d0:00.0` is assigned to the hardware GPU device.
> `10de:20b9` is the device ID of the hardware GPU device.
2. Find the IOMMU group for the GPU device:
```sh
$ BDF="0000:d0:00.0"
$ readlink -e /sys/bus/pci/devices/$BDF/iommu_group
```
The previous output shows that the GPU belongs to IOMMU group 192. The next
step is to bind the GPU to the VFIO-PCI driver.
```sh
$ BDF="0000:d0:00.0"
$ DEV="/sys/bus/pci/devices/$BDF"
$ echo "vfio-pci" > $DEV/driver_override
$ echo $BDF > $DEV/driver/unbind
$ echo $BDF > /sys/bus/pci/drivers_probe
# To return the device to the standard driver, we simply clear the
# driver_override and reprobe the device, ex:
$ echo > $DEV/preferred_driver
$ echo $BDF > $DEV/driver/unbind
$ echo $BDF > /sys/bus/pci/drivers_probe
```
3. Check the IOMMU group number under `/dev/vfio`:
```sh
$ ls -l /dev/vfio
total 0
crw------- 1 zvonkok zvonkok 243, 0 Mar 18 03:06 192
crw-rw-rw- 1 root root 10, 196 Mar 18 02:27 vfio
```
4. Start a Kata container with the GPU device:
```sh
# You may need to `modprobe vhost-vsock` if you get
# host system doesn't support vsock: stat /dev/vhost-vsock
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch uname -r
```
5. Run `lspci` within the container to verify the GPU device is seen in the list
of the PCI devices. Note the vendor-device id of the GPU (`10de:20b9`) in the `lspci` output.
```sh
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -nn | grep '10de:20b9'"
```
6. Additionally, you can check the PCI BARs space of the NVIDIA GPU device in the container:
```sh
$ sudo ctr --debug run --runtime "io.containerd.kata.v2" --device /dev/vfio/192 --rm -t "docker.io/library/archlinux:latest" arch sh -c "lspci -s 02:00.0 -vv | grep Region"
```
> **Note**: If you see a message similar to the above, the BAR space of the NVIDIA
> GPU has been successfully allocated.
### NVIDIA vGPU mode
NVIDIA vGPU is a licensed product on all supported GPU boards. A software license
is required to enable all vGPU features within the guest VM. NVIDIA vGPU manager
needs to be installed on the host to configure GPUs in vGPU mode. See
[NVIDIA Virtual GPU Software Documentation v14.0 through 14.1](https://docs.nvidia.com/grid/14.0/)
for more details.
#### NVIDIA vGPU time-sliced
In the time-sliced mode, the GPU is not partitioned and the workload uses the
whole GPU and shares access to the GPU engines. Processes are scheduled in
series. The best effort scheduler is the default one and can be exchanged by
other scheduling policies see the documentation above how to do that.
Beware if you had `MIG` enabled before to disable `MIG` on the GPU if you want
to use `time-sliced` `vGPU`.
```sh
$ sudo nvidia-smi -mig 0
```
Enable the virtual functions for the physical GPU in the `sysfs` file system.
```sh
$ sudo /usr/lib/nvidia/sriov-manage -e 0000:41:00.0
```
Get the `BDF` of the available virtual function on the GPU, and choose one for the
following steps.
```sh
$ cd /sys/bus/pci/devices/0000:41:00.0/
$ ls -l | grep virtfn
```
##### List all available vGPU instances
The following shell snippet will walk the `sysfs` and only print instances
that are available, that can be created.
```sh
# The 00.0 is often the PF of the device. The VFs will have the function in the
# BDF incremented by some values so e.g. the very first VF is 0000:41:00.4
cd /sys/bus/pci/devices/0000:41:00.0/
for vf in $(ls -d virtfn*)
do
BDF=$(basename $(readlink -f $vf))
for md in $(ls -d $vf/mdev_supported_types/*)
do
AVAIL=$(cat $md/available_instances)
NAME=$(cat $md/name)
DIR=$(basename $md)
if [ $AVAIL -gt 0 ]; then
echo "| BDF | INSTANCES | NAME | DIR |"
echo "+--------------+-----------+----------------+------------+"
printf "| %12s |%10d |%15s | %10s |\n\n" "$BDF" "$AVAIL" "$NAME" "$DIR"
fi
done
done
```
If there are available instances you get something like this (for the first VF),
beware that the output is highly dependent on the GPU you have, if there is no
output check again if `MIG` is really disabled.
```sh
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-4C | nvidia-692 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-8C | nvidia-693 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-10C | nvidia-694 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-16C | nvidia-695 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-20C | nvidia-696 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-40C | nvidia-697 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 | GRID A100D-80C | nvidia-698 |
```
Change to the `mdev_supported_types` directory for the virtual function on which
you want to create the `vGPU`. Taking the first output as an example:
```sh
$ cd virtfn0/mdev_supported_types/nvidia-692
$ UUIDGEN=$(uuidgen)
$ sudo bash -c "echo $UUIDGEN > create"
```
Confirm that the `vGPU` was created. You should see the `UUID` pointing to a
subdirectory of the `sysfs` space.
```sh
$ ls -l /sys/bus/mdev/devices/
```
Get the `IOMMU` group number and verify there is a `VFIO` device created to use
with Kata.
```sh
$ ls -l /sys/bus/mdev/devices/*/
$ ls -l /dev/vfio
```
Use the `VFIO` device created in the same way as in the passthrough use-case.
Beware that the guest needs the NVIDIA guest drivers, so one would need to build
a new guest `OS` image.
#### NVIDIA vGPU MIG-backed
We're not going into detail what `MIG` is but briefly it is a technology to
partition the hardware into independent instances with guaranteed quality of
service. For more details see
[NVIDIA Multi-Instance GPU User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/).
First enable `MIG` mode for a GPU, depending on the platform you're running
a reboot would be necessary. Some platforms support GPU reset.
```sh
$ sudo nvidia-smi -mig 1
```
If the platform supports a GPU reset one can run, otherwise you will get a
warning to reboot the server.
```sh
$ sudo nvidia-smi --gpu-reset
```
The driver per default provides a number of profiles that users can opt-in when
configuring the MIG feature.
```sh
$ sudo nvidia-smi mig -lgip
+-----------------------------------------------------------------------------+
| GPU instance profiles: |
| GPU Name ID Instances Memory P2P SM DEC ENC |
| Free/Total GiB CE JPEG OFA |
|=============================================================================|
| 0 MIG 1g.10gb 19 7/7 9.50 No 14 0 0 |
| 1 0 0 |
+-----------------------------------------------------------------------------+
| 0 MIG 1g.10gb+me 20 1/1 9.50 No 14 1 0 |
| 1 1 1 |
+-----------------------------------------------------------------------------+
| 0 MIG 2g.20gb 14 3/3 19.50 No 28 1 0 |
| 2 0 0 |
+-----------------------------------------------------------------------------+
...
```
Create the GPU instances that correspond to the `vGPU` types of the `MIG-backed`
`vGPUs` that you will create
[NVIDIA A100 PCIe 80GB Virtual GPU Types](https://docs.nvidia.com/grid/13.0/grid-vgpu-user-guide/index.html#vgpu-types-nvidia-a100-pcie-80gb).
```sh
# MIG 1g.10gb --> vGPU A100D-1-10C
$ sudo nvidia-smi mig -cgi 19
```
List the GPU instances and get the GPU instance id to create the compute
instance.
```sh
$ sudo nvidia-smi mig -lgi # list the created GPU instances
$ sudo nvidia-smi mig -cci -gi 9 # each GPU instance can have several compute
# instances. Instance -> Workload
```
Verify that the compute instances were created within the GPU instance
```sh
$ nvidia-smi
... snip ...
+-----------------------------------------------------------------------------+
| MIG devices: |
+------------------+----------------------+-----------+-----------------------+
| GPU GI CI MIG | Memory-Usage | Vol| Shared |
| ID ID Dev | BAR1-Usage | SM Unc| CE ENC DEC OFA JPG|
| | | ECC| |
|==================+======================+===========+=======================|
| 0 9 0 0 | 0MiB / 9728MiB | 14 0 | 1 0 0 0 0 |
| | 0MiB / 4095MiB | | |
+------------------+----------------------+-----------+-----------------------+
... snip ...
```
We can use the [snippet](#list-all-available-vgpu-instances) from before to list
the available `vGPU` instances, this time `MIG-backed`.
```sh
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.4 | 1 |GRID A100D-1-10C | nvidia-699 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:00.5 | 1 |GRID A100D-1-10C | nvidia-699 |
| BDF | INSTANCES | NAME | DIR |
+--------------+-----------+----------------+------------+
| 0000:41:01.6 | 1 |GRID A100D-1-10C | nvidia-699 |
... snip ...
```
Repeat the steps after the [snippet](#list-all-available-vgpu-instances) listing
to create the corresponding `mdev` device and use the guest `OS` created in the
previous section with `time-sliced` `vGPUs`.
As the last step one can remove the additional packages and files that were added
to the `$ROOTFS_DIR` to keep it as small as possible.
## References

View File

@@ -1,20 +1,24 @@
# Table of Contents
**Note:**: This guide used to contain an end-to-end flow to build a
custom Kata containers root filesystem with QAT out-of-tree SR-IOV virtual
function driver and run QAT enabled containers. The former is no longer necessary
so the instructions are dropped. If the use-case is still of interest, please file
an issue in either of the QAT Kubernetes specific repos linked below.
# Introduction
Intel® QuickAssist Technology (QAT) provides hardware acceleration
for security (cryptography) and compression. Kata Containers can enable
these acceleration functions for containers using QAT SR-IOV with the
support from [Intel QAT Device Plugin for Kubernetes](https://github.com/intel/intel-device-plugins-for-kubernetes)
or [Intel QAT DRA Resource Driver for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes).
for security (cryptography) and compression. These instructions cover the
steps for the latest [Ubuntu LTS release](https://ubuntu.com/download/desktop)
which already include the QAT host driver. These instructions can be adapted to
any Linux distribution. These instructions guide the user on how to download
the kernel sources, compile kernel driver modules against those sources, and
load them onto the host as well as preparing a specially built Kata Containers
kernel and custom Kata Containers rootfs.
## More Information
* Download kernel sources
* Compile Kata kernel
* Compile kernel driver modules against those sources
* Download rootfs
* Add driver modules to rootfs
* Build rootfs image
## Helpful Links before starting
[Intel® QuickAssist Technology at `01.org`](https://www.intel.com/content/www/us/en/developer/topic-technology/open/quick-assist-technology/overview.html)
@@ -22,6 +26,554 @@ or [Intel QAT DRA Resource Driver for Kubernetes](https://github.com/intel/intel
[Intel Device Plugin for Kubernetes](https://github.com/intel/intel-device-plugins-for-kubernetes)
[Intel DRA Resource Driver for Kubernetes](https://github.com/intel/intel-resource-drivers-for-kubernetes)
[Intel® QuickAssist Technology for Crypto Poll Mode Driver](https://dpdk-docs.readthedocs.io/en/latest/cryptodevs/qat.html)
## Steps to enable Intel® QAT in Kata Containers
There are some steps to complete only once, some steps to complete with every
reboot, and some steps to complete when the host kernel changes.
## Script variables
The following list of variables must be set before running through the
scripts. These variables refer to locations to store modules and configuration
files on the host and links to the drivers to use. Modify these as
needed to point to updated drivers or different install locations.
### Set environment variables (Every Reboot)
Make sure to check [`01.org`](https://www.intel.com/content/www/us/en/developer/topic-technology/open/quick-assist-technology/overview.html) for
the latest driver.
```bash
$ export QAT_DRIVER_VER=qat1.7.l.4.14.0-00031.tar.gz
$ export QAT_DRIVER_URL=https://downloadmirror.intel.com/30178/eng/${QAT_DRIVER_VER}
$ export QAT_CONF_LOCATION=~/QAT_conf
$ export QAT_DOCKERFILE=https://raw.githubusercontent.com/intel/intel-device-plugins-for-kubernetes/main/demo/openssl-qat-engine/Dockerfile
$ export QAT_SRC=~/src/QAT
$ export GOPATH=~/src/go
$ export KATA_KERNEL_LOCATION=~/kata
$ export KATA_ROOTFS_LOCATION=~/kata
```
## Prepare the Ubuntu Host
The host could be a bare metal instance or a virtual machine. If using a
virtual machine, make sure that KVM nesting is enabled. The following
instructions reference an Intel® C62X chipset. Some of the instructions must be
modified if using a different Intel® QAT device. The Intel® QAT chipset can be
identified by executing the following.
### Identify which PCI Bus the Intel® QAT card is on
```bash
$ for i in 0434 0435 37c8 1f18 1f19; do lspci -d 8086:$i; done
```
### Install necessary packages for Ubuntu
These packages are necessary to compile the Kata kernel, Intel® QAT driver, and to
prepare the rootfs for Kata. [Docker](https://docs.docker.com/engine/install/ubuntu/)
also needs to be installed to be able to build the rootfs. To test that
everything works a Kubernetes pod is started requesting Intel® QAT resources. For the
pass through of the virtual functions the kernel boot parameter needs to have
`INTEL_IOMMU=on`.
```bash
$ sudo apt update
$ sudo apt install -y golang-go build-essential python pkg-config zlib1g-dev libudev-dev bison libelf-dev flex libtool automake autotools-dev autoconf bc libpixman-1-dev coreutils libssl-dev
$ sudo sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT=""/GRUB_CMDLINE_LINUX_DEFAULT="intel_iommu=on"/' /etc/default/grub
$ sudo update-grub
$ sudo reboot
```
### Download Intel® QAT drivers
This will download the [Intel® QAT drivers](https://www.intel.com/content/www/us/en/developer/topic-technology/open/quick-assist-technology/overview.html).
Make sure to check the website for the latest version.
```bash
$ mkdir -p $QAT_SRC
$ cd $QAT_SRC
$ curl -L $QAT_DRIVER_URL | tar zx
```
### Copy Intel® QAT configuration files and enable virtual functions
Modify the instructions below as necessary if using a different Intel® QAT hardware
platform. You can learn more about customizing configuration files at the
[Intel® QAT Engine repository](https://github.com/intel/QAT_Engine/#copy-the-correct-intel-quickassist-technology-driver-config-files)
This section starts from a base config file and changes the `SSL` section to
`SHIM` to support the OpenSSL engine. There are more tweaks that you can make
depending on the use case and how many Intel® QAT engines should be run. You
can find more information about how to customize in the
[Intel® QuickAssist Technology Software for Linux* - Programmer's Guide.](https://www.intel.com/content/www/us/en/content-details/709196/intel-quickassist-technology-api-programmer-s-guide.html)
> **Note: This section assumes that a Intel® QAT `c6xx` platform is used.**
```bash
$ mkdir -p $QAT_CONF_LOCATION
$ cp $QAT_SRC/quickassist/utilities/adf_ctl/conf_files/c6xxvf_dev0.conf.vm $QAT_CONF_LOCATION/c6xxvf_dev0.conf
$ sed -i 's/\[SSL\]/\[SHIM\]/g' $QAT_CONF_LOCATION/c6xxvf_dev0.conf
```
### Expose and Bind Intel® QAT virtual functions to VFIO-PCI (Every reboot)
To enable virtual functions, the host OS should have IOMMU groups enabled. In
the UEFI Firmware Intel® Virtualization Technology for Directed I/O
(Intel® VT-d) must be enabled. Also, the kernel boot parameter should be
`intel_iommu=on` or `intel_iommu=ifgx_off`. This should have been set from
the instructions above. Check the output of `/proc/cmdline` to confirm. The
following commands assume you installed an Intel® QAT card, IOMMU is on, and
VT-d is enabled. The vendor and device ID add to the `VFIO-PCI` driver so that
each exposed virtual function can be bound to the `VFIO-PCI` driver. Once
complete, each virtual function passes into a Kata Containers container using
the PCIe device passthrough feature. For Kubernetes, the
[Intel device plugin](https://github.com/intel/intel-device-plugins-for-kubernetes)
for Kubernetes handles the binding of the driver, but the VFs still must be
enabled.
```bash
$ sudo modprobe vfio-pci
$ QAT_PCI_BUS_PF_NUMBERS=$((lspci -d :435 && lspci -d :37c8 && lspci -d :19e2 && lspci -d :6f54) | cut -d ' ' -f 1)
$ QAT_PCI_BUS_PF_1=$(echo $QAT_PCI_BUS_PF_NUMBERS | cut -d ' ' -f 1)
$ echo 16 | sudo tee /sys/bus/pci/devices/0000:$QAT_PCI_BUS_PF_1/sriov_numvfs
$ QAT_PCI_ID_VF=$(cat /sys/bus/pci/devices/0000:${QAT_PCI_BUS_PF_1}/virtfn0/uevent | grep PCI_ID)
$ QAT_VENDOR_AND_ID_VF=$(echo ${QAT_PCI_ID_VF/PCI_ID=} | sed 's/:/ /')
$ echo $QAT_VENDOR_AND_ID_VF | sudo tee --append /sys/bus/pci/drivers/vfio-pci/new_id
```
Loop through all the virtual functions and bind to the VFIO driver
```bash
$ for f in /sys/bus/pci/devices/0000:$QAT_PCI_BUS_PF_1/virtfn*
do QAT_PCI_BUS_VF=$(basename $(readlink $f))
echo $QAT_PCI_BUS_VF | sudo tee --append /sys/bus/pci/drivers/c6xxvf/unbind
echo $QAT_PCI_BUS_VF | sudo tee --append /sys/bus/pci/drivers/vfio-pci/bind
done
```
### Check Intel® QAT virtual functions are enabled
If the following command returns empty, then the virtual functions are not
properly enabled. This command checks the enumerated device IDs for just the
virtual functions. Using the Intel® QAT as an example, the physical device ID
is `37c8` and virtual function device ID is `37c9`. The following command checks
if VF's are enabled for any of the currently known Intel® QAT device ID's. The
following `ls` command should show the 16 VF's bound to `VFIO-PCI`.
```bash
$ for i in 0442 0443 37c9 19e3; do lspci -d 8086:$i; done
```
Another way to check is to see what PCI devices that `VFIO-PCI` is mapped to.
It should match the device ID's of the VF's.
```bash
$ ls -la /sys/bus/pci/drivers/vfio-pci
```
## Prepare Kata Containers
### Download Kata kernel Source
This example automatically uses the latest Kata kernel supported by Kata. It
follows the instructions from the
[packaging kernel repository](../../tools/packaging/kernel)
and uses the latest Kata kernel
[config](../../tools/packaging/kernel/configs).
There are some patches that must be installed as well, which the
`build-kernel.sh` script should automatically apply. If you are using a
different kernel version, then you might need to manually apply them. Since
the Kata Containers kernel has a minimal set of kernel flags set, you must
create a Intel® QAT kernel fragment with the necessary `CONFIG_CRYPTO_*` options set.
Update the config to set some of the `CRYPTO` flags to enabled. This might
change with different kernel versions. The following instructions were tested
with kernel `v5.4.0-64-generic`.
```bash
$ mkdir -p $GOPATH
$ cd $GOPATH
$ go get -v github.com/kata-containers/kata-containers
$ cat << EOF > $GOPATH/src/github.com/kata-containers/kata-containers/tools/packaging/kernel/configs/fragments/common/qat.conf
CONFIG_PCIEAER=y
CONFIG_UIO=y
CONFIG_CRYPTO_HW=y
CONFIG_CRYPTO_DEV_QAT_C62XVF=m
CONFIG_CRYPTO_CBC=y
CONFIG_MODULES=y
CONFIG_MODULE_SIG=y
CONFIG_CRYPTO_AUTHENC=y
CONFIG_CRYPTO_DH=y
EOF
$ $GOPATH/src/github.com/kata-containers/kata-containers/tools/packaging/kernel/build-kernel.sh setup
```
### Build Kata kernel
```bash
$ cd $GOPATH
$ export LINUX_VER=$(ls -d kata-linux-*)
$ sed -i 's/EXTRAVERSION =/EXTRAVERSION = .qat.container/' $LINUX_VER/Makefile
$ $GOPATH/src/github.com/kata-containers/kata-containers/tools/packaging/kernel/build-kernel.sh build
```
### Copy Kata kernel
```bash
$ export KATA_KERNEL_NAME=vmlinux-${LINUX_VER}_qat
$ mkdir -p $KATA_KERNEL_LOCATION
$ cp ${GOPATH}/${LINUX_VER}/vmlinux ${KATA_KERNEL_LOCATION}/${KATA_KERNEL_NAME}
```
### Prepare Kata root filesystem
These instructions build upon the OS builder instructions located in the
[Developer Guide](../Developer-Guide.md). At this point it is recommended that
[Docker](https://docs.docker.com/engine/install/ubuntu/) is installed first, and
then [Kata-deploy](../../tools/packaging/kata-deploy)
is use to install Kata. This will make sure that the correct `agent` version
is installed into the rootfs in the steps below.
The following instructions use Ubuntu as the root filesystem with systemd as
the init and will add in the `kmod` binary, which is not a standard binary in
a Kata rootfs image. The `kmod` binary is necessary to load the Intel® QAT
kernel modules when the virtual machine rootfs boots.
```bash
$ export OSBUILDER=$GOPATH/src/github.com/kata-containers/kata-containers/tools/osbuilder
$ export ROOTFS_DIR=${OSBUILDER}/rootfs-builder/rootfs
$ export EXTRA_PKGS='kmod'
```
Make sure that the `kata-agent` version matches the installed `kata-runtime`
version. Also make sure the `kata-runtime` install location is in your `PATH`
variable. The following `AGENT_VERSION` can be set manually to match
the `kata-runtime` version if the following commands don't work.
```bash
$ export PATH=$PATH:/opt/kata/bin
$ cd $GOPATH
$ export AGENT_VERSION=$(kata-runtime version | head -n 1 | grep -o "[0-9.]\+")
$ cd ${OSBUILDER}/rootfs-builder
$ sudo rm -rf ${ROOTFS_DIR}
$ script -fec 'sudo -E GOPATH=$GOPATH USE_DOCKER=true SECCOMP=no ./rootfs.sh ubuntu'
```
### Compile Intel® QAT drivers for Kata Containers kernel and add to Kata Containers rootfs
After the Kata Containers kernel builds with the proper configuration flags,
you must build the Intel® QAT drivers against that Kata Containers kernel
version in a similar way they were previously built for the host OS. You must
set the `KERNEL_SOURCE_ROOT` variable to the Kata Containers kernel source
directory and build the Intel® QAT drivers again. The `make` command will
install the Intel® QAT modules into the Kata rootfs.
```bash
$ cd $GOPATH
$ export LINUX_VER=$(ls -d kata*)
$ export KERNEL_MAJOR_VERSION=$(awk '/^VERSION =/{print $NF}' $GOPATH/$LINUX_VER/Makefile)
$ export KERNEL_PATHLEVEL=$(awk '/^PATCHLEVEL =/{print $NF}' $GOPATH/$LINUX_VER/Makefile)
$ export KERNEL_SUBLEVEL=$(awk '/^SUBLEVEL =/{print $NF}' $GOPATH/$LINUX_VER/Makefile)
$ export KERNEL_EXTRAVERSION=$(awk '/^EXTRAVERSION =/{print $NF}' $GOPATH/$LINUX_VER/Makefile)
$ export KERNEL_ROOTFS_DIR=${KERNEL_MAJOR_VERSION}.${KERNEL_PATHLEVEL}.${KERNEL_SUBLEVEL}${KERNEL_EXTRAVERSION}
$ cd $QAT_SRC
$ KERNEL_SOURCE_ROOT=$GOPATH/$LINUX_VER ./configure --enable-icp-sriov=guest
$ sudo -E make all -j $(nproc)
$ sudo -E make INSTALL_MOD_PATH=$ROOTFS_DIR qat-driver-install -j $(nproc)
```
The `usdm_drv` module also needs to be copied into the rootfs modules path and
`depmod` should be run.
```bash
$ sudo cp $QAT_SRC/build/usdm_drv.ko $ROOTFS_DIR/lib/modules/${KERNEL_ROOTFS_DIR}/updates/drivers
$ sudo depmod -a -b ${ROOTFS_DIR} ${KERNEL_ROOTFS_DIR}
$ cd ${OSBUILDER}/image-builder
$ script -fec 'sudo -E USE_DOCKER=true ./image_builder.sh ${ROOTFS_DIR}'
```
> **Note: Ignore any errors on modules.builtin and modules.order when running
> `depmod`.**
### Copy Kata rootfs
```bash
$ mkdir -p $KATA_ROOTFS_LOCATION
$ cp ${OSBUILDER}/image-builder/kata-containers.img $KATA_ROOTFS_LOCATION
```
## Verify Intel® QAT works in a container
The following instructions uses a OpenSSL Dockerfile that builds the
Intel® QAT engine to allow OpenSSL to offload crypto functions. It is a
convenient way to test that VFIO device passthrough for the Intel® QAT VFs are
working properly with the Kata Containers VM.
### Build OpenSSL Intel® QAT engine container
Use the OpenSSL Intel® QAT [Dockerfile](https://github.com/intel/intel-device-plugins-for-kubernetes/tree/main/demo/openssl-qat-engine)
to build a container image with an optimized OpenSSL engine for
Intel® QAT. Using `docker build` with the Kata Containers runtime can sometimes
have issues. Therefore, make sure that `runc` is the default Docker container
runtime.
```bash
$ cd $QAT_SRC
$ curl -O $QAT_DOCKERFILE
$ sudo docker build -t openssl-qat-engine .
```
> **Note: The Intel® QAT driver version in this container might not match the
> Intel® QAT driver compiled and loaded on the host when compiling.**
### Test Intel® QAT with the ctr tool
The `ctr` tool can be used to interact with the containerd daemon. It may be
more convenient to use this tool to verify the kernel and image instead of
setting up a Kubernetes cluster. The correct Kata runtimes need to be added
to the containerd `config.toml`. Below is a sample snippet that can be added
to allow QEMU and Cloud Hypervisor (CLH) to work with `ctr`.
```
[plugins.cri.containerd.runtimes.kata-qemu]
runtime_type = "io.containerd.kata-qemu.v2"
privileged_without_host_devices = true
pod_annotations = ["io.katacontainers.*"]
[plugins.cri.containerd.runtimes.kata-qemu.options]
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
[plugins.cri.containerd.runtimes.kata-clh]
runtime_type = "io.containerd.kata-clh.v2"
privileged_without_host_devices = true
pod_annotations = ["io.katacontainers.*"]
[plugins.cri.containerd.runtimes.kata-clh.options]
ConfigPath = "/opt/kata/share/defaults/kata-containers/configuration-clh.toml"
```
In addition, containerd expects the binary to be in `/usr/local/bin` so add
this small script so that it redirects to be able to use either QEMU or
Cloud Hypervisor with Kata.
```bash
$ echo '#!/usr/bin/env bash' | sudo tee /usr/local/bin/containerd-shim-kata-qemu-v2
$ echo 'KATA_CONF_FILE=/opt/kata/share/defaults/kata-containers/configuration-qemu.toml /opt/kata/bin/containerd-shim-kata-v2 $@' | sudo tee -a /usr/local/bin/containerd-shim-kata-qemu-v2
$ sudo chmod +x /usr/local/bin/containerd-shim-kata-qemu-v2
$ echo '#!/usr/bin/env bash' | sudo tee /usr/local/bin/containerd-shim-kata-clh-v2
$ echo 'KATA_CONF_FILE=/opt/kata/share/defaults/kata-containers/configuration-clh.toml /opt/kata/bin/containerd-shim-kata-v2 $@' | sudo tee -a /usr/local/bin/containerd-shim-kata-clh-v2
$ sudo chmod +x /usr/local/bin/containerd-shim-kata-clh-v2
```
After the OpenSSL image is built and imported into containerd, a Intel® QAT
virtual function exposed in the step above can be added to the `ctr` command.
Make sure to change the `/dev/vfio` number to one that actually exists on the
host system. When using the `ctr` tool, the`configuration.toml` for Kata needs
to point to the custom Kata kernel and rootfs built above and the Intel® QAT
modules in the Kata rootfs need to load at boot. The following steps assume that
`kata-deploy` was used to install Kata and QEMU is being tested. If using a
different hypervisor, different install method for Kata, or a different
Intel® QAT chipset then the command will need to be modified.
> **Note: The following was tested with
[containerd v1.4.6](https://github.com/containerd/containerd/releases/tag/v1.4.6).**
```bash
$ config_file="/opt/kata/share/defaults/kata-containers/configuration-qemu.toml"
$ sudo sed -i "/kernel =/c kernel = "\"${KATA_ROOTFS_LOCATION}/${KATA_KERNEL_NAME}\""" $config_file
$ sudo sed -i "/image =/c image = "\"${KATA_KERNEL_LOCATION}/kata-containers.img\""" $config_file
$ sudo sed -i -e 's/^kernel_params = "\(.*\)"/kernel_params = "\1 modules-load=usdm_drv,qat_c62xvf"/g' $config_file
$ sudo docker save -o openssl-qat-engine.tar openssl-qat-engine:latest
$ sudo ctr images import openssl-qat-engine.tar
$ sudo ctr run --runtime io.containerd.run.kata-qemu.v2 --privileged -t --rm --device=/dev/vfio/180 --mount type=bind,src=/dev,dst=/dev,options=rbind:rw --mount type=bind,src=${QAT_CONF_LOCATION}/c6xxvf_dev0.conf,dst=/etc/c6xxvf_dev0.conf,options=rbind:rw docker.io/library/openssl-qat-engine:latest bash
```
Below are some commands to run in the container image to verify Intel® QAT is
working
```sh
root@67561dc2757a/ # cat /proc/modules
qat_c62xvf 16384 - - Live 0xffffffffc00d9000 (OE)
usdm_drv 86016 - - Live 0xffffffffc00e8000 (OE)
intel_qat 249856 - - Live 0xffffffffc009b000 (OE)
root@67561dc2757a/ # adf_ctl restart
Restarting all devices.
Processing /etc/c6xxvf_dev0.conf
root@67561dc2757a/ # adf_ctl status
Checking status of all devices.
There is 1 QAT acceleration device(s) in the system:
qat_dev0 - type: c6xxvf, inst_id: 0, node_id: 0, bsf: 0000:01:01.0, #accel: 1 #engines: 1 state: up
root@67561dc2757a/ # openssl engine -c -t qat-hw
(qat-hw) Reference implementation of QAT crypto engine v0.6.1
[RSA, DSA, DH, AES-128-CBC-HMAC-SHA1, AES-128-CBC-HMAC-SHA256, AES-256-CBC-HMAC-SHA1, AES-256-CBC-HMAC-SHA256, TLS1-PRF, HKDF, X25519, X448]
[ available ]
```
### Test Intel® QAT in Kubernetes
Start a Kubernetes cluster with containerd as the CRI. The host should
already be setup with 16 virtual functions of the Intel® QAT card bound to
`VFIO-PCI`. Verify this by looking in `/dev/vfio` for a listing of devices.
You might need to disable Docker before initializing Kubernetes. Be aware
that the OpenSSL container image built above will need to be exported from
Docker and imported into containerd.
If Kata is installed through [`kata-deploy`](../../tools/packaging/kata-deploy/helm-chart/README.md)
there will be multiple `configuration.toml` files associated with different
hypervisors. Rather than add in the custom Kata kernel, Kata rootfs, and
kernel modules to each `configuration.toml` as the default, instead use
[annotations](../how-to/how-to-load-kernel-modules-with-kata.md)
in the Kubernetes YAML file to tell Kata which kernel and rootfs to use. The
easy way to do this is to use `kata-deploy` which will install the Kata binaries
to `/opt` and properly configure the `/etc/containerd/config.toml` with annotation
support. However, the `configuration.toml` needs to enable support for
annotations as well. The following configures both QEMU and Cloud Hypervisor
`configuration.toml` files that are currently available with Kata Container
versions 2.0 and higher.
```bash
$ sudo sed -i 's/enable_annotations\s=\s\[\]/enable_annotations = [".*"]/' /opt/kata/share/defaults/kata-containers/configuration-qemu.toml
$ sudo sed -i 's/enable_annotations\s=\s\[\]/enable_annotations = [".*"]/' /opt/kata/share/defaults/kata-containers/configuration-clh.toml
```
Export the OpenSSL image from Docker and import into containerd.
```bash
$ sudo docker save -o openssl-qat-engine.tar openssl-qat-engine:latest
$ sudo ctr -n=k8s.io images import openssl-qat-engine.tar
```
The [Intel® QAT Plugin](https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/cmd/qat_plugin/README.md)
needs to be started so that the virtual functions can be discovered and
used by Kubernetes.
The following YAML file can be used to start a Kata container with Intel® QAT
support. If Kata is installed with `kata-deploy`, then the containerd
`configuration.toml` should have all of the Kata runtime classes already
populated and annotations supported. To use a Intel® QAT virtual function, the
Intel® QAT plugin needs to be started after the VF's are bound to `VFIO-PCI` as
described [above](#expose-and-bind-intel-qat-virtual-functions-to-vfio-pci-every-reboot).
Edit the following to point to the correct Kata kernel and rootfs location
built with Intel® QAT support.
```bash
$ cat << EOF > kata-openssl-qat.yaml
apiVersion: v1
kind: Pod
metadata:
name: kata-openssl-qat
labels:
app: kata-openssl-qat
annotations:
io.katacontainers.config.hypervisor.kernel: "$KATA_KERNEL_LOCATION/$KATA_KERNEL_NAME"
io.katacontainers.config.hypervisor.image: "$KATA_ROOTFS_LOCATION/kata-containers.img"
io.katacontainers.config.hypervisor.kernel_params: "modules-load=usdm_drv,qat_c62xvf"
spec:
runtimeClassName: kata-qemu
containers:
- name: kata-openssl-qat
image: docker.io/library/openssl-qat-engine:latest
imagePullPolicy: IfNotPresent
resources:
limits:
qat.intel.com/generic: 1
cpu: 1
securityContext:
capabilities:
add: ["IPC_LOCK", "SYS_ADMIN"]
volumeMounts:
- mountPath: /etc/c6xxvf_dev0.conf
name: etc-mount
- mountPath: /dev
name: dev-mount
volumes:
- name: dev-mount
hostPath:
path: /dev
- name: etc-mount
hostPath:
path: $QAT_CONF_LOCATION/c6xxvf_dev0.conf
EOF
```
Use `kubectl` to start the pod. Verify that Intel® QAT card acceleration is
working with the Intel® QAT engine.
```bash
$ kubectl apply -f kata-openssl-qat.yaml
```
```sh
$ kubectl exec -it kata-openssl-qat -- adf_ctl restart
Restarting all devices.
Processing /etc/c6xxvf_dev0.conf
$ kubectl exec -it kata-openssl-qat -- adf_ctl status
Checking status of all devices.
There is 1 QAT acceleration device(s) in the system:
qat_dev0 - type: c6xxvf, inst_id: 0, node_id: 0, bsf: 0000:01:01.0, #accel: 1 #engines: 1 state: up
$ kubectl exec -it kata-openssl-qat -- openssl engine -c -t qat-hw
(qat-hw) Reference implementation of QAT crypto engine v0.6.1
[RSA, DSA, DH, AES-128-CBC-HMAC-SHA1, AES-128-CBC-HMAC-SHA256, AES-256-CBC-HMAC-SHA1, AES-256-CBC-HMAC-SHA256, TLS1-PRF, HKDF, X25519, X448]
[ available ]
```
### Troubleshooting
* Check that `/dev/vfio` has VFs enabled.
```sh
$ ls /dev/vfio
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 vfio
```
* Check that the modules load when inside the Kata Container.
```sh
bash-5.0# grep -E "qat|usdm_drv" /proc/modules
qat_c62xvf 16384 - - Live 0x0000000000000000 (O)
usdm_drv 86016 - - Live 0x0000000000000000 (O)
intel_qat 184320 - - Live 0x0000000000000000 (O)
```
* Verify that at least the first `c6xxvf_dev0.conf` file mounts inside the
container image in `/etc`. You will need one configuration file for each VF
passed into the container.
```sh
bash-5.0# ls /etc
c6xxvf_dev0.conf c6xxvf_dev11.conf c6xxvf_dev14.conf c6xxvf_dev3.conf c6xxvf_dev6.conf c6xxvf_dev9.conf resolv.conf
c6xxvf_dev1.conf c6xxvf_dev12.conf c6xxvf_dev15.conf c6xxvf_dev4.conf c6xxvf_dev7.conf hostname
c6xxvf_dev10.conf c6xxvf_dev13.conf c6xxvf_dev2.conf c6xxvf_dev5.conf c6xxvf_dev8.conf hosts
```
* Check `dmesg` inside the container to see if there are any issues with the
Intel® QAT driver.
* If there are issues building the OpenSSL Intel® QAT container image, then
check to make sure that runc is the default runtime for building container.
```sh
$ cat /etc/systemd/system/docker.service.d/50-runtime.conf
[Service]
Environment="DOCKER_DEFAULT_RUNTIME=--default-runtime runc"
```
## Optional Scripts
### Verify Intel® QAT card counters are incremented
To check the built in firmware counters, the Intel® QAT driver has to be compiled
and installed to the host and can't rely on the built in host driver. The
counters will increase when the accelerator is actively being used. To verify
Intel® QAT is actively accelerating the containerized application, use the
following instructions to check if any of the counters increment. Make
sure to change the PCI Device ID to match whats in the system.
```bash
$ for i in 0434 0435 37c8 1f18 1f19; do lspci -d 8086:$i; done
$ sudo watch cat /sys/kernel/debug/qat_c6xx_0000\:b1\:00.0/fw_counters
$ sudo watch cat /sys/kernel/debug/qat_c6xx_0000\:b3\:00.0/fw_counters
$ sudo watch cat /sys/kernel/debug/qat_c6xx_0000\:b5\:00.0/fw_counters
```

View File

@@ -1,3 +1,3 @@
[toolchain]
# Keep in sync with versions.yaml
channel = "1.91"
channel = "1.85.1"

Some files were not shown because too many files have changed in this diff Show More