Files
kata-containers/.github/workflows/run-kata-coco-tests.yaml
Hyounggyu Choi bc131a84b9 GHA: Set timeout for kata-deploy and kbs cleanup
It was observed that some kata-deploy cleanup steps could hang,
causing the workflow to never finish properly. In these cases,
a QEMU process was not cleaned up and kept printing debug logs
to the journal. Over time, this maxed out the runner’s disk
usage and caused the runner service to stop.

Set timeouts for the relevant cleanup steps to avoid this.

Signed-off-by: Hyounggyu Choi <Hyounggyu.Choi@ibm.com>
2026-01-22 10:32:24 +01:00

366 lines
12 KiB
YAML

name: CI | Run kata coco tests
on:
workflow_call:
inputs:
tarball-suffix:
required: false
type: string
registry:
required: true
type: string
repo:
required: true
type: string
tag:
required: true
type: string
pr-number:
required: true
type: string
commit-hash:
required: false
type: string
target-branch:
required: false
type: string
default: ""
secrets:
AUTHENTICATED_IMAGE_PASSWORD:
required: true
AZ_APPID:
required: true
AZ_TENANT_ID:
required: true
AZ_SUBSCRIPTION_ID:
required: true
ITA_KEY:
required: true
permissions: {}
jobs:
run-k8s-tests-on-tee:
name: run-k8s-tests-on-tee
strategy:
fail-fast: false
matrix:
include:
- runner: tdx
vmm: qemu-tdx
- runner: sev-snp
vmm: qemu-snp
runs-on: ${{ matrix.runner }}
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBERNETES: "vanilla"
KBS: "true"
K8S_TEST_HOST_TYPE: "baremetal"
KBS_INGRESS: "nodeport"
SNAPSHOTTER: "nydus"
PULL_TYPE: "guest-pull"
AUTHENTICATED_IMAGE_USER: ${{ vars.AUTHENTICATED_IMAGE_USER }}
AUTHENTICATED_IMAGE_PASSWORD: ${{ secrets.AUTHENTICATED_IMAGE_PASSWORD }}
GH_ITA_KEY: ${{ secrets.ITA_KEY }}
AUTO_GENERATE_POLICY: "yes"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Deploy Kata
timeout-minutes: 20
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
- name: Uninstall previous `kbs-client`
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh uninstall-kbs-client
- name: Deploy CoCo KBS
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh deploy-coco-kbs
env:
ITA_KEY: ${{ env.KATA_HYPERVISOR == 'qemu-tdx' && env.GH_ITA_KEY || '' }}
- name: Install `kbs-client`
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh install-kbs-client
- name: Deploy CSI driver
timeout-minutes: 5
run: bash tests/integration/kubernetes/gha-run.sh deploy-csi-driver
- name: Run tests
timeout-minutes: 100
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Delete kata-deploy
if: always()
timeout-minutes: 15
run: bash tests/integration/kubernetes/gha-run.sh cleanup
- name: Delete CoCo KBS
if: always()
timeout-minutes: 10
run: |
[[ "${KATA_HYPERVISOR}" == "qemu-tdx" ]] && echo "ITA_KEY=${GH_ITA_KEY}" >> "${GITHUB_ENV}"
bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs
- name: Delete CSI driver
timeout-minutes: 5
run: bash tests/integration/kubernetes/gha-run.sh delete-csi-driver
# Generate jobs for testing CoCo on non-TEE environments
run-k8s-tests-coco-nontee:
name: run-k8s-tests-coco-nontee
strategy:
fail-fast: false
matrix:
vmm:
- qemu-coco-dev
- qemu-coco-dev-runtime-rs
snapshotter:
- nydus
pull-type:
- guest-pull
include:
- pull-type: experimental-force-guest-pull
vmm: qemu-coco-dev
snapshotter: ""
runs-on: ubuntu-22.04
permissions:
id-token: write # Used for OIDC access to log into Azure
environment: ci
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
# Some tests rely on that variable to run (or not)
KBS: "true"
# Set the KBS ingress handler (empty string disables handling)
KBS_INGRESS: "aks"
KUBERNETES: "vanilla"
PULL_TYPE: ${{ matrix.pull-type }}
AUTHENTICATED_IMAGE_USER: ${{ vars.AUTHENTICATED_IMAGE_USER }}
AUTHENTICATED_IMAGE_PASSWORD: ${{ secrets.AUTHENTICATED_IMAGE_PASSWORD }}
SNAPSHOTTER: ${{ matrix.snapshotter }}
EXPERIMENTAL_FORCE_GUEST_PULL: ${{ matrix.pull-type == 'experimental-force-guest-pull' && matrix.vmm || '' }}
# Caution: current ingress controller used to expose the KBS service
# requires much vCPUs, lefting only a few for the tests. Depending on the
# host type chose it will result on the creation of a cluster with
# insufficient resources.
K8S_TEST_HOST_TYPE: "all"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Log into the Azure account
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0
with:
client-id: ${{ secrets.AZ_APPID }}
tenant-id: ${{ secrets.AZ_TENANT_ID }}
subscription-id: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Create AKS cluster
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # v3.0.2
with:
timeout_minutes: 15
max_attempts: 20
retry_on: error
retry_wait_seconds: 10
command: bash tests/integration/kubernetes/gha-run.sh create-cluster
- name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats
- name: Install `kubectl`
uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1
with:
version: 'latest'
- name: Download credentials for the Kubernetes CLI to use them
run: bash tests/integration/kubernetes/gha-run.sh get-cluster-credentials
- name: Deploy Kata
timeout-minutes: 20
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-aks
env:
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: ${{ env.SNAPSHOTTER == 'nydus' }}
AUTO_GENERATE_POLICY: ${{ env.PULL_TYPE == 'experimental-force-guest-pull' && 'no' || 'yes' }}
- name: Deploy CoCo KBS
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh deploy-coco-kbs
- name: Install `kbs-client`
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh install-kbs-client
- name: Deploy CSI driver
timeout-minutes: 5
run: bash tests/integration/kubernetes/gha-run.sh deploy-csi-driver
- name: Run tests
timeout-minutes: 80
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests
- name: Refresh OIDC token in case access token expired
if: always()
uses: azure/login@a457da9ea143d694b1b9c7c869ebb04ebe844ef5 # v2.3.0
with:
client-id: ${{ secrets.AZ_APPID }}
tenant-id: ${{ secrets.AZ_TENANT_ID }}
subscription-id: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Delete AKS cluster
if: always()
timeout-minutes: 15
run: bash tests/integration/kubernetes/gha-run.sh delete-cluster
# Generate jobs for testing CoCo on non-TEE environments with erofs-snapshotter
run-k8s-tests-coco-nontee-with-erofs-snapshotter:
name: run-k8s-tests-coco-nontee-with-erofs-snapshotter
strategy:
fail-fast: false
matrix:
vmm:
- qemu-coco-dev
snapshotter:
- erofs
pull-type:
- default
runs-on: ubuntu-24.04
environment: ci
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
# Some tests rely on that variable to run (or not)
KBS: "false"
# Set the KBS ingress handler (empty string disables handling)
KBS_INGRESS: ""
KUBERNETES: "vanilla"
CONTAINER_ENGINE: "containerd"
CONTAINER_ENGINE_VERSION: "v2.2"
PULL_TYPE: ${{ matrix.pull-type }}
SNAPSHOTTER: ${{ matrix.snapshotter }}
USE_EXPERIMENTAL_SETUP_SNAPSHOTTER: "true"
K8S_TEST_HOST_TYPE: "all"
# We are skipping the auto generated policy tests for now,
# but those should be enabled as soon as we work on that.
AUTO_GENERATE_POLICY: "no"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: get-kata-tools-tarball
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
with:
name: kata-tools-static-tarball-amd64${{ inputs.tarball-suffix }}
path: kata-tools-artifacts
- name: Install kata-tools
run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts
- name: Remove unnecessary directories to free up space
run: |
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/julia*
sudo rm -rf /opt/az
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/firefox
- name: Deploy kubernetes
timeout-minutes: 15
run: bash tests/integration/kubernetes/gha-run.sh deploy-k8s
env:
GH_TOKEN: ${{ github.token }}
- name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats
- name: Deploy Kata
timeout-minutes: 20
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
- name: Deploy CSI driver
timeout-minutes: 5
run: bash tests/integration/kubernetes/gha-run.sh deploy-csi-driver
- name: Run tests
timeout-minutes: 80
run: bash tests/integration/kubernetes/gha-run.sh run-tests
- name: Report tests
if: always()
run: bash tests/integration/kubernetes/gha-run.sh report-tests