diff --git a/.github/workflows/basic-ci-amd64.yaml b/.github/workflows/basic-ci-amd64.yaml index d63b979efd..1198918530 100644 --- a/.github/workflows/basic-ci-amd64.yaml +++ b/.github/workflows/basic-ci-amd64.yaml @@ -165,6 +165,43 @@ jobs: - name: Run tracing tests run: bash tests/integration/runk/gha-run.sh run + run-tracing: + strategy: + fail-fast: false + matrix: + vmm: + - clh # cloud-hypervisor + - qemu + runs-on: garm-ubuntu-2204-smaller + env: + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit-hash }} + fetch-depth: 0 + + - name: Rebase atop of the latest target branch + run: | + ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" + env: + TARGET_BRANCH: ${{ inputs.target-branch }} + + - name: Install dependencies + run: bash tests/functional/tracing/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Install kata + run: bash tests/functional/tracing/gha-run.sh install-kata kata-artifacts + + - name: Run tracing tests + run: bash tests/functional/tracing/gha-run.sh run + run-vfio: strategy: fail-fast: false @@ -198,3 +235,86 @@ jobs: - name: Run vfio tests timeout-minutes: 15 run: bash tests/functional/vfio/gha-run.sh run + + run-docker-tests: + strategy: + # We can set this to true whenever we're 100% sure that + # all the tests are not flaky, otherwise we'll fail them + # all due to a single flaky instance. + fail-fast: false + matrix: + vmm: + - clh + - qemu + runs-on: garm-ubuntu-2304-smaller + env: + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit-hash }} + fetch-depth: 0 + + - name: Rebase atop of the latest target branch + run: | + ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" + env: + TARGET_BRANCH: ${{ inputs.target-branch }} + + - name: Install dependencies + run: bash tests/integration/docker/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Install kata + run: bash tests/integration/docker/gha-run.sh install-kata kata-artifacts + + - name: Run docker smoke test + timeout-minutes: 5 + run: bash tests/integration/docker/gha-run.sh run + + run-nerdctl-tests: + strategy: + # We can set this to true whenever we're 100% sure that + # all the tests are not flaky, otherwise we'll fail them + # all due to a single flaky instance. + fail-fast: false + matrix: + vmm: + - clh + - dragonball + - qemu + runs-on: garm-ubuntu-2304-smaller + env: + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.commit-hash }} + fetch-depth: 0 + + - name: Rebase atop of the latest target branch + run: | + ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" + env: + TARGET_BRANCH: ${{ inputs.target-branch }} + + - name: Install dependencies + run: bash tests/integration/nerdctl/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Install kata + run: bash tests/integration/nerdctl/gha-run.sh install-kata kata-artifacts + + - name: Run nerdctl smoke test + timeout-minutes: 5 + run: bash tests/integration/nerdctl/gha-run.sh run diff --git a/.github/workflows/run-docker-tests-on-garm.yaml b/.github/workflows/run-docker-tests-on-garm.yaml deleted file mode 100644 index cc18240dd1..0000000000 --- a/.github/workflows/run-docker-tests-on-garm.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: CI | Run docker integration tests -on: - workflow_call: - inputs: - tarball-suffix: - required: false - type: string - commit-hash: - required: false - type: string - target-branch: - required: false - type: string - default: "" - -jobs: - run-docker-tests: - strategy: - # We can set this to true whenever we're 100% sure that - # all the tests are not flaky, otherwise we'll fail them - # all due to a single flaky instance. - fail-fast: false - matrix: - vmm: - - clh - - qemu - runs-on: garm-ubuntu-2304-smaller - env: - KATA_HYPERVISOR: ${{ matrix.vmm }} - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.commit-hash }} - fetch-depth: 0 - - - name: Rebase atop of the latest target branch - run: | - ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" - env: - TARGET_BRANCH: ${{ inputs.target-branch }} - - - name: Install dependencies - run: bash tests/integration/docker/gha-run.sh install-dependencies - - - name: get-kata-tarball - uses: actions/download-artifact@v3 - with: - name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} - path: kata-artifacts - - - name: Install kata - run: bash tests/integration/docker/gha-run.sh install-kata kata-artifacts - - - name: Run docker smoke test - timeout-minutes: 5 - run: bash tests/integration/docker/gha-run.sh run diff --git a/.github/workflows/run-nerdctl-tests-on-garm.yaml b/.github/workflows/run-nerdctl-tests-on-garm.yaml deleted file mode 100644 index e9133c9720..0000000000 --- a/.github/workflows/run-nerdctl-tests-on-garm.yaml +++ /dev/null @@ -1,57 +0,0 @@ -name: CI | Run nerdctl integration tests -on: - workflow_call: - inputs: - tarball-suffix: - required: false - type: string - commit-hash: - required: false - type: string - target-branch: - required: false - type: string - default: "" - -jobs: - run-nerdctl-tests: - strategy: - # We can set this to true whenever we're 100% sure that - # all the tests are not flaky, otherwise we'll fail them - # all due to a single flaky instance. - fail-fast: false - matrix: - vmm: - - clh - - dragonball - - qemu - runs-on: garm-ubuntu-2304-smaller - env: - KATA_HYPERVISOR: ${{ matrix.vmm }} - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.commit-hash }} - fetch-depth: 0 - - - name: Rebase atop of the latest target branch - run: | - ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch" - env: - TARGET_BRANCH: ${{ inputs.target-branch }} - - - name: Install dependencies - run: bash tests/integration/nerdctl/gha-run.sh install-dependencies - - - name: get-kata-tarball - uses: actions/download-artifact@v3 - with: - name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} - path: kata-artifacts - - - name: Install kata - run: bash tests/integration/nerdctl/gha-run.sh install-kata kata-artifacts - - - name: Run nerdctl smoke test - timeout-minutes: 5 - run: bash tests/integration/nerdctl/gha-run.sh run diff --git a/tests/common.bash b/tests/common.bash index 8497542ead..b3f8ac10b3 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -552,6 +552,24 @@ EOF sudo systemctl enable --now crio } +function install_docker() { + # Add Docker's official GPG key + sudo apt-get update + sudo apt-get -y install ca-certificates curl gnupg + sudo install -m 0755 -d /etc/apt/keyrings + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg + sudo chmod a+r /etc/apt/keyrings/docker.gpg + + # Add the repository to Apt sources: + echo \ + "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + + sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +} + # Convert architecture to the name used by golang function arch_to_golang() { local arch="$(uname -m)" diff --git a/tests/functional/tracing/configure_tracing_for_kata.sh b/tests/functional/tracing/configure_tracing_for_kata.sh new file mode 100755 index 0000000000..ed97571139 --- /dev/null +++ b/tests/functional/tracing/configure_tracing_for_kata.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Copyright (c) 2019-2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../../common.bash" + +[ "$#" -eq 1 ] || die "Specify enable or disable" + +kata_cfg_file=$(kata-runtime kata-env --json |jq '.Runtime | .Config | .Path' |cut -d\" -f2) + +enable_tracing() { + info "Enabling kata tracing on $kata_cfg_file" + sudo crudini --set "$kata_cfg_file" agent.kata enable_tracing true + sudo crudini --set "$kata_cfg_file" runtime enable_tracing true +} + +disable_tracing() { + info "Disabling kata tracing on $kata_cfg_file" + sudo crudini --set "$kata_cfg_file" agent.kata enable_tracing false + sudo crudini --set "$kata_cfg_file" runtime enable_tracing false +} + +main() { + cmd="$1" + case "$cmd" in + enable ) enable_tracing ;; + disable ) disable_tracing ;; + *) die "invalid command: '$cmd'" ;; + esac +} + +main "$@" diff --git a/tests/functional/tracing/gha-run.sh b/tests/functional/tracing/gha-run.sh new file mode 100755 index 0000000000..3369926629 --- /dev/null +++ b/tests/functional/tracing/gha-run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +kata_tarball_dir="${2:-kata-artifacts}" +tracing_dir="$(dirname "$(readlink -f "$0")")" +source "${tracing_dir}/../../common.bash" + +function install_dependencies() { + info "Installing the dependencies needed for running the tracing tests" + + # Dependency list of projects that we can rely on the system packages + # - crudini + # - jq + # - socat + # - tmux + declare -a system_deps=( + crudini + jq + socat + tmux + ) + + sudo apt-get update + sudo apt-get -y install "${system_deps[@]}" + + # Install docker according to the docker's website documentation + install_docker +} + +function run() { + info "Running tracing tests using ${KATA_HYPERVISOR} hypervisor" + + enabling_hypervisor + bash -c ${tracing_dir}/test-agent-shutdown.sh + bash -c ${tracing_dir}/tracing-test.sh +} + +function main() { + action="${1:-}" + case "${action}" in + install-dependencies) install_dependencies ;; + install-kata) install_kata ;; + run) run ;; + *) >&2 die "Invalid argument" ;; + esac +} + +main "$@" diff --git a/tests/functional/tracing/test-agent-shutdown.sh b/tests/functional/tracing/test-agent-shutdown.sh new file mode 100755 index 0000000000..6bad962d8b --- /dev/null +++ b/tests/functional/tracing/test-agent-shutdown.sh @@ -0,0 +1,1502 @@ +#!/bin/bash +# Copyright (c) 2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +#--------------------------------------------------------------------- +# Description: Test the Kata Containers 2.x rust agent shutdown behaviour. +# +# Normally, the kata-agent process running inside the VM is not shut down; +# once the workload ends and the agent has returned the workload return +# value back to the runtime, the runtime simply kills the VM. This is safe +# since nothing the user cares about is running any more. +# +# However, for agent tracing, a graceful agent shutdown is necessary to ensure +# all trace spans are generated. When *static* agent tracing is enabled, the +# runtime relies entirely on the agent to perform a graceful shutdown _and_ +# shut down the VM. +# +# This script tests the kata-agent in two ways: +# +# - "manually" / "standalone" where the agent binary is run directly. +# - Inside a Kata VM, started by a shimv2-capable container manager +# (containerd). +# +# In both cases, the agent is shut down using the agent-ctl tool +# to request the agent shut down gracefully. +# +# Various configuration options are also tested. One of these enables +# the agents built-in (VSOCK) debug console. This test not only enables +# the option but also connects to the created console. +# +# Since this script needs to start various programs with a terminal, +# it uses tmux(1) consistently to simplify the handling logic. +#--------------------------------------------------------------------- + +readonly script_name=${0##*/} + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../../common.bash" +source "/etc/os-release" || source "/usr/lib/os-release" + +CTR_RUNTIME=${CTR_RUNTIME:-"io.containerd.kata.v2"} + +# Kata always uses this value +EXPECTED_VSOCK_PORT="1024" + +DOCKER_IMAGE=${DOCKER_IMAGE:-"busybox"} +CTR_IMAGE=${CTR_IMAGE:-"quay.io/prometheus/busybox:latest"} + +# Number of times the test should be run +KATA_AGENT_SHUTDOWN_TEST_COUNT=${KATA_AGENT_SHUTDOWN_TEST_COUNT:-1} + +# Default VSOCK port used by the agent +KATA_AGENT_VSOCK_CONSOLE_PORT=${KATA_AGENT_VSOCK_CONSOLE_PORT:-1026} + +# The shutdown test type that represents a "default" / vanilla Kata +# installation (where no debug options are enabled). +VANILLA_TEST_TYPE='default' + +# Name of tmux(1) sessions to create to run Kata VM and local agent in +KATA_TMUX_VM_SESSION="kata-shutdown-test-vm-session" +KATA_TMUX_LOCAL_SESSION="kata-shutdown-test-local-agent-session" + +# Name of tmux(1) session to create to run a debug console in +KATA_TMUX_CONSOLE_SESSION="kata-shutdown-test-console-session" + +# tmux(1) session to run the trace forwarder in +KATA_TMUX_FORWARDER_SESSION="kata-shutdown-test-trace-forwarder-session" + +KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" + +# List of test types used by configure_kata(). +# +# Each element contains four colon delimited fields: +# +# 1: Name. +# 2: Whether debug should be enabled for the agent+runtime. +# 3: Whether hypervisor debug should be enabled. +# (handled separately due to a previous bug which blocked agent shutdown). +# 4: Whether a VSOCK debug console should be configured and used. +# +# Notes: +# +# - Tests are run in the order found in this array. +# - An array is used (rather than a hash) to ensure the standard/vanilla +# configuration is run *last*. The reason for this being that debug is +# needed to diagnose shutdown errors, so there is no point in runnning +# the default scenario first, in case it fails (and it thus "undebuggable"). +shutdown_test_types=( + 'with-debug:true:false:false' + 'with-debug-console:false:false:true' + 'with-hypervisor-debug:true:true:false' + 'with-everything:true:true:true' + "${VANILLA_TEST_TYPE}:false:false:false" +) + +# Number of fields each entry in the 'shutdown_test_types' array should have. +shutdown_test_type_fields=4 + +# Pseudo test type name that represents all test types defined +# in the 'shutdown_test_types' array. +ALL_TEST_TYPES='all' + +DEFAULT_SHUTDOWN_TEST_TYPE="${ALL_TEST_TYPES}" + +# List of ways of running the agent: +# +# Each element contains two colon delimited fields: +# +# 1: Name used for a particular way of running the agent. +# 2: Description. +agent_test_types=( + 'local:Run agent using agent-ctl tool' + 'vm:Run agent inside a Kata Container' +) + +# Default value from the 'agent_test_types' array. +DEFAULT_AGENT_TEST_TYPE='vm' + +# Set by every call to run_single_agent() +test_start_time= +test_end_time= + +#------------------------------------------------------------------------------- +# Settings + +# values used to wait for local and VM processes to start and end. +wait_time_secs=${WAIT_TIME_SECS:-20} +sleep_time_secs=${SLEEP_TIME_SECS:-1} + +# Time to allow for the agent and VM to shutdown +shutdown_time_secs=${SHUTDOWN_TIME_SECS:-120} + +# Name for the container that will be created +container_id="${CONTAINER_ID:-kata-agent-shutdown-test}" + +# If 'true', don't run any commands, just show what would be run. +dry_run="${DRY_RUN:-false}" + +# If 'true', don't remove logs on a successful run. +keep_logs="${KEEP_LOGS:-false}" + +# Name of socket file used by a local agent. +agent_socket_file="kata-agent.socket" + +# Kata Agent socket URI. +# +# Notes: +# +# - The file is an abstract socket +# (meaning it is not visible in the filesystem). +# +# - The agent and the agent-ctl tool use slightly different +# address formats for abstract sockets. +local_agent_server_addr="unix://${agent_socket_file}" +local_agent_ctl_server_addr="unix://@${agent_socket_file}" + +# Address that is dynamically configured when using CLH before +# starting trace forwarder or container +clh_socket_path= +clh_socket_prefix="/run/vc/vm/" + +ctl_log_file="${PWD}/agent-ctl.log" + +# Log file that must contain agent output. +agent_log_file="${PWD}/kata-agent.log" + +# Set in setup() based on KATA_HYPERVISOR +# Supported hypervisors are qemu and clh +configured_hypervisor= +# String that would appear in config file (qemu or clh) +configured_hypervisor_cfg= + +# Full path to directory containing an OCI bundle based on "$DOCKER_IMAGE", +# which is required by the agent control tool. +bundle_dir=${BUNDLE_DIR:-""} + +#--------------------------------------- +# Default values + +default_arch=$(uname -m) +arch="${arch:-${default_arch}}" + +#------------------------------------------------------------------------------- + +agent_binary="/usr/bin/kata-agent" + +# Maximum debug level +default_agent_log_level="trace" + +agent_log_level=${agent_log_level:-${default_agent_log_level}} + +# Full path to the main configuration file (set by setup()). +kata_cfg_file= + +# Set in setup() based on KATA_HYPERVISOR +hypervisor_binary= + + +#------------------------------------------------------------------------------- + +[ -n "${DEBUG:-}" ] && set -o xtrace + +usage() +{ + cat < : Agent test type to use + (default: '$DEFAULT_AGENT_TEST_TYPE'). + -c : Run specified number of iterations + (default: $KATA_AGENT_SHUTDOWN_TEST_COUNT). + -d : Enable debug (shell trace) output. + -h : Show this help statement. + -k : Keep logs on successful run + (default: logs will be deleted on success). + -l : List all available agent and shutdown test types. + -n : Dry-run mode - show the commands that would be run. + -t : Only run the specified shutdown test type + (default: '$DEFAULT_SHUTDOWN_TEST_TYPE'). + +Notes: + +- These tests should be run *before* the Kata Agent tracing tests, since if + the agent cannot be shut down, static tracing will not work reliably. + +- By default all shutdown test types are run, but only the default agent test + type is run. + +EOF +} + +warn() +{ + echo >&2 "WARNING: $*" +} + +# Run the specified command, or if dry-run mode is enabled, +# just show the command that would be run. +run_cmd() +{ + local cmdline="$@" + + if [ "$dry_run" = 'true' ] + then + info "dry-run: Would run: '$cmdline'" + else + eval $cmdline + fi +} + +# Show a subset of processes (for debugging) +show_procs() +{ + info "Processes" + + local hypervisor + hypervisor="qemu" + [ ${configured_hypervisor} = "clh" ] && hypervisor="cloud-hypervisor" + + local patterns=() + + patterns+=("kata-agent-ctl") + patterns+=("${hypervisor}") + patterns+=("containerd") + patterns+=("ctr") + + local pattern_list + pattern_list=$(echo "${patterns[@]}"|tr ' ' '|') + + local regex + regex="(${pattern_list})" + + ps -efww | egrep -i "$regex" || true +} + +kill_tmux_sessions() +{ + local session + + for session in \ + "$KATA_TMUX_CONSOLE_SESSION" \ + "$KATA_TMUX_FORWARDER_SESSION" \ + "$KATA_TMUX_LOCAL_SESSION" \ + "$KATA_TMUX_VM_SESSION" + do + tmux kill-session -t "$session" &>/dev/null || true + done + + true +} + +get_shutdown_test_type_entry() +{ + local shutdown_test_type="${1:-}" + [ -z "$shutdown_test_type" ] && die "need shutdown test type name" + + local entry + + for entry in "${shutdown_test_types[@]}" + do + local count + count=$(echo "$entry"|tr ':' '\n'|wc -l) + [ "$count" -eq "$shutdown_test_type_fields" ] \ + || die "expected $shutdown_test_type_fields fields, found $count: '$entry'" + + local name + + name=$(echo "$entry"|cut -d: -f1) + + [ "$name" = "$shutdown_test_type" ] \ + && echo "$entry" \ + && break + done + + echo +} + +list_shutdown_test_types() +{ + local entry + local debug_value + local hypervisor_debug_value + local debug_console_value + + printf "# Shutdown test types:\n\n" + + printf "%-24s %-15s %-23s %s\n\n" \ + "Test type" \ + "Debug enabled" \ + "Hypervisor debug" \ + "Debug console used" + + for entry in "${shutdown_test_types[@]}" + do + local name + local debug_value + local hypervisor_debug_value + local debug_console_value + + name=$(echo "$entry"|cut -d: -f1) + debug_value=$(echo "$entry"|cut -d: -f2) + hypervisor_debug_value=$(echo "$entry"|cut -d: -f3) + debug_console_value=$(echo "$entry"|cut -d: -f4) + + printf "%-24s %-15s %-23s %s\n" \ + "$name" \ + "$debug_value" \ + "$hypervisor_debug_value" \ + "$debug_console_value" + done + + echo +} + +list_agent_test_types() +{ + local entry + + printf "# Agent test types:\n\n" + + printf "%-12s %s\n\n" \ + "Agent type" \ + "Description" + + for entry in "${agent_test_types[@]}" + do + local name + local descr + + name=$(echo "$entry"|cut -d: -f1) + descr=$(echo "$entry"|cut -d: -f2-) + + local msg="" + + [ "$name" = "$DEFAULT_AGENT_TEST_TYPE" ] && msg=" (default)" + + printf "%-12s %s%s.\n" \ + "$name" \ + "$descr" \ + "$msg" + done + + echo +} + +list_test_types() +{ + list_agent_test_types + list_shutdown_test_types +} + +# Set Kata options according to test type. +configure_kata() +{ + local shutdown_test_type="${1:-}" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + local entry + local debug_value + local hypervisor_debug_value + local debug_console_value + + local entry + entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) + [ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'" + + debug_value=$(echo "$entry"|cut -d: -f2) + hypervisor_debug_value=$(echo "$entry"|cut -d: -f3) + debug_console_value=$(echo "$entry"|cut -d: -f4) + + [ -z "$debug_value" ] && \ + die "need debug value for $shutdown_test_type" + + [ -z "$hypervisor_debug_value" ] && \ + die "need hypervisor debug value for $shutdown_test_type" + + [ -z "$debug_console_value" ] && \ + die "need debug console value for $shutdown_test_type" + + toggle_debug "$debug_value" "$hypervisor_debug_value" + toggle_vsock_debug_console "$debug_console_value" + + # Enable agent tracing + # + # Even though this program only tests agent shutdown, static tracing + # must be configured. This is because normally (with tracing + # disabled), the runtime kills the VM after the workload has exited. + # However, if static tracing is enabled, the runtime will not kill the + # VM - the responsibility for shutting down the VM is given to the + # agent process running inside the VM. + + if [ "$shutdown_test_type" = "$VANILLA_TEST_TYPE" ] + then + # We don't need to worry about the 'trace_mode' here since agent tracing + # is *only* enabled if the 'enable_tracing' variable is set. + run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'false' + else + run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'true' + fi +} + +unconfigure_kata() +{ + info "Resetting configuration to defaults" + + configure_kata "$VANILLA_TEST_TYPE" +} + +# Enable/disable the agent's built-in VSOCK debug console +toggle_vsock_debug_console() +{ + run_cmd sudo crudini --set "${kata_cfg_file}" \ + 'agent.kata' 'debug_console_enabled' "$1" +} + +# Enable/disable debug options. +# +# Note: Don't use 'kata-manager.sh "enable-debug"' since this +# enables all debug (including the problematic hypervisor +# debug - see below). +toggle_debug() +{ + local value="${1:-}" + local hypervisor_debug="${2:-}" + + [ -z "$value" ] && die "need value" + [ -z "$hypervisor_debug" ] && die "need hypervisor debug value" + + # list of confguration.toml sections that have debug options we care about + local debug_sections=() + + debug_sections+=('agent.kata') + debug_sections+=('runtime') + + local section + + for section in "${debug_sections[@]}" + do + run_cmd sudo crudini --set "$kata_cfg_file" "$section" \ + 'enable_debug' "$value" + done + + # XXX: Enabling hypervisor debug for QEMU will make a systemd debug + # console service inoperable (*), but we need to test it anyhow. + # + # (*) - If enabled, it stops "kata-debug.service" from attaching to + # the console and the socat call made on the client hangs until + # the VM is shut down! + local section + + section=$(printf "hypervisor.%s" "$configured_hypervisor_cfg") + + run_cmd sudo crudini --set "$kata_cfg_file" "$section" \ + 'enable_debug' "$hypervisor_debug_value" +} + +# Provide a "semi-valid" vsock address for when dry-run mode is active. +# The URI includes a message telling the user to change it and replace +# with the real VSOCK CID value. +get_dry_run_agent_vsock_address() +{ + echo "vsock://FIXME-CHANGE-TO-VSOCK-CID:${EXPECTED_VSOCK_PORT}" +} + +# Start a debug console shell using the agent's built-in debug console +# feature. +# +# Note: You should be able to use "kata-runtime exec $cid", but that isn't +# working currently. +connect_to_vsock_debug_console() +{ + local agent_addr + + if [ "$dry_run" = 'true' ] + then + agent_addr=$(get_dry_run_agent_vsock_address) + else + agent_addr=$(get_agent_vsock_address || true) + [ -z "$agent_addr" ] && die "cannot determine agent VSOCK address" + fi + + local socat_connect= + if [ $configured_hypervisor = "qemu" ]; then + socat_connect=$(echo "$agent_addr"|sed 's!^vsock://!vsock-connect:!') + elif [ $configured_hypervisor = "clh" ]; then + socat_connect="unix-connect:${clh_socket_path}" + else + die "Cannot configure address for socat, unknown hypervisor: '$configured_hypervisor'" + fi + + run_cmd \ + "tmux new-session \ + -d \ + -s \"$KATA_TMUX_CONSOLE_SESSION\" \ + \"socat \ + '${socat_connect}' \ + stdout\"" + +} + +cleanup() +{ + # Save the result of the last call made before + # this handler was called. + # + # XXX: This *MUST* be the first command in this function! + local failure_ret="$?" + + [ "$dry_run" = 'true' ] && return 0 + + if [ "$failure_ret" -eq 0 ] && [ "$keep_logs" = 'true' ] + then + info "SUCCESS: Test passed, but leaving logs:" + info "" + info "agent log file : ${agent_log_file}" + info "agent-ctl log file : ${ctl_log_file}" + info "OCI bundle directory : ${bundle_dir}" + + return 0 + fi + + local arg="${1:-}" + + if [ $failure_ret -ne 0 ] && [ "$arg" != 'initial' ]; then + warn "ERROR: Test failed" + warn "" + warn "Not cleaning up to help debug failure:" + warn "" + + if [ "${CI:-}" = "true" ] + then + show_procs + + info "VSOCK details" + ss -Hp --vsock || true + + info "agent-ctl log file" + sudo cat "${ctl_log_file}" || true + echo + + info "agent log file" + sudo cat "${agent_log_file}" || true + echo + + else + info "agent-ctl log file : ${ctl_log_file}" + info "agent log file : ${agent_log_file}" + fi + + info "OCI bundle directory : ${bundle_dir}" + + return 0 + fi + + kill_tmux_sessions + + unconfigure_kata + + [ "$arg" != 'initial' ] && [ -d "$bundle_dir" ] && rm -rf "$bundle_dir" + + sudo rm -f \ + "$agent_log_file" \ + "$ctl_log_file" + + clean_env_ctr &>/dev/null || true + + local sandbox_dir="/run/sandbox-ns/" + + # XXX: Without doing this, the agent will hang attempting to create the + # XXX: namespaces (in function "setup_shared_namespaces()") + sudo umount -f "${sandbox_dir}/uts" "${sandbox_dir}/ipc" &>/dev/null || true + sudo rm -rf "${sandbox_dir}" &>/dev/null || true + + # Check that clh socket was deleted + if [ $configured_hypervisor = "clh" ] && [ ! -z $clh_socket_path ]; then + [ -f $clh_socket_path ] && die "CLH socket path $clh_socket_path was not properly cleaned up" + fi + + sudo systemctl restart containerd +} + +setup_containerd() +{ + local file="/etc/containerd/config.toml" + + [ -e "$file" ] || die "missing containerd config file: '$file'" + + # Although the containerd config file is in TOML format, crudini(1) + # won't parse it due to the indentation it uses. + local containerd_debug_enabled + + containerd_debug_enabled=$(sed \ + -e '/./{H;$!d;}' \ + -e 'x;/\[debug\]/!d;' \ + "$file" |\ + grep "level *= *\"debug\"" || true) + + if [ -z "$containerd_debug_enabled" ] + then + cat <<-EOF | sudo tee -a "$file" + [debug] + # Allow Kata Containers debug messages to be propageted + # into the hosts journal. + # (use "journalctl -t kata" to view). + level = "debug" + EOF + + sudo systemctl restart containerd + fi + + sudo ctr image pull "$CTR_IMAGE" + + true +} + +create_oci_rootfs() +{ + local dir="${1:-}" + + [ -z "$dir" ] && die "Need OCI rootfs dir" + + sudo docker export $(sudo docker create "$DOCKER_IMAGE") |\ + tar -C "${dir}" -xvf - >/dev/null +} + +setup_oci_bundle() +{ + bundle_dir="$(mktemp -d)" + export bundle_dir + + info "Creating OCI bundle in directory: '$bundle_dir'" + + local config="${bundle_dir}/config.json" + local rootfs_dir="${bundle_dir}/rootfs/" + + mkdir -p "$rootfs_dir" + + create_oci_rootfs "$rootfs_dir" + + pushd "$bundle_dir" &>/dev/null + runc spec + popd &>/dev/null + + [ -e "$config" ] || die "no OCI config file at ${config}" +} + +setup() +{ + configured_hypervisor="${KATA_HYPERVISOR:-}" + + if [ "${KATA_HYPERVISOR:-}" = "qemu" ]; then + hypervisor_binary="qemu-system-${arch}" + configured_hypervisor_cfg="qemu" + elif [ "${KATA_HYPERVISOR:-}" = "clh" ]; then + hypervisor_binary="cloud-hypervisor" + configured_hypervisor_cfg="clh" + else + local msg="" + msg+="Exiting as hypervisor test dependency not met" + msg+=" (expected 'qemu' or 'cloud-hypervisor', found '$KATA_HYPERVISOR')" + die "$msg" + fi + info "Configured hypervisor is $configured_hypervisor" + + trap cleanup EXIT + + # Don't mess with an existing tmux session + unset TMUX + + [ "$dry_run" = 'false' ] && \ + [ -z "$bundle_dir" ] && \ + setup_oci_bundle || true + + local cmds=() + + # For parsing TOML config files + cmds+=('crudini') + + # For container manager (containerd) + cmds+=('ctr') + + # for OCI bundle creation + cmds+=('docker') + cmds+=('runc') + + # For querying VSOCK sockets + cmds+=('socat') + + # For launching processes + cmds+=('tmux') + + local cmd + + for cmd in "${cmds[@]}" + do + local result + result=$(command -v "$cmd" || true) + [ -n "$result" ] || die "need $cmd" + done + + kata_cfg_file=$(kata-runtime kata-env \ + --json |\ + jq '.Runtime | .Config | .Path' |\ + cut -d\" -f2 || true) + + [ -z "$kata_cfg_file" ] && die "Cannot determine config file" + + sudo mkdir -p $(dirname "$kata_cfg_file") + + #------------------------------ + # Check configured hypervisor + + local hypervisor_section + + hypervisor_section=$(printf "hypervisor.%s\n" "${configured_hypervisor_cfg}") + + local ret + + { crudini --get "${kata_cfg_file}" "${hypervisor_section}" &>/dev/null; ret=$?; } || true + + [ "$ret" -eq 0 ] || \ + die "Configured hypervisor ${configured_hypervisor} does not match config file ${kata_cfg_file}" + + setup_containerd +} + +start_local_agent() +{ + local log_file="${1:-}" + [ -z "$log_file" ] && die "need agent log file" + + local running + running=$(get_local_agent_pid || true) + + [ -n "$running" ] && die "agent already running: '$running'" + + # Note: it's imperative that we capture stderr to the log file + # as the agent writes the shutdown message to this stream! + run_cmd \ + "tmux new-session \ + -d \ + -s \"$KATA_TMUX_LOCAL_SESSION\" \ + \"sudo \ + RUST_BACKTRACE=full \ + KATA_AGENT_LOG_LEVEL=${agent_log_level} \ + KATA_AGENT_SERVER_ADDR=${local_agent_server_addr} \ + ${agent_binary} \ + &> ${log_file}\"" + + [ "$dry_run" = 'false' ] && wait_for_local_agent_to_start || true +} + +# Wait for the agent to finish starting +wait_for_kata_vm_agent_to_start() +{ + local cid="${1:-}" + [ -z "$log_file" ] && die "need container ID" + + # First, check the containerd status of the container + local cmd="sudo ctr task list | grep \"${cid}\" | grep -q \"RUNNING\"" + + info "Waiting for VM to start (cid: '$cid')" + + waitForProcess \ + "$wait_time_secs" \ + "$sleep_time_secs" \ + "$cmd" + + show_procs + + # Next, ensure there is a valid VSOCK address for the VM + info "Waiting for agent VSOCK server" + + cmd="get_agent_vsock_address_simple >/dev/null" + + waitForProcess \ + "$wait_time_secs" \ + "$sleep_time_secs" \ + "$cmd" + + info "Kata VM running" +} + +check_local_agent_alive() +{ + local cmds=() + + cmds+=("-c Check") + + run_agent_ctl \ + "${local_agent_ctl_server_addr}" \ + "${cmds[@]}" + + true +} + +wait_for_local_agent_to_start() +{ + local cmd="check_local_agent_alive" + + info "Waiting for agent process to start" + + waitForProcess \ + "$wait_time_secs" \ + "$sleep_time_secs" \ + "$cmd" + + info "Kata agent process running" +} + +# Create a Kata Container that blocks "forever" +start_agent_in_kata_vm() +{ + local log_file="${1:-}" + [ -z "$log_file" ] && die "need agent log file" + + local snapshotter="" + local ret + + # Allow containerd to run on a ZFS root filesystem + { zfs list &>/dev/null; ret=$?; } || true + [ "$ret" = 0 ] && snapshotter='zfs' + + # Ensure the container blocks forever + local cmd='tail -f /dev/null' + + run_cmd \ + "tmux new-session \ + -d \ + -s \"$KATA_TMUX_VM_SESSION\" \ + \"sudo ctr run \ + --snapshotter '$snapshotter' \ + --runtime '${CTR_RUNTIME}' \ + --rm \ + -t '${CTR_IMAGE}' \ + '$container_id' \ + $cmd\"" + + [ "$dry_run" = 'false' ] && \ + wait_for_kata_vm_agent_to_start "$container_id" || true +} + +start_agent() +{ + local agent_test_type="${1:-}" + [ -z "$agent_test_type" ] && die "need agent test type" + + local log_file="${2:-}" + [ -z "$log_file" ] && die "need agent log file" + + case "$agent_test_type" in + 'local') start_local_agent "$log_file" ;; + 'vm') start_agent_in_kata_vm "$log_file" ;; + *) die "invalid agent test type: '$agent_test_type'" ;; + esac + + true +} + +run_agent_ctl() +{ + local server_addr="${1:-}" + + shift + + local cmds="${*:-}" + + [ -n "$server_addr" ] || die "need agent ttRPC server address" + [ -n "$cmds" ] || die "need commands for agent control tool" + + local agent_ctl_path + agent_ctl_path="/opt/kata/bin/kata-agent-ctl" + + local redirect="&>\"${ctl_log_file}\"" + + if [ "$dry_run" = 'true' ] + then + redirect="" + bundle_dir="FIXME-set-to-OCI-bundle-directory" + fi + + local server_address= + if [ $configured_hypervisor = "qemu" ]; then + server_address="--server-address \"${server_addr}\"" + elif [ $configured_hypervisor = "clh" ]; then + server_address="--server-address \"${server_addr}\" --hybrid-vsock" + else + die "Cannot configure server address, unknown hypervisor: '$configured_hypervisor'" + fi + + run_cmd \ + sudo \ + RUST_BACKTRACE=full \ + "${agent_ctl_path}" \ + -l debug \ + connect \ + "${server_address}" \ + --bundle-dir "${bundle_dir}" \ + "${cmds}" \ + "${redirect}" +} + +# This function "cheats" a little - it gets the agent +# to do some work *and then* stops it. +stop_local_agent() +{ + local cmds=() + + cmds+=("-c Check") + cmds+=("-c GetGuestDetails") + cmds+=("-c 'sleep 1s'") + cmds+=("-c DestroySandbox") + + run_agent_ctl \ + "${local_agent_ctl_server_addr}" \ + "${cmds[@]}" +} + +get_addresses() +{ + local addresses= + + if [ $configured_hypervisor = "qemu" ]; then + addresses=$(ss -Hp --vsock |\ + egrep -v "\" |\ + awk '$2 ~ /^ESTAB$/ {print $6}' |\ + grep ":${EXPECTED_VSOCK_PORT}$") + elif [ $configured_hypervisor = "clh" ]; then + # since we preconfigured the socket, we are checking to see if it is reported + addresses=$(ss -Hp |\ + grep "${clh_socket_path}" |\ + awk '$2 ~ /^ESTAB$/ {print $5}') + else + die "Cannot retrieve address, unknown hypervisor: '$configured_hypervisor'" + fi + + echo ${addresses} +} + +# Doesn't fail. Instead it will return the empty string on error. +get_agent_vsock_address_simple() +{ + local addresses=$(get_addresses) + + [ -z "$addresses" ] && return 1 + + local expected_count=1 + + local count + count=$(echo "$addresses"|wc -l || true) + + [ "$count" -eq "$expected_count" ] || return 1 + + if [ $configured_hypervisor = "qemu" ]; then + local cid + local port + + cid=$(echo "$addresses"|cut -d: -f1) + port=$(echo "$addresses"|cut -d: -f2) + + echo "vsock://${cid}:${port}" + elif [ $configured_hypervisor = "clh" ]; then + address=$(echo "$addresses" | awk 'NR==1{print $1}') + echo "unix://${address}" + else + die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'" + fi + + return 0 +} + +get_agent_vsock_address() +{ + local addresses=$(get_addresses) + + [ -z "$addresses" ] && die "no VSOCK connections found" + + local expected_count=1 + + local count + count=$(echo "$addresses"|wc -l || true) + + if [ $configured_hypervisor = "qemu" ]; then + # For QEMU we always expect 1 result. For Cloud Hypervisor, if a debug console is configured + # and running, we will have more than 1 result, so only run this check for QEMU + [ "$count" -eq "$expected_count" ] \ + || die "expected $expected_count VSOCK entry, found $count: '$addresses'" + + local cid + local port + + cid=$(echo "$addresses"|cut -d: -f1) + port=$(echo "$addresses"|cut -d: -f2) + + echo "vsock://${cid}:${port}" + elif [ $configured_hypervisor = "clh" ]; then + address=$(echo "$addresses" | awk 'NR==1{print $1}') + echo "unix://${address}" + else + die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'" + fi +} + +stop_agent_in_kata_vm() +{ + local agent_addr + + if [ "$dry_run" = 'true' ] + then + agent_addr=$(get_dry_run_agent_vsock_address) + else + agent_addr=$(get_agent_vsock_address || true) + [ -z "$agent_addr" ] && \ + die "cannot determine agent VSOCK address for $hypervisor_binary" + fi + + # List of API commands to send to the agent. + local cmds=() + + # Run a couple of query commands first to ensure + # the agent is listening. + cmds+=("-c Check") + cmds+=("-c GetGuestDetails") + + # Creating a container implies creating a sandbox, so request + # agent/VM/container shutdown by asking the agent + # to destroy the sandbox. + cmds+=("-c DestroySandbox") + + run_agent_ctl \ + "${agent_addr}" \ + "${cmds[@]}" + + true +} + +stop_agent() +{ + info "Stopping agent" + + local agent_test_type="${1:-}" + [ -z "$agent_test_type" ] && die "need agent test type" + + local log_file="${2:-}" + [ -z "$log_file" ] && die "need agent-ctl log file" + + case "$agent_test_type" in + 'local') stop_local_agent ;; + 'vm') stop_agent_in_kata_vm ;; + *) die "invalid agent test type: '$agent_test_type'" ;; + esac + + true +} + +get_local_agent_pid() +{ + local pids + + local name + name=$(basename "$agent_binary") + + pids=$(pgrep "$name" || true) + [ -z "$pids" ] && return 0 + + local count + count=$(echo "$pids"|wc -l) + + [ "$count" -gt 1 ] && \ + die "too many agent processes running ($count, '$pids')" + + echo $pids +} + +# Function that writes all agent logs to '$agent_log_file'. +get_agent_log_file() +{ + local agent_test_type="${1:-}" + [ -z "$agent_test_type" ] && die "need agent test type" + + local log_file="${2:-}" + [ -z "$log_file" ] && die "need agent log file" + + info "Getting agent log details" + + case "$agent_test_type" in + # NOP: File should have been created by start_local_agent() + 'local') true ;; + + # Extract journal entries for the duration of the test + 'vm') + sudo journalctl \ + -q \ + -a \ + -o cat \ + -t 'kata' \ + --since="$test_start_time" \ + > "$log_file" + ;; + + *) die "invalid agent test type: '$agent_test_type'" ;; + esac + + [ -e "$log_file" ] || die "no log file: '$log_file'" + [ -s "$log_file" ] || die "empty log file: '$log_file'" + + true +} + +# Function to run to ensure correct behaviour +validate_agent() +{ + local agent_test_type="${1:-}" + local shutdown_test_type="${2:-}" + local log_file="${3:-}" + + [ -z "$agent_test_type" ] && die "need agent test type" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + [ -z "$log_file" ] && die "need agent log file" + + info "validating" + + get_agent_log_file \ + "$agent_test_type" \ + "$log_file" + + # Regular expression that describes possible agent failures + local regex="(slog::Fuse|Drain|Custom|serialization error|thread.*panicked|stack backtrace:)" + + egrep -q "$regex" "$log_file" && cat $log_file && die "Found agent error in log file: '$log_file'" + + local entry + entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) + [ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'" + + local hypervisor_debug=$(echo "$entry"|cut -d: -f3) + local vsock_console=$(echo "$entry"|cut -d: -f4) + + local agent_debug_logs_available='false' + + [ "$hypervisor_debug" = 'true' ] && \ + [ "$vsock_console" = 'false' ] && \ + agent_debug_logs_available='true' + + if [ "$agent_debug_logs_available" = 'true' ] || [ "$agent_test_type" = 'local' ] + then + # The message the agent writes to stderr just before it exits. + local done_msg="\" + + egrep -q "$done_msg" "$log_file" || (cat $log_file && die "missing agent shutdown message") + else + # We can only check for the shutdown message if the agent debug + # logs are available. + info "Not checking for agent shutdown message as hypervisor debug disabled" + fi +} + +setup_agent() +{ + local shutdown_test_type="${1:-}" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + kill_tmux_sessions + + configure_kata "$shutdown_test_type" + + true +} + +# Even though this test is not testing tracing, agent tracing needs to be +# enabled to stop the runtime from killing the VM. However, if tracing is +# enabled, the forwarder must be running. To remove the need for Jaeger to +# also be running, run the forwarder in "NOP" mode. +run_trace_forwarder() +{ + local forwarder_binary_path + forwarder_binary_path="/opt/kata/bin/kata-trace-forwarder" + + local socket_path_tf="" + + # If using CLH, socket path must be passed to trace forwarder + if [ $configured_hypervisor = "clh" ]; then + socket_path_tf="--socket-path ${clh_socket_path}" + fi + + run_cmd \ + "tmux new-session \ + -d \ + -s \"$KATA_TMUX_FORWARDER_SESSION\" \ + sudo \"$forwarder_binary_path --dump-only -l trace ${socket_path_tf}\"" +} + +check_agent_stopped() +{ + info "Checking agent stopped" + + local agent_test_type="${1:-}" + [ -z "$agent_test_type" ] && die "need agent test type" + + local cmd= + + case "$agent_test_type" in + 'local') cmd=check_local_agent_stopped ;; + 'vm') cmd=check_vm_stopped ;; + *) die "invalid agent test type: '$agent_test_type'" ;; + esac + + waitForProcess \ + "$shutdown_time_secs" \ + "$sleep_time_secs" \ + "$cmd" + + true +} + +check_local_agent_stopped() +{ + local ret=0 + + local i=0 + local max=20 + + agent_ended="false" + + local agent_pid + agent_pid=$(get_local_agent_pid || true) + + # Agent has finished + [ -z "$agent_pid" ] && return 0 + + for _ in $(seq "$max") + do + { sudo kill -0 "$agent_pid"; ret=$?; } || true + + [ "$ret" -ne 0 ] && agent_ended="true" && break + + sleep 0.2 + done + + [ "$agent_ended" = "false" ] && die "agent still running: pid $agent_pid" || true +} + +get_vm_pid() +{ + pgrep "$hypervisor_binary" +} + +check_vm_stopped() +{ + tmux list-sessions |\ + grep -q "^${KATA_TMUX_VM_SESSION}:" \ + && return 1 + + return 0 +} + +start_debug_console() +{ + local agent_test_type="${1:-}" + local shutdown_test_type="${2:-}" + + [ -z "$agent_test_type" ] && die "need agent test type" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + info "Starting debug console" + + case "$agent_test_type" in + 'vm') connect_to_vsock_debug_console ;; + # NOP for a local agent since we cannot connect to the agents + # VSOCK console socket from *outside* the host! + 'local') true ;; + *) die "invalid agent test type: '$agent_test_type'" ;; + esac + + true +} + +run_single_agent() +{ + local agent_test_type="${1:-}" + local shutdown_test_type="${2:-}" + + [ -z "$agent_test_type" ] && die "need agent test type" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + local msg + msg=$(printf \ + "Testing agent (agent test type: '%s', shutdown test type: '%s')" \ + "$agent_test_type" \ + "$shutdown_test_type") + info "$msg" + + setup_agent "$shutdown_test_type" + + if [ $configured_hypervisor = "clh" ]; then + # CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify + socket_path_template=$clh_socket_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath') + clh_socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"') + [ "$dry_run" = 'false' ] && sudo mkdir -p $(dirname "$clh_socket_path") + fi + + run_trace_forwarder "$shutdown_test_type" + + sleep 5s + + test_start_time=$(date '+%F %T') + + start_agent \ + "$agent_test_type" \ + "$agent_log_file" + + info "Testing agent: shutdown test type: '$shutdown_test_type', agent test type: $agent_test_type" + + local entry + entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) + local debug_console=$(echo "$entry"|cut -d: -f4) + [ "$debug_console" = 'true' ] && \ + start_debug_console \ + "$agent_test_type" \ + "$shutdown_test_type" + + stop_agent \ + "$agent_test_type" \ + "$ctl_log_file" + + # We only need to show the set of commands once + [ "$dry_run" = 'true' ] && exit 0 + + test_end_time=$(date '+%F %T') + + check_agent_stopped "$agent_test_type" + + validate_agent \ + "$agent_test_type" \ + "$shutdown_test_type" \ + "$agent_log_file" +} + +run_agent() +{ + local agent_test_type="${1:-}" + local shutdown_test_type="${2:-}" + + [ -z "$agent_test_type" ] && die "need agent test type" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + case "$shutdown_test_type" in + "$ALL_TEST_TYPES") + local entry + + # Run all shutdown types + for entry in "${shutdown_test_types[@]}" + do + local name + name=$(echo "$entry"|cut -d: -f1) + + run_single_agent \ + "$agent_test_type" \ + "$name" + + # Clean up between iterations + sudo rm -f \ + "$ctl_log_file" \ + "$agent_log_file" + + local addresses=$(get_addresses || true) + + [ -z "$addresses" ] || \ + die "found unexpected vsock addresses: '$addresses'" + + done + ;; + + *) + run_single_agent \ + "$agent_test_type" \ + "$shutdown_test_type" + ;; + esac + +} + +test_agent_shutdown() +{ + local count="${1:-}" + local agent_test_type="${2:-}" + local shutdown_test_type="${3:-}" + + [ -z "$count" ] && die "need count" + [ -z "$agent_test_type" ] && die "need agent test type" + [ -z "$shutdown_test_type" ] && die "need shutdown test type" + + # Start with a clean environment + [ "$dry_run" = 'false' ] && cleanup initial || true + + local i + + for i in $(seq "$count") + do + [ "$dry_run" = 'false' ] && \ + info "testing agent: run $i of $count" || true + run_agent \ + "$agent_test_type" \ + "$shutdown_test_type" + done + + info "testing agent: completed $count runs" +} + +handle_args() +{ + local opt + + local count="${KATA_AGENT_SHUTDOWN_TEST_COUNT}" + local shutdown_test_type="$DEFAULT_SHUTDOWN_TEST_TYPE" + local agent_test_type="$DEFAULT_AGENT_TEST_TYPE" + + while getopts "a:c:dhklnt:" opt "$@" + do + case "$opt" in + a) agent_test_type="$OPTARG" ;; + c) count="$OPTARG" ;; + d) set -o xtrace ;; + h) usage; exit 0 ;; + k) keep_logs='true' ;; + l) list_test_types; exit 0 ;; + n) dry_run='true' ;; + t) shutdown_test_type="$OPTARG" ;; + *) die "invalid option: '$opt'" ;; + esac + done + + setup + + test_agent_shutdown \ + "$count" \ + "$agent_test_type" \ + "$shutdown_test_type" +} + +main() +{ + handle_args "$@" +} + +main "$@" diff --git a/tests/functional/tracing/tracing-test.sh b/tests/functional/tracing/tracing-test.sh new file mode 100755 index 0000000000..e8256ba381 --- /dev/null +++ b/tests/functional/tracing/tracing-test.sh @@ -0,0 +1,540 @@ +#!/bin/bash +# Copyright (c) 2019-2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +script_name=${0##*/} +source "/etc/os-release" || source "/usr/lib/os-release" + +# Set to true if all tests pass +success="false" + +DEBUG=${DEBUG:-} + +# If set to any value, do not shut down the Jaeger service. +DEBUG_KEEP_JAEGER=${DEBUG_KEEP_JAEGER:-} +# If set to any value, do not shut down the trace forwarder. +DEBUG_KEEP_FORWARDER=${DEBUG_KEEP_FORWARDER:-} + +[ -n "$DEBUG" ] && set -o xtrace + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../../common.bash" + +RUNTIME="io.containerd.kata.v2" +CONTAINER_IMAGE="quay.io/prometheus/busybox:latest" + +TRACE_LOG_DIR=${TRACE_LOG_DIR:-${KATA_TESTS_LOGDIR}/traces} + +KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" + +# files for output +formatted_traces_file="kata-traces-formatted.json" +trace_summary_file="span-summary.txt" + +# tmux(1) session to run the trace forwarder in +KATA_TMUX_FORWARDER_SESSION="kata-trace-forwarder-session" + +forwarder_binary="/opt/kata/bin/kata-trace-forwarder" + +# path prefix for CLH socket path +socket_path_prefix="/run/vc/vm/" + +container_id="tracing-test" + +jaeger_server=${jaeger_server:-localhost} +jaeger_ui_port=${jaeger_ui_port:-16686} +jaeger_docker_container_name="jaeger" + +# Span data for testing: +# 1. Existence of spans in jaeger output +# 2. That the relative ordering in the data occurs +# in the jaeger output. +# This is tested to make sure specific spans exist in the output and +# that the order of spans is preserved. +# Ordered in latest in sequence to earliest. +# +# Fields are all span existing span names in relative order from latest +# to earliest call in a sequence of calls. Example (pseudocode): +# func1() { +# span = trace("func1") +# func2() +# end span +# } +# func2() { +# span = trace("func2") +# func3() +# end span +# } +# func3() { +# span = trace("func3") +# end span +# } +# The following data should result in a passing test: +# 'func3:func2:func1' +# 'func3:func2' +# 'func3:func1' +# 'func2:func1' +span_ordering_data=( + 'StartVM:createSandboxFromConfig:create:rootSpan' + 'setup_shared_namespaces:StartVM:createSandboxFromConfig:create:rootSpan' + 'start_container:Start:rootSpan' + 'stopSandbox:Stop:Start:rootSpan' +) + +# Cleanup will remove Jaeger container and +# disable tracing. +cleanup() +{ + local fp="die" + local result="failed" + local dest="$logdir" + + if [ "$success" = "true" ]; then + local fp="info" + result="passed" + + [ -z "$DEBUG_KEEP_JAEGER" ] && stop_jaeger 2>/dev/null || true + + [ -z "$DEBUG_KEEP_FORWARDER" ] && kill_trace_forwarder + + # The tests worked so remove the logs + if [ -n "$DEBUG" ]; then + eval "$fp" "test $result - logs left in '$dest'" + else + "${SCRIPT_PATH}/configure_tracing_for_kata.sh" disable + + [ -d "$logdir" ] && rm -rf "$logdir" || true + fi + + return 0 + fi + + if [ -n "${CI:-}" ]; then + # Running under the CI, so copy the logs to allow them + # to be added as test artifacts. + sudo mkdir -p "$TRACE_LOG_DIR" + sudo cp -a "$logdir"/* "$TRACE_LOG_DIR" + + dest="$TRACE_LOG_DIR" + fi + + eval "$fp" "test $result - logs left in '$dest'" +} + +# Run an operation to generate Jaeger trace spans +create_traces() +{ + sudo ctr image pull "$CONTAINER_IMAGE" + sudo ctr run --runtime "$RUNTIME" --rm "$CONTAINER_IMAGE" "$container_id" true +} + +start_jaeger() +{ + local jaeger_docker_image="jaegertracing/all-in-one:latest" + + sudo docker rm -f "${jaeger_docker_container_name}" + + # Defaults - see https://www.jaegertracing.io/docs/getting-started/ + sudo docker run -d --runtime runc --name "${jaeger_docker_container_name}" \ + -e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \ + -p 5775:5775/udp \ + -p 6831:6831/udp \ + -p 6832:6832/udp \ + -p 5778:5778 \ + -p "${jaeger_ui_port}:${jaeger_ui_port}" \ + -p 14268:14268 \ + -p 9411:9411 \ + "$jaeger_docker_image" + + sudo mkdir -m 0750 -p "$TRACE_LOG_DIR" +} + +stop_jaeger() +{ + sudo docker stop "${jaeger_docker_container_name}" + sudo docker rm -f "${jaeger_docker_container_name}" +} + +get_jaeger_traces() +{ + local service="$1" + [ -z "$service" ] && die "need jaeger service name" + + local traces_url="http://${jaeger_server}:${jaeger_ui_port}/api/traces?service=${service}" + curl -s "${traces_url}" 2>/dev/null +} + +get_trace_summary() +{ + local status="$1" + [ -z "$status" ] && die "need jaeger status JSON" + + echo "${status}" | jq -S '.data[].spans[] | [.spanID, .operationName] | @sh' +} + +get_span_count() +{ + local status="$1" + [ -z "$status" ] && die "need jaeger status JSON" + + # This could be simplified but creating a variable holding the + # summary is useful in debug mode as the summary is displayed. + local trace_summary=$(get_trace_summary "$status" || true) + + [ -z "$trace_summary" ] && die "failed to get trace summary" + + local count=$(echo "${trace_summary}" | wc -l) + + [ -z "$count" ] && count=0 + + echo "$count" +} + +# Returns status from Jaeger web UI +get_jaeger_status() +{ + local service="$1" + local logdir="$2" + + [ -z "$service" ] && die "need jaeger service name" + [ -z "$logdir" ] && die "need logdir" + + local status="" + local span_count=0 + + # Find spans + status=$(get_jaeger_traces "$service" || true) + if [ -n "$status" ]; then + echo "$status" | tee "$logdir/${service}-status.json" + span_count=$(get_span_count "$status") + fi + + [ -z "$status" ] && die "failed to query Jaeger for status" + [ "$span_count" -eq 0 ] && die "failed to find any trace spans" + [ "$span_count" -le 0 ] && die "invalid span count" + + get_trace_summary "$status" > "$logdir/$trace_summary_file" +} + +# Check Jaeger spans for the specified service. +check_jaeger_output() +{ + local service="$1" + local min_spans="$2" + local logdir="$3" + + [ -z "$service" ] && die "need jaeger service name" + [ -z "$min_spans" ] && die "need minimum trace span count" + [ -z "$logdir" ] && die "need logdir" + + local status + local errors=0 + + info "Checking Jaeger status" + + status=$(get_jaeger_status "$service" "$logdir") + + #------------------------------ + # Basic sanity checks + [ -z "$status" ] && die "failed to query status via HTTP" + + local span_lines=$(echo "$status"|jq -S '.data[].spans | length') + [ -z "$span_lines" ] && die "no span status" + + # Log the spans to allow for analysis in case the test fails + echo "$status"|jq -S . > "$logdir/${service}-traces-formatted.json" + + local span_lines_count=$(echo "$span_lines"|wc -l) + + # Total up all span counts + local spans=$(echo "$span_lines"|paste -sd+ -|bc) + [ -z "$spans" ] && die "no spans" + + # Ensure total span count is numeric + echo "$spans"|grep -q "^[0-9][0-9]*$" || die "invalid span count: '$spans'" + + info "found $spans spans (across $span_lines_count traces)" + + # Validate + [ "$spans" -lt "$min_spans" ] && die "expected >= $min_spans spans, got $spans" + + # Look for common errors in span data + local error_msg=$(echo "$status"|jq -S . 2>/dev/null|grep "invalid parent span" || true) + + if [ -n "$error_msg" ]; then + errors=$((errors+1)) + warn "Found invalid parent span errors: $error_msg" + else + errors=$((errors-1)) + [ "$errors" -lt 0 ] && errors=0 + fi + + # Crude but it works + error_or_warning_msgs=$(echo "$status" |\ + jq -S . 2>/dev/null |\ + jq '.data[].spans[].warnings' |\ + grep -E -v "\" |\ + grep -E -v "\[" |\ + grep -E -v "\]" |\ + grep -E -v "clock skew" || true) # ignore clock skew error + + if [ -n "$error_or_warning_msgs" ]; then + errors=$((errors+1)) + warn "Found errors/warnings: $error_or_warning_msgs" + else + errors=$((errors-1)) + [ "$errors" -lt 0 ] && errors=0 + fi + + [ "$errors" -eq 0 ] || die "errors detected" +} + +# Check output for spans in span_ordering_data +check_spans() +{ + local logdir="$1" + [ -z "$logdir" ] && die "need logdir" + + local errors=0 + + # Check for existence of spans in output so we do not do the more + # time consuming test of checking span ordering if it will fail + info "Checking spans: ${span_ordering_data[@]}" + local missing_spans=() + for span_ordering in "${span_ordering_data[@]}"; do + local test_spans=(`echo $span_ordering | tr ':' ' '`) + for s in "${test_spans[@]}"; do + grep -q \'$s\' "$logdir/$trace_summary_file" || missing_spans+=( "$s" ) + done + done + if [ "${#missing_spans[@]}" -gt 0 ]; then + die "Fail: Missing spans: ${missing_spans[@]}" + fi + + # Check relative ordering of spans. We are not checking full trace, just + # that known calls are not out of order based on the test input. + for span_ordering in "${span_ordering_data[@]}"; do # runs maximum length of span_ordering_data + local test_spans=(`echo $span_ordering | tr ':' ' '`) + + # create array for span IDs that match span string + local span_ids=() + for span in "${test_spans[@]}"; do + grep -q \'$span\' "$logdir/$trace_summary_file" || die "Fail: Missing span: $span" + id=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.operationName==\"$span\") | .spanID") || die "Fail: error with span $span retrieved from traces" + id_formatted=$(echo $id | tr -d '\"' | tr '\n' ':') # format to a string for parsing later, not an array + span_ids+=("$id_formatted") + done + + # We now have 2 parallel arrays where test_spans[n] is the string name and + # span_ids[n] has all possible span IDs for that string separated by a colon + + # Since functions can be called multiple times, we may have multiple results + # for span IDs. + initial_span_ids=(`echo ${span_ids[0]} | tr ':' ' '`) + for initial in "${initial_span_ids[@]}"; do # test parents for all initial spans + # construct array of all parents of first span + local retrieved_spans=() + local current_span="$initial" + [ "$current_span" != "" ] || break + + MAX_DEPTH=20 # to prevent infinite loop due to unforeseen errors + for i in `seq 1 $MAX_DEPTH`; do + retrieved_spans+=("$current_span") + current_span=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.spanID==\"$current_span\") | .references[].spanID") || die "Fail: error with current_span $current_span retrieved from formatted traces" + [ "$current_span" != "" ] || break + current_span=$(echo $current_span | tr -d '"') + [ $i -lt $MAX_DEPTH ] || die "Fail: max depth reached, error in jq or adjust test depth" + done + + # Keep track of this index so we can ensure we are testing the constructed array in order + # Increment when there is a match between test case and constructed path + local retrieved_ids_index=0 + + local matches=0 + local index=0 + + # TODO: Optimize + for ((index=0; index<${#span_ids[@]}; index++)); do + for ((r_index=$retrieved_ids_index; r_index<${#retrieved_spans[@]}; r_index++)); do + grep -q "${retrieved_spans[$r_index]}" <<< ${span_ids[$index]} && (( retrieved_ids_index=$r_index+1 )) && (( matches+=1 )) && break + done + done + + local last_initial_span_index=${#initial_span_ids[@]}-1 + if [ $matches -eq ${#span_ids[@]} ]; then + info "Pass: spans \"${test_spans[@]}\" found in jaeger output" + break + elif [ $matches -lt ${#span_ids[@]} ] && [ "$initial" = "${initial_span_ids[$last_initial_span_index]}" ]; then + die "Fail: spans \"${test_spans[@]}\" NOT in jaeger output" + fi + # else repeat test for next initial span ID + done + done + + +} + +run_trace_forwarder() +{ + if [ $KATA_HYPERVISOR = "qemu" ]; then + tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "sudo $forwarder_binary -l trace" + elif [ $KATA_HYPERVISOR = "clh" ]; then + # CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify + socket_path_template=$socket_path_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath') + socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"') + sudo mkdir -p $(dirname "$socket_path") + + tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "sudo $forwarder_binary -l trace --socket-path $socket_path" + else + die "Unsupported hypervisor $KATA_HYPERVISOR" + fi + + info "Verifying trace forwarder in tmux session $KATA_TMUX_FORWARDER_SESSION" + + local cmd="tmux capture-pane -pt $KATA_TMUX_FORWARDER_SESSION | tr -d '\n' | tr -d '\"' | grep -q \"source:kata-trace-forwarder\"" + waitForProcess 10 1 "$cmd" +} + +kill_trace_forwarder() +{ + tmux kill-session -t "$KATA_TMUX_FORWARDER_SESSION" +} + +setup() +{ + # containerd must be running in order to use ctr to generate traces + restart_containerd_service + + local cmds=() + # For container manager (containerd) + cmds+=('ctr') + # For jaeger + cmds+=('docker') + # For launching processes + cmds+=('tmux') + + local cmd + for cmd in "${cmds[@]}" + do + local result + result=$(command -v "$cmd" || true) + [ -n "$result" ] || die "need $cmd" + done + + run_trace_forwarder + + start_jaeger + + "${SCRIPT_PATH}/configure_tracing_for_kata.sh" enable +} + +run_test() +{ + local service="$1" + local min_spans="$2" + local logdir="$3" + + [ -z "$service" ] && die "need service name" + [ -z "$min_spans" ] && die "need minimum span count" + [ -z "$logdir" ] && die "need logdir" + + info "Running test for service '$service'" + + logdir="$logdir/$service" + mkdir -p "$logdir" + + check_jaeger_output "$service" "$min_spans" "$logdir" + check_spans "$logdir" + + info "test passed" +} + +run_tests() +{ + # List of services to check + # + # Format: "name:min-spans" + # + # Where: + # + # - 'name' is the Jaeger service name. + # - 'min-spans' is an integer representing the minimum number of + # trace spans this service should generate. + # + # Notes: + # + # - Uses an array to ensure predictable ordering. + # - All services listed are expected to generate traces + # when create_traces() is called a single time. + local -a services + + services+=("kata:125") + + create_traces + + logdir=$(mktemp -d) + + for service in "${services[@]}" + do + local name=$(echo "${service}"|cut -d: -f1) + local min_spans=$(echo "${service}"|cut -d: -f2) + + run_test "${name}" "${min_spans}" "${logdir}" + done + + info "all tests passed" + success="true" +} + +usage() +{ + cat <] + +Commands: + + clean - Perform cleanup phase only. + help - Show usage. + run - Only run tests (no setup or cleanup). + setup - Perform setup phase only. + +Environment variables: + + CI - if set, save logs of all tests to ${TRACE_LOG_DIR}. + DEBUG - if set, enable tracing and do not cleanup after tests. + DEBUG_KEEP_JAEGER - if set, do not shut down the Jaeger service. + DEBUG_KEEP_FORWARDER - if set, do not shut down the trace forwarder. + +Notes: + - Runs all test phases if no arguments are specified. + +EOF +} + +main() +{ + local cmd="${1:-}" + + case "$cmd" in + clean) success="true"; cleanup; exit 0;; + help|-h|-help|--help) usage; exit 0;; + run) run_tests; exit 0;; + setup) setup; exit 0;; + esac + + trap cleanup EXIT + + setup + + run_tests +} + +main "$@" diff --git a/tests/integration/docker/gha-run.sh b/tests/integration/docker/gha-run.sh index cab3401ea1..0a01006684 100755 --- a/tests/integration/docker/gha-run.sh +++ b/tests/integration/docker/gha-run.sh @@ -16,21 +16,7 @@ source "${docker_dir}/../../common.bash" function install_dependencies() { info "Installing the dependencies needed for running the docker smoke test" - # Add Docker's official GPG key: - sudo apt-get update - sudo apt-get -y install ca-certificates curl gnupg - sudo install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg - sudo chmod a+r /etc/apt/keyrings/docker.gpg - - # Add the repository to Apt sources: - echo \ - "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \ - sudo tee /etc/apt/sources.list.d/docker.list > /dev/null - sudo apt-get update - - sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + install_docker } function run() {