#!/bin/bash # Copyright (c) 2021 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 # #--------------------------------------------------------------------- # Description: Test the Kata Containers 2.x rust agent shutdown behaviour. # # Normally, the kata-agent process running inside the VM is not shut down; # once the workload ends and the agent has returned the workload return # value back to the runtime, the runtime simply kills the VM. This is safe # since nothing the user cares about is running any more. # # However, for agent tracing, a graceful agent shutdown is necessary to ensure # all trace spans are generated. When *static* agent tracing is enabled, the # runtime relies entirely on the agent to perform a graceful shutdown _and_ # shut down the VM. # # This script tests the kata-agent in two ways: # # - "manually" / "standalone" where the agent binary is run directly. # - Inside a Kata VM, started by a shimv2-capable container manager # (containerd). # # In both cases, the agent is shut down using the agent-ctl tool # to request the agent shut down gracefully. # # Various configuration options are also tested. One of these enables # the agents built-in (VSOCK) debug console. This test not only enables # the option but also connects to the created console. # # Since this script needs to start various programs with a terminal, # it uses tmux(1) consistently to simplify the handling logic. #--------------------------------------------------------------------- readonly script_name=${0##*/} set -o errexit set -o nounset set -o pipefail set -o errtrace SCRIPT_PATH=$(dirname "$(readlink -f "$0")") source "${SCRIPT_PATH}/../../common.bash" source "/etc/os-release" || source "/usr/lib/os-release" CTR_RUNTIME=${CTR_RUNTIME:-"io.containerd.kata.v2"} # Kata always uses this value EXPECTED_VSOCK_PORT="1024" DOCKER_IMAGE=${DOCKER_IMAGE:-"busybox"} CTR_IMAGE=${CTR_IMAGE:-"quay.io/prometheus/busybox:latest"} # Number of times the test should be run KATA_AGENT_SHUTDOWN_TEST_COUNT=${KATA_AGENT_SHUTDOWN_TEST_COUNT:-1} # Default VSOCK port used by the agent KATA_AGENT_VSOCK_CONSOLE_PORT=${KATA_AGENT_VSOCK_CONSOLE_PORT:-1026} # The shutdown test type that represents a "default" / vanilla Kata # installation (where no debug options are enabled). VANILLA_TEST_TYPE='default' # Name of tmux(1) sessions to create to run Kata VM and local agent in KATA_TMUX_VM_SESSION="kata-shutdown-test-vm-session" KATA_TMUX_LOCAL_SESSION="kata-shutdown-test-local-agent-session" # Name of tmux(1) session to create to run a debug console in KATA_TMUX_CONSOLE_SESSION="kata-shutdown-test-console-session" # tmux(1) session to run the trace forwarder in KATA_TMUX_FORWARDER_SESSION="kata-shutdown-test-trace-forwarder-session" KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" # List of test types used by configure_kata(). # # Each element contains four colon delimited fields: # # 1: Name. # 2: Whether debug should be enabled for the agent+runtime. # 3: Whether hypervisor debug should be enabled. # (handled separately due to a previous bug which blocked agent shutdown). # 4: Whether a VSOCK debug console should be configured and used. # # Notes: # # - Tests are run in the order found in this array. # - An array is used (rather than a hash) to ensure the standard/vanilla # configuration is run *last*. The reason for this being that debug is # needed to diagnose shutdown errors, so there is no point in runnning # the default scenario first, in case it fails (and it thus "undebuggable"). shutdown_test_types=( 'with-debug:true:false:false' 'with-debug-console:false:false:true' 'with-hypervisor-debug:true:true:false' 'with-everything:true:true:true' "${VANILLA_TEST_TYPE}:false:false:false" ) # Number of fields each entry in the 'shutdown_test_types' array should have. shutdown_test_type_fields=4 # Pseudo test type name that represents all test types defined # in the 'shutdown_test_types' array. ALL_TEST_TYPES='all' DEFAULT_SHUTDOWN_TEST_TYPE="${ALL_TEST_TYPES}" # List of ways of running the agent: # # Each element contains two colon delimited fields: # # 1: Name used for a particular way of running the agent. # 2: Description. agent_test_types=( 'local:Run agent using agent-ctl tool' 'vm:Run agent inside a Kata Container' ) # Default value from the 'agent_test_types' array. DEFAULT_AGENT_TEST_TYPE='vm' # Set by every call to run_single_agent() test_start_time= test_end_time= #------------------------------------------------------------------------------- # Settings # values used to wait for local and VM processes to start and end. wait_time_secs=${WAIT_TIME_SECS:-20} sleep_time_secs=${SLEEP_TIME_SECS:-1} # Time to allow for the agent and VM to shutdown shutdown_time_secs=${SHUTDOWN_TIME_SECS:-120} # Name for the container that will be created container_id="${CONTAINER_ID:-kata-agent-shutdown-test}" # If 'true', don't run any commands, just show what would be run. dry_run="${DRY_RUN:-false}" # If 'true', don't remove logs on a successful run. keep_logs="${KEEP_LOGS:-false}" # Name of socket file used by a local agent. agent_socket_file="kata-agent.socket" # Kata Agent socket URI. # # Notes: # # - The file is an abstract socket # (meaning it is not visible in the filesystem). # # - The agent and the agent-ctl tool use slightly different # address formats for abstract sockets. local_agent_server_addr="unix://${agent_socket_file}" local_agent_ctl_server_addr="unix://@${agent_socket_file}" # Address that is dynamically configured when using CLH before # starting trace forwarder or container clh_socket_path= clh_socket_prefix="/run/vc/vm/" ctl_log_file="${PWD}/agent-ctl.log" # Log file that must contain agent output. agent_log_file="${PWD}/kata-agent.log" # Set in setup() based on KATA_HYPERVISOR # Supported hypervisors are qemu and clh configured_hypervisor= # String that would appear in config file (qemu or clh) configured_hypervisor_cfg= # Full path to directory containing an OCI bundle based on "$DOCKER_IMAGE", # which is required by the agent control tool. bundle_dir=${BUNDLE_DIR:-""} #--------------------------------------- # Default values default_arch=$(uname -m) arch="${arch:-${default_arch}}" #------------------------------------------------------------------------------- agent_binary="/usr/bin/kata-agent" # Maximum debug level default_agent_log_level="trace" agent_log_level=${agent_log_level:-${default_agent_log_level}} # Full path to the main configuration file (set by setup()). kata_cfg_file= # Set in setup() based on KATA_HYPERVISOR hypervisor_binary= #------------------------------------------------------------------------------- [ -n "${DEBUG:-}" ] && set -o xtrace usage() { cat < : Agent test type to use (default: '$DEFAULT_AGENT_TEST_TYPE'). -c : Run specified number of iterations (default: $KATA_AGENT_SHUTDOWN_TEST_COUNT). -d : Enable debug (shell trace) output. -h : Show this help statement. -k : Keep logs on successful run (default: logs will be deleted on success). -l : List all available agent and shutdown test types. -n : Dry-run mode - show the commands that would be run. -t : Only run the specified shutdown test type (default: '$DEFAULT_SHUTDOWN_TEST_TYPE'). Notes: - These tests should be run *before* the Kata Agent tracing tests, since if the agent cannot be shut down, static tracing will not work reliably. - By default all shutdown test types are run, but only the default agent test type is run. EOF } warn() { echo >&2 "WARNING: $*" } # Run the specified command, or if dry-run mode is enabled, # just show the command that would be run. run_cmd() { local cmdline="$@" if [ "$dry_run" = 'true' ] then info "dry-run: Would run: '$cmdline'" else eval $cmdline fi } # Show a subset of processes (for debugging) show_procs() { info "Processes" local hypervisor hypervisor="qemu" [ ${configured_hypervisor} = "clh" ] && hypervisor="cloud-hypervisor" local patterns=() patterns+=("kata-agent-ctl") patterns+=("${hypervisor}") patterns+=("containerd") patterns+=("ctr") local pattern_list pattern_list=$(echo "${patterns[@]}"|tr ' ' '|') local regex regex="(${pattern_list})" ps -efww | grep -i -E "$regex" || true } kill_tmux_sessions() { local session for session in \ "$KATA_TMUX_CONSOLE_SESSION" \ "$KATA_TMUX_FORWARDER_SESSION" \ "$KATA_TMUX_LOCAL_SESSION" \ "$KATA_TMUX_VM_SESSION" do tmux kill-session -t "$session" &>/dev/null || true done true } get_shutdown_test_type_entry() { local shutdown_test_type="${1:-}" [ -z "$shutdown_test_type" ] && die "need shutdown test type name" local entry for entry in "${shutdown_test_types[@]}" do local count count=$(echo "$entry"|tr ':' '\n'|wc -l) [ "$count" -eq "$shutdown_test_type_fields" ] \ || die "expected $shutdown_test_type_fields fields, found $count: '$entry'" local name name=$(echo "$entry"|cut -d: -f1) [ "$name" = "$shutdown_test_type" ] \ && echo "$entry" \ && break done echo } list_shutdown_test_types() { local entry local debug_value local hypervisor_debug_value local debug_console_value printf "# Shutdown test types:\n\n" printf "%-24s %-15s %-23s %s\n\n" \ "Test type" \ "Debug enabled" \ "Hypervisor debug" \ "Debug console used" for entry in "${shutdown_test_types[@]}" do local name local debug_value local hypervisor_debug_value local debug_console_value name=$(echo "$entry"|cut -d: -f1) debug_value=$(echo "$entry"|cut -d: -f2) hypervisor_debug_value=$(echo "$entry"|cut -d: -f3) debug_console_value=$(echo "$entry"|cut -d: -f4) printf "%-24s %-15s %-23s %s\n" \ "$name" \ "$debug_value" \ "$hypervisor_debug_value" \ "$debug_console_value" done echo } list_agent_test_types() { local entry printf "# Agent test types:\n\n" printf "%-12s %s\n\n" \ "Agent type" \ "Description" for entry in "${agent_test_types[@]}" do local name local descr name=$(echo "$entry"|cut -d: -f1) descr=$(echo "$entry"|cut -d: -f2-) local msg="" [ "$name" = "$DEFAULT_AGENT_TEST_TYPE" ] && msg=" (default)" printf "%-12s %s%s.\n" \ "$name" \ "$descr" \ "$msg" done echo } list_test_types() { list_agent_test_types list_shutdown_test_types } # Set Kata options according to test type. configure_kata() { local shutdown_test_type="${1:-}" [ -z "$shutdown_test_type" ] && die "need shutdown test type" local entry local debug_value local hypervisor_debug_value local debug_console_value local entry entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) [ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'" debug_value=$(echo "$entry"|cut -d: -f2) hypervisor_debug_value=$(echo "$entry"|cut -d: -f3) debug_console_value=$(echo "$entry"|cut -d: -f4) [ -z "$debug_value" ] && \ die "need debug value for $shutdown_test_type" [ -z "$hypervisor_debug_value" ] && \ die "need hypervisor debug value for $shutdown_test_type" [ -z "$debug_console_value" ] && \ die "need debug console value for $shutdown_test_type" toggle_debug "$debug_value" "$hypervisor_debug_value" toggle_vsock_debug_console "$debug_console_value" # Enable agent tracing # # Even though this program only tests agent shutdown, static tracing # must be configured. This is because normally (with tracing # disabled), the runtime kills the VM after the workload has exited. # However, if static tracing is enabled, the runtime will not kill the # VM - the responsibility for shutting down the VM is given to the # agent process running inside the VM. if [ "$shutdown_test_type" = "$VANILLA_TEST_TYPE" ] then # We don't need to worry about the 'trace_mode' here since agent tracing # is *only* enabled if the 'enable_tracing' variable is set. run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'false' else run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'true' fi } unconfigure_kata() { info "Resetting configuration to defaults" configure_kata "$VANILLA_TEST_TYPE" } # Enable/disable the agent's built-in VSOCK debug console toggle_vsock_debug_console() { run_cmd sudo crudini --set "${kata_cfg_file}" \ 'agent.kata' 'debug_console_enabled' "$1" } # Enable/disable debug options. # # Note: Don't use 'kata-manager.sh "enable-debug"' since this # enables all debug (including the problematic hypervisor # debug - see below). toggle_debug() { local value="${1:-}" local hypervisor_debug="${2:-}" [ -z "$value" ] && die "need value" [ -z "$hypervisor_debug" ] && die "need hypervisor debug value" # list of confguration.toml sections that have debug options we care about local debug_sections=() debug_sections+=('agent.kata') debug_sections+=('runtime') local section for section in "${debug_sections[@]}" do run_cmd sudo crudini --set "$kata_cfg_file" "$section" \ 'enable_debug' "$value" done # XXX: Enabling hypervisor debug for QEMU will make a systemd debug # console service inoperable (*), but we need to test it anyhow. # # (*) - If enabled, it stops "kata-debug.service" from attaching to # the console and the socat call made on the client hangs until # the VM is shut down! local section section=$(printf "hypervisor.%s" "$configured_hypervisor_cfg") run_cmd sudo crudini --set "$kata_cfg_file" "$section" \ 'enable_debug' "$hypervisor_debug_value" } # Provide a "semi-valid" vsock address for when dry-run mode is active. # The URI includes a message telling the user to change it and replace # with the real VSOCK CID value. get_dry_run_agent_vsock_address() { echo "vsock://FIXME-CHANGE-TO-VSOCK-CID:${EXPECTED_VSOCK_PORT}" } # Start a debug console shell using the agent's built-in debug console # feature. # # Note: You should be able to use "kata-runtime exec $cid", but that isn't # working currently. connect_to_vsock_debug_console() { local agent_addr if [ "$dry_run" = 'true' ] then agent_addr=$(get_dry_run_agent_vsock_address) else agent_addr=$(get_agent_vsock_address || true) [ -z "$agent_addr" ] && die "cannot determine agent VSOCK address" fi local socat_connect= if [ $configured_hypervisor = "qemu" ]; then socat_connect=$(echo "$agent_addr"|sed 's!^vsock://!vsock-connect:!') elif [ $configured_hypervisor = "clh" ]; then socat_connect="unix-connect:${clh_socket_path}" else die "Cannot configure address for socat, unknown hypervisor: '$configured_hypervisor'" fi run_cmd \ "tmux new-session \ -d \ -s \"$KATA_TMUX_CONSOLE_SESSION\" \ \"socat \ '${socat_connect}' \ stdout\"" } cleanup() { # Save the result of the last call made before # this handler was called. # # XXX: This *MUST* be the first command in this function! local failure_ret="$?" [ "$dry_run" = 'true' ] && return 0 if [ "$failure_ret" -eq 0 ] && [ "$keep_logs" = 'true' ] then info "SUCCESS: Test passed, but leaving logs:" info "" info "agent log file : ${agent_log_file}" info "agent-ctl log file : ${ctl_log_file}" info "OCI bundle directory : ${bundle_dir}" return 0 fi local arg="${1:-}" if [ $failure_ret -ne 0 ] && [ "$arg" != 'initial' ]; then warn "ERROR: Test failed" warn "" warn "Not cleaning up to help debug failure:" warn "" info "agent-ctl log file : ${ctl_log_file}" info "agent log file : ${agent_log_file}" info "OCI bundle directory : ${bundle_dir}" return 0 fi kill_tmux_sessions unconfigure_kata [ "$arg" != 'initial' ] && [ -d "$bundle_dir" ] && rm -rf "$bundle_dir" sudo rm -f \ "$agent_log_file" \ "$ctl_log_file" clean_env_ctr &>/dev/null || true local sandbox_dir="/run/sandbox-ns/" # XXX: Without doing this, the agent will hang attempting to create the # XXX: namespaces (in function "setup_shared_namespaces()") sudo umount -f "${sandbox_dir}/uts" "${sandbox_dir}/ipc" &>/dev/null || true sudo rm -rf "${sandbox_dir}" &>/dev/null || true # Check that clh socket was deleted if [ $configured_hypervisor = "clh" ] && [ ! -z $clh_socket_path ]; then [ -f $clh_socket_path ] && die "CLH socket path $clh_socket_path was not properly cleaned up" fi sudo systemctl restart containerd } setup_containerd() { local file="/etc/containerd/config.toml" [ -e "$file" ] || die "missing containerd config file: '$file'" # Although the containerd config file is in TOML format, crudini(1) # won't parse it due to the indentation it uses. local containerd_debug_enabled containerd_debug_enabled=$(sed \ -e '/./{H;$!d;}' \ -e 'x;/\[debug\]/!d;' \ "$file" |\ grep "level *= *\"debug\"" || true) if [ -z "$containerd_debug_enabled" ] then cat <<-EOF | sudo tee -a "$file" [debug] # Allow Kata Containers debug messages to be propageted # into the hosts journal. # (use "journalctl -t kata" to view). level = "debug" EOF sudo systemctl restart containerd fi sudo ctr image pull "$CTR_IMAGE" true } create_oci_rootfs() { local dir="${1:-}" [ -z "$dir" ] && die "Need OCI rootfs dir" sudo docker export $(sudo docker create "$DOCKER_IMAGE") |\ tar -C "${dir}" -xvf - >/dev/null } setup_oci_bundle() { bundle_dir="$(mktemp -d)" export bundle_dir info "Creating OCI bundle in directory: '$bundle_dir'" local config="${bundle_dir}/config.json" local rootfs_dir="${bundle_dir}/rootfs/" mkdir -p "$rootfs_dir" create_oci_rootfs "$rootfs_dir" pushd "$bundle_dir" &>/dev/null runc spec popd &>/dev/null [ -e "$config" ] || die "no OCI config file at ${config}" } setup() { configured_hypervisor="${KATA_HYPERVISOR:-}" if [ "${KATA_HYPERVISOR:-}" = "qemu" ]; then hypervisor_binary="qemu-system-${arch}" configured_hypervisor_cfg="qemu" elif [ "${KATA_HYPERVISOR:-}" = "clh" ]; then hypervisor_binary="cloud-hypervisor" configured_hypervisor_cfg="clh" else local msg="" msg+="Exiting as hypervisor test dependency not met" msg+=" (expected 'qemu' or 'cloud-hypervisor', found '$KATA_HYPERVISOR')" die "$msg" fi info "Configured hypervisor is $configured_hypervisor" trap cleanup EXIT # Don't mess with an existing tmux session unset TMUX [ "$dry_run" = 'false' ] && \ [ -z "$bundle_dir" ] && \ setup_oci_bundle || true local cmds=() # For parsing TOML config files cmds+=('crudini') # For container manager (containerd) cmds+=('ctr') # for OCI bundle creation cmds+=('docker') cmds+=('runc') # For querying VSOCK sockets cmds+=('socat') # For launching processes cmds+=('tmux') local cmd for cmd in "${cmds[@]}" do local result result=$(command -v "$cmd" || true) [ -n "$result" ] || die "need $cmd" done kata_cfg_file=$(kata-runtime kata-env \ --json |\ jq '.Runtime | .Config | .Path' |\ cut -d\" -f2 || true) [ -z "$kata_cfg_file" ] && die "Cannot determine config file" sudo mkdir -p $(dirname "$kata_cfg_file") #------------------------------ # Check configured hypervisor local hypervisor_section hypervisor_section=$(printf "hypervisor.%s\n" "${configured_hypervisor_cfg}") local ret { crudini --get "${kata_cfg_file}" "${hypervisor_section}" &>/dev/null; ret=$?; } || true [ "$ret" -eq 0 ] || \ die "Configured hypervisor ${configured_hypervisor} does not match config file ${kata_cfg_file}" setup_containerd } start_local_agent() { local log_file="${1:-}" [ -z "$log_file" ] && die "need agent log file" local running running=$(get_local_agent_pid || true) [ -n "$running" ] && die "agent already running: '$running'" # Note: it's imperative that we capture stderr to the log file # as the agent writes the shutdown message to this stream! run_cmd \ "tmux new-session \ -d \ -s \"$KATA_TMUX_LOCAL_SESSION\" \ \"sudo \ RUST_BACKTRACE=full \ KATA_AGENT_LOG_LEVEL=${agent_log_level} \ KATA_AGENT_SERVER_ADDR=${local_agent_server_addr} \ ${agent_binary} \ &> ${log_file}\"" [ "$dry_run" = 'false' ] && wait_for_local_agent_to_start || true } # Wait for the agent to finish starting wait_for_kata_vm_agent_to_start() { local cid="${1:-}" [ -z "$log_file" ] && die "need container ID" # First, check the containerd status of the container local cmd="sudo ctr task list | grep \"${cid}\" | grep -q \"RUNNING\"" info "Waiting for VM to start (cid: '$cid')" waitForProcess \ "$wait_time_secs" \ "$sleep_time_secs" \ "$cmd" show_procs # Next, ensure there is a valid VSOCK address for the VM info "Waiting for agent VSOCK server" cmd="get_agent_vsock_address_simple >/dev/null" waitForProcess \ "$wait_time_secs" \ "$sleep_time_secs" \ "$cmd" info "Kata VM running" } check_local_agent_alive() { local cmds=() cmds+=("-c Check") run_agent_ctl \ "${local_agent_ctl_server_addr}" \ "${cmds[@]}" true } wait_for_local_agent_to_start() { local cmd="check_local_agent_alive" info "Waiting for agent process to start" waitForProcess \ "$wait_time_secs" \ "$sleep_time_secs" \ "$cmd" info "Kata agent process running" } # Create a Kata Container that blocks "forever" start_agent_in_kata_vm() { local log_file="${1:-}" [ -z "$log_file" ] && die "need agent log file" local snapshotter="" local ret # Allow containerd to run on a ZFS root filesystem { zfs list &>/dev/null; ret=$?; } || true [ "$ret" = 0 ] && snapshotter='zfs' # Ensure the container blocks forever local cmd='tail -f /dev/null' run_cmd \ "tmux new-session \ -d \ -s \"$KATA_TMUX_VM_SESSION\" \ \"sudo ctr run \ --snapshotter '$snapshotter' \ --runtime '${CTR_RUNTIME}' \ --rm \ -t '${CTR_IMAGE}' \ '$container_id' \ $cmd\"" [ "$dry_run" = 'false' ] && \ wait_for_kata_vm_agent_to_start "$container_id" || true } start_agent() { local agent_test_type="${1:-}" [ -z "$agent_test_type" ] && die "need agent test type" local log_file="${2:-}" [ -z "$log_file" ] && die "need agent log file" case "$agent_test_type" in 'local') start_local_agent "$log_file" ;; 'vm') start_agent_in_kata_vm "$log_file" ;; *) die "invalid agent test type: '$agent_test_type'" ;; esac true } run_agent_ctl() { local server_addr="${1:-}" shift local cmds="${*:-}" [ -n "$server_addr" ] || die "need agent ttRPC server address" [ -n "$cmds" ] || die "need commands for agent control tool" local agent_ctl_path agent_ctl_path="/opt/kata/bin/kata-agent-ctl" local redirect="&>\"${ctl_log_file}\"" if [ "$dry_run" = 'true' ] then redirect="" bundle_dir="FIXME-set-to-OCI-bundle-directory" fi local server_address= if [ $configured_hypervisor = "qemu" ]; then server_address="--server-address \"${server_addr}\"" elif [ $configured_hypervisor = "clh" ]; then server_address="--server-address \"${server_addr}\" --hybrid-vsock" else die "Cannot configure server address, unknown hypervisor: '$configured_hypervisor'" fi run_cmd \ sudo \ RUST_BACKTRACE=full \ "${agent_ctl_path}" \ -l debug \ connect \ "${server_address}" \ --bundle-dir "${bundle_dir}" \ "${cmds}" \ "${redirect}" } # This function "cheats" a little - it gets the agent # to do some work *and then* stops it. stop_local_agent() { local cmds=() cmds+=("-c Check") cmds+=("-c GetGuestDetails") cmds+=("-c 'sleep 1s'") cmds+=("-c DestroySandbox") run_agent_ctl \ "${local_agent_ctl_server_addr}" \ "${cmds[@]}" } get_addresses() { local addresses= if [ $configured_hypervisor = "qemu" ]; then addresses=$(ss -Hp --vsock |\ grep -v -E "\" |\ awk '$2 ~ /^ESTAB$/ {print $6}' |\ grep ":${EXPECTED_VSOCK_PORT}$") elif [ $configured_hypervisor = "clh" ]; then # since we preconfigured the socket, we are checking to see if it is reported addresses=$(ss -Hp |\ grep "${clh_socket_path}" |\ awk '$2 ~ /^ESTAB$/ {print $5}') else die "Cannot retrieve address, unknown hypervisor: '$configured_hypervisor'" fi echo ${addresses} } # Doesn't fail. Instead it will return the empty string on error. get_agent_vsock_address_simple() { local addresses=$(get_addresses) [ -z "$addresses" ] && return 1 local expected_count=1 local count count=$(echo "$addresses"|wc -l || true) [ "$count" -eq "$expected_count" ] || return 1 if [ $configured_hypervisor = "qemu" ]; then local cid local port cid=$(echo "$addresses"|cut -d: -f1) port=$(echo "$addresses"|cut -d: -f2) echo "vsock://${cid}:${port}" elif [ $configured_hypervisor = "clh" ]; then address=$(echo "$addresses" | awk 'NR==1{print $1}') echo "unix://${address}" else die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'" fi return 0 } get_agent_vsock_address() { local addresses=$(get_addresses) [ -z "$addresses" ] && die "no VSOCK connections found" local expected_count=1 local count count=$(echo "$addresses"|wc -l || true) if [ $configured_hypervisor = "qemu" ]; then # For QEMU we always expect 1 result. For Cloud Hypervisor, if a debug console is configured # and running, we will have more than 1 result, so only run this check for QEMU [ "$count" -eq "$expected_count" ] \ || die "expected $expected_count VSOCK entry, found $count: '$addresses'" local cid local port cid=$(echo "$addresses"|cut -d: -f1) port=$(echo "$addresses"|cut -d: -f2) echo "vsock://${cid}:${port}" elif [ $configured_hypervisor = "clh" ]; then address=$(echo "$addresses" | awk 'NR==1{print $1}') echo "unix://${address}" else die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'" fi } stop_agent_in_kata_vm() { local agent_addr if [ "$dry_run" = 'true' ] then agent_addr=$(get_dry_run_agent_vsock_address) else agent_addr=$(get_agent_vsock_address || true) [ -z "$agent_addr" ] && \ die "cannot determine agent VSOCK address for $hypervisor_binary" fi # List of API commands to send to the agent. local cmds=() # Run a couple of query commands first to ensure # the agent is listening. cmds+=("-c Check") cmds+=("-c GetGuestDetails") # Creating a container implies creating a sandbox, so request # agent/VM/container shutdown by asking the agent # to destroy the sandbox. cmds+=("-c DestroySandbox") run_agent_ctl \ "${agent_addr}" \ "${cmds[@]}" true } stop_agent() { info "Stopping agent" local agent_test_type="${1:-}" [ -z "$agent_test_type" ] && die "need agent test type" local log_file="${2:-}" [ -z "$log_file" ] && die "need agent-ctl log file" case "$agent_test_type" in 'local') stop_local_agent ;; 'vm') stop_agent_in_kata_vm ;; *) die "invalid agent test type: '$agent_test_type'" ;; esac true } get_local_agent_pid() { local pids local name name=$(basename "$agent_binary") pids=$(pgrep "$name" || true) [ -z "$pids" ] && return 0 local count count=$(echo "$pids"|wc -l) [ "$count" -gt 1 ] && \ die "too many agent processes running ($count, '$pids')" echo $pids } # Function that writes all agent logs to '$agent_log_file'. get_agent_log_file() { local agent_test_type="${1:-}" [ -z "$agent_test_type" ] && die "need agent test type" local log_file="${2:-}" [ -z "$log_file" ] && die "need agent log file" info "Getting agent log details" case "$agent_test_type" in # NOP: File should have been created by start_local_agent() 'local') true ;; # Extract journal entries for the duration of the test 'vm') sudo journalctl \ -q \ -a \ -o cat \ -t 'kata' \ --since="$test_start_time" \ > "$log_file" ;; *) die "invalid agent test type: '$agent_test_type'" ;; esac [ -e "$log_file" ] || die "no log file: '$log_file'" [ -s "$log_file" ] || die "empty log file: '$log_file'" true } # Function to run to ensure correct behaviour validate_agent() { local agent_test_type="${1:-}" local shutdown_test_type="${2:-}" local log_file="${3:-}" [ -z "$agent_test_type" ] && die "need agent test type" [ -z "$shutdown_test_type" ] && die "need shutdown test type" [ -z "$log_file" ] && die "need agent log file" info "validating" get_agent_log_file \ "$agent_test_type" \ "$log_file" # Regular expression that describes possible agent failures local regex="(slog::Fuse|Drain|Custom|serialization error|thread.*panicked|stack backtrace:)" grep -q -E "$regex" "$log_file" && cat $log_file && die "Found agent error in log file: '$log_file'" local entry entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) [ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'" local hypervisor_debug=$(echo "$entry"|cut -d: -f3) local vsock_console=$(echo "$entry"|cut -d: -f4) local agent_debug_logs_available='false' [ "$hypervisor_debug" = 'true' ] && \ [ "$vsock_console" = 'false' ] && \ agent_debug_logs_available='true' if [ "$agent_debug_logs_available" = 'true' ] || [ "$agent_test_type" = 'local' ] then # The message the agent writes to stderr just before it exits. local done_msg="\" grep -q -E "$done_msg" "$log_file" || (cat $log_file && die "missing agent shutdown message") else # We can only check for the shutdown message if the agent debug # logs are available. info "Not checking for agent shutdown message as hypervisor debug disabled" fi } setup_agent() { local shutdown_test_type="${1:-}" [ -z "$shutdown_test_type" ] && die "need shutdown test type" kill_tmux_sessions configure_kata "$shutdown_test_type" true } # Even though this test is not testing tracing, agent tracing needs to be # enabled to stop the runtime from killing the VM. However, if tracing is # enabled, the forwarder must be running. To remove the need for Jaeger to # also be running, run the forwarder in "NOP" mode. run_trace_forwarder() { local forwarder_binary_path forwarder_binary_path="/opt/kata/bin/kata-trace-forwarder" local socket_path_tf="" # If using CLH, socket path must be passed to trace forwarder if [ $configured_hypervisor = "clh" ]; then socket_path_tf="--socket-path ${clh_socket_path}" fi run_cmd \ "tmux new-session \ -d \ -s \"$KATA_TMUX_FORWARDER_SESSION\" \ sudo \"$forwarder_binary_path --dump-only -l trace ${socket_path_tf}\"" } check_agent_stopped() { info "Checking agent stopped" local agent_test_type="${1:-}" [ -z "$agent_test_type" ] && die "need agent test type" local cmd= case "$agent_test_type" in 'local') cmd=check_local_agent_stopped ;; 'vm') cmd=check_vm_stopped ;; *) die "invalid agent test type: '$agent_test_type'" ;; esac waitForProcess \ "$shutdown_time_secs" \ "$sleep_time_secs" \ "$cmd" true } check_local_agent_stopped() { local ret=0 local i=0 local max=20 agent_ended="false" local agent_pid agent_pid=$(get_local_agent_pid || true) # Agent has finished [ -z "$agent_pid" ] && return 0 for _ in $(seq "$max") do { sudo kill -0 "$agent_pid"; ret=$?; } || true [ "$ret" -ne 0 ] && agent_ended="true" && break sleep 0.2 done [ "$agent_ended" = "false" ] && die "agent still running: pid $agent_pid" || true } get_vm_pid() { pgrep "$hypervisor_binary" } check_vm_stopped() { tmux list-sessions |\ grep -q "^${KATA_TMUX_VM_SESSION}:" \ && return 1 return 0 } start_debug_console() { local agent_test_type="${1:-}" local shutdown_test_type="${2:-}" [ -z "$agent_test_type" ] && die "need agent test type" [ -z "$shutdown_test_type" ] && die "need shutdown test type" info "Starting debug console" case "$agent_test_type" in 'vm') connect_to_vsock_debug_console ;; # NOP for a local agent since we cannot connect to the agents # VSOCK console socket from *outside* the host! 'local') true ;; *) die "invalid agent test type: '$agent_test_type'" ;; esac true } run_single_agent() { local agent_test_type="${1:-}" local shutdown_test_type="${2:-}" [ -z "$agent_test_type" ] && die "need agent test type" [ -z "$shutdown_test_type" ] && die "need shutdown test type" local msg msg=$(printf \ "Testing agent (agent test type: '%s', shutdown test type: '%s')" \ "$agent_test_type" \ "$shutdown_test_type") info "$msg" setup_agent "$shutdown_test_type" if [ $configured_hypervisor = "clh" ]; then # CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify socket_path_template=$clh_socket_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath') clh_socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"') [ "$dry_run" = 'false' ] && sudo mkdir -p $(dirname "$clh_socket_path") fi run_trace_forwarder "$shutdown_test_type" sleep 5s test_start_time=$(date '+%F %T') start_agent \ "$agent_test_type" \ "$agent_log_file" info "Testing agent: shutdown test type: '$shutdown_test_type', agent test type: $agent_test_type" local entry entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true) local debug_console=$(echo "$entry"|cut -d: -f4) [ "$debug_console" = 'true' ] && \ start_debug_console \ "$agent_test_type" \ "$shutdown_test_type" stop_agent \ "$agent_test_type" \ "$ctl_log_file" # We only need to show the set of commands once [ "$dry_run" = 'true' ] && exit 0 test_end_time=$(date '+%F %T') check_agent_stopped "$agent_test_type" validate_agent \ "$agent_test_type" \ "$shutdown_test_type" \ "$agent_log_file" } run_agent() { local agent_test_type="${1:-}" local shutdown_test_type="${2:-}" [ -z "$agent_test_type" ] && die "need agent test type" [ -z "$shutdown_test_type" ] && die "need shutdown test type" case "$shutdown_test_type" in "$ALL_TEST_TYPES") local entry # Run all shutdown types for entry in "${shutdown_test_types[@]}" do local name name=$(echo "$entry"|cut -d: -f1) run_single_agent \ "$agent_test_type" \ "$name" # Clean up between iterations sudo rm -f \ "$ctl_log_file" \ "$agent_log_file" local addresses=$(get_addresses || true) [ -z "$addresses" ] || \ die "found unexpected vsock addresses: '$addresses'" done ;; *) run_single_agent \ "$agent_test_type" \ "$shutdown_test_type" ;; esac } test_agent_shutdown() { local count="${1:-}" local agent_test_type="${2:-}" local shutdown_test_type="${3:-}" [ -z "$count" ] && die "need count" [ -z "$agent_test_type" ] && die "need agent test type" [ -z "$shutdown_test_type" ] && die "need shutdown test type" # Start with a clean environment [ "$dry_run" = 'false' ] && cleanup initial || true local i for i in $(seq "$count") do [ "$dry_run" = 'false' ] && \ info "testing agent: run $i of $count" || true run_agent \ "$agent_test_type" \ "$shutdown_test_type" done info "testing agent: completed $count runs" } handle_args() { local opt local count="${KATA_AGENT_SHUTDOWN_TEST_COUNT}" local shutdown_test_type="$DEFAULT_SHUTDOWN_TEST_TYPE" local agent_test_type="$DEFAULT_AGENT_TEST_TYPE" while getopts "a:c:dhklnt:" opt "$@" do case "$opt" in a) agent_test_type="$OPTARG" ;; c) count="$OPTARG" ;; d) set -o xtrace ;; h) usage; exit 0 ;; k) keep_logs='true' ;; l) list_test_types; exit 0 ;; n) dry_run='true' ;; t) shutdown_test_type="$OPTARG" ;; *) die "invalid option: '$opt'" ;; esac done setup test_agent_shutdown \ "$count" \ "$agent_test_type" \ "$shutdown_test_type" } main() { handle_args "$@" } main "$@"