kata-containers/tests/functional/tracing/test-agent-shutdown.sh
2025-01-29 11:26:27 +01:00

1486 lines
35 KiB
Bash
Executable File

#!/bin/bash
# Copyright (c) 2021 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
#---------------------------------------------------------------------
# Description: Test the Kata Containers 2.x rust agent shutdown behaviour.
#
# Normally, the kata-agent process running inside the VM is not shut down;
# once the workload ends and the agent has returned the workload return
# value back to the runtime, the runtime simply kills the VM. This is safe
# since nothing the user cares about is running any more.
#
# However, for agent tracing, a graceful agent shutdown is necessary to ensure
# all trace spans are generated. When *static* agent tracing is enabled, the
# runtime relies entirely on the agent to perform a graceful shutdown _and_
# shut down the VM.
#
# This script tests the kata-agent in two ways:
#
# - "manually" / "standalone" where the agent binary is run directly.
# - Inside a Kata VM, started by a shimv2-capable container manager
# (containerd).
#
# In both cases, the agent is shut down using the agent-ctl tool
# to request the agent shut down gracefully.
#
# Various configuration options are also tested. One of these enables
# the agents built-in (VSOCK) debug console. This test not only enables
# the option but also connects to the created console.
#
# Since this script needs to start various programs with a terminal,
# it uses tmux(1) consistently to simplify the handling logic.
#---------------------------------------------------------------------
readonly script_name=${0##*/}
set -o errexit
set -o nounset
set -o pipefail
set -o errtrace
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../../common.bash"
source "/etc/os-release" || source "/usr/lib/os-release"
CTR_RUNTIME=${CTR_RUNTIME:-"io.containerd.kata.v2"}
# Kata always uses this value
EXPECTED_VSOCK_PORT="1024"
DOCKER_IMAGE=${DOCKER_IMAGE:-"busybox"}
CTR_IMAGE=${CTR_IMAGE:-"quay.io/prometheus/busybox:latest"}
# Number of times the test should be run
KATA_AGENT_SHUTDOWN_TEST_COUNT=${KATA_AGENT_SHUTDOWN_TEST_COUNT:-1}
# Default VSOCK port used by the agent
KATA_AGENT_VSOCK_CONSOLE_PORT=${KATA_AGENT_VSOCK_CONSOLE_PORT:-1026}
# The shutdown test type that represents a "default" / vanilla Kata
# installation (where no debug options are enabled).
VANILLA_TEST_TYPE='default'
# Name of tmux(1) sessions to create to run Kata VM and local agent in
KATA_TMUX_VM_SESSION="kata-shutdown-test-vm-session"
KATA_TMUX_LOCAL_SESSION="kata-shutdown-test-local-agent-session"
# Name of tmux(1) session to create to run a debug console in
KATA_TMUX_CONSOLE_SESSION="kata-shutdown-test-console-session"
# tmux(1) session to run the trace forwarder in
KATA_TMUX_FORWARDER_SESSION="kata-shutdown-test-trace-forwarder-session"
KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}"
# List of test types used by configure_kata().
#
# Each element contains four colon delimited fields:
#
# 1: Name.
# 2: Whether debug should be enabled for the agent+runtime.
# 3: Whether hypervisor debug should be enabled.
# (handled separately due to a previous bug which blocked agent shutdown).
# 4: Whether a VSOCK debug console should be configured and used.
#
# Notes:
#
# - Tests are run in the order found in this array.
# - An array is used (rather than a hash) to ensure the standard/vanilla
# configuration is run *last*. The reason for this being that debug is
# needed to diagnose shutdown errors, so there is no point in runnning
# the default scenario first, in case it fails (and it thus "undebuggable").
shutdown_test_types=(
'with-debug:true:false:false'
'with-debug-console:false:false:true'
'with-hypervisor-debug:true:true:false'
'with-everything:true:true:true'
"${VANILLA_TEST_TYPE}:false:false:false"
)
# Number of fields each entry in the 'shutdown_test_types' array should have.
shutdown_test_type_fields=4
# Pseudo test type name that represents all test types defined
# in the 'shutdown_test_types' array.
ALL_TEST_TYPES='all'
DEFAULT_SHUTDOWN_TEST_TYPE="${ALL_TEST_TYPES}"
# List of ways of running the agent:
#
# Each element contains two colon delimited fields:
#
# 1: Name used for a particular way of running the agent.
# 2: Description.
agent_test_types=(
'local:Run agent using agent-ctl tool'
'vm:Run agent inside a Kata Container'
)
# Default value from the 'agent_test_types' array.
DEFAULT_AGENT_TEST_TYPE='vm'
# Set by every call to run_single_agent()
test_start_time=
test_end_time=
#-------------------------------------------------------------------------------
# Settings
# values used to wait for local and VM processes to start and end.
wait_time_secs=${WAIT_TIME_SECS:-20}
sleep_time_secs=${SLEEP_TIME_SECS:-1}
# Time to allow for the agent and VM to shutdown
shutdown_time_secs=${SHUTDOWN_TIME_SECS:-120}
# Name for the container that will be created
container_id="${CONTAINER_ID:-kata-agent-shutdown-test}"
# If 'true', don't run any commands, just show what would be run.
dry_run="${DRY_RUN:-false}"
# If 'true', don't remove logs on a successful run.
keep_logs="${KEEP_LOGS:-false}"
# Name of socket file used by a local agent.
agent_socket_file="kata-agent.socket"
# Kata Agent socket URI.
#
# Notes:
#
# - The file is an abstract socket
# (meaning it is not visible in the filesystem).
#
# - The agent and the agent-ctl tool use slightly different
# address formats for abstract sockets.
local_agent_server_addr="unix://${agent_socket_file}"
local_agent_ctl_server_addr="unix://@${agent_socket_file}"
# Address that is dynamically configured when using CLH before
# starting trace forwarder or container
clh_socket_path=
clh_socket_prefix="/run/vc/vm/"
ctl_log_file="${PWD}/agent-ctl.log"
# Log file that must contain agent output.
agent_log_file="${PWD}/kata-agent.log"
# Set in setup() based on KATA_HYPERVISOR
# Supported hypervisors are qemu and clh
configured_hypervisor=
# String that would appear in config file (qemu or clh)
configured_hypervisor_cfg=
# Full path to directory containing an OCI bundle based on "$DOCKER_IMAGE",
# which is required by the agent control tool.
bundle_dir=${BUNDLE_DIR:-""}
#---------------------------------------
# Default values
default_arch=$(uname -m)
arch="${arch:-${default_arch}}"
#-------------------------------------------------------------------------------
agent_binary="/usr/bin/kata-agent"
# Maximum debug level
default_agent_log_level="trace"
agent_log_level=${agent_log_level:-${default_agent_log_level}}
# Full path to the main configuration file (set by setup()).
kata_cfg_file=
# Set in setup() based on KATA_HYPERVISOR
hypervisor_binary=
#-------------------------------------------------------------------------------
[ -n "${DEBUG:-}" ] && set -o xtrace
usage()
{
cat <<EOF
Usage: $script_name [options]
Summary: Run Kata Agent shutdown tests.
Description: Run a set of tests to ensure the Kata Containers agent process
running inside the virtual machine can shut down cleanly. This is required for
static tracing. A number of variations of the test are run to exercise as many
code paths as possible, specifically different code paths for when particular
debug options are enabled.
Options:
-a <agent-test-type> : Agent test type to use
(default: '$DEFAULT_AGENT_TEST_TYPE').
-c <count> : Run specified number of iterations
(default: $KATA_AGENT_SHUTDOWN_TEST_COUNT).
-d : Enable debug (shell trace) output.
-h : Show this help statement.
-k : Keep logs on successful run
(default: logs will be deleted on success).
-l : List all available agent and shutdown test types.
-n : Dry-run mode - show the commands that would be run.
-t <shutdown-test-type> : Only run the specified shutdown test type
(default: '$DEFAULT_SHUTDOWN_TEST_TYPE').
Notes:
- These tests should be run *before* the Kata Agent tracing tests, since if
the agent cannot be shut down, static tracing will not work reliably.
- By default all shutdown test types are run, but only the default agent test
type is run.
EOF
}
warn()
{
echo >&2 "WARNING: $*"
}
# Run the specified command, or if dry-run mode is enabled,
# just show the command that would be run.
run_cmd()
{
local cmdline="$@"
if [ "$dry_run" = 'true' ]
then
info "dry-run: Would run: '$cmdline'"
else
eval $cmdline
fi
}
# Show a subset of processes (for debugging)
show_procs()
{
info "Processes"
local hypervisor
hypervisor="qemu"
[ ${configured_hypervisor} = "clh" ] && hypervisor="cloud-hypervisor"
local patterns=()
patterns+=("kata-agent-ctl")
patterns+=("${hypervisor}")
patterns+=("containerd")
patterns+=("ctr")
local pattern_list
pattern_list=$(echo "${patterns[@]}"|tr ' ' '|')
local regex
regex="(${pattern_list})"
ps -efww | grep -i -E "$regex" || true
}
kill_tmux_sessions()
{
local session
for session in \
"$KATA_TMUX_CONSOLE_SESSION" \
"$KATA_TMUX_FORWARDER_SESSION" \
"$KATA_TMUX_LOCAL_SESSION" \
"$KATA_TMUX_VM_SESSION"
do
tmux kill-session -t "$session" &>/dev/null || true
done
true
}
get_shutdown_test_type_entry()
{
local shutdown_test_type="${1:-}"
[ -z "$shutdown_test_type" ] && die "need shutdown test type name"
local entry
for entry in "${shutdown_test_types[@]}"
do
local count
count=$(echo "$entry"|tr ':' '\n'|wc -l)
[ "$count" -eq "$shutdown_test_type_fields" ] \
|| die "expected $shutdown_test_type_fields fields, found $count: '$entry'"
local name
name=$(echo "$entry"|cut -d: -f1)
[ "$name" = "$shutdown_test_type" ] \
&& echo "$entry" \
&& break
done
echo
}
list_shutdown_test_types()
{
local entry
local debug_value
local hypervisor_debug_value
local debug_console_value
printf "# Shutdown test types:\n\n"
printf "%-24s %-15s %-23s %s\n\n" \
"Test type" \
"Debug enabled" \
"Hypervisor debug" \
"Debug console used"
for entry in "${shutdown_test_types[@]}"
do
local name
local debug_value
local hypervisor_debug_value
local debug_console_value
name=$(echo "$entry"|cut -d: -f1)
debug_value=$(echo "$entry"|cut -d: -f2)
hypervisor_debug_value=$(echo "$entry"|cut -d: -f3)
debug_console_value=$(echo "$entry"|cut -d: -f4)
printf "%-24s %-15s %-23s %s\n" \
"$name" \
"$debug_value" \
"$hypervisor_debug_value" \
"$debug_console_value"
done
echo
}
list_agent_test_types()
{
local entry
printf "# Agent test types:\n\n"
printf "%-12s %s\n\n" \
"Agent type" \
"Description"
for entry in "${agent_test_types[@]}"
do
local name
local descr
name=$(echo "$entry"|cut -d: -f1)
descr=$(echo "$entry"|cut -d: -f2-)
local msg=""
[ "$name" = "$DEFAULT_AGENT_TEST_TYPE" ] && msg=" (default)"
printf "%-12s %s%s.\n" \
"$name" \
"$descr" \
"$msg"
done
echo
}
list_test_types()
{
list_agent_test_types
list_shutdown_test_types
}
# Set Kata options according to test type.
configure_kata()
{
local shutdown_test_type="${1:-}"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
local entry
local debug_value
local hypervisor_debug_value
local debug_console_value
local entry
entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true)
[ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'"
debug_value=$(echo "$entry"|cut -d: -f2)
hypervisor_debug_value=$(echo "$entry"|cut -d: -f3)
debug_console_value=$(echo "$entry"|cut -d: -f4)
[ -z "$debug_value" ] && \
die "need debug value for $shutdown_test_type"
[ -z "$hypervisor_debug_value" ] && \
die "need hypervisor debug value for $shutdown_test_type"
[ -z "$debug_console_value" ] && \
die "need debug console value for $shutdown_test_type"
toggle_debug "$debug_value" "$hypervisor_debug_value"
toggle_vsock_debug_console "$debug_console_value"
# Enable agent tracing
#
# Even though this program only tests agent shutdown, static tracing
# must be configured. This is because normally (with tracing
# disabled), the runtime kills the VM after the workload has exited.
# However, if static tracing is enabled, the runtime will not kill the
# VM - the responsibility for shutting down the VM is given to the
# agent process running inside the VM.
if [ "$shutdown_test_type" = "$VANILLA_TEST_TYPE" ]
then
# We don't need to worry about the 'trace_mode' here since agent tracing
# is *only* enabled if the 'enable_tracing' variable is set.
run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'false'
else
run_cmd sudo crudini --set "${kata_cfg_file}" 'agent.kata' 'enable_tracing' 'true'
fi
}
unconfigure_kata()
{
info "Resetting configuration to defaults"
configure_kata "$VANILLA_TEST_TYPE"
}
# Enable/disable the agent's built-in VSOCK debug console
toggle_vsock_debug_console()
{
run_cmd sudo crudini --set "${kata_cfg_file}" \
'agent.kata' 'debug_console_enabled' "$1"
}
# Enable/disable debug options.
#
# Note: Don't use 'kata-manager.sh "enable-debug"' since this
# enables all debug (including the problematic hypervisor
# debug - see below).
toggle_debug()
{
local value="${1:-}"
local hypervisor_debug="${2:-}"
[ -z "$value" ] && die "need value"
[ -z "$hypervisor_debug" ] && die "need hypervisor debug value"
# list of confguration.toml sections that have debug options we care about
local debug_sections=()
debug_sections+=('agent.kata')
debug_sections+=('runtime')
local section
for section in "${debug_sections[@]}"
do
run_cmd sudo crudini --set "$kata_cfg_file" "$section" \
'enable_debug' "$value"
done
# XXX: Enabling hypervisor debug for QEMU will make a systemd debug
# console service inoperable (*), but we need to test it anyhow.
#
# (*) - If enabled, it stops "kata-debug.service" from attaching to
# the console and the socat call made on the client hangs until
# the VM is shut down!
local section
section=$(printf "hypervisor.%s" "$configured_hypervisor_cfg")
run_cmd sudo crudini --set "$kata_cfg_file" "$section" \
'enable_debug' "$hypervisor_debug_value"
}
# Provide a "semi-valid" vsock address for when dry-run mode is active.
# The URI includes a message telling the user to change it and replace
# with the real VSOCK CID value.
get_dry_run_agent_vsock_address()
{
echo "vsock://FIXME-CHANGE-TO-VSOCK-CID:${EXPECTED_VSOCK_PORT}"
}
# Start a debug console shell using the agent's built-in debug console
# feature.
#
# Note: You should be able to use "kata-runtime exec $cid", but that isn't
# working currently.
connect_to_vsock_debug_console()
{
local agent_addr
if [ "$dry_run" = 'true' ]
then
agent_addr=$(get_dry_run_agent_vsock_address)
else
agent_addr=$(get_agent_vsock_address || true)
[ -z "$agent_addr" ] && die "cannot determine agent VSOCK address"
fi
local socat_connect=
if [ $configured_hypervisor = "qemu" ]; then
socat_connect=$(echo "$agent_addr"|sed 's!^vsock://!vsock-connect:!')
elif [ $configured_hypervisor = "clh" ]; then
socat_connect="unix-connect:${clh_socket_path}"
else
die "Cannot configure address for socat, unknown hypervisor: '$configured_hypervisor'"
fi
run_cmd \
"tmux new-session \
-d \
-s \"$KATA_TMUX_CONSOLE_SESSION\" \
\"socat \
'${socat_connect}' \
stdout\""
}
cleanup()
{
# Save the result of the last call made before
# this handler was called.
#
# XXX: This *MUST* be the first command in this function!
local failure_ret="$?"
[ "$dry_run" = 'true' ] && return 0
if [ "$failure_ret" -eq 0 ] && [ "$keep_logs" = 'true' ]
then
info "SUCCESS: Test passed, but leaving logs:"
info ""
info "agent log file : ${agent_log_file}"
info "agent-ctl log file : ${ctl_log_file}"
info "OCI bundle directory : ${bundle_dir}"
return 0
fi
local arg="${1:-}"
if [ $failure_ret -ne 0 ] && [ "$arg" != 'initial' ]; then
warn "ERROR: Test failed"
warn ""
warn "Not cleaning up to help debug failure:"
warn ""
info "agent-ctl log file : ${ctl_log_file}"
info "agent log file : ${agent_log_file}"
info "OCI bundle directory : ${bundle_dir}"
return 0
fi
kill_tmux_sessions
unconfigure_kata
[ "$arg" != 'initial' ] && [ -d "$bundle_dir" ] && rm -rf "$bundle_dir"
sudo rm -f \
"$agent_log_file" \
"$ctl_log_file"
clean_env_ctr &>/dev/null || true
local sandbox_dir="/run/sandbox-ns/"
# XXX: Without doing this, the agent will hang attempting to create the
# XXX: namespaces (in function "setup_shared_namespaces()")
sudo umount -f "${sandbox_dir}/uts" "${sandbox_dir}/ipc" &>/dev/null || true
sudo rm -rf "${sandbox_dir}" &>/dev/null || true
# Check that clh socket was deleted
if [ $configured_hypervisor = "clh" ] && [ ! -z $clh_socket_path ]; then
[ -f $clh_socket_path ] && die "CLH socket path $clh_socket_path was not properly cleaned up"
fi
sudo systemctl restart containerd
}
setup_containerd()
{
local file="/etc/containerd/config.toml"
[ -e "$file" ] || die "missing containerd config file: '$file'"
# Although the containerd config file is in TOML format, crudini(1)
# won't parse it due to the indentation it uses.
local containerd_debug_enabled
containerd_debug_enabled=$(sed \
-e '/./{H;$!d;}' \
-e 'x;/\[debug\]/!d;' \
"$file" |\
grep "level *= *\"debug\"" || true)
if [ -z "$containerd_debug_enabled" ]
then
cat <<-EOF | sudo tee -a "$file"
[debug]
# Allow Kata Containers debug messages to be propageted
# into the hosts journal.
# (use "journalctl -t kata" to view).
level = "debug"
EOF
sudo systemctl restart containerd
fi
sudo ctr image pull "$CTR_IMAGE"
true
}
create_oci_rootfs()
{
local dir="${1:-}"
[ -z "$dir" ] && die "Need OCI rootfs dir"
sudo docker export $(sudo docker create "$DOCKER_IMAGE") |\
tar -C "${dir}" -xvf - >/dev/null
}
setup_oci_bundle()
{
bundle_dir="$(mktemp -d)"
export bundle_dir
info "Creating OCI bundle in directory: '$bundle_dir'"
local config="${bundle_dir}/config.json"
local rootfs_dir="${bundle_dir}/rootfs/"
mkdir -p "$rootfs_dir"
create_oci_rootfs "$rootfs_dir"
pushd "$bundle_dir" &>/dev/null
runc spec
popd &>/dev/null
[ -e "$config" ] || die "no OCI config file at ${config}"
}
setup()
{
configured_hypervisor="${KATA_HYPERVISOR:-}"
if [ "${KATA_HYPERVISOR:-}" = "qemu" ]; then
hypervisor_binary="qemu-system-${arch}"
configured_hypervisor_cfg="qemu"
elif [ "${KATA_HYPERVISOR:-}" = "clh" ]; then
hypervisor_binary="cloud-hypervisor"
configured_hypervisor_cfg="clh"
else
local msg=""
msg+="Exiting as hypervisor test dependency not met"
msg+=" (expected 'qemu' or 'cloud-hypervisor', found '$KATA_HYPERVISOR')"
die "$msg"
fi
info "Configured hypervisor is $configured_hypervisor"
trap cleanup EXIT
# Don't mess with an existing tmux session
unset TMUX
[ "$dry_run" = 'false' ] && \
[ -z "$bundle_dir" ] && \
setup_oci_bundle || true
local cmds=()
# For parsing TOML config files
cmds+=('crudini')
# For container manager (containerd)
cmds+=('ctr')
# for OCI bundle creation
cmds+=('docker')
cmds+=('runc')
# For querying VSOCK sockets
cmds+=('socat')
# For launching processes
cmds+=('tmux')
local cmd
for cmd in "${cmds[@]}"
do
local result
result=$(command -v "$cmd" || true)
[ -n "$result" ] || die "need $cmd"
done
kata_cfg_file=$(kata-runtime kata-env \
--json |\
jq '.Runtime | .Config | .Path' |\
cut -d\" -f2 || true)
[ -z "$kata_cfg_file" ] && die "Cannot determine config file"
sudo mkdir -p $(dirname "$kata_cfg_file")
#------------------------------
# Check configured hypervisor
local hypervisor_section
hypervisor_section=$(printf "hypervisor.%s\n" "${configured_hypervisor_cfg}")
local ret
{ crudini --get "${kata_cfg_file}" "${hypervisor_section}" &>/dev/null; ret=$?; } || true
[ "$ret" -eq 0 ] || \
die "Configured hypervisor ${configured_hypervisor} does not match config file ${kata_cfg_file}"
setup_containerd
}
start_local_agent()
{
local log_file="${1:-}"
[ -z "$log_file" ] && die "need agent log file"
local running
running=$(get_local_agent_pid || true)
[ -n "$running" ] && die "agent already running: '$running'"
# Note: it's imperative that we capture stderr to the log file
# as the agent writes the shutdown message to this stream!
run_cmd \
"tmux new-session \
-d \
-s \"$KATA_TMUX_LOCAL_SESSION\" \
\"sudo \
RUST_BACKTRACE=full \
KATA_AGENT_LOG_LEVEL=${agent_log_level} \
KATA_AGENT_SERVER_ADDR=${local_agent_server_addr} \
${agent_binary} \
&> ${log_file}\""
[ "$dry_run" = 'false' ] && wait_for_local_agent_to_start || true
}
# Wait for the agent to finish starting
wait_for_kata_vm_agent_to_start()
{
local cid="${1:-}"
[ -z "$log_file" ] && die "need container ID"
# First, check the containerd status of the container
local cmd="sudo ctr task list | grep \"${cid}\" | grep -q \"RUNNING\""
info "Waiting for VM to start (cid: '$cid')"
waitForProcess \
"$wait_time_secs" \
"$sleep_time_secs" \
"$cmd"
show_procs
# Next, ensure there is a valid VSOCK address for the VM
info "Waiting for agent VSOCK server"
cmd="get_agent_vsock_address_simple >/dev/null"
waitForProcess \
"$wait_time_secs" \
"$sleep_time_secs" \
"$cmd"
info "Kata VM running"
}
check_local_agent_alive()
{
local cmds=()
cmds+=("-c Check")
run_agent_ctl \
"${local_agent_ctl_server_addr}" \
"${cmds[@]}"
true
}
wait_for_local_agent_to_start()
{
local cmd="check_local_agent_alive"
info "Waiting for agent process to start"
waitForProcess \
"$wait_time_secs" \
"$sleep_time_secs" \
"$cmd"
info "Kata agent process running"
}
# Create a Kata Container that blocks "forever"
start_agent_in_kata_vm()
{
local log_file="${1:-}"
[ -z "$log_file" ] && die "need agent log file"
local snapshotter=""
local ret
# Allow containerd to run on a ZFS root filesystem
{ zfs list &>/dev/null; ret=$?; } || true
[ "$ret" = 0 ] && snapshotter='zfs'
# Ensure the container blocks forever
local cmd='tail -f /dev/null'
run_cmd \
"tmux new-session \
-d \
-s \"$KATA_TMUX_VM_SESSION\" \
\"sudo ctr run \
--snapshotter '$snapshotter' \
--runtime '${CTR_RUNTIME}' \
--rm \
-t '${CTR_IMAGE}' \
'$container_id' \
$cmd\""
[ "$dry_run" = 'false' ] && \
wait_for_kata_vm_agent_to_start "$container_id" || true
}
start_agent()
{
local agent_test_type="${1:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
local log_file="${2:-}"
[ -z "$log_file" ] && die "need agent log file"
case "$agent_test_type" in
'local') start_local_agent "$log_file" ;;
'vm') start_agent_in_kata_vm "$log_file" ;;
*) die "invalid agent test type: '$agent_test_type'" ;;
esac
true
}
run_agent_ctl()
{
local server_addr="${1:-}"
shift
local cmds="${*:-}"
[ -n "$server_addr" ] || die "need agent ttRPC server address"
[ -n "$cmds" ] || die "need commands for agent control tool"
local agent_ctl_path
agent_ctl_path="/opt/kata/bin/kata-agent-ctl"
local redirect="&>\"${ctl_log_file}\""
if [ "$dry_run" = 'true' ]
then
redirect=""
bundle_dir="FIXME-set-to-OCI-bundle-directory"
fi
local server_address=
if [ $configured_hypervisor = "qemu" ]; then
server_address="--server-address \"${server_addr}\""
elif [ $configured_hypervisor = "clh" ]; then
server_address="--server-address \"${server_addr}\" --hybrid-vsock"
else
die "Cannot configure server address, unknown hypervisor: '$configured_hypervisor'"
fi
run_cmd \
sudo \
RUST_BACKTRACE=full \
"${agent_ctl_path}" \
-l debug \
connect \
"${server_address}" \
--bundle-dir "${bundle_dir}" \
"${cmds}" \
"${redirect}"
}
# This function "cheats" a little - it gets the agent
# to do some work *and then* stops it.
stop_local_agent()
{
local cmds=()
cmds+=("-c Check")
cmds+=("-c GetGuestDetails")
cmds+=("-c 'sleep 1s'")
cmds+=("-c DestroySandbox")
run_agent_ctl \
"${local_agent_ctl_server_addr}" \
"${cmds[@]}"
}
get_addresses()
{
local addresses=
if [ $configured_hypervisor = "qemu" ]; then
addresses=$(ss -Hp --vsock |\
grep -v -E "\<socat\>" |\
awk '$2 ~ /^ESTAB$/ {print $6}' |\
grep ":${EXPECTED_VSOCK_PORT}$")
elif [ $configured_hypervisor = "clh" ]; then
# since we preconfigured the socket, we are checking to see if it is reported
addresses=$(ss -Hp |\
grep "${clh_socket_path}" |\
awk '$2 ~ /^ESTAB$/ {print $5}')
else
die "Cannot retrieve address, unknown hypervisor: '$configured_hypervisor'"
fi
echo ${addresses}
}
# Doesn't fail. Instead it will return the empty string on error.
get_agent_vsock_address_simple()
{
local addresses=$(get_addresses)
[ -z "$addresses" ] && return 1
local expected_count=1
local count
count=$(echo "$addresses"|wc -l || true)
[ "$count" -eq "$expected_count" ] || return 1
if [ $configured_hypervisor = "qemu" ]; then
local cid
local port
cid=$(echo "$addresses"|cut -d: -f1)
port=$(echo "$addresses"|cut -d: -f2)
echo "vsock://${cid}:${port}"
elif [ $configured_hypervisor = "clh" ]; then
address=$(echo "$addresses" | awk 'NR==1{print $1}')
echo "unix://${address}"
else
die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'"
fi
return 0
}
get_agent_vsock_address()
{
local addresses=$(get_addresses)
[ -z "$addresses" ] && die "no VSOCK connections found"
local expected_count=1
local count
count=$(echo "$addresses"|wc -l || true)
if [ $configured_hypervisor = "qemu" ]; then
# For QEMU we always expect 1 result. For Cloud Hypervisor, if a debug console is configured
# and running, we will have more than 1 result, so only run this check for QEMU
[ "$count" -eq "$expected_count" ] \
|| die "expected $expected_count VSOCK entry, found $count: '$addresses'"
local cid
local port
cid=$(echo "$addresses"|cut -d: -f1)
port=$(echo "$addresses"|cut -d: -f2)
echo "vsock://${cid}:${port}"
elif [ $configured_hypervisor = "clh" ]; then
address=$(echo "$addresses" | awk 'NR==1{print $1}')
echo "unix://${address}"
else
die "Cannot get agent vsock address, unknown hypervisor: '$configured_hypervisor'"
fi
}
stop_agent_in_kata_vm()
{
local agent_addr
if [ "$dry_run" = 'true' ]
then
agent_addr=$(get_dry_run_agent_vsock_address)
else
agent_addr=$(get_agent_vsock_address || true)
[ -z "$agent_addr" ] && \
die "cannot determine agent VSOCK address for $hypervisor_binary"
fi
# List of API commands to send to the agent.
local cmds=()
# Run a couple of query commands first to ensure
# the agent is listening.
cmds+=("-c Check")
cmds+=("-c GetGuestDetails")
# Creating a container implies creating a sandbox, so request
# agent/VM/container shutdown by asking the agent
# to destroy the sandbox.
cmds+=("-c DestroySandbox")
run_agent_ctl \
"${agent_addr}" \
"${cmds[@]}"
true
}
stop_agent()
{
info "Stopping agent"
local agent_test_type="${1:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
local log_file="${2:-}"
[ -z "$log_file" ] && die "need agent-ctl log file"
case "$agent_test_type" in
'local') stop_local_agent ;;
'vm') stop_agent_in_kata_vm ;;
*) die "invalid agent test type: '$agent_test_type'" ;;
esac
true
}
get_local_agent_pid()
{
local pids
local name
name=$(basename "$agent_binary")
pids=$(pgrep "$name" || true)
[ -z "$pids" ] && return 0
local count
count=$(echo "$pids"|wc -l)
[ "$count" -gt 1 ] && \
die "too many agent processes running ($count, '$pids')"
echo $pids
}
# Function that writes all agent logs to '$agent_log_file'.
get_agent_log_file()
{
local agent_test_type="${1:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
local log_file="${2:-}"
[ -z "$log_file" ] && die "need agent log file"
info "Getting agent log details"
case "$agent_test_type" in
# NOP: File should have been created by start_local_agent()
'local') true ;;
# Extract journal entries for the duration of the test
'vm')
sudo journalctl \
-q \
-a \
-o cat \
-t 'kata' \
--since="$test_start_time" \
> "$log_file"
;;
*) die "invalid agent test type: '$agent_test_type'" ;;
esac
[ -e "$log_file" ] || die "no log file: '$log_file'"
[ -s "$log_file" ] || die "empty log file: '$log_file'"
true
}
# Function to run to ensure correct behaviour
validate_agent()
{
local agent_test_type="${1:-}"
local shutdown_test_type="${2:-}"
local log_file="${3:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
[ -z "$log_file" ] && die "need agent log file"
info "validating"
get_agent_log_file \
"$agent_test_type" \
"$log_file"
# Regular expression that describes possible agent failures
local regex="(slog::Fuse|Drain|Custom|serialization error|thread.*panicked|stack backtrace:)"
grep -q -E "$regex" "$log_file" && cat $log_file && die "Found agent error in log file: '$log_file'"
local entry
entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true)
[ -z "$entry" ] && die "invalid test type: '$shutdown_test_type'"
local hypervisor_debug=$(echo "$entry"|cut -d: -f3)
local vsock_console=$(echo "$entry"|cut -d: -f4)
local agent_debug_logs_available='false'
[ "$hypervisor_debug" = 'true' ] && \
[ "$vsock_console" = 'false' ] && \
agent_debug_logs_available='true'
if [ "$agent_debug_logs_available" = 'true' ] || [ "$agent_test_type" = 'local' ]
then
# The message the agent writes to stderr just before it exits.
local done_msg="\<shutdown complete\>"
grep -q -E "$done_msg" "$log_file" || (cat $log_file && die "missing agent shutdown message")
else
# We can only check for the shutdown message if the agent debug
# logs are available.
info "Not checking for agent shutdown message as hypervisor debug disabled"
fi
}
setup_agent()
{
local shutdown_test_type="${1:-}"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
kill_tmux_sessions
configure_kata "$shutdown_test_type"
true
}
# Even though this test is not testing tracing, agent tracing needs to be
# enabled to stop the runtime from killing the VM. However, if tracing is
# enabled, the forwarder must be running. To remove the need for Jaeger to
# also be running, run the forwarder in "NOP" mode.
run_trace_forwarder()
{
local forwarder_binary_path
forwarder_binary_path="/opt/kata/bin/kata-trace-forwarder"
local socket_path_tf=""
# If using CLH, socket path must be passed to trace forwarder
if [ $configured_hypervisor = "clh" ]; then
socket_path_tf="--socket-path ${clh_socket_path}"
fi
run_cmd \
"tmux new-session \
-d \
-s \"$KATA_TMUX_FORWARDER_SESSION\" \
sudo \"$forwarder_binary_path --dump-only -l trace ${socket_path_tf}\""
}
check_agent_stopped()
{
info "Checking agent stopped"
local agent_test_type="${1:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
local cmd=
case "$agent_test_type" in
'local') cmd=check_local_agent_stopped ;;
'vm') cmd=check_vm_stopped ;;
*) die "invalid agent test type: '$agent_test_type'" ;;
esac
waitForProcess \
"$shutdown_time_secs" \
"$sleep_time_secs" \
"$cmd"
true
}
check_local_agent_stopped()
{
local ret=0
local i=0
local max=20
agent_ended="false"
local agent_pid
agent_pid=$(get_local_agent_pid || true)
# Agent has finished
[ -z "$agent_pid" ] && return 0
for _ in $(seq "$max")
do
{ sudo kill -0 "$agent_pid"; ret=$?; } || true
[ "$ret" -ne 0 ] && agent_ended="true" && break
sleep 0.2
done
[ "$agent_ended" = "false" ] && die "agent still running: pid $agent_pid" || true
}
get_vm_pid()
{
pgrep "$hypervisor_binary"
}
check_vm_stopped()
{
tmux list-sessions |\
grep -q "^${KATA_TMUX_VM_SESSION}:" \
&& return 1
return 0
}
start_debug_console()
{
local agent_test_type="${1:-}"
local shutdown_test_type="${2:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
info "Starting debug console"
case "$agent_test_type" in
'vm') connect_to_vsock_debug_console ;;
# NOP for a local agent since we cannot connect to the agents
# VSOCK console socket from *outside* the host!
'local') true ;;
*) die "invalid agent test type: '$agent_test_type'" ;;
esac
true
}
run_single_agent()
{
local agent_test_type="${1:-}"
local shutdown_test_type="${2:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
local msg
msg=$(printf \
"Testing agent (agent test type: '%s', shutdown test type: '%s')" \
"$agent_test_type" \
"$shutdown_test_type")
info "$msg"
setup_agent "$shutdown_test_type"
if [ $configured_hypervisor = "clh" ]; then
# CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify
socket_path_template=$clh_socket_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath')
clh_socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"')
[ "$dry_run" = 'false' ] && sudo mkdir -p $(dirname "$clh_socket_path")
fi
run_trace_forwarder "$shutdown_test_type"
sleep 5s
test_start_time=$(date '+%F %T')
start_agent \
"$agent_test_type" \
"$agent_log_file"
info "Testing agent: shutdown test type: '$shutdown_test_type', agent test type: $agent_test_type"
local entry
entry=$(get_shutdown_test_type_entry "$shutdown_test_type" || true)
local debug_console=$(echo "$entry"|cut -d: -f4)
[ "$debug_console" = 'true' ] && \
start_debug_console \
"$agent_test_type" \
"$shutdown_test_type"
stop_agent \
"$agent_test_type" \
"$ctl_log_file"
# We only need to show the set of commands once
[ "$dry_run" = 'true' ] && exit 0
test_end_time=$(date '+%F %T')
check_agent_stopped "$agent_test_type"
validate_agent \
"$agent_test_type" \
"$shutdown_test_type" \
"$agent_log_file"
}
run_agent()
{
local agent_test_type="${1:-}"
local shutdown_test_type="${2:-}"
[ -z "$agent_test_type" ] && die "need agent test type"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
case "$shutdown_test_type" in
"$ALL_TEST_TYPES")
local entry
# Run all shutdown types
for entry in "${shutdown_test_types[@]}"
do
local name
name=$(echo "$entry"|cut -d: -f1)
run_single_agent \
"$agent_test_type" \
"$name"
# Clean up between iterations
sudo rm -f \
"$ctl_log_file" \
"$agent_log_file"
local addresses=$(get_addresses || true)
[ -z "$addresses" ] || \
die "found unexpected vsock addresses: '$addresses'"
done
;;
*)
run_single_agent \
"$agent_test_type" \
"$shutdown_test_type"
;;
esac
}
test_agent_shutdown()
{
local count="${1:-}"
local agent_test_type="${2:-}"
local shutdown_test_type="${3:-}"
[ -z "$count" ] && die "need count"
[ -z "$agent_test_type" ] && die "need agent test type"
[ -z "$shutdown_test_type" ] && die "need shutdown test type"
# Start with a clean environment
[ "$dry_run" = 'false' ] && cleanup initial || true
local i
for i in $(seq "$count")
do
[ "$dry_run" = 'false' ] && \
info "testing agent: run $i of $count" || true
run_agent \
"$agent_test_type" \
"$shutdown_test_type"
done
info "testing agent: completed $count runs"
}
handle_args()
{
local opt
local count="${KATA_AGENT_SHUTDOWN_TEST_COUNT}"
local shutdown_test_type="$DEFAULT_SHUTDOWN_TEST_TYPE"
local agent_test_type="$DEFAULT_AGENT_TEST_TYPE"
while getopts "a:c:dhklnt:" opt "$@"
do
case "$opt" in
a) agent_test_type="$OPTARG" ;;
c) count="$OPTARG" ;;
d) set -o xtrace ;;
h) usage; exit 0 ;;
k) keep_logs='true' ;;
l) list_test_types; exit 0 ;;
n) dry_run='true' ;;
t) shutdown_test_type="$OPTARG" ;;
*) die "invalid option: '$opt'" ;;
esac
done
setup
test_agent_shutdown \
"$count" \
"$agent_test_type" \
"$shutdown_test_type"
}
main()
{
handle_args "$@"
}
main "$@"