kata-containers/tests/functional/tracing/tracing-test.sh
Gabriela Cervantes 8a79b1449e tests: Remove CI variable in tracing test
This PR removes the CI variable as well as the instructions related
to this as this was part of the jenkins environment which is not
longer supported it.

Signed-off-by: Gabriela Cervantes <gabriela.cervantes.tellez@intel.com>
2024-07-08 20:12:41 +00:00

532 lines
14 KiB
Bash
Executable File

#!/bin/bash
# Copyright (c) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
set -o errexit
set -o nounset
set -o pipefail
set -o errtrace
script_name=${0##*/}
source "/etc/os-release" || source "/usr/lib/os-release"
# Set to true if all tests pass
success="false"
DEBUG=${DEBUG:-}
# If set to any value, do not shut down the Jaeger service.
DEBUG_KEEP_JAEGER=${DEBUG_KEEP_JAEGER:-}
# If set to any value, do not shut down the trace forwarder.
DEBUG_KEEP_FORWARDER=${DEBUG_KEEP_FORWARDER:-}
[ -n "$DEBUG" ] && set -o xtrace
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../../common.bash"
RUNTIME="io.containerd.kata.v2"
CONTAINER_IMAGE="quay.io/prometheus/busybox:latest"
TRACE_LOG_DIR=${TRACE_LOG_DIR:-${KATA_TESTS_LOGDIR}/traces}
KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}"
# files for output
formatted_traces_file="kata-traces-formatted.json"
trace_summary_file="span-summary.txt"
# tmux(1) session to run the trace forwarder in
KATA_TMUX_FORWARDER_SESSION="kata-trace-forwarder-session"
forwarder_binary="/opt/kata/bin/kata-trace-forwarder"
# path prefix for CLH socket path
socket_path_prefix="/run/vc/vm/"
container_id="tracing-test"
jaeger_server=${jaeger_server:-localhost}
jaeger_ui_port=${jaeger_ui_port:-16686}
jaeger_docker_container_name="jaeger"
# Span data for testing:
# 1. Existence of spans in jaeger output
# 2. That the relative ordering in the data occurs
# in the jaeger output.
# This is tested to make sure specific spans exist in the output and
# that the order of spans is preserved.
# Ordered in latest in sequence to earliest.
#
# Fields are all span existing span names in relative order from latest
# to earliest call in a sequence of calls. Example (pseudocode):
# func1() {
# span = trace("func1")
# func2()
# end span
# }
# func2() {
# span = trace("func2")
# func3()
# end span
# }
# func3() {
# span = trace("func3")
# end span
# }
# The following data should result in a passing test:
# 'func3:func2:func1'
# 'func3:func2'
# 'func3:func1'
# 'func2:func1'
span_ordering_data=(
'StartVM:createSandboxFromConfig:create:rootSpan'
'setup_shared_namespaces:StartVM:createSandboxFromConfig:create:rootSpan'
'start_container:Start:rootSpan'
'stopSandbox:Stop:Start:rootSpan'
)
# Cleanup will remove Jaeger container and
# disable tracing.
cleanup()
{
local fp="die"
local result="failed"
local dest="$logdir"
if [ "$success" = "true" ]; then
local fp="info"
result="passed"
[ -z "$DEBUG_KEEP_JAEGER" ] && stop_jaeger 2>/dev/null || true
[ -z "$DEBUG_KEEP_FORWARDER" ] && kill_trace_forwarder
# The tests worked so remove the logs
if [ -n "$DEBUG" ]; then
eval "$fp" "test $result - logs left in '$dest'"
else
"${SCRIPT_PATH}/configure_tracing_for_kata.sh" disable
[ -d "$logdir" ] && rm -rf "$logdir" || true
fi
return 0
fi
eval "$fp" "test $result - logs left in '$dest'"
}
# Run an operation to generate Jaeger trace spans
create_traces()
{
sudo ctr image pull "$CONTAINER_IMAGE"
sudo ctr run --runtime "$RUNTIME" --rm "$CONTAINER_IMAGE" "$container_id" true
}
start_jaeger()
{
local jaeger_docker_image="jaegertracing/all-in-one:latest"
sudo docker rm -f "${jaeger_docker_container_name}"
# Defaults - see https://www.jaegertracing.io/docs/getting-started/
sudo docker run -d --runtime runc --name "${jaeger_docker_container_name}" \
-e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \
-p 5775:5775/udp \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p "${jaeger_ui_port}:${jaeger_ui_port}" \
-p 14268:14268 \
-p 9411:9411 \
"$jaeger_docker_image"
sudo mkdir -m 0750 -p "$TRACE_LOG_DIR"
}
stop_jaeger()
{
sudo docker stop "${jaeger_docker_container_name}"
sudo docker rm -f "${jaeger_docker_container_name}"
}
get_jaeger_traces()
{
local service="$1"
[ -z "$service" ] && die "need jaeger service name"
local traces_url="http://${jaeger_server}:${jaeger_ui_port}/api/traces?service=${service}"
curl -s "${traces_url}" 2>/dev/null
}
get_trace_summary()
{
local status="$1"
[ -z "$status" ] && die "need jaeger status JSON"
echo "${status}" | jq -S '.data[].spans[] | [.spanID, .operationName] | @sh'
}
get_span_count()
{
local status="$1"
[ -z "$status" ] && die "need jaeger status JSON"
# This could be simplified but creating a variable holding the
# summary is useful in debug mode as the summary is displayed.
local trace_summary=$(get_trace_summary "$status" || true)
[ -z "$trace_summary" ] && die "failed to get trace summary"
local count=$(echo "${trace_summary}" | wc -l)
[ -z "$count" ] && count=0
echo "$count"
}
# Returns status from Jaeger web UI
get_jaeger_status()
{
local service="$1"
local logdir="$2"
[ -z "$service" ] && die "need jaeger service name"
[ -z "$logdir" ] && die "need logdir"
local status=""
local span_count=0
# Find spans
status=$(get_jaeger_traces "$service" || true)
if [ -n "$status" ]; then
echo "$status" | tee "$logdir/${service}-status.json"
span_count=$(get_span_count "$status")
fi
[ -z "$status" ] && die "failed to query Jaeger for status"
[ "$span_count" -eq 0 ] && die "failed to find any trace spans"
[ "$span_count" -le 0 ] && die "invalid span count"
get_trace_summary "$status" > "$logdir/$trace_summary_file"
}
# Check Jaeger spans for the specified service.
check_jaeger_output()
{
local service="$1"
local min_spans="$2"
local logdir="$3"
[ -z "$service" ] && die "need jaeger service name"
[ -z "$min_spans" ] && die "need minimum trace span count"
[ -z "$logdir" ] && die "need logdir"
local status
local errors=0
info "Checking Jaeger status"
status=$(get_jaeger_status "$service" "$logdir")
#------------------------------
# Basic sanity checks
[ -z "$status" ] && die "failed to query status via HTTP"
local span_lines=$(echo "$status"|jq -S '.data[].spans | length')
[ -z "$span_lines" ] && die "no span status"
# Log the spans to allow for analysis in case the test fails
echo "$status"|jq -S . > "$logdir/${service}-traces-formatted.json"
local span_lines_count=$(echo "$span_lines"|wc -l)
# Total up all span counts
local spans=$(echo "$span_lines"|paste -sd+ -|bc)
[ -z "$spans" ] && die "no spans"
# Ensure total span count is numeric
echo "$spans"|grep -q "^[0-9][0-9]*$" || die "invalid span count: '$spans'"
info "found $spans spans (across $span_lines_count traces)"
# Validate
[ "$spans" -lt "$min_spans" ] && die "expected >= $min_spans spans, got $spans"
# Look for common errors in span data
local error_msg=$(echo "$status"|jq -S . 2>/dev/null|grep "invalid parent span" || true)
if [ -n "$error_msg" ]; then
errors=$((errors+1))
warn "Found invalid parent span errors: $error_msg"
else
errors=$((errors-1))
[ "$errors" -lt 0 ] && errors=0
fi
# Crude but it works
error_or_warning_msgs=$(echo "$status" |\
jq -S . 2>/dev/null |\
jq '.data[].spans[].warnings' |\
grep -E -v "\<null\>" |\
grep -E -v "\[" |\
grep -E -v "\]" |\
grep -E -v "clock skew" || true) # ignore clock skew error
if [ -n "$error_or_warning_msgs" ]; then
errors=$((errors+1))
warn "Found errors/warnings: $error_or_warning_msgs"
else
errors=$((errors-1))
[ "$errors" -lt 0 ] && errors=0
fi
[ "$errors" -eq 0 ] || die "errors detected"
}
# Check output for spans in span_ordering_data
check_spans()
{
local logdir="$1"
[ -z "$logdir" ] && die "need logdir"
local errors=0
# Check for existence of spans in output so we do not do the more
# time consuming test of checking span ordering if it will fail
info "Checking spans: ${span_ordering_data[@]}"
local missing_spans=()
for span_ordering in "${span_ordering_data[@]}"; do
local test_spans=(`echo $span_ordering | tr ':' ' '`)
for s in "${test_spans[@]}"; do
grep -q \'$s\' "$logdir/$trace_summary_file" || missing_spans+=( "$s" )
done
done
if [ "${#missing_spans[@]}" -gt 0 ]; then
die "Fail: Missing spans: ${missing_spans[@]}"
fi
# Check relative ordering of spans. We are not checking full trace, just
# that known calls are not out of order based on the test input.
for span_ordering in "${span_ordering_data[@]}"; do # runs maximum length of span_ordering_data
local test_spans=(`echo $span_ordering | tr ':' ' '`)
# create array for span IDs that match span string
local span_ids=()
for span in "${test_spans[@]}"; do
grep -q \'$span\' "$logdir/$trace_summary_file" || die "Fail: Missing span: $span"
id=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.operationName==\"$span\") | .spanID") || die "Fail: error with span $span retrieved from traces"
id_formatted=$(echo $id | tr -d '\"' | tr '\n' ':') # format to a string for parsing later, not an array
span_ids+=("$id_formatted")
done
# We now have 2 parallel arrays where test_spans[n] is the string name and
# span_ids[n] has all possible span IDs for that string separated by a colon
# Since functions can be called multiple times, we may have multiple results
# for span IDs.
initial_span_ids=(`echo ${span_ids[0]} | tr ':' ' '`)
for initial in "${initial_span_ids[@]}"; do # test parents for all initial spans
# construct array of all parents of first span
local retrieved_spans=()
local current_span="$initial"
[ "$current_span" != "" ] || break
MAX_DEPTH=20 # to prevent infinite loop due to unforeseen errors
for i in `seq 1 $MAX_DEPTH`; do
retrieved_spans+=("$current_span")
current_span=$(cat "$logdir/$formatted_traces_file" | jq ".data[].spans[] | select(.spanID==\"$current_span\") | .references[].spanID") || die "Fail: error with current_span $current_span retrieved from formatted traces"
[ "$current_span" != "" ] || break
current_span=$(echo $current_span | tr -d '"')
[ $i -lt $MAX_DEPTH ] || die "Fail: max depth reached, error in jq or adjust test depth"
done
# Keep track of this index so we can ensure we are testing the constructed array in order
# Increment when there is a match between test case and constructed path
local retrieved_ids_index=0
local matches=0
local index=0
# TODO: Optimize
for ((index=0; index<${#span_ids[@]}; index++)); do
for ((r_index=$retrieved_ids_index; r_index<${#retrieved_spans[@]}; r_index++)); do
grep -q "${retrieved_spans[$r_index]}" <<< ${span_ids[$index]} && (( retrieved_ids_index=$r_index+1 )) && (( matches+=1 )) && break
done
done
local last_initial_span_index=${#initial_span_ids[@]}-1
if [ $matches -eq ${#span_ids[@]} ]; then
info "Pass: spans \"${test_spans[@]}\" found in jaeger output"
break
elif [ $matches -lt ${#span_ids[@]} ] && [ "$initial" = "${initial_span_ids[$last_initial_span_index]}" ]; then
die "Fail: spans \"${test_spans[@]}\" NOT in jaeger output"
fi
# else repeat test for next initial span ID
done
done
}
run_trace_forwarder()
{
if [ $KATA_HYPERVISOR = "qemu" ]; then
tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "sudo $forwarder_binary -l trace"
elif [ $KATA_HYPERVISOR = "clh" ]; then
# CLH uses hybrid VSOCK which uses a local UNIX socket that we need to specify
socket_path_template=$socket_path_prefix$(sudo kata-runtime env --json | jq '.Hypervisor.SocketPath')
socket_path=$(echo "$socket_path_template" | sed "s/{ID}/${container_id}/g" | tr -d '"')
sudo mkdir -p $(dirname "$socket_path")
tmux new-session -d -s "$KATA_TMUX_FORWARDER_SESSION" "sudo $forwarder_binary -l trace --socket-path $socket_path"
else
die "Unsupported hypervisor $KATA_HYPERVISOR"
fi
info "Verifying trace forwarder in tmux session $KATA_TMUX_FORWARDER_SESSION"
local cmd="tmux capture-pane -pt $KATA_TMUX_FORWARDER_SESSION | tr -d '\n' | tr -d '\"' | grep -q \"source:kata-trace-forwarder\""
waitForProcess 10 1 "$cmd"
}
kill_trace_forwarder()
{
tmux kill-session -t "$KATA_TMUX_FORWARDER_SESSION"
}
setup()
{
# containerd must be running in order to use ctr to generate traces
restart_containerd_service
local cmds=()
# For container manager (containerd)
cmds+=('ctr')
# For jaeger
cmds+=('docker')
# For launching processes
cmds+=('tmux')
local cmd
for cmd in "${cmds[@]}"
do
local result
result=$(command -v "$cmd" || true)
[ -n "$result" ] || die "need $cmd"
done
run_trace_forwarder
start_jaeger
"${SCRIPT_PATH}/configure_tracing_for_kata.sh" enable
}
run_test()
{
local service="$1"
local min_spans="$2"
local logdir="$3"
[ -z "$service" ] && die "need service name"
[ -z "$min_spans" ] && die "need minimum span count"
[ -z "$logdir" ] && die "need logdir"
info "Running test for service '$service'"
logdir="$logdir/$service"
mkdir -p "$logdir"
check_jaeger_output "$service" "$min_spans" "$logdir"
check_spans "$logdir"
info "test passed"
}
run_tests()
{
# List of services to check
#
# Format: "name:min-spans"
#
# Where:
#
# - 'name' is the Jaeger service name.
# - 'min-spans' is an integer representing the minimum number of
# trace spans this service should generate.
#
# Notes:
#
# - Uses an array to ensure predictable ordering.
# - All services listed are expected to generate traces
# when create_traces() is called a single time.
local -a services
services+=("kata:125")
create_traces
logdir=$(mktemp -d)
for service in "${services[@]}"
do
local name=$(echo "${service}"|cut -d: -f1)
local min_spans=$(echo "${service}"|cut -d: -f2)
run_test "${name}" "${min_spans}" "${logdir}"
done
info "all tests passed"
success="true"
}
usage()
{
cat <<EOF
Usage: $script_name [<command>]
Commands:
clean - Perform cleanup phase only.
help - Show usage.
run - Only run tests (no setup or cleanup).
setup - Perform setup phase only.
Environment variables:
CI - if set, save logs of all tests to ${TRACE_LOG_DIR}.
DEBUG - if set, enable tracing and do not cleanup after tests.
DEBUG_KEEP_JAEGER - if set, do not shut down the Jaeger service.
DEBUG_KEEP_FORWARDER - if set, do not shut down the trace forwarder.
Notes:
- Runs all test phases if no arguments are specified.
EOF
}
main()
{
local cmd="${1:-}"
case "$cmd" in
clean) success="true"; cleanup; exit 0;;
help|-h|-help|--help) usage; exit 0;;
run) run_tests; exit 0;;
setup) setup; exit 0;;
esac
trap cleanup EXIT
setup
run_tests
}
main "$@"