kata-containers/tests/functional/kata-monitor/kata-monitor-tests.sh
Gabriela Cervantes a8432880f8 tests: Increase timeout to crictl calls on kata monitor tests
This PR increases the timeout to crictl calls on kata monitor
tests to avoid to hit issues every now and avoid random failures.
This PR is very similar to PR #7640.

Signed-off-by: Gabriela Cervantes <gabriela.cervantes.tellez@intel.com>
2024-06-25 22:32:47 +00:00

296 lines
6.4 KiB
Bash
Executable File

#!/bin/bash
#
# Copyright (c) 2022 Red Hat
#
# SPDX-License-Identifier: Apache-2.0
#
# This test file will test kata-monitor for basic functionality (retrieve kata sandboxes)
# It will assume an environment where:
# - a CRI container manager (container engine) will be up and running
# - crictl is installed and configured
# - the kata-monitor binary is available on the host
#
set -o errexit
set -o nounset
set -o pipefail
source "/etc/os-release" || source "/usr/lib/os-release"
[ -n "${BASH_VERSION:-}" ] && set -o errtrace
[ -n "${DEBUG:-}" ] && set -o xtrace
readonly MONITOR_HTTP_ENDPOINT="127.0.0.1:8090"
# we should collect few hundred metrics, let's put a reasonable minimum
readonly MONITOR_MIN_METRICS_NUM=200
readonly TIMEOUT="20s"
CONTAINER_ENGINE=${CONTAINER_ENGINE:-"containerd"}
CRICTL_RUNTIME=${CRICTL_RUNTIME:-"kata"}
KATA_MONITOR_BIN="${KATA_MONITOR_BIN:-$(command -v kata-monitor || true)}"
KATA_MONITOR_PID=""
TMPATH=$(mktemp -d -t kata-monitor-test-XXXXXXXXX)
METRICS_FILE="${TMPATH}/metrics.txt"
MONITOR_LOG_FILE="${TMPATH}/kata-monitor.log"
CACHE_UPD_TIMEOUT_SEC=${CACHE_UPD_TIMEOUT_SEC:-20}
POD_ID=""
CID=""
RUNC_POD_ID=""
RUNC_CID=""
CURRENT_TASK=""
FALSE=1
TRUE=0
trap error_with_msg ERR
title() {
local step="$1"
echo -e "\n* STEP: $step"
}
echo_ok() {
local msg="$1"
echo "OK: $msg"
}
# quiet crictrl
qcrictl() {
sudo crictl "$@" > /dev/null
}
# this is just an hash of current date (+ nanoseconds)
gen_unique_id() {
date +%T:%N | md5sum | cut -d ' ' -f 1
}
error_with_msg() {
local msg=${1:-"cannot $CURRENT_TASK"}
trap - ERR
echo -e "\nERROR: $msg"
if [ -f "$MONITOR_LOG_FILE" ]; then
echo -e "\nkata-monitor logs:\n----------------"
cat "$MONITOR_LOG_FILE"
fi
echo -e "\nkata-monitor testing: FAILED!"
cleanup
exit 1
}
cleanup() {
stop_workload
stop_workload "$RUNC_CID" "$RUNC_POD_ID"
[ -n "$KATA_MONITOR_PID" ] \
&& [ -d "/proc/$KATA_MONITOR_PID" ] \
&& kill -9 "$KATA_MONITOR_PID"
rm -rf "$TMPATH"
}
create_sandbox_json() {
local uid_name_suffix="$(gen_unique_id)"
local sbfile="$TMPATH/sandbox-$uid_name_suffix.json"
cat <<EOF >$sbfile
{
"metadata": {
"name": "nginx-$uid_name_suffix",
"namespace": "default",
"uid": "nginx-container-uid",
"attempt": 1
},
"logDirectory": "/tmp",
"linux": {
}
}
EOF
echo "$sbfile"
}
create_container_json() {
local uid_name_suffix="$(gen_unique_id)"
local cntfile="$TMPATH/container-$uid_name_suffix.json"
cat <<EOF >$cntfile
{
"metadata": {
"name": "busybox",
"namespace": "default",
"uid": "busybox-container-uid"
},
"image":{
"image": "busybox"
},
"command": [
"top"
],
"log_path":"busybox.log",
"linux": {
}
}
EOF
echo "$cntfile"
}
start_workload() {
local runtime=${1:-}
local args=""
local sbfile=""
local cntfile=""
[ -n "$runtime" ] && args="-r $runtime"
sbfile="$(create_sandbox_json)"
cntfile="$(create_container_json)"
POD_ID=$(sudo crictl --timeout=$TIMEOUT runp $args $sbfile)
CID=$(sudo crictl --timeout=$TIMEOUT create $POD_ID $cntfile $sbfile)
qcrictl --timeout=$TIMEOUT start $CID
}
stop_workload() {
local cid="${1:-$CID}"
local pod_id="${2:-$POD_ID}"
local check
[ -z "$pod_id" ] && return
check=$(sudo crictl --timeout=$TIMEOUT pods -q -id $pod_id)
[ -z "$check" ] && return
qcrictl --timeout=$TIMEOUT stop $cid
qcrictl --timeout=$TIMEOUT rm $cid
qcrictl --timeout=$TIMEOUT stopp $pod_id
qcrictl --timeout=$TIMEOUT rmp $pod_id
}
is_sandbox_there() {
local podid=${1}
local sbs s
sbs=$(sudo curl -s ${MONITOR_HTTP_ENDPOINT}/sandboxes)
if [ -n "$sbs" ]; then
for s in $sbs; do
if [ "$s" = "$podid" ]; then
return $TRUE
break
fi
done
fi
return $FALSE
}
is_sandbox_there_iterate() {
local podid=${1}
for i in $(seq 1 $CACHE_UPD_TIMEOUT_SEC); do
is_sandbox_there "$podid" && return $TRUE
echo -n "."
sleep 1
continue
done
return $FALSE
}
is_sandbox_missing_iterate() {
local podid=${1}
for i in $(seq 1 $CACHE_UPD_TIMEOUT_SEC); do
is_sandbox_there "$podid" || return $TRUE
echo -n "."
sleep 1
continue
done
return $FALSE
}
main() {
local args=""
###########################
title "pre-checks"
CURRENT_TASK="connect to the container engine"
qcrictl --timeout=$TIMEOUT pods
echo_ok "$CURRENT_TASK"
###########################
title "pull the image to be used"
sudo crictl --timeout=$TIMEOUT pull busybox
###########################
title "create workloads"
CURRENT_TASK="start workload (runc)"
start_workload
RUNC_POD_ID="$POD_ID"
RUNC_CID="$CID"
echo_ok "$CURRENT_TASK - POD ID:$POD_ID, CID:$CID"
CURRENT_TASK="start workload ($CRICTL_RUNTIME)"
start_workload "$CRICTL_RUNTIME"
echo_ok "$CURRENT_TASK - POD ID:$POD_ID, CID:$CID"
###########################
title "start kata-monitor"
[ ! -x "$KATA_MONITOR_BIN" ] && error_with_msg "kata-monitor binary not found"
[ "$CONTAINER_ENGINE" = "crio" ] && args="--runtime-endpoint /run/crio/crio.sock"
CURRENT_TASK="start kata-monitor"
sudo $KATA_MONITOR_BIN $args --log-level trace > "$MONITOR_LOG_FILE" 2>&1 &
KATA_MONITOR_PID="$!"
echo_ok "$CURRENT_TASK ($KATA_MONITOR_PID)"
###########################
title "kata-monitor cache update checks"
CURRENT_TASK="retrieve $POD_ID in kata-monitor cache"
is_sandbox_there_iterate "$POD_ID" || error_with_msg
echo_ok "$CURRENT_TASK"
CURRENT_TASK="look for runc pod $RUNC_POD_ID in kata-monitor cache"
is_sandbox_there_iterate "$RUNC_POD_ID" && error_with_msg "cache: got runc pod $RUNC_POD_ID"
echo_ok "runc pod $RUNC_POD_ID skipped from kata-monitor cache"
###########################
title "kata-monitor metrics retrieval"
CURRENT_TASK="retrieve metrics from kata-monitor"
curl -s ${MONITOR_HTTP_ENDPOINT}/metrics > "$METRICS_FILE"
echo_ok "$CURRENT_TASK"
CURRENT_TASK="retrieve metrics for pod $POD_ID"
METRICS_COUNT=$(grep -c "$POD_ID" "$METRICS_FILE")
[ ${METRICS_COUNT} -lt ${MONITOR_MIN_METRICS_NUM} ] \
&& error_with_msg "got too few metrics (#${METRICS_COUNT})"
echo_ok "$CURRENT_TASK - found #${METRICS_COUNT} metrics"
###########################
title "remove kata workload"
CURRENT_TASK="stop workload ($CRICTL_RUNTIME)"
stop_workload
echo_ok "$CURRENT_TASK"
###########################
title "kata-monitor cache update checks (removal)"
CURRENT_TASK="verify removal of $POD_ID from kata-monitor cache"
is_sandbox_missing_iterate "$POD_ID" || error_with_msg "pod $POD_ID was not removed"
echo_ok "$CURRENT_TASK"
###########################
CURRENT_TASK="cleanup"
cleanup
echo -e "\nkata-monitor testing: PASSED!\n"
}
main "@"