Files
kata-containers/tests/metrics/density/fast_footprint.sh
David Esparza b2ce8b4d61 metrics: Add memory footprint tests to the CI
This PR adds memory foot print metrics to tests/metrics/density
folder.

Intentionally, each test exits w/ zero in all test cases to ensure
that tests would be green when added, and will be enabled in a
subsequent PR.

A workflow matrix was added to define hypervisor variation on
each job, in order to run them sequentially.

The launch-times test was updated to make use of the matrix
environment variables.

Fixes: #7066

Signed-off-by: David Esparza <david.esparza.borquez@intel.com>
2023-06-30 09:52:27 -06:00

434 lines
11 KiB
Bash
Executable File

#!/bin/bash
# Copyright (c) 2017-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# A script to gather memory 'footprint' information as we launch more
# and more containers
#
# The script gathers information about both user and kernel space consumption
# Output is into a .json file, named using some of the config component names
# (such as footprint-busybox.json)
# Pull in some common, useful, items
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
# Note that all vars that can be set from outside the script (that is,
# passed in the ENV), use the ':-' setting to allow being over-ridden
# Default sleep, in seconds, to let containers come up and finish their
# initialisation before we take the measures. Some of the larger
# containers can take a number of seconds to get running.
PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
# How long, in seconds, do we wait for KSM to 'settle down', before we
# timeout and just continue anyway.
KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}"
# How long, in seconds, do we poll for ctr to complete launching all the
# containers?
CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}"
# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP
# nap
PARALLELISM="${PARALLELISM:-10}"
### The default config - run a small busybox image
# Define what we will be running (app under test)
# Default is we run busybox, as a 'small' workload
PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
###
# which RUNTIME we use is picked up from the env in
# common.bash. You can over-ride by setting RUNTIME in your env
###
# Define the cutoff checks for when we stop running the test
# Run up to this many containers
NUM_CONTAINERS="${NUM_CONTAINERS:-100}"
# Run until we have consumed this much memory (from MemFree)
MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}"
# Run until we have this much MemFree left
MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
# Tools we need to have installed in order to operate
REQUIRED_COMMANDS="smem awk"
# If we 'dump' the system caches before we measure then we get less
# noise in the results - they show more what our un-reclaimable footprint is
DUMP_CACHES="${DUMP_CACHES:-1}"
# Affects the name of the file to store the results in
TEST_NAME="${TEST_NAME:-fast-footprint-busybox}"
############# end of configurable items ###################
# vars to remember where we started so we can calc diffs
base_mem_avail=0
base_mem_free=0
# dump the kernel caches, so we get a more precise (or just different)
# view of what our footprint really is.
function dump_caches() {
sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
}
function init() {
restart_containerd_service
check_cmds $REQUIRED_COMMANDS
sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
# Modify the test name if running with KSM enabled
check_for_ksm
# Use the common init func to get to a known state
init_env
# Prepare to start storing results
metrics_json_init
# Store up baseline measures
base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
base_mem_free=$(get_memfree)
# Store our configuration for this run
save_config
}
save_config(){
metrics_json_start_array
local json="$(cat << EOF
{
"testname": "${TEST_NAME}",
"payload": "${PAYLOAD}",
"payload_args": "${PAYLOAD_ARGS}",
"payload_sleep": ${PAYLOAD_SLEEP},
"ksm_settle_time": ${KSM_WAIT_TIME},
"num_containers": ${NUM_CONTAINERS},
"parallelism": ${PARALLELISM},
"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
"min_memory_free": "${MIN_MEMORY_FREE}",
"dump_caches": "${DUMP_CACHES}"
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Config"
}
function cleanup() {
# Finish storing the results
metrics_json_save
clean_env_ctr
}
# helper function to get USS of process in arg1
function get_proc_uss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
((item*=1024))
echo $item
}
# helper function to get PSS of process in arg1
function get_proc_pss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
((item*=1024))
echo $item
}
# Get the PSS for the whole of userspace (all processes)
# This allows us to see if we had any impact on the rest of the system, for instance
# dockerd grows as we launch containers, so we should account for that in our total
# memory breakdown
function grab_all_pss() {
item=$(sudo smem -t | tail -1 | awk '{print $5}')
((item*=1024))
local json="$(cat << EOF
"all_pss": {
"pss": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_user_smem() {
# userspace
item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
((item*=1024))
local json="$(cat << EOF
"user_smem": {
"userspace": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_slab() {
# Grabbing slab total from meminfo is easier than doing the math
# on slabinfo
item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
((item*=1024))
local json="$(cat << EOF
"slab": {
"slab": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function get_memfree() {
mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
((mem_free*=1024))
echo $mem_free
}
function grab_system() {
# avail memory, from 'free'
local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
local avail_decr=$((base_mem_avail-avail))
# cached memory, from 'free'
local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
# free memory from smem
local smem_free=$(get_memfree)
local free_decr=$((base_mem_free-item))
# Anon pages
local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
((anon*=1024))
# Mapped pages
local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
((mapped*=1024))
# Cached
local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
((meminfo_cached*=1024))
local json="$(cat << EOF
"system": {
"avail": $avail,
"avail_decr": $avail_decr,
"cached": $cached,
"smem_free": $smem_free,
"free_decr": $free_decr,
"anon": $anon,
"mapped": $mapped,
"meminfo_cached": $meminfo_cached,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_stats() {
# If configured, dump the caches so we get a more stable
# view of what our static footprint really is
if [[ "$DUMP_CACHES" ]] ; then
dump_caches
fi
# user space data
# PSS taken all userspace
grab_all_pss
# user as reported by smem
grab_user_smem
# System overview data
# System free and cached
grab_system
# kernel data
# The 'total kernel space taken' we can work out as:
# ktotal = ((free-avail)-user)
# So, we don't grab that number from smem, as that is what it does
# internally anyhow.
# Still try to grab any finer kernel details that we can though
# totals from slabinfo
grab_slab
metrics_json_close_array_element
}
function check_limits() {
mem_free=$(get_memfree)
if ((mem_free <= MIN_MEMORY_FREE)); then
echo 1
return
fi
mem_consumed=$((base_mem_avail-mem_free))
if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
echo 1
return
fi
echo 0
}
launch_containers() {
local parloops leftovers
(( parloops=${NUM_CONTAINERS}/${PARALLELISM} ))
(( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) ))
echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras"
containers=()
local iter n
for iter in $(seq 1 $parloops); do
echo "Launch iteration ${iter}"
for n in $(seq 1 $PARALLELISM); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
done
if [[ $PAYLOAD_SLEEP ]]; then
sleep $PAYLOAD_SLEEP
fi
# check if we have hit one of our limits and need to wrap up the tests
if (($(check_limits))); then
echo "Ran out of resources, check_limits failed"
return
fi
done
for n in $(seq 1 $leftovers); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
done
}
wait_containers() {
local t numcontainers
# nap 3s between checks
local step=3
for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do
numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l)
if (( numcontainers >= ${NUM_CONTAINERS} )); then
echo "All containers now launched (${t}s)"
return
else
echo "Waiting for containers to launch (${numcontainers} at ${t}s)"
fi
sleep ${step}
done
echo "Timed out waiting for containers to launch (${t}s)"
cleanup
die "Timed out waiting for containers to launch (${t}s)"
}
function go() {
# Init the json cycle for this save
metrics_json_start_array
# Grab the first set of stats before we run any containers.
grab_stats
launch_containers
wait_containers
if [ $ksm_on == "1" ]; then
echo "Wating for KSM to settle..."
wait_ksm_settle ${KSM_WAIT_TIME}
fi
grab_stats
# Wrap up the results array
metrics_json_end_array "Results"
}
function show_vars()
{
echo -e "\nEvironment variables:"
echo -e "\tName (default)"
echo -e "\t\tDescription"
echo -e "\tPAYLOAD (${PAYLOAD})"
echo -e "\t\tThe ctr image to run"
echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
echo -e "\t\tAny extra arguments passed into the docker 'run' command"
echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})"
echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure"
echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})"
echo -e "\t\tSeconds to poll for ctr to finish launching containers"
echo -e "\tPARALLELISM (${PARALLELISM})"
echo -e "\t\tNumber of containers we launch in parallel"
echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})"
echo -e "\t\tThe total number of containers to run"
echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
echo -e "\t\tThe minimum amount of memory allowed to be free before terminating"
echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
echo -e "\tTEST_NAME (${TEST_NAME})"
echo -e "\t\tCan be set to over-ride the default JSON results filename"
}
function help()
{
usage=$(cat << EOF
Usage: $0 [-h] [options]
Description:
Launch a series of workloads and take memory metric measurements after
each launch.
Options:
-h, Help page.
EOF
)
echo "$usage"
show_vars
}
function main() {
local OPTIND
while getopts "h" opt;do
case ${opt} in
h)
help
exit 0;
;;
esac
done
shift $((OPTIND-1))
init
go
cleanup
}
main "$@"