kata-containers/ci/openshift-ci/cluster/install_kata.sh

#!/bin/bash
#
# Copyright (c) 2020 Red Hat, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script installs the built kata-containers in the test cluster,
# and configure a runtime.

scripts_dir=$(dirname $0)
deployments_dir=${scripts_dir}/deployments
configs_dir=${scripts_dir}/configs

source ${scripts_dir}/../lib.sh

# Set to 'yes' if you want to configure SELinux to permissive on the cluster
# workers.
#
SELINUX_PERMISSIVE=${SELINUX_PERMISSIVE:-no}

# Set to 'yes' if you want to configure Kata Containers to use the system's
# QEMU (from the RHCOS extension).
#
KATA_WITH_SYSTEM_QEMU=${KATA_WITH_SYSTEM_QEMU:-no}

# Set to 'yes' if you want to configure Kata Containers to use the host kernel.
#
KATA_WITH_HOST_KERNEL=${KATA_WITH_HOST_KERNEL:-no}

# kata-deploy image to be used to deploy the kata (by default use CI image
# that is built for each pull request)
#
KATA_DEPLOY_IMAGE=${KATA_DEPLOY_IMAGE:-quay.io/kata-containers/kata-deploy-ci:kata-containers-latest}

# Enable workaround for OCP 4.13 https://github.com/kata-containers/kata-containers/pull/9206
#
WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no}

# Leverage kata-deploy to install Kata Containers in the cluster.
#
apply_kata_deploy() {
	local deploy_file="tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
	pushd "${katacontainers_repo_dir}"
	sed -ri "s#(\s+image:) .*#\1 ${KATA_DEPLOY_IMAGE}#" "${deploy_file}"

	info "Applying kata-deploy"
	oc apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
	oc label --overwrite ns kube-system pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=baseline pod-security.kubernetes.io/audit=baseline
	oc apply -f "${deploy_file}"
	oc -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod

	info "Adding the kata runtime classes"
	oc apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
	popd
}


# Wait all worker nodes reboot.
#
# Params:
#   $1 - timeout in seconds (default to 900).
#
wait_for_reboot() {
	local delta="${1:-900}"
	local sleep_time=60
	declare -A BOOTIDS
	local workers=($(oc get nodes | \
		awk '{if ($3 == "worker") { print $1 } }'))
	# Get the boot ID to compared it changed over time.
	for node in "${workers[@]}"; do
		BOOTIDS[${node}]=$(oc get -o jsonpath='{.status.nodeInfo.bootID}'\
			node/${node})
		echo "Wait ${node} reboot"
	done

	echo "Set timeout to ${delta} seconds"
	timer_start=$(date +%s)
	while [ ${#workers[@]} -gt 0 ]; do
		sleep ${sleep_time}
		now=$(date +%s)
		if [ $((${timer_start} + ${delta})) -lt ${now} ]; then
			echo "Timeout: not all workers rebooted"
			return 1
		fi
		echo "Checking after $((${now} - ${timer_start})) seconds"
		for i in "${!workers[@]}"; do
			current_id=$(oc get \
				-o jsonpath='{.status.nodeInfo.bootID}' \
				node/${workers[i]})
			if [ "${current_id}" != ${BOOTIDS[${workers[i]}]} ]; then
				echo "${workers[i]} rebooted"
				unset workers[i]
			fi
		done
	done
}

wait_mcp_update() {
	local delta="${1:-3600}"
	local sleep_time=30
	# The machineconfigpool is fine when all the workers updated and are ready,
	# and none are degraded.
	local ready_count=0
	local degraded_count=0
	local machine_count=$(oc get mcp worker -o jsonpath='{.status.machineCount}')

	if [[ -z "${machine_count}" && "${machine_count}" -lt 1 ]]; then
		warn "Unabled to obtain the machine count"
		return 1
	fi

	echo "Set timeout to ${delta} seconds"
	local deadline=$(($(date +%s) + ${delta}))
	# The ready count might not have changed yet, so wait a little.
	while [[ "${ready_count}" != "${machine_count}" && \
		"${degraded_count}" == 0 ]]; do
		# Let's check it hit the timeout (or not).
		local now=$(date +%s)
		if [ ${deadline} -lt ${now} ]; then
			echo "Timeout: not all workers updated" >&2
			return 1
		fi
		sleep ${sleep_time}
		ready_count=$(oc get mcp worker \
			-o jsonpath='{.status.readyMachineCount}')
		degraded_count=$(oc get mcp worker \
			-o jsonpath='{.status.degradedMachineCount}')
		echo "check machineconfigpool - ready_count: ${ready_count} degraded_count: ${degraded_count}"
	done
	[ ${degraded_count} -eq 0 ]
}

# Enable the RHCOS extension for the Sandboxed Containers.
#
enable_sandboxedcontainers_extension() {
	info "Enabling the RHCOS extension for Sandboxed Containers"
	local deployment_file="${deployments_dir}/machineconfig_sandboxedcontainers_extension.yaml"
	oc apply -f ${deployment_file}
	oc get -f ${deployment_file} || \
		die "Sandboxed Containers extension machineconfig not found"
	wait_mcp_update || die "Failed to update the machineconfigpool"
}

# Print useful information for debugging.
#
# Params:
#   $1 - the pod name
debug_pod() {
	local pod="$1"
	info "Debug pod: ${pod}"
	oc describe pods "${pod}"
        oc logs "${pod}"
}

# Wait for all pods of the app label to contain expected message
#
# Params:
#   $1 - app labela
#   $2 - expected pods count (>=1)
#   $3 - message to be present in the logs
#   $4 - timeout (60)
#   $5 - namespace (the current one)
wait_for_app_pods_message() {
	local app="$1"
	local pod_count="$2"
	local message="$3"
	local timeout="$4"
	local namespace="$5"
	[ -z "${pod_count}" ] && pod_count=1
	[ -z "${timeout}" ] && timeout=60
	[ -n "${namespace}" ] && namespace=" -n ${namespace} "
	local pod
	local pods
	local i
	SECONDS=0
	while :; do
		pods=($(oc get pods -l app="${app}" --no-headers=true ${namespace} | awk '{print $1}'))
		[ "${#pods}" -ge "${pod_count}" ] && break
		if [ "${SECONDS}" -gt "${timeout}" ]; then
			printf "Unable to find ${pod_count} pods for '-l app=\"${app}\"' in ${SECONDS}s (%s)" "${pods[@]}"
			return 1
		fi
	done
	for pod in "${pods[@]}"; do
		while :; do
			local log=$(oc logs ${namespace} "${pod}")
			echo "${log}" | grep "${message}" -q && echo "Found $(echo "${log}" | grep "${message}") in ${pod}'s log (${SECONDS})" && break;
			if [ "${SECONDS}" -gt "${timeout}" ]; then
				echo -n "Message '${message}' not present in '${pod}' pod of the '-l app=\"${app}\"' "
				printf "pods after ${SECONDS}s :(%s)\n" "${pods[@]}"
				echo "Pod ${pod}'s output so far:"
				echo "${log}"
				return 1
			fi
			sleep 1;
		done
	done
}

oc config set-context --current --namespace=default

worker_nodes=$(oc get nodes |  awk '{if ($3 == "worker") { print $1 } }')
num_nodes=$(echo ${worker_nodes} | wc -w)
[ ${num_nodes} -ne 0 ] || \
	die "No worker nodes detected. Something is wrong with the cluster"

if [ "${KATA_WITH_SYSTEM_QEMU}" == "yes" ]; then
	# QEMU is deployed on the workers via RCHOS extension.
	enable_sandboxedcontainers_extension
	oc apply -f ${deployments_dir}/configmap_installer_qemu.yaml
fi

if [ "${KATA_WITH_HOST_KERNEL}" == "yes" ]; then
	oc apply -f ${deployments_dir}/configmap_installer_kernel.yaml
fi

apply_kata_deploy

# Set SELinux to permissive mode
if [ ${SELINUX_PERMISSIVE} == "yes" ]; then
	info "Configuring SELinux"
	if [ -z "${SELINUX_CONF_BASE64}" ]; then
		export SELINUX_CONF_BASE64=$(echo \
			$(cat ${configs_dir}/selinux.conf|base64) | \
			sed -e 's/\s//g')
	fi
	envsubst < ${deployments_dir}/machineconfig_selinux.yaml.in | \
		oc apply -f -
	oc get machineconfig/51-kata-selinux || \
		die "SELinux machineconfig not found"
	# The new SELinux configuration will trigger another reboot.
	wait_for_reboot
fi

if [[ "${WORKAROUND_9206_CRIO}" == "yes" ]]; then
	info "Applying workaround to enable skip_mount_home in crio on OCP 4.13"
	oc apply -f "${deployments_dir}/workaround-9206-crio.yaml"
	oc apply -f "${deployments_dir}/workaround-9206-crio-ds.yaml"
	wait_for_app_pods_message workaround-9206-crio-ds "${num_nodes}" "Config file present" 1200 || echo "Failed to apply the workaround, proceeding anyway..."
fi

# FIXME: Remove when https://github.com/kata-containers/kata-containers/pull/8417 is resolved
# Selinux context is currently not handled by kata-deploy
oc apply -f ${deployments_dir}/relabel_selinux.yaml
wait_for_app_pods_message restorecon "${num_nodes}" "NSENTER_FINISHED_WITH:" 120 "kube-system" || echo "Failed to treat selinux, proceeding anyway..."