#!/bin/bash # # Copyright (c) 2020 Red Hat, Inc. # # SPDX-License-Identifier: Apache-2.0 # # This script installs the built kata-containers in the test cluster, # and configure a runtime. scripts_dir=$(dirname "$0") deployments_dir=${scripts_dir}/deployments configs_dir=${scripts_dir}/configs # shellcheck disable=SC1091 # import based on variable source "${scripts_dir}/../lib.sh" # Set your katacontainers repo dir location [[ -z "${katacontainers_repo_dir}" ]] && echo "Please set katacontainers_repo_dir variable to your kata repo" # Set to 'yes' if you want to configure SELinux to permissive on the cluster # workers. # SELINUX_PERMISSIVE=${SELINUX_PERMISSIVE:-no} # Set to 'yes' if you want to configure Kata Containers to use the system's # QEMU (from the RHCOS extension). # KATA_WITH_SYSTEM_QEMU=${KATA_WITH_SYSTEM_QEMU:-no} # Set to 'yes' if you want to configure Kata Containers to use the host kernel. # KATA_WITH_HOST_KERNEL=${KATA_WITH_HOST_KERNEL:-no} # kata-deploy image to be used to deploy the kata (by default use CI image # that is built for each pull request) # KATA_DEPLOY_IMAGE=${KATA_DEPLOY_IMAGE:-quay.io/kata-containers/kata-deploy-ci:kata-containers-latest} # Enable workaround for OCP 4.13 https://github.com/kata-containers/kata-containers/pull/9206 # WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} # Leverage kata-deploy to install Kata Containers in the cluster. # apply_kata_deploy() { local deploy_file="tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" pushd "${katacontainers_repo_dir}" || die sed -ri "s#(\s+image:) .*#\1 ${KATA_DEPLOY_IMAGE}#" "${deploy_file}" info "Applying kata-deploy" oc apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml oc label --overwrite ns kube-system pod-security.kubernetes.io/enforce=privileged pod-security.kubernetes.io/warn=baseline pod-security.kubernetes.io/audit=baseline oc apply -f "${deploy_file}" oc -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod info "Adding the kata runtime classes" oc apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml popd || die } # Wait all worker nodes reboot. # # Params: # $1 - timeout in seconds (default to 900). # wait_for_reboot() { local delta="${1:-900}" local sleep_time=60 declare -A BOOTIDS local workers mapfile -t workers < <(oc get nodes | awk '{if ($3 == "worker") { print $1 } }') # Get the boot ID to compared it changed over time. for node in "${workers[@]}"; do BOOTIDS[${node}]=$(oc get -o jsonpath='{.status.nodeInfo.bootID}'\ "node/${node}") echo "Wait ${node} reboot" done echo "Set timeout to ${delta} seconds" timer_start=$(date +%s) while [[ ${#workers[@]} -gt 0 ]]; do sleep "${sleep_time}" now=$(date +%s) if [[ $((timer_start + delta)) -lt ${now} ]]; then echo "Timeout: not all workers rebooted" return 1 fi echo "Checking after $((now - timer_start)) seconds" for i in "${!workers[@]}"; do current_id=$(oc get \ -o jsonpath='{.status.nodeInfo.bootID}' \ "node/${workers[i]}") if [[ "${current_id}" != "${BOOTIDS[${workers[i]}]}" ]]; then echo "${workers[i]} rebooted" unset "workers[i]" fi done done } wait_mcp_update() { local delta="${1:-3600}" local sleep_time=30 # The machineconfigpool is fine when all the workers updated and are ready, # and none are degraded. local ready_count=0 local degraded_count=0 local machine_count machine_count=$(oc get mcp worker -o jsonpath='{.status.machineCount}') if [[ -z "${machine_count}" && "${machine_count}" -lt 1 ]]; then warn "Unabled to obtain the machine count" return 1 fi echo "Set timeout to ${delta} seconds" local deadline=$(($(date +%s) + delta)) local now # The ready count might not have changed yet, so wait a little. while [[ "${ready_count}" != "${machine_count}" && \ "${degraded_count}" == 0 ]]; do # Let's check it hit the timeout (or not). now=$(date +%s) if [[ ${deadline} -lt ${now} ]]; then echo "Timeout: not all workers updated" >&2 return 1 fi sleep "${sleep_time}" ready_count=$(oc get mcp worker \ -o jsonpath='{.status.readyMachineCount}') degraded_count=$(oc get mcp worker \ -o jsonpath='{.status.degradedMachineCount}') echo "check machineconfigpool - ready_count: ${ready_count} degraded_count: ${degraded_count}" done [[ ${degraded_count} -eq 0 ]] } # Enable the RHCOS extension for the Sandboxed Containers. # enable_sandboxedcontainers_extension() { info "Enabling the RHCOS extension for Sandboxed Containers" local deployment_file="${deployments_dir}/machineconfig_sandboxedcontainers_extension.yaml" oc apply -f "${deployment_file}" oc get -f "${deployment_file}" || \ die "Sandboxed Containers extension machineconfig not found" wait_mcp_update 3600 || die "Failed to update the machineconfigpool" } # Print useful information for debugging. # # Params: # $1 - the pod name debug_pod() { local pod="$1" info "Debug pod: ${pod}" oc describe pods "${pod}" oc logs "${pod}" } # Wait for all pods of the app label to contain expected message # # Params: # $1 - app labela # $2 - expected pods count (>=1) # $3 - message to be present in the logs # $4 - timeout (60) # $5 - namespace (the current one) wait_for_app_pods_message() { local app="$1" local pod_count="$2" local message="$3" local timeout="$4" local namespace="$5" [[ -z "${pod_count}" ]] && pod_count=1 [[ -z "${timeout}" ]] && timeout=60 [[ -n "${namespace}" ]] && namespace=" -n ${namespace} " local pod local pods local i SECONDS=0 while :; do mapfile -t pods < <(oc get pods -l app="${app}" --no-headers=true "${namespace}" | awk '{print $1}') [[ "${#pods}" -ge "${pod_count}" ]] && break if [[ "${SECONDS}" -gt "${timeout}" ]]; then printf "Unable to find ${pod_count} pods for '-l app=\"${app}\"' in ${SECONDS}s (%s)" "${pods[@]}" return 1 fi done local log for pod in "${pods[@]}"; do while :; do log=$(oc logs "${namespace}" "${pod}") echo "${log}" | grep "${message}" -q && echo "Found $(echo "${log}" | grep "${message}") in ${pod}'s log (${SECONDS})" && break; if [[ "${SECONDS}" -gt "${timeout}" ]]; then echo -n "Message '${message}' not present in '${pod}' pod of the '-l app=\"${app}\"' " printf "pods after ${SECONDS}s :(%s)\n" "${pods[@]}" echo "Pod ${pod}'s output so far:" echo "${log}" return 1 fi sleep 1; done done } oc config set-context --current --namespace=default worker_nodes=$(oc get nodes | awk '{if ($3 == "worker") { print $1 } }') num_nodes=$(echo "${worker_nodes}" | wc -w) [[ ${num_nodes} -ne 0 ]] || \ die "No worker nodes detected. Something is wrong with the cluster" if [[ "${KATA_WITH_SYSTEM_QEMU}" == "yes" ]]; then # QEMU is deployed on the workers via RCHOS extension. enable_sandboxedcontainers_extension oc apply -f "${deployments_dir}/configmap_installer_qemu.yaml" fi if [[ "${KATA_WITH_HOST_KERNEL}" == "yes" ]]; then oc apply -f "${deployments_dir}/configmap_installer_kernel.yaml" fi apply_kata_deploy # Set SELinux to permissive mode if [[ ${SELINUX_PERMISSIVE} == "yes" ]]; then info "Configuring SELinux" if [[ -z "${SELINUX_CONF_BASE64}" ]]; then SELINUX_CONF_BASE64=$(base64 -w0 < "${configs_dir}/selinux.conf") export SELINUX_CONF_BASE64 fi envsubst < "${deployments_dir}"/machineconfig_selinux.yaml.in | \ oc apply -f - oc get machineconfig/51-kata-selinux || \ die "SELinux machineconfig not found" # The new SELinux configuration will trigger another reboot. wait_for_reboot 900 fi if [[ "${WORKAROUND_9206_CRIO}" == "yes" ]]; then info "Applying workaround to enable skip_mount_home in crio on OCP 4.13" oc apply -f "${deployments_dir}/workaround-9206-crio.yaml" oc apply -f "${deployments_dir}/workaround-9206-crio-ds.yaml" wait_for_app_pods_message workaround-9206-crio-ds "${num_nodes}" "Config file present" 1200 || echo "Failed to apply the workaround, proceeding anyway..." fi # FIXME: Remove when https://github.com/kata-containers/kata-containers/pull/8417 is resolved # Selinux context is currently not handled by kata-deploy oc apply -f "${deployments_dir}/relabel_selinux.yaml" wait_for_app_pods_message restorecon "${num_nodes}" "NSENTER_FINISHED_WITH:" 120 "kube-system" || echo "Failed to treat selinux, proceeding anyway..."