From fbbea68f1f5b9fa66e58f02391b1a31f0afd1ce8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 5 Mar 2024 16:55:47 +0100 Subject: [PATCH 01/10] ci.ocp: Ignore selinux setup on non-selinux cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit improve our selinux workaround to work well on non-selinux clusters. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/deployments/relabel_selinux.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml b/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml index ab6cdf8c3f..854965ec51 100644 --- a/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml +++ b/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml @@ -23,6 +23,7 @@ spec: set -e; echo Starting the relabel; nsenter --target 1 --mount bash -xc ' + command -v semanage &>/dev/null || { echo Does not look like a SELINUX cluster, skipping; exit 0; }; for ENTRY in \ \"/(.*/)?opt/kata/bin(/.*)?\" \ \"/(.*/)?opt/kata/runtime-rs/bin(/.*)?\" \ From f7febd07a010af5ac65a142e182287ed017515d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 5 Mar 2024 16:58:31 +0100 Subject: [PATCH 02/10] ci.ocp: Allow to re-apply the selinux workaround MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit in case we re-apply the selinux workaround or if user had already existing similar rule the relabel_selinux was failing. Let's allow it to modify the existing rules as well to avoid such issues. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/deployments/relabel_selinux.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml b/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml index 854965ec51..de814c9091 100644 --- a/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml +++ b/ci/openshift-ci/cluster/deployments/relabel_selinux.yaml @@ -32,7 +32,7 @@ spec: \"/(.*/)?opt/kata/share/tdvf(/.*)?\" \ \"/(.*/)?opt/kata/libexec(/.*)?\"; do - semanage fcontext -a -t qemu_exec_t \"$ENTRY\" || { echo \"Error in semanage command\"; exit 1; } + semanage fcontext -a -t qemu_exec_t \"$ENTRY\" || semanage fcontext -m -t qemu_exec_t \"$ENTRY\" || { echo \"Error in semanage command\"; exit 1; } done; restorecon -v -R /opt/kata || { echo \"Error in restorecon command\"; exit 1; } '; From 76c452d4e09b5be4a787af5616fab4422a7d7dc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 11:01:45 +0100 Subject: [PATCH 03/10] ci.ocp: Wait for all pods to finish the work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit previously we only waited for a random pod to finish the selinux relabel, which could be error-prone. Let's wait for all of the podst to contain the expected message. Increase the timeout to 120s as some pods might take a little bit longer to finish. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 47 ++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index 82e80d6432..61b9fe57d4 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -147,6 +147,48 @@ debug_pod() { oc logs "$pod" } +# Wait for all pods of the app label to contain expected message +# +# Params: +# $1 - app labela +# $2 - expected pods count (>=1) +# $3 - message to be present in the logs +# $4 - timeout (60) +# $5 - namespace (the current one) +wait_for_app_pods_message() { + local app="$1" + local pod_count="$2" + local message="$3" + local timeout="$4" + local namespace="$5" + [ -z "$pod_count" ] && pod_count=1 + [ -z "$timeout" ] && timeout=60 + [ -n "$namespace" ] && namespace=" -n $namespace " + local pod + local pods + local i + SECONDS=0 + while :; do + pods=($(oc get pods -l app="$app" --no-headers=true $namespace | awk '{print $1}')) + [ "${#pods}" -ge "$pod_count" ] && break + if [ "$SECONDS" -gt "$timeout" ]; then + echo "Unable to find ${pod_count} pods for '-l app=\"$app\"' in ${SECONDS}s (${pods[@]})" + return -1 + fi + done + for pod in "${pods[@]}"; do + while :; do + oc logs $namespace "$pod" | grep "$message" -q && echo "Found $message in $pod's log ($SECONDS)" && break; + if [ "$SECONDS" -gt "$timeout" ]; then + echo -n "Message '$message' not present in '${pod}' pod of the '-l app=\"$app\"' " + echo "pods after ${SECONDS}s (${pods[@]})" + return -1 + fi + sleep 1; + done + done +} + oc config set-context --current --namespace=default worker_nodes=$(oc get nodes | awk '{if ($3 == "worker") { print $1 } }') @@ -185,7 +227,4 @@ fi # FIXME: Remove when https://github.com/kata-containers/kata-containers/pull/8417 is resolved # Selinux context is currently not handled by kata-deploy oc apply -f ${deployments_dir}/relabel_selinux.yaml -( for I in $(seq 30); do - sleep 10 - oc logs -n kube-system ds/relabel-selinux-daemonset | grep "NSENTER_FINISHED_WITH:" && exit -done ) || { echo "Selinux relabel failed, check the logs"; exit -1; } +wait_for_app_pods_message restorecon "$num_nodes" "NSENTER_FINISHED_WITH:" 120 "kube-system" || { echo "Selinux relabel failed, check the logs"; exit -1; } From 739d627b4ebc0c289bd7872012597ebd28f9721e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 12 Mar 2024 16:35:52 +0100 Subject: [PATCH 04/10] ci.ocp: Turn selinux relabel failures into warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of failing the pipeline let's proceed with an error message that selinux setup failed so, in case of a later failure, we know what might have caused it while keeping the coverage in case of a false setup issue. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index 61b9fe57d4..eaca843a63 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -227,4 +227,4 @@ fi # FIXME: Remove when https://github.com/kata-containers/kata-containers/pull/8417 is resolved # Selinux context is currently not handled by kata-deploy oc apply -f ${deployments_dir}/relabel_selinux.yaml -wait_for_app_pods_message restorecon "$num_nodes" "NSENTER_FINISHED_WITH:" 120 "kube-system" || { echo "Selinux relabel failed, check the logs"; exit -1; } +wait_for_app_pods_message restorecon "$num_nodes" "NSENTER_FINISHED_WITH:" 120 "kube-system" || echo "Failed to treat selinux, proceeding anyway..." From 6525c94065b46ec04bb5ebd4ccc566dadb8a3c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 11:04:50 +0100 Subject: [PATCH 05/10] ci.ocp: Add a workaround to optionally enable skip_mount_home MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the latest upstream kata-containers requires the skip_mount_home to be enabled, which is default on OCP 4.14+ but disabled on OCP 4.13-. Let's use a "WORKAROUND_9206_CRIO" (called by kata-containers GH issue) variable to allow users to enable this treatement when needed. Related to: #9206 Signed-off-by: Lukáš Doktor --- .../deployments/workaround-9206-crio-ds.yaml | 28 +++++++++++++++++++ .../deployments/workaround-9206-crio.yaml | 18 ++++++++++++ ci/openshift-ci/cluster/install_kata.sh | 11 ++++++++ 3 files changed, 57 insertions(+) create mode 100644 ci/openshift-ci/cluster/deployments/workaround-9206-crio-ds.yaml create mode 100644 ci/openshift-ci/cluster/deployments/workaround-9206-crio.yaml diff --git a/ci/openshift-ci/cluster/deployments/workaround-9206-crio-ds.yaml b/ci/openshift-ci/cluster/deployments/workaround-9206-crio-ds.yaml new file mode 100644 index 0000000000..0a5cf8a5ee --- /dev/null +++ b/ci/openshift-ci/cluster/deployments/workaround-9206-crio-ds.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: workaround-9206-crio-ds +spec: + selector: + matchLabels: + app: workaround-9206-crio-ds + template: + metadata: + labels: + app: workaround-9206-crio-ds + spec: + containers: + - name: workaround-9206-crio-ds + image: alpine + volumeMounts: + - name: host-dir + mountPath: /tmp/config + securityContext: + runAsUser: 0 + privileged: true + command: ["/bin/sh", "-c", "while [ ! -f '/tmp/config/10-workaround-9206-crio' ]; do sleep 1; done; echo 'Config file present'; sleep infinity"] + volumes: + - name: host-dir + hostPath: + path: /etc/crio/crio.conf.d/ diff --git a/ci/openshift-ci/cluster/deployments/workaround-9206-crio.yaml b/ci/openshift-ci/cluster/deployments/workaround-9206-crio.yaml new file mode 100644 index 0000000000..18313a0b5e --- /dev/null +++ b/ci/openshift-ci/cluster/deployments/workaround-9206-crio.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: worker + name: 10-workaround-9206-crio +spec: + config: + ignition: + version: 2.2.0 + storage: + files: + - contents: + source: data:text/plain;charset=utf-8;base64,W2NyaW9dCnN0b3JhZ2Vfb3B0aW9uID0gWwoJIm92ZXJsYXkuc2tpcF9tb3VudF9ob21lPXRydWUiLApdCg== + filesystem: root + mode: 0644 + path: /etc/crio/crio.conf.d/10-workaround-9206-crio diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index eaca843a63..ef719f7482 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -27,6 +27,10 @@ KATA_WITH_SYSTEM_QEMU=${KATA_WITH_SYSTEM_QEMU:-no} # KATA_WITH_HOST_KERNEL=${KATA_WITH_HOST_KERNEL:-no} +# Enable workaround for OCP 4.13 https://github.com/kata-containers/kata-containers/pull/9206 +# +WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} + # Leverage kata-deploy to install Kata Containers in the cluster. # apply_kata_deploy() { @@ -224,6 +228,13 @@ if [ ${SELINUX_PERMISSIVE} == "yes" ]; then wait_for_reboot fi +if [[ "$WORKAROUND_9206_CRIO" == "yes" ]]; then + info "Applying workaround to enable skip_mount_home in crio on OCP 4.13" + oc apply -f "${deployments_dir}/workaround-9206-crio.yaml" + oc apply -f "${deployments_dir}/workaround-9206-crio-ds.yaml" + wait_for_app_pods_message workaround-9206-crio-ds "$num_nodes" "Config file present" 1200 || echo "Failed to apply the workaround, proceeding anyway..." +fi + # FIXME: Remove when https://github.com/kata-containers/kata-containers/pull/8417 is resolved # Selinux context is currently not handled by kata-deploy oc apply -f ${deployments_dir}/relabel_selinux.yaml From 2936503b246a611ceaa6ba58737d07e308422de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 11:18:18 +0100 Subject: [PATCH 06/10] ci.ocp: Always replace the kata-deploy image in OCP pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit previously we only replaced the image when the previously defined one matched the "old_img". This is good to avoid modifying developers custom changes, but it might lead to hard-to-debug issues when the image stays different. Let's ensure we always replace the image with the one we asked for. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index ef719f7482..595ecee239 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -35,12 +35,11 @@ WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} # apply_kata_deploy() { local deploy_file="tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" - local old_img="quay.io/kata-containers/kata-deploy:latest" # Use the kata-deploy CI image which is built for each pull request merged local new_img="quay.io/kata-containers/kata-deploy-ci:kata-containers-latest" pushd "$katacontainers_repo_dir" - sed -i "s#${old_img}#${new_img}#" "$deploy_file" + sed -ri "s#(\s+image:) .*#\1 ${new_img}#" "$deploy_file" info "Applying kata-deploy" oc apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml From b811ee0650ded0560f0a1da74238819bcda8ce5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 11:20:44 +0100 Subject: [PATCH 07/10] ci.ocp: Allow to override the kata-deploy image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sometimes we want to test a different than the latest image (eg. when verifying a PR via ghcr images or when bisecting a failure over older builds). Let's add a KATA_DEPLOY_IMAGE variable for that while keeping the latest image by default. Fixes: #9228 Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index 595ecee239..93315f77f7 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -27,6 +27,11 @@ KATA_WITH_SYSTEM_QEMU=${KATA_WITH_SYSTEM_QEMU:-no} # KATA_WITH_HOST_KERNEL=${KATA_WITH_HOST_KERNEL:-no} +# kata-deploy image to be used to deploy the kata (by default use CI image +# that is built for each pull request) +# +KATA_DEPLOY_IMAGE=${KATA_DEPLOY_IMAGE:-quay.io/kata-containers/kata-deploy-ci:kata-containers-latest} + # Enable workaround for OCP 4.13 https://github.com/kata-containers/kata-containers/pull/9206 # WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} @@ -35,11 +40,8 @@ WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} # apply_kata_deploy() { local deploy_file="tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" - # Use the kata-deploy CI image which is built for each pull request merged - local new_img="quay.io/kata-containers/kata-deploy-ci:kata-containers-latest" - pushd "$katacontainers_repo_dir" - sed -ri "s#(\s+image:) .*#\1 ${new_img}#" "$deploy_file" + sed -ri "s#(\s+image:) .*#\1 ${KATA_DEPLOY_IMAGE}#" "$deploy_file" info "Applying kata-deploy" oc apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml From cc02329fd1d14826cbf0018d2469ce6d18793183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 12:39:26 +0100 Subject: [PATCH 08/10] ci.ocp: Add a cleanup script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This script doesn't serve as a complete cleanup, but it can be used as a best-effort cleaner between deploying different versions of kata-containers on the same OCP cluster. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cleanup.sh | 55 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 ci/openshift-ci/cleanup.sh diff --git a/ci/openshift-ci/cleanup.sh b/ci/openshift-ci/cleanup.sh new file mode 100755 index 0000000000..587c431478 --- /dev/null +++ b/ci/openshift-ci/cleanup.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# +# Copyright (c) 2024 Red Hat, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# This script tries to removes most of the resources added by `test.sh` script +# from the cluster. + +scripts_dir=$(dirname $0) +deployments_dir=${scripts_dir}/cluster/deployments +configs_dir=${scripts_dir}/configs + +source ${scripts_dir}/lib.sh + +# Set to 'yes' if you want to configure SELinux to permissive on the cluster +# workers. +# +SELINUX_PERMISSIVE=${SELINUX_PERMISSIVE:-no} + +# Enable workaround for OCP 4.13 https://github.com/kata-containers/kata-containers/pull/9206 +# +WORKAROUND_9206_CRIO=${WORKAROUND_9206_CRIO:-no} + +# Ignore errors as we want best-effort-approach here +trap - ERR + +# Delete potential smoke-test resources +oc delete -f "${scripts_dir}/smoke/service.yaml" +oc delete -f "${scripts_dir}/smoke/service_kubernetes.yaml" +oc delete -f "${scripts_dir}/smoke/http-server.yaml" + +# Delete test.sh resources +oc delete -f "${deployments_dir}/relabel_selinux.yaml" +if [[ "$WORKAROUND_9206_CRIO" == "yes" ]]; then + oc delete -f "${deployments_dir}/workaround-9206-crio-ds.yaml" + oc delete -f "${deployments_dir}/workaround-9206-crio.yaml" +fi +[ ${SELINUX_PERMISSIVE} == "yes" ] && oc delete -f "${deployments_dir}/machineconfig_selinux.yaml.in" + +# Delete kata-containers +pushd "$katacontainers_repo_dir/tools/packaging/kata-deploy" +oc delete -f kata-deploy/base/kata-deploy.yaml +oc -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod +oc apply -f kata-cleanup/base/kata-cleanup.yaml +echo "Wait for all related pods to be gone" +( repeats=1; for i in $(seq 1 600); do + oc get pods -l name="kubelet-kata-cleanup" --no-headers=true -n kube-system 2>&1 | grep "No resources found" -q && ((repeats++)) || repeats=1 + [ "$repeats" -gt 5 ] && echo kata-cleanup finished && break + sleep 1 +done) || { echo "There are still some kata-cleanup related pods after 600 iterations"; oc get all -n kube-system; exit -1; } +oc delete -f kata-cleanup/base/kata-cleanup.yaml +oc delete -f kata-rbac/base/kata-rbac.yaml +oc delete -f runtimeclasses/kata-runtimeClasses.yaml + From 7ff2eb508e4251f5d551ba050efcd47af06b44a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Wed, 6 Mar 2024 18:05:04 +0100 Subject: [PATCH 09/10] ci.ocp: Increase the mcp update timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we're hitting this timeout quite often, looks like newer OCP takes longer to reconfigure. Increase the timeout to 1200. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index 93315f77f7..ac2581ebab 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -96,7 +96,7 @@ wait_for_reboot() { } wait_mcp_update() { - local delta="${1:-900}" + local delta="${1:-1200}" local sleep_time=30 # The machineconfigpool is fine when all the workers updated and are ready, # and none are degraded. From 46e62eecb1010d5c46e0b99b800ec93e428b748f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 12 Mar 2024 16:58:02 +0100 Subject: [PATCH 10/10] ci.ocp: Log the full grepped line rather than the expected msg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we are grepping for an expected message but it might contain extra bits of information fruitful for later debugging. Let's include it in the output and the full log in case of an error. Signed-off-by: Lukáš Doktor --- ci/openshift-ci/cluster/install_kata.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/openshift-ci/cluster/install_kata.sh b/ci/openshift-ci/cluster/install_kata.sh index ac2581ebab..d544aa70c3 100755 --- a/ci/openshift-ci/cluster/install_kata.sh +++ b/ci/openshift-ci/cluster/install_kata.sh @@ -183,10 +183,13 @@ wait_for_app_pods_message() { done for pod in "${pods[@]}"; do while :; do - oc logs $namespace "$pod" | grep "$message" -q && echo "Found $message in $pod's log ($SECONDS)" && break; + local log=$(oc logs $namespace "$pod") + echo "$log" | grep "$message" -q && echo "Found $(echo "$log" | grep "$message") in $pod's log ($SECONDS)" && break; if [ "$SECONDS" -gt "$timeout" ]; then echo -n "Message '$message' not present in '${pod}' pod of the '-l app=\"$app\"' " echo "pods after ${SECONDS}s (${pods[@]})" + echo "Pod $pod's output so far:" + echo "$log" return -1 fi sleep 1;