From 2c2941122caa177aaab6a8bfc4444e9d2783442c Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Tue, 24 Sep 2024 14:33:44 +0200 Subject: [PATCH 1/2] tests: Fail fast in assert_pod_fail() `assert_pod_fail()` currently calls `k8s_create_pod()` to ensure that a pod does not become ready within the default 120s. However, this delays the test's completion even if an error message is detected earlier in the journal. This commit removes the use of `k8s_create_pod()` and modifies `assert_pod_fail()` to fail as soon as the pod enters a failed state. All failing pods end up in one of the following states: - CrashLoopBackOff - ImagePullBackOff The function now polls the pod's state every 5 seconds to check for these conditions. If the pod enters a failed state, the function immediately returns 0. If the pod does not reach a failed state within 120 seconds, it returns 1. Signed-off-by: Hyounggyu Choi --- tests/integration/kubernetes/lib.sh | 30 ++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tests/integration/kubernetes/lib.sh b/tests/integration/kubernetes/lib.sh index 77ec57307c..eb732fdde9 100644 --- a/tests/integration/kubernetes/lib.sh +++ b/tests/integration/kubernetes/lib.sh @@ -90,15 +90,39 @@ assert_logs_contain() { # # Parameters: # $1 - the pod configuration file. +# $2 - the duration to wait for the container to fail. Defaults to 120. (optional) # assert_pod_fail() { local container_config="$1" + local duration="${2:-120}" + echo "In assert_pod_fail: $container_config" - echo "Attempt to create the container but it should fail" - ! k8s_create_pod "$container_config" || /bin/false -} + kubectl apply -f "${container_config}" + if ! pod_name=$(kubectl get pods -o jsonpath='{.items..metadata.name}'); then + echo "Failed to create the pod" + return 1 + fi + + local elapsed_time=0 + local sleep_time=5 + while true; do + echo "Waiting for a container to fail" + sleep ${sleep_time} + elapsed_time=$((elapsed_time+sleep_time)) + if [[ $(kubectl get pod "${pod_name}" \ + -o jsonpath='{.status.containerStatuses[0].state.waiting.reason}') = *BackOff* ]]; then + return 0 + fi + if [ $elapsed_time -gt $duration ]; then + echo "The container does not get into a failing state" >&2 + break + fi + done + return 1 + +} # Check the pulled rootfs on host for given node and sandbox_id # From c70588fafea88fa4e997295d301844162c954c55 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Tue, 24 Sep 2024 16:11:17 +0200 Subject: [PATCH 2/2] tests: Use custom-node-debugger pod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With #10232 merged, we now have a persistent node debugger pod throughout the test. As a result, there’s no need to spawn another debugger pod using `kubectl debug`, which could lead to false negatives due to premature pod termination, as reported in #10081. This commit removes the `print_node_journal()` call that uses `kubectl debug` and instead uses `exec_host()` to capture the host journal. The `exec_host()` function is relocated to `tests/integration/kubernetes/lib.sh` to prevent cyclical dependencies between `tests_common.sh` and `lib.sh`. Signed-off-by: Hyounggyu Choi --- .../k8s-confidential-attestation.bats | 2 +- .../kubernetes/k8s-file-volume.bats | 1 + .../k8s-guest-pull-image-authenticated.bats | 2 +- .../k8s-guest-pull-image-encrypted.bats | 2 +- .../k8s-guest-pull-image-signature.bats | 2 +- .../kubernetes/k8s-liveness-probes.bats | 2 +- .../kubernetes/k8s-sealed-secret.bats | 2 +- tests/integration/kubernetes/k8s-volume.bats | 1 + tests/integration/kubernetes/lib.sh | 63 ++++++++++++------- tests/integration/kubernetes/tests_common.sh | 40 ------------ 10 files changed, 49 insertions(+), 68 deletions(-) diff --git a/tests/integration/kubernetes/k8s-confidential-attestation.bats b/tests/integration/kubernetes/k8s-confidential-attestation.bats index 7d57db2770..88764ae5f5 100644 --- a/tests/integration/kubernetes/k8s-confidential-attestation.bats +++ b/tests/integration/kubernetes/k8s-confidential-attestation.bats @@ -95,6 +95,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-file-volume.bats b/tests/integration/kubernetes/k8s-file-volume.bats index 35891d1dc0..f063f444d5 100644 --- a/tests/integration/kubernetes/k8s-file-volume.bats +++ b/tests/integration/kubernetes/k8s-file-volume.bats @@ -5,6 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 # +load "${BATS_TEST_DIRNAME}/lib.sh" load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" TEST_INITRD="${TEST_INITRD:-no}" diff --git a/tests/integration/kubernetes/k8s-guest-pull-image-authenticated.bats b/tests/integration/kubernetes/k8s-guest-pull-image-authenticated.bats index cf9ab5a469..ff1c50cc1a 100644 --- a/tests/integration/kubernetes/k8s-guest-pull-image-authenticated.bats +++ b/tests/integration/kubernetes/k8s-guest-pull-image-authenticated.bats @@ -117,6 +117,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-guest-pull-image-encrypted.bats b/tests/integration/kubernetes/k8s-guest-pull-image-encrypted.bats index 0753af5170..5ecff42570 100644 --- a/tests/integration/kubernetes/k8s-guest-pull-image-encrypted.bats +++ b/tests/integration/kubernetes/k8s-guest-pull-image-encrypted.bats @@ -100,6 +100,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-guest-pull-image-signature.bats b/tests/integration/kubernetes/k8s-guest-pull-image-signature.bats index 4866ae84a0..69a2909152 100644 --- a/tests/integration/kubernetes/k8s-guest-pull-image-signature.bats +++ b/tests/integration/kubernetes/k8s-guest-pull-image-signature.bats @@ -151,6 +151,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-liveness-probes.bats b/tests/integration/kubernetes/k8s-liveness-probes.bats index 7540f8422d..42557be8bc 100644 --- a/tests/integration/kubernetes/k8s-liveness-probes.bats +++ b/tests/integration/kubernetes/k8s-liveness-probes.bats @@ -101,6 +101,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-sealed-secret.bats b/tests/integration/kubernetes/k8s-sealed-secret.bats index b36ff41bd5..bdcd56066a 100644 --- a/tests/integration/kubernetes/k8s-sealed-secret.bats +++ b/tests/integration/kubernetes/k8s-sealed-secret.bats @@ -115,6 +115,6 @@ teardown() { if [[ -n "${node_start_time:-}" && -z "$BATS_TEST_COMPLETED" ]]; then echo "DEBUG: system logs of node '$node' since test start time ($node_start_time)" - print_node_journal "$node" "kata" --since "$node_start_time" || true + exec_host "${node}" journalctl -x -t "kata" --since '"'$node_start_time'"' || true fi } diff --git a/tests/integration/kubernetes/k8s-volume.bats b/tests/integration/kubernetes/k8s-volume.bats index 58c2b51c3a..b1386b2096 100644 --- a/tests/integration/kubernetes/k8s-volume.bats +++ b/tests/integration/kubernetes/k8s-volume.bats @@ -5,6 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 # +load "${BATS_TEST_DIRNAME}/lib.sh" load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" TEST_INITRD="${TEST_INITRD:-no}" diff --git a/tests/integration/kubernetes/lib.sh b/tests/integration/kubernetes/lib.sh index eb732fdde9..56713e950b 100644 --- a/tests/integration/kubernetes/lib.sh +++ b/tests/integration/kubernetes/lib.sh @@ -64,6 +64,46 @@ k8s_create_pod() { fi } +# Runs a command in the host filesystem. +# +# Parameters: +# $1 - the node name +# +exec_host() { + local node="$1" + # `kubectl debug` always returns 0, so we hack it to return the right exit code. + local command="${@:2}" + # Make 7 character hash from the node name + local pod_name="custom-node-debugger-$(echo -n "$node" | sha1sum | cut -c1-7)" + + # Run a debug pod + # Check if there is an existing node debugger pod and reuse it + # Otherwise, create a new one + if ! kubectl get pod -n kube-system "${pod_name}" > /dev/null 2>&1; then + POD_NAME="${pod_name}" NODE_NAME="${node}" envsubst < runtimeclass_workloads/custom-node-debugger.yaml | \ + kubectl apply -n kube-system -f - > /dev/null + # Wait for the newly created pod to be ready + kubectl wait pod -n kube-system --timeout="30s" --for=condition=ready "${pod_name}" > /dev/null + # Manually check the exit status of the previous command to handle errors explicitly + # since `set -e` is not enabled, allowing subsequent commands to run if needed. + if [ $? -ne 0 ]; then + return $? + fi + fi + + # Execute the command and capture the output + # We're trailing the `\r` here due to: https://github.com/kata-containers/kata-containers/issues/8051 + # tl;dr: When testing with CRI-O we're facing the following error: + # ``` + # (from function `exec_host' in file tests_common.sh, line 51, + # in test file k8s-file-volume.bats, line 25) + # `exec_host "echo "$file_body" > $tmp_file"' failed with status 127 + # [bats-exec-test:38] INFO: k8s configured to use runtimeclass + # bash: line 1: $'\r': command not found + # ``` + kubectl exec -qi -n kube-system "${pod_name}" -- chroot /host bash -c "${command}" | tr -d '\r' +} + # Check the logged messages on host have a given message. # # Parameters: @@ -79,7 +119,7 @@ assert_logs_contain() { local message="$4" # Note: with image-rs we get more than the default 1000 lines of logs - print_node_journal "$node" "$log_id" --since "$datetime" | grep "$message" + exec_host "${node}" journalctl -x -t $log_id --since '"'$datetime'"' | grep "$message" } # Create a pod then assert it fails to run. Use in tests that you expect the @@ -262,27 +302,6 @@ set_node() { "${yaml}" } -# Get the systemd's journal from a worker node -# -# Parameters: -# $1 - the k8s worker node name -# $2 - the syslog identifier as in journalctl's -t option -# $N - (optional) any extra parameters to journalctl -# -print_node_journal() { - local node="$1" - local id="$2" - shift 2 - local img="quay.io/prometheus/busybox" - - kubectl debug --image "$img" -q -i "node/${node}" \ - -- chroot /host journalctl -x -t "$id" --no-pager "$@" - # Delete the debugger pod - kubectl get pods -o name | grep "node-debugger-${node}" | \ - xargs kubectl delete > /dev/null -} - - # Get the sandbox id for kata container from a worker node # # Parameters: diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index d21e40d0ec..5339c64f07 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -76,46 +76,6 @@ get_one_kata_node() { echo "${resource_name/"node/"}" } -# Runs a command in the host filesystem. -# -# Parameters: -# $1 - the node name -# -exec_host() { - local node="$1" - # `kubectl debug` always returns 0, so we hack it to return the right exit code. - local command="${@:2}" - # Make 7 character hash from the node name - local pod_name="custom-node-debugger-$(echo -n "$node" | sha1sum | cut -c1-7)" - - # Run a debug pod - # Check if there is an existing node debugger pod and reuse it - # Otherwise, create a new one - if ! kubectl get pod -n kube-system "${pod_name}" > /dev/null 2>&1; then - POD_NAME="${pod_name}" NODE_NAME="${node}" envsubst < runtimeclass_workloads/custom-node-debugger.yaml | \ - kubectl apply -n kube-system -f - > /dev/null - # Wait for the newly created pod to be ready - kubectl wait pod -n kube-system --timeout="30s" --for=condition=ready "${pod_name}" > /dev/null - # Manually check the exit status of the previous command to handle errors explicitly - # since `set -e` is not enabled, allowing subsequent commands to run if needed. - if [ $? -ne 0 ]; then - return $? - fi - fi - - # Execute the command and capture the output - # We're trailing the `\r` here due to: https://github.com/kata-containers/kata-containers/issues/8051 - # tl;dr: When testing with CRI-O we're facing the following error: - # ``` - # (from function `exec_host' in file tests_common.sh, line 51, - # in test file k8s-file-volume.bats, line 25) - # `exec_host "echo "$file_body" > $tmp_file"' failed with status 127 - # [bats-exec-test:38] INFO: k8s configured to use runtimeclass - # bash: line 1: $'\r': command not found - # ``` - kubectl exec -qi -n kube-system "${pod_name}" -- chroot /host bash -c "${command}" | tr -d '\r' -} - auto_generate_policy_enabled() { [ "${AUTO_GENERATE_POLICY}" == "yes" ] }