From aedf14b244b739775a27841ae67a65cb6152ed9a Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Mon, 2 Sep 2024 09:44:47 +0200 Subject: [PATCH 1/5] tests: Mimic node debugger with full privileges This commit addresses an issue with handling loop devices via a node debugger due to restricted privileges. It runs a pod with full privileges, allowing it to mount the host root to `/host`, similar to the node debugger. This change enables us to run tests for trusted image storage using the `qemu-coco-dev` runtime class. Fixes: #10133 Signed-off-by: Hyounggyu Choi --- .../custom-node-debugger.yaml | 37 +++++++++++++++++++ tests/integration/kubernetes/tests_common.sh | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml diff --git a/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml b/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml new file mode 100644 index 0000000000..58a6a8cfae --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml @@ -0,0 +1,37 @@ +# +# Copyright (c) IBM Corp. 2024 +# +# SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: custom-node-debugger +spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - ${NODE_NAME} + containers: + - name: node-debugger-container + image: quay.io/bedrock/ubuntu:latest + command: ["/bin/sh", "-c", "sleep infinity"] + stdin: true + tty: true + securityContext: + privileged: true + runAsUser: 0 + allowPrivilegeEscalation: true + volumeMounts: + - name: host-root + mountPath: /host + volumes: + - name: host-root + hostPath: + path: / + type: Directory diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index c552f5bf2c..98a3c60c9e 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -104,7 +104,7 @@ exec_host() { local old_debugger_pods=($(kubectl get pods -o name | grep node-debugger)) # Run a debug pod - kubectl debug -q "node/${node}" --image=quay.io/bedrock/ubuntu:latest -- chroot /host bash -c "sleep infinity" >&2 + NODE_NAME="${node}" envsubst < runtimeclass_workloads/custom-node-debugger.yaml | kubectl apply -f - > /dev/null # Identify the new debugger pod local new_debugger_pod=$(get_new_debugger_pod "${old_debugger_pods[@]}") From 374b8d2534c02834eaf92afc7cccc83ad8f97cd1 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Mon, 2 Sep 2024 10:11:20 +0200 Subject: [PATCH 2/5] tests: Create and delete node debugger pod only once Creating and deleting a node debugger pod for every `exec_host()` call is inefficient. This commit changes the test suite to create and delete the pod only once, globally. Signed-off-by: Hyounggyu Choi --- .../kubernetes/k8s-file-volume.bats | 3 +- tests/integration/kubernetes/k8s-volume.bats | 3 +- .../kubernetes/run_kubernetes_tests.sh | 5 +++ .../custom-node-debugger.yaml | 2 +- tests/integration/kubernetes/tests_common.sh | 38 ++++++------------- 5 files changed, 21 insertions(+), 30 deletions(-) diff --git a/tests/integration/kubernetes/k8s-file-volume.bats b/tests/integration/kubernetes/k8s-file-volume.bats index f35ab1decc..35891d1dc0 100644 --- a/tests/integration/kubernetes/k8s-file-volume.bats +++ b/tests/integration/kubernetes/k8s-file-volume.bats @@ -16,7 +16,8 @@ setup() { pod_name="test-file-volume" container_name="busybox-file-volume-container" node="$(get_one_kata_node)" - tmp_file=$(exec_host "$node" mktemp /tmp/file-volume-test-foo.XXXXX) + tmp_file=$(mktemp -u /tmp/file-volume-test-foo.XXXXX) + exec_host "$node" touch $tmp_file mount_path="/tmp/foo.txt" file_body="test" get_pod_config_dir diff --git a/tests/integration/kubernetes/k8s-volume.bats b/tests/integration/kubernetes/k8s-volume.bats index 4178f8b1e1..58c2b51c3a 100644 --- a/tests/integration/kubernetes/k8s-volume.bats +++ b/tests/integration/kubernetes/k8s-volume.bats @@ -16,7 +16,8 @@ setup() { get_pod_config_dir node=$(get_one_kata_node) - tmp_file=$(exec_host "$node" mktemp -d /tmp/data.XXXX) + tmp_file=$(mktemp -u /tmp/data.XXXX) + exec_host "$node" mkdir $tmp_file pv_yaml=$(mktemp --tmpdir pv_config.XXXXXX.yaml) pod_yaml=$(mktemp --tmpdir pod_config.XXXXXX.yaml) msg="Hello from Kubernetes" diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index 67753849d1..ab12babc29 100755 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -129,6 +129,11 @@ do fi done +# Clean up all node debugger pods whose name starts with `custom-node-debugger` if pods exist +pods_to_be_deleted=$(kubectl get pods -n kube-system --no-headers -o custom-columns=:metadata.name \ + | grep '^custom-node-debugger' || true) +[ -n "$pods_to_be_deleted" ] && kubectl delete pod -n kube-system $pods_to_be_deleted || true + [ ${#tests_fail[@]} -ne 0 ] && die "Tests FAILED from suites: ${tests_fail[*]}" info "All tests SUCCEEDED" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml b/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml index 58a6a8cfae..cb77fdfc7e 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/custom-node-debugger.yaml @@ -6,7 +6,7 @@ apiVersion: v1 kind: Pod metadata: - name: custom-node-debugger + name: ${POD_NAME} spec: affinity: nodeAffinity: diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index 98a3c60c9e..501a0fc42e 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -76,19 +76,6 @@ get_one_kata_node() { echo "${resource_name/"node/"}" } -# Get the new debugger pod that wasn't present in the old_pods array. -get_new_debugger_pod() { - local old_pods=("$@") - local new_pod_list=($(kubectl get pods -o name | grep node-debugger)) - - for new_pod in "${new_pod_list[@]}"; do - if [[ ! " ${old_pods[*]} " =~ " ${new_pod} " ]]; then - echo "${new_pod}" - return - fi - done -} - # Runs a command in the host filesystem. # # Parameters: @@ -99,18 +86,18 @@ exec_host() { # `kubectl debug` always returns 0, so we hack it to return the right exit code. local command="${@:2}" command+='; echo -en \\n$?' - - # Get the already existing debugger pods - local old_debugger_pods=($(kubectl get pods -o name | grep node-debugger)) + # Make 7 character hash from the node name + local pod_name="custom-node-debugger-$(echo -n "$node" | sha1sum | cut -c1-7)" # Run a debug pod - NODE_NAME="${node}" envsubst < runtimeclass_workloads/custom-node-debugger.yaml | kubectl apply -f - > /dev/null - - # Identify the new debugger pod - local new_debugger_pod=$(get_new_debugger_pod "${old_debugger_pods[@]}") - - # Wait for the newly created pod to be ready - kubectl wait --timeout="30s" --for=condition=ready "${new_debugger_pod}" > /dev/null + # Check if there is an existing node debugger pod and reuse it + # Otherwise, create a new one + if ! kubectl get pod -n kube-system "${pod_name}" > /dev/null 2>&1; then + POD_NAME="${pod_name}" NODE_NAME="${node}" envsubst < runtimeclass_workloads/custom-node-debugger.yaml | \ + kubectl apply -n kube-system -f - > /dev/null + # Wait for the newly created pod to be ready + kubectl wait pod -n kube-system --timeout="30s" --for=condition=ready "${pod_name}" > /dev/null + fi # Execute the command and capture the output # We're trailing the `\r` here due to: https://github.com/kata-containers/kata-containers/issues/8051 @@ -122,10 +109,7 @@ exec_host() { # [bats-exec-test:38] INFO: k8s configured to use runtimeclass # bash: line 1: $'\r': command not found # ``` - local output="$(kubectl exec -qi "${new_debugger_pod}" -- chroot /host bash -c "${command}" | tr -d '\r')" - - # Delete the newly created pod - kubectl delete "${new_debugger_pod}" >&2 + local output="$(kubectl exec -qi -n kube-system "${pod_name}" -- chroot /host bash -c "${command}" | tr -d '\r')" # Output the command result local exit_code="$(echo "${output}" | tail -1)" From 9cff9271bc8e9cf001c2706c56f5747c14d29e2b Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Fri, 6 Sep 2024 14:12:30 +0200 Subject: [PATCH 3/5] tests: Run all commands in *_loop_device() using exec_host() If the host running the tests is different from the host where the cluster is running, the *_loop_device() functions do not work as expected because the device is created on the test host, while the cluster expects the device to be local. This commit ensures that all commands for the relevant functions are executed via exec_host() so that a device should be handled on a cluster node. Additionally, it modifies exec_host() to return the exit code of the last executed command because the existing logic with `kubectl debug` sometimes includes unexpected characters that are difficult to handle. `kubectl exec` appears to properly return the exit code for a given command to it. Signed-off-by: Hyounggyu Choi --- .../integration/kubernetes/confidential_common.sh | 15 +++++++++------ tests/integration/kubernetes/tests_common.sh | 13 ++++++------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/integration/kubernetes/confidential_common.sh b/tests/integration/kubernetes/confidential_common.sh index 5e5fb019ae..5b3e59ba71 100644 --- a/tests/integration/kubernetes/confidential_common.sh +++ b/tests/integration/kubernetes/confidential_common.sh @@ -87,27 +87,30 @@ function is_confidential_hardware() { function create_loop_device(){ local loop_file="${1:-/tmp/trusted-image-storage.img}" + local node="$(get_one_kata_node)" cleanup_loop_device "$loop_file" - sudo dd if=/dev/zero of=$loop_file bs=1M count=2500 - sudo losetup -fP $loop_file >/dev/null 2>&1 - local device=$(sudo losetup -j $loop_file | awk -F'[: ]' '{print $1}') + exec_host "$node" "dd if=/dev/zero of=$loop_file bs=1M count=2500" + exec_host "$node" "losetup -fP $loop_file >/dev/null 2>&1" + local device=$(exec_host "$node" losetup -j $loop_file | awk -F'[: ]' '{print $1}') + echo $device } function cleanup_loop_device(){ local loop_file="${1:-/tmp/trusted-image-storage.img}" + local node="$(get_one_kata_node)" # Find all loop devices associated with $loop_file - local existed_devices=$(sudo losetup -j $loop_file | awk -F'[: ]' '{print $1}') + local existed_devices=$(exec_host "$node" losetup -j $loop_file | awk -F'[: ]' '{print $1}') if [ -n "$existed_devices" ]; then # Iterate over each found loop device and detach it for d in $existed_devices; do - sudo losetup -d "$d" >/dev/null 2>&1 + exec_host "$node" "losetup -d "$d" >/dev/null 2>&1" done fi - sudo rm -f "$loop_file" >/dev/null 2>&1 || true + exec_host "$node" "rm -f "$loop_file" >/dev/null 2>&1 || true" } # This function creates pod yaml. Parameters diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index 501a0fc42e..d21e40d0ec 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -85,7 +85,6 @@ exec_host() { local node="$1" # `kubectl debug` always returns 0, so we hack it to return the right exit code. local command="${@:2}" - command+='; echo -en \\n$?' # Make 7 character hash from the node name local pod_name="custom-node-debugger-$(echo -n "$node" | sha1sum | cut -c1-7)" @@ -97,6 +96,11 @@ exec_host() { kubectl apply -n kube-system -f - > /dev/null # Wait for the newly created pod to be ready kubectl wait pod -n kube-system --timeout="30s" --for=condition=ready "${pod_name}" > /dev/null + # Manually check the exit status of the previous command to handle errors explicitly + # since `set -e` is not enabled, allowing subsequent commands to run if needed. + if [ $? -ne 0 ]; then + return $? + fi fi # Execute the command and capture the output @@ -109,12 +113,7 @@ exec_host() { # [bats-exec-test:38] INFO: k8s configured to use runtimeclass # bash: line 1: $'\r': command not found # ``` - local output="$(kubectl exec -qi -n kube-system "${pod_name}" -- chroot /host bash -c "${command}" | tr -d '\r')" - - # Output the command result - local exit_code="$(echo "${output}" | tail -1)" - echo "$(echo "${output}" | head -n -1)" - return ${exit_code} + kubectl exec -qi -n kube-system "${pod_name}" -- chroot /host bash -c "${command}" | tr -d '\r' } auto_generate_policy_enabled() { From c6b86e88e4e89672e333ffb48c415903edbfe526 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Tue, 3 Sep 2024 13:22:20 +0200 Subject: [PATCH 4/5] tests: Increase timeouts for qemu-coco-dev in trusted image storage tests Timeouts occur (e.g. `create_container_timeout` and `wait_time`) when using qemu-coco-dev. This commit increases these timeouts for the trusted image storage test cases Signed-off-by: Hyounggyu Choi --- .../kubernetes/k8s-guest-pull-image.bats | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/integration/kubernetes/k8s-guest-pull-image.bats b/tests/integration/kubernetes/k8s-guest-pull-image.bats index 2de58bd36e..46381810a9 100644 --- a/tests/integration/kubernetes/k8s-guest-pull-image.bats +++ b/tests/integration/kubernetes/k8s-guest-pull-image.bats @@ -110,6 +110,15 @@ setup() { pod_config=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${pod_config_template}").XXX") IMAGE="$image_pulled_time_less_than_default_time" NODE_NAME="$node" envsubst < "$pod_config_template" > "$pod_config" + + # Set CreateContainerRequest timeout for qemu-coco-dev + if [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ]; then + create_container_timeout=300 + set_metadata_annotation "$pod_config" \ + "io.katacontainers.config.runtime.create_container_timeout" \ + "${create_container_timeout}" + fi + # Enable dm-integrity in guest set_metadata_annotation "${pod_config}" \ "io.katacontainers.config.hypervisor.kernel_params" \ @@ -125,7 +134,9 @@ setup() { cat $pod_config add_allow_all_policy_to_yaml "$pod_config" - k8s_create_pod "$pod_config" + local wait_time=120 + [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ] && wait_time=300 + k8s_create_pod "$pod_config" "$wait_time" } @test "Test we cannot pull a large image that pull time exceeds createcontainer timeout inside the guest" { @@ -195,6 +206,7 @@ setup() { # Set CreateContainerRequest timeout in the annotation to pull large image in guest create_container_timeout=120 + [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ] && create_container_timeout=600 set_metadata_annotation "$pod_config" \ "io.katacontainers.config.runtime.create_container_timeout" \ "${create_container_timeout}" @@ -214,7 +226,9 @@ setup() { cat $pod_config add_allow_all_policy_to_yaml "$pod_config" - k8s_create_pod "$pod_config" + local wait_time=120 + [ "${KATA_HYPERVISOR}" == "qemu-coco-dev" ] && wait_time=600 + k8s_create_pod "$pod_config" "$wait_time" } teardown() { From 2d6ac3d85d04778b5afbda7a30fa663bdf388ed7 Mon Sep 17 00:00:00 2001 From: Hyounggyu Choi Date: Mon, 2 Sep 2024 10:48:36 +0200 Subject: [PATCH 5/5] tests: Re-enable guest-pull-image tests for qemu-coco-dev Now that the issue with handling loop devices has been resolved, this commit re-enables the guest-pull-image tests for `qemu-coco-dev`. Signed-off-by: Hyounggyu Choi --- .../integration/kubernetes/k8s-guest-pull-image.bats | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/integration/kubernetes/k8s-guest-pull-image.bats b/tests/integration/kubernetes/k8s-guest-pull-image.bats index 46381810a9..4b2a896523 100644 --- a/tests/integration/kubernetes/k8s-guest-pull-image.bats +++ b/tests/integration/kubernetes/k8s-guest-pull-image.bats @@ -92,10 +92,6 @@ setup() { # The image pulled in the guest will be downloaded and unpacked in the `/run/kata-containers/image` directory. # The tests will use `cryptsetup` to encrypt a block device and mount it at `/run/kata-containers/image`. - if [ "${KATA_HYPERVISOR}" = "qemu-coco-dev" ]; then - skip "skip this specific one due to issue https://github.com/kata-containers/kata-containers/issues/10133" - fi - storage_config=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").XXX") local_device=$(create_loop_device) LOCAL_DEVICE="$local_device" NODE_NAME="$node" envsubst < "$storage_config_template" > "$storage_config" @@ -141,10 +137,6 @@ setup() { @test "Test we cannot pull a large image that pull time exceeds createcontainer timeout inside the guest" { - if [ "${KATA_HYPERVISOR}" = "qemu-coco-dev" ]; then - skip "skip this specific one due to issue https://github.com/kata-containers/kata-containers/issues/10133" - fi - storage_config=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").XXX") local_device=$(create_loop_device) LOCAL_DEVICE="$local_device" NODE_NAME="$node" envsubst < "$storage_config_template" > "$storage_config" @@ -187,8 +179,8 @@ setup() { @test "Test we can pull a large image inside the guest with large createcontainer timeout" { - if [ "${KATA_HYPERVISOR}" = "qemu-coco-dev" ]; then - skip "skip this specific one due to issue https://github.com/kata-containers/kata-containers/issues/10133" + if [ "${KATA_HYPERVISOR}" = "qemu-coco-dev" ] && [ "${KBS_INGRESS}" = "aks" ]; then + skip "skip this specific one due to issue https://github.com/kata-containers/kata-containers/issues/10299" fi storage_config=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").XXX") local_device=$(create_loop_device)