From 14e74b8fc9d6dae7423d88c0d59ddd9aa8325af7 Mon Sep 17 00:00:00 2001 From: Wainer dos Santos Moschetta Date: Tue, 15 Apr 2025 18:53:02 -0300 Subject: [PATCH 1/4] tests/k8s: fix kbs installation on Azure AKS The Azure AKS addon-http-application-routing add-on is deprecated and cannot be enabled on new clusters which has caused some CI jobs to fail. Migrated our code to use approuting instead. Unlike addon-http-application-routing, this add-on doesn't configure a managed cluster DNS zone, but the created ingress has a public IP. To avoid having to deal with DNS setup, we will be using that address from now on. Thus, some functions no longer used are deleted. Fixes #11156 Signed-off-by: Wainer dos Santos Moschetta --- tests/gha-run-k8s-common.sh | 25 ++---------- .../kubernetes/confidential_kbs.sh | 39 ++++++++----------- 2 files changed, 20 insertions(+), 44 deletions(-) diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh index 55878b4e71..d175e38c1d 100644 --- a/tests/gha-run-k8s-common.sh +++ b/tests/gha-run-k8s-common.sh @@ -75,10 +75,10 @@ function _print_rg_name() { echo "${AZ_RG:-"kataCI-$(_print_cluster_name "${test_type}")"}" } -# Enable the HTTP application routing add-on to AKS. +# Enable the approuting routing add-on to AKS. # Use with ingress to expose a service API externally. # -function enable_cluster_http_application_routing() { +function enable_cluster_approuting() { local test_type="${1:-k8s}" local cluster_name local rg @@ -86,8 +86,7 @@ function enable_cluster_http_application_routing() { rg="$(_print_rg_name "${test_type}")" cluster_name="$(_print_cluster_name "${test_type}")" - az aks enable-addons -g "${rg}" -n "${cluster_name}" \ - --addons http_application_routing + az aks approuting enable -g "${rg}" -n "${cluster_name}" } function install_azure_cli() { @@ -194,24 +193,6 @@ function get_cluster_credentials() { -n "$(_print_cluster_name "${test_type}")" } - -# Get the AKS DNS zone name of HTTP application routing. -# -# Note: if the HTTP application routing add-on isn't installed in the cluster -# then it will return an empty string. -# -function get_cluster_specific_dns_zone() { - local test_type="${1:-k8s}" - local cluster_name - local rg - local q="addonProfiles.httpApplicationRouting.config.HTTPApplicationRoutingZoneName" - - rg="$(_print_rg_name "${test_type}")" - cluster_name="$(_print_cluster_name "${test_type}")" - - az aks show -g "${rg}" -n "${cluster_name}" --query "${q}" | tr -d \" -} - function delete_cluster() { test_type="${1:-k8s}" local rg diff --git a/tests/integration/kubernetes/confidential_kbs.sh b/tests/integration/kubernetes/confidential_kbs.sh index d0d9c2f6b8..03a1a15f02 100644 --- a/tests/integration/kubernetes/confidential_kbs.sh +++ b/tests/integration/kubernetes/confidential_kbs.sh @@ -419,13 +419,20 @@ function kbs_k8s_deploy() { fi } -# Return the kbs service host name in case ingress is configured +# Return the kbs service public IP in case ingress is configured # otherwise the cluster IP. # kbs_k8s_svc_host() { if kubectl get ingress -n "$KBS_NS" 2>/dev/null | grep -q kbs; then - kubectl get ingress "$KBS_INGRESS_NAME" -n "$KBS_NS" \ - -o jsonpath='{.spec.rules[0].host}' 2>/dev/null + local host + # The ingress IP address can take a while to show up. + SECONDS=0 + while true; do + host=$(kubectl get ingress "${KBS_INGRESS_NAME}" -n "${KBS_NS}" -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + [[ -z "${host}" && ${SECONDS} -lt 30 ]] || break + sleep 5 + done + echo "${host}" elif kubectl get svc "$KBS_SVC_NAME" -n "$KBS_NS" &>/dev/null; then local host host=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}' -n "$KBS_NS") @@ -514,29 +521,17 @@ _handle_ingress() { # Implement the ingress handler for AKS. # _handle_ingress_aks() { - local dns_zone - - dns_zone=$(get_cluster_specific_dns_zone "") - - # In case the DNS zone name is empty, the cluster might not have the HTTP - # application routing add-on. Let's try to enable it. - if [ -z "$dns_zone" ]; then - echo "::group::Enable HTTP application routing add-on" - enable_cluster_http_application_routing "" - echo "::endgroup::" - dns_zone=$(get_cluster_specific_dns_zone "") - fi - - if [ -z "$dns_zone" ]; then - echo "ERROR: the DNS zone name is nil, it cannot configure Ingress" - return 1 - fi + echo "::group::Enable approuting (application routing) add-on" + enable_cluster_approuting "" + echo "::endgroup::" pushd "${COCO_KBS_DIR}/config/kubernetes/overlays/" echo "::group::$(pwd)/ingress.yaml" - KBS_INGRESS_CLASS="addon-http-application-routing" \ - KBS_INGRESS_HOST="kbs.${dns_zone}" \ + # We don't use a cluster DNS zone, instead get the ingress public IP, + # thus KBS_INGRESS_HOST is set empty. + KBS_INGRESS_CLASS="webapprouting.kubernetes.azure.com" \ + KBS_INGRESS_HOST="\"\"" \ envsubst < ingress.yaml | tee ingress.yaml.tmp echo "::endgroup::" mv ingress.yaml.tmp ingress.yaml From a66aac0d77a88a382554a85e96a63029bc252840 Mon Sep 17 00:00:00 2001 From: Wainer dos Santos Moschetta Date: Fri, 25 Apr 2025 16:57:40 -0300 Subject: [PATCH 2/4] tests/k8s: optimize nginx ingress for AKS small VM It's used an AKS managed ingress controller which keeps two nginx pod replicas where both request 500m of CPU. On small VMs like we've used on CI for running the CoCo non-TEE tests, it left only a few amount of CPU for the tests. Actually, one of these pod replicas won't even get started. So let's patch the ingress controller to have only one replica of nginx. Signed-off-by: Wainer dos Santos Moschetta --- .../kubernetes/confidential_kbs.sh | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/integration/kubernetes/confidential_kbs.sh b/tests/integration/kubernetes/confidential_kbs.sh index 03a1a15f02..bc57528deb 100644 --- a/tests/integration/kubernetes/confidential_kbs.sh +++ b/tests/integration/kubernetes/confidential_kbs.sh @@ -372,6 +372,10 @@ function kbs_k8s_deploy() { fi echo "::endgroup::" + echo "::group::Post deploy actions" + _post_deploy "${ingress}" + echo "::endgroup::" + # By default, the KBS service is reachable within the cluster only, # thus the following healthy checker should run from a pod. So start a # debug pod where it will try to get a response from the service. The @@ -547,6 +551,22 @@ _handle_ingress_nodeport() { export DEPLOYMENT_DIR=nodeport } +# Run further actions after the kbs was deployed, usually to apply further +# configurations. +# +_post_deploy() { + local ingress="${1:-}" + + if [[ "${ingress}" = "aks" ]]; then + # The AKS managed ingress controller defaults to two nginx pod + # replicas where both request 500m of CPU. On cluster made of small + # VMs (e.g. 2 vCPU) one of the pod might not even start. We need only + # one nginx, so patching the controller to keep only one replica. + echo "Patch the ingress controller to have only one replica of nginx" + waitForProcess "20" "5" \ + "kubectl patch nginxingresscontroller/default -n app-routing-system --type=merge -p='{\"spec\":{\"scaling\": {\"minReplicas\": 1}}}'" + fi +} # Prepare necessary resources for qemu-se runtime # Documentation: https://github.com/confidential-containers/trustee/tree/main/attestation-service/verifier/src/se From 945482ff6e8b2b17c001ff85f00f6af28d4580e9 Mon Sep 17 00:00:00 2001 From: Wainer dos Santos Moschetta Date: Fri, 25 Apr 2025 18:56:00 -0300 Subject: [PATCH 3/4] tests: make _print_instance_type() to handle "all" host type _print_instance_type() returns the instance type of the AKS nodes, based on the host type. Tests are grouped per host type in "small" and "normal" sets based on the CPU requirements: "small" tests require few CPUs and "normal" more. There is an 3rd case: "all" host type maps to the union of "small" and "normal" tests, which should be handled by _print_instance_type() properly. In this case, it should return the largest instance type possible because "normal" tests will be executed too. Signed-off-by: Wainer dos Santos Moschetta --- tests/gha-run-k8s-common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/gha-run-k8s-common.sh b/tests/gha-run-k8s-common.sh index d175e38c1d..e461e74e1b 100644 --- a/tests/gha-run-k8s-common.sh +++ b/tests/gha-run-k8s-common.sh @@ -41,7 +41,7 @@ function _print_instance_type() { small) echo "Standard_D2s_v5" ;; - normal) + all|normal) echo "Standard_D4s_v5" ;; *) From 460c3394dd2a1b8826ecfda94fbfaccfaa96511b Mon Sep 17 00:00:00 2001 From: Wainer dos Santos Moschetta Date: Fri, 25 Apr 2025 19:28:00 -0300 Subject: [PATCH 4/4] gha: run CoCo non-TEE tests on "all" host type By running on "all" host type there are two consequences: 1) run the "normal" tests too (until now, it's only "small" tests), so increasing the coverage 2) create AKS cluster with larger VMs. This is a new requirement due to the current ingress controller for the KBS service eating too much vCPUs and lefting only few for the tests (resulting on failures) Signed-off-by: Wainer dos Santos Moschetta --- .github/workflows/run-kata-coco-tests.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/run-kata-coco-tests.yaml b/.github/workflows/run-kata-coco-tests.yaml index 70eb05030f..3f129a3eaa 100644 --- a/.github/workflows/run-kata-coco-tests.yaml +++ b/.github/workflows/run-kata-coco-tests.yaml @@ -222,6 +222,11 @@ jobs: AUTHENTICATED_IMAGE_USER: ${{ secrets.AUTHENTICATED_IMAGE_USER }} AUTHENTICATED_IMAGE_PASSWORD: ${{ secrets.AUTHENTICATED_IMAGE_PASSWORD }} SNAPSHOTTER: ${{ matrix.snapshotter }} + # Caution: current ingress controller used to expose the KBS service + # requires much vCPUs, lefting only a few for the tests. Depending on the + # host type chose it will result on the creation of a cluster with + # insufficient resources. + K8S_TEST_HOST_TYPE: "all" USING_NFD: "false" AUTO_GENERATE_POLICY: "yes" steps: