From 373a388844f28590958cc64132d535bbe9e9af4a Mon Sep 17 00:00:00 2001 From: Ruoqing He Date: Wed, 22 Jan 2025 11:37:19 +0800 Subject: [PATCH] ci: Retry on failure of Create AKS cluster The `Create AKS cluster` step in `run-k8s-tests-on-aks.yaml` is likely to fail fail since we are trying to issue `PUT` to `aks` in a relatively high frequency, while the `aks` end has it's limit on `bucket-size` and `refill-rate`, documented here [1]. Use `nick-fields/retry@v3` to retry in 10 seconds after request fail, based on observations that AKS were request 7, or 8 second delays before retry as part of their 429 response [1] https://learn.microsoft.com/en-us/azure/aks/quotas-skus-regions#throttling-limits-on-aks-resource-provider-apis Fixes: #10772 Signed-off-by: Ruoqing He Signed-off-by: stevenhorsman --- .github/workflows/run-k8s-tests-on-aks.yaml | 9 +++++++-- .github/workflows/run-kata-coco-stability-tests.yaml | 9 +++++++-- .github/workflows/run-kata-coco-tests.yaml | 9 +++++++-- .github/workflows/run-kata-deploy-tests-on-aks.yaml | 11 ++++++++--- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run-k8s-tests-on-aks.yaml b/.github/workflows/run-k8s-tests-on-aks.yaml index 0677e47c7..b49b783dd 100644 --- a/.github/workflows/run-k8s-tests-on-aks.yaml +++ b/.github/workflows/run-k8s-tests-on-aks.yaml @@ -103,8 +103,13 @@ jobs: AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} - name: Create AKS cluster - timeout-minutes: 10 - run: bash tests/integration/kubernetes/gha-run.sh create-cluster + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 20 + retry_on: error + retry_wait_seconds: 10 + command: bash tests/integration/kubernetes/gha-run.sh create-cluster - name: Install `bats` run: bash tests/integration/kubernetes/gha-run.sh install-bats diff --git a/.github/workflows/run-kata-coco-stability-tests.yaml b/.github/workflows/run-kata-coco-stability-tests.yaml index 11ec21a13..eb8b2bcb8 100644 --- a/.github/workflows/run-kata-coco-stability-tests.yaml +++ b/.github/workflows/run-kata-coco-stability-tests.yaml @@ -87,8 +87,13 @@ jobs: AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} - name: Create AKS cluster - timeout-minutes: 10 - run: bash tests/integration/kubernetes/gha-run.sh create-cluster + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 20 + retry_on: error + retry_wait_seconds: 10 + command: bash tests/integration/kubernetes/gha-run.sh create-cluster - name: Install `bats` run: bash tests/integration/kubernetes/gha-run.sh install-bats diff --git a/.github/workflows/run-kata-coco-tests.yaml b/.github/workflows/run-kata-coco-tests.yaml index 8e3d784d1..384758751 100644 --- a/.github/workflows/run-kata-coco-tests.yaml +++ b/.github/workflows/run-kata-coco-tests.yaml @@ -333,8 +333,13 @@ jobs: AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} - name: Create AKS cluster - timeout-minutes: 10 - run: bash tests/integration/kubernetes/gha-run.sh create-cluster + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 20 + retry_on: error + retry_wait_seconds: 10 + command: bash tests/integration/kubernetes/gha-run.sh create-cluster - name: Install `bats` run: bash tests/integration/kubernetes/gha-run.sh install-bats diff --git a/.github/workflows/run-kata-deploy-tests-on-aks.yaml b/.github/workflows/run-kata-deploy-tests-on-aks.yaml index b397e8a87..718330994 100644 --- a/.github/workflows/run-kata-deploy-tests-on-aks.yaml +++ b/.github/workflows/run-kata-deploy-tests-on-aks.yaml @@ -71,8 +71,13 @@ jobs: AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} - name: Create AKS cluster - timeout-minutes: 10 - run: bash tests/functional/kata-deploy/gha-run.sh create-cluster + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 20 + retry_on: error + retry_wait_seconds: 10 + command: bash tests/integration/kubernetes/gha-run.sh create-cluster - name: Install `bats` run: bash tests/functional/kata-deploy/gha-run.sh install-bats @@ -85,7 +90,7 @@ jobs: - name: Run tests run: bash tests/functional/kata-deploy/gha-run.sh run-tests - + - name: Delete AKS cluster if: always() run: bash tests/functional/kata-deploy/gha-run.sh delete-cluster