ci: Retry on failure of Create AKS cluster

The `Create AKS cluster` step in `run-k8s-tests-on-aks.yaml` is likely
to fail fail since we are trying to issue `PUT` to `aks` in a relatively
high frequency, while the `aks` end has it's limit on `bucket-size` and
`refill-rate`, documented here [1].

Use `nick-fields/retry@v3` to retry in 10 seconds after request fail,
based on observations that AKS were request 7, or 8 second delays
before retry as part of their 429 response

[1] https://learn.microsoft.com/en-us/azure/aks/quotas-skus-regions#throttling-limits-on-aks-resource-provider-apis

Fixes: #10772

Signed-off-by: Ruoqing He <heruoqing@iscas.ac.cn>
Signed-off-by: stevenhorsman <steven@uk.ibm.com>
This commit is contained in:
Ruoqing He 2025-01-22 11:37:19 +08:00 committed by stevenhorsman
parent e71bc1f068
commit 373a388844
4 changed files with 29 additions and 9 deletions

View File

@ -103,8 +103,13 @@ jobs:
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Create AKS cluster - name: Create AKS cluster
timeout-minutes: 10 uses: nick-fields/retry@v3
run: bash tests/integration/kubernetes/gha-run.sh create-cluster with:
timeout_minutes: 15
max_attempts: 20
retry_on: error
retry_wait_seconds: 10
command: bash tests/integration/kubernetes/gha-run.sh create-cluster
- name: Install `bats` - name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats run: bash tests/integration/kubernetes/gha-run.sh install-bats

View File

@ -87,8 +87,13 @@ jobs:
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Create AKS cluster - name: Create AKS cluster
timeout-minutes: 10 uses: nick-fields/retry@v3
run: bash tests/integration/kubernetes/gha-run.sh create-cluster with:
timeout_minutes: 15
max_attempts: 20
retry_on: error
retry_wait_seconds: 10
command: bash tests/integration/kubernetes/gha-run.sh create-cluster
- name: Install `bats` - name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats run: bash tests/integration/kubernetes/gha-run.sh install-bats

View File

@ -333,8 +333,13 @@ jobs:
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Create AKS cluster - name: Create AKS cluster
timeout-minutes: 10 uses: nick-fields/retry@v3
run: bash tests/integration/kubernetes/gha-run.sh create-cluster with:
timeout_minutes: 15
max_attempts: 20
retry_on: error
retry_wait_seconds: 10
command: bash tests/integration/kubernetes/gha-run.sh create-cluster
- name: Install `bats` - name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats run: bash tests/integration/kubernetes/gha-run.sh install-bats

View File

@ -71,8 +71,13 @@ jobs:
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }} AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
- name: Create AKS cluster - name: Create AKS cluster
timeout-minutes: 10 uses: nick-fields/retry@v3
run: bash tests/functional/kata-deploy/gha-run.sh create-cluster with:
timeout_minutes: 15
max_attempts: 20
retry_on: error
retry_wait_seconds: 10
command: bash tests/integration/kubernetes/gha-run.sh create-cluster
- name: Install `bats` - name: Install `bats`
run: bash tests/functional/kata-deploy/gha-run.sh install-bats run: bash tests/functional/kata-deploy/gha-run.sh install-bats
@ -85,7 +90,7 @@ jobs:
- name: Run tests - name: Run tests
run: bash tests/functional/kata-deploy/gha-run.sh run-tests run: bash tests/functional/kata-deploy/gha-run.sh run-tests
- name: Delete AKS cluster - name: Delete AKS cluster
if: always() if: always()
run: bash tests/functional/kata-deploy/gha-run.sh delete-cluster run: bash tests/functional/kata-deploy/gha-run.sh delete-cluster