ci: Onboard another NVIDIA machine

Let's add a new NVIDIA machine, which later on will be used for CC related tests. For now the current tests are skipped in the CC capable machine. Signed-off-by: Manuel Huber <manuelh@nvidia.com> Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-01-25 22:54:29 +00:00 · 2025-11-05 22:30:20 +01:00
parent 66e133e096
commit d8953f67c5
4 changed files with 30 additions and 12 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -8,6 +8,7 @@ self-hosted-runner:
  # Labels of self-hosted runner that linter should ignore
  labels:
    - amd64-nvidia-a100
+    - amd64-nvidia-h100-snp
    - arm64-k8s
    - containerd-v1.7-overlayfs
    - containerd-v2.0-overlayfs
--- a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
+++ b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
@@ -29,22 +29,22 @@ permissions: {}

 jobs:
  run-nvidia-gpu-tests-on-amd64:
-    name: run-nvidia-gpu-tests-on-amd64
+    name: run-${{ matrix.environment.name }}-tests-on-amd64
    strategy:
      fail-fast: false
      matrix:
-        vmm:
-          - qemu-nvidia-gpu
-        k8s:
-          - kubeadm
-    runs-on: amd64-nvidia-a100
+        environment: [
+          { name: nvidia-gpu,     vmm: qemu-nvidia-gpu,     runner: amd64-nvidia-a100 },
+          { name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp },
+        ]
+    runs-on: ${{ matrix.environment.runner }}
    env:
      DOCKER_REGISTRY: ${{ inputs.registry }}
      DOCKER_REPO: ${{ inputs.repo }}
      DOCKER_TAG: ${{ inputs.tag }}
      GH_PR_NUMBER: ${{ inputs.pr-number }}
-      KATA_HYPERVISOR: ${{ matrix.vmm }}
-      KUBERNETES: ${{ matrix.k8s }}
+      KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
+      KUBERNETES: kubeadm
      K8S_TEST_HOST_TYPE: all
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -66,20 +66,20 @@ jobs:
      - name: Install `bats`
        run: bash tests/integration/kubernetes/gha-run.sh install-bats

-      - name: Run tests
+      - name: Run tests ${{ matrix.environment.vmm }}
        timeout-minutes: 30
        run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
        env:
          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
-      - name: Collect artifacts ${{ matrix.vmm }}
+      - name: Collect artifacts ${{ matrix.environment.vmm }}
        if: always()
        run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
        continue-on-error: true

-      - name: Archive artifacts ${{ matrix.vmm }}
+      - name: Archive artifacts ${{ matrix.environment.vmm }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
+          name: k8s-tests-${{ matrix.environment.vmm }}-kubeadm-${{ inputs.tag }}
          path: /tmp/artifacts
          retention-days: 1

--- a/tests/integration/kubernetes/k8s-nvidia-cuda.bats
+++ b/tests/integration/kubernetes/k8s-nvidia-cuda.bats
@@ -16,6 +16,8 @@ POD_NAME_CUDA="cuda-vectoradd-kata"
 export POD_NAME_CUDA

 setup() {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    setup_common
    get_pod_config_dir

@@ -29,6 +31,8 @@ setup() {
 }

@test "CUDA Vector Addition Test" {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # Create the CUDA pod
    kubectl apply -f "${pod_yaml}"

@@ -44,6 +48,8 @@ setup() {
 }

 teardown() {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # Debugging information
    echo "=== CUDA vectoradd Pod Logs ==="
    kubectl logs "${pod_name}" || true
--- a/tests/integration/kubernetes/k8s-nvidia-nim.bats
+++ b/tests/integration/kubernetes/k8s-nvidia-nim.bats
@@ -79,6 +79,8 @@ create_embedqa_pod() {
 }

 setup_file() {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    setup_common

    dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
@@ -102,6 +104,8 @@ setup_file() {
 }

@test "List of models available for inference" {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # shellcheck disable=SC1091  # File is created by previous test
    source "${BATS_SUITE_TMPDIR}/env"
    # shellcheck disable=SC2031  # Variable is shared via file between BATS tests
@@ -122,6 +126,8 @@ setup_file() {
 }

@test "Simple OpenAI completion request" {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # shellcheck disable=SC1091  # File is created by previous test
    source "${BATS_SUITE_TMPDIR}/env"
    # shellcheck disable=SC2031  # Variables are shared via file between BATS tests
@@ -147,6 +153,8 @@ setup_file() {


@test "LangChain NVIDIA AI Endpoints" {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # shellcheck disable=SC1091  # File is created by previous test
    source "${BATS_SUITE_TMPDIR}/env"
    # shellcheck disable=SC2031  # Variables are shared via file between BATS tests
@@ -179,6 +187,7 @@ EOF
 }

@test "Kata Documentation RAG" {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
    [ "${SKIP_MULTI_GPU_TESTS}" = "true" ] && skip "indicated to skip tests requiring multiple GPUs"

    # shellcheck disable=SC1091  # File is created by previous test
@@ -340,6 +349,8 @@ EOF
 }

 teardown_file() {
+    [ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
+
    # Debugging information
    echo "=== Instruct Pod Logs ==="
    kubectl logs "${POD_NAME_INSTRUCT}" || true