ci: Onboard another NVIDIA machine

Let's add a new NVIDIA machine, which later on will be used for CC
related tests.

For now the current tests are skipped in the CC capable machine.

Signed-off-by: Manuel Huber <manuelh@nvidia.com>
Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
Manuel Huber
2025-11-05 22:30:20 +01:00
committed by Fabiano Fidêncio
parent 66e133e096
commit d8953f67c5
4 changed files with 30 additions and 12 deletions

View File

@@ -8,6 +8,7 @@ self-hosted-runner:
# Labels of self-hosted runner that linter should ignore
labels:
- amd64-nvidia-a100
- amd64-nvidia-h100-snp
- arm64-k8s
- containerd-v1.7-overlayfs
- containerd-v2.0-overlayfs

View File

@@ -29,22 +29,22 @@ permissions: {}
jobs:
run-nvidia-gpu-tests-on-amd64:
name: run-nvidia-gpu-tests-on-amd64
name: run-${{ matrix.environment.name }}-tests-on-amd64
strategy:
fail-fast: false
matrix:
vmm:
- qemu-nvidia-gpu
k8s:
- kubeadm
runs-on: amd64-nvidia-a100
environment: [
{ name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100 },
{ name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp },
]
runs-on: ${{ matrix.environment.runner }}
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBERNETES: ${{ matrix.k8s }}
KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
KUBERNETES: kubeadm
K8S_TEST_HOST_TYPE: all
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -66,20 +66,20 @@ jobs:
- name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats
- name: Run tests
- name: Run tests ${{ matrix.environment.vmm }}
timeout-minutes: 30
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
env:
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
- name: Collect artifacts ${{ matrix.vmm }}
- name: Collect artifacts ${{ matrix.environment.vmm }}
if: always()
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
continue-on-error: true
- name: Archive artifacts ${{ matrix.vmm }}
- name: Archive artifacts ${{ matrix.environment.vmm }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
name: k8s-tests-${{ matrix.environment.vmm }}-kubeadm-${{ inputs.tag }}
path: /tmp/artifacts
retention-days: 1

View File

@@ -16,6 +16,8 @@ POD_NAME_CUDA="cuda-vectoradd-kata"
export POD_NAME_CUDA
setup() {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
setup_common
get_pod_config_dir
@@ -29,6 +31,8 @@ setup() {
}
@test "CUDA Vector Addition Test" {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# Create the CUDA pod
kubectl apply -f "${pod_yaml}"
@@ -44,6 +48,8 @@ setup() {
}
teardown() {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# Debugging information
echo "=== CUDA vectoradd Pod Logs ==="
kubectl logs "${pod_name}" || true

View File

@@ -79,6 +79,8 @@ create_embedqa_pod() {
}
setup_file() {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
setup_common
dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
@@ -102,6 +104,8 @@ setup_file() {
}
@test "List of models available for inference" {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
@@ -122,6 +126,8 @@ setup_file() {
}
@test "Simple OpenAI completion request" {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
@@ -147,6 +153,8 @@ setup_file() {
@test "LangChain NVIDIA AI Endpoints" {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
@@ -179,6 +187,7 @@ EOF
}
@test "Kata Documentation RAG" {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
[ "${SKIP_MULTI_GPU_TESTS}" = "true" ] && skip "indicated to skip tests requiring multiple GPUs"
# shellcheck disable=SC1091 # File is created by previous test
@@ -340,6 +349,8 @@ EOF
}
teardown_file() {
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
# Debugging information
echo "=== Instruct Pod Logs ==="
kubectl logs "${POD_NAME_INSTRUCT}" || true