mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-01-25 22:54:29 +00:00
ci: Onboard another NVIDIA machine
Let's add a new NVIDIA machine, which later on will be used for CC related tests. For now the current tests are skipped in the CC capable machine. Signed-off-by: Manuel Huber <manuelh@nvidia.com> Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
66e133e096
commit
d8953f67c5
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@@ -8,6 +8,7 @@ self-hosted-runner:
|
||||
# Labels of self-hosted runner that linter should ignore
|
||||
labels:
|
||||
- amd64-nvidia-a100
|
||||
- amd64-nvidia-h100-snp
|
||||
- arm64-k8s
|
||||
- containerd-v1.7-overlayfs
|
||||
- containerd-v2.0-overlayfs
|
||||
|
||||
@@ -29,22 +29,22 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
run-nvidia-gpu-tests-on-amd64:
|
||||
name: run-nvidia-gpu-tests-on-amd64
|
||||
name: run-${{ matrix.environment.name }}-tests-on-amd64
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
vmm:
|
||||
- qemu-nvidia-gpu
|
||||
k8s:
|
||||
- kubeadm
|
||||
runs-on: amd64-nvidia-a100
|
||||
environment: [
|
||||
{ name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100 },
|
||||
{ name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp },
|
||||
]
|
||||
runs-on: ${{ matrix.environment.runner }}
|
||||
env:
|
||||
DOCKER_REGISTRY: ${{ inputs.registry }}
|
||||
DOCKER_REPO: ${{ inputs.repo }}
|
||||
DOCKER_TAG: ${{ inputs.tag }}
|
||||
GH_PR_NUMBER: ${{ inputs.pr-number }}
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
KUBERNETES: ${{ matrix.k8s }}
|
||||
KATA_HYPERVISOR: ${{ matrix.environment.vmm }}
|
||||
KUBERNETES: kubeadm
|
||||
K8S_TEST_HOST_TYPE: all
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
@@ -66,20 +66,20 @@ jobs:
|
||||
- name: Install `bats`
|
||||
run: bash tests/integration/kubernetes/gha-run.sh install-bats
|
||||
|
||||
- name: Run tests
|
||||
- name: Run tests ${{ matrix.environment.vmm }}
|
||||
timeout-minutes: 30
|
||||
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
|
||||
env:
|
||||
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||
- name: Collect artifacts ${{ matrix.vmm }}
|
||||
- name: Collect artifacts ${{ matrix.environment.vmm }}
|
||||
if: always()
|
||||
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
|
||||
continue-on-error: true
|
||||
|
||||
- name: Archive artifacts ${{ matrix.vmm }}
|
||||
- name: Archive artifacts ${{ matrix.environment.vmm }}
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
|
||||
name: k8s-tests-${{ matrix.environment.vmm }}-kubeadm-${{ inputs.tag }}
|
||||
path: /tmp/artifacts
|
||||
retention-days: 1
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ POD_NAME_CUDA="cuda-vectoradd-kata"
|
||||
export POD_NAME_CUDA
|
||||
|
||||
setup() {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
setup_common
|
||||
get_pod_config_dir
|
||||
|
||||
@@ -29,6 +31,8 @@ setup() {
|
||||
}
|
||||
|
||||
@test "CUDA Vector Addition Test" {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# Create the CUDA pod
|
||||
kubectl apply -f "${pod_yaml}"
|
||||
|
||||
@@ -44,6 +48,8 @@ setup() {
|
||||
}
|
||||
|
||||
teardown() {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# Debugging information
|
||||
echo "=== CUDA vectoradd Pod Logs ==="
|
||||
kubectl logs "${pod_name}" || true
|
||||
|
||||
@@ -79,6 +79,8 @@ create_embedqa_pod() {
|
||||
}
|
||||
|
||||
setup_file() {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
setup_common
|
||||
|
||||
dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
|
||||
@@ -102,6 +104,8 @@ setup_file() {
|
||||
}
|
||||
|
||||
@test "List of models available for inference" {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# shellcheck disable=SC1091 # File is created by previous test
|
||||
source "${BATS_SUITE_TMPDIR}/env"
|
||||
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
|
||||
@@ -122,6 +126,8 @@ setup_file() {
|
||||
}
|
||||
|
||||
@test "Simple OpenAI completion request" {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# shellcheck disable=SC1091 # File is created by previous test
|
||||
source "${BATS_SUITE_TMPDIR}/env"
|
||||
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||
@@ -147,6 +153,8 @@ setup_file() {
|
||||
|
||||
|
||||
@test "LangChain NVIDIA AI Endpoints" {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# shellcheck disable=SC1091 # File is created by previous test
|
||||
source "${BATS_SUITE_TMPDIR}/env"
|
||||
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||
@@ -179,6 +187,7 @@ EOF
|
||||
}
|
||||
|
||||
@test "Kata Documentation RAG" {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
[ "${SKIP_MULTI_GPU_TESTS}" = "true" ] && skip "indicated to skip tests requiring multiple GPUs"
|
||||
|
||||
# shellcheck disable=SC1091 # File is created by previous test
|
||||
@@ -340,6 +349,8 @@ EOF
|
||||
}
|
||||
|
||||
teardown_file() {
|
||||
[ "${KATA_HYPERVISOR}" = "qemu-nvidia-gpu-snp" ] && skip "The CC version of the test is under development"
|
||||
|
||||
# Debugging information
|
||||
echo "=== Instruct Pod Logs ==="
|
||||
kubectl logs "${POD_NAME_INSTRUCT}" || true
|
||||
|
||||
Reference in New Issue
Block a user