Merge pull request #11236 from kata-containers/amd64-nvidia-gpu-cicd

gpu: AMD64 NVIDIA GPU CI/CD
This commit is contained in:
Fabiano Fidêncio
2025-07-31 14:52:01 +02:00
committed by GitHub
17 changed files with 465 additions and 16 deletions

View File

@@ -23,3 +23,4 @@ self-hosted-runner:
- s390x
- s390x-large
- tdx
- amd64-nvidia-a100

View File

@@ -23,6 +23,8 @@ on:
secrets:
QUAY_DEPLOYER_PASSWORD:
required: false
KBUILD_SIGN_PIN:
required: true
permissions:
contents: read
@@ -108,6 +110,7 @@ jobs:
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ inputs.target-branch }}
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
- name: Parse OCI image name and digest
id: parse-oci-segments
@@ -215,6 +218,7 @@ jobs:
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ inputs.target-branch }}
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
- name: store-artifact ${{ matrix.asset }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2

View File

@@ -31,3 +31,4 @@ jobs:
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

View File

@@ -27,6 +27,8 @@ jobs:
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
ITA_KEY: ${{ secrets.ITA_KEY }}
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
build-checks:
uses: ./.github/workflows/build-checks.yaml

View File

@@ -31,3 +31,5 @@ jobs:
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
ITA_KEY: ${{ secrets.ITA_KEY }}
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

View File

@@ -52,3 +52,5 @@ jobs:
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
ITA_KEY: ${{ secrets.ITA_KEY }}
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

View File

@@ -27,6 +27,8 @@ on:
required: true
QUAY_DEPLOYER_PASSWORD:
required: true
KBUILD_SIGN_PIN:
required: true
permissions:
contents: read
@@ -43,6 +45,8 @@ jobs:
tarball-suffix: -${{ inputs.tag }}
commit-hash: ${{ inputs.commit-hash }}
target-branch: ${{ inputs.target-branch }}
secrets:
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
publish-kata-deploy-payload-amd64:
needs: build-kata-static-tarball-amd64

View File

@@ -35,6 +35,10 @@ on:
required: true
QUAY_DEPLOYER_PASSWORD:
required: true
NGC_API_KEY:
required: true
KBUILD_SIGN_PIN:
required: true
permissions:
contents: read
@@ -52,6 +56,8 @@ jobs:
tarball-suffix: -${{ inputs.tag }}
commit-hash: ${{ inputs.commit-hash }}
target-branch: ${{ inputs.target-branch }}
secrets:
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
publish-kata-deploy-payload-amd64:
needs: build-kata-static-tarball-amd64
@@ -323,6 +329,21 @@ jobs:
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
run-k8s-tests-on-nvidia-gpu:
if: ${{ inputs.skip-test != 'yes' }}
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
secrets:
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
run-kata-coco-tests:
if: ${{ inputs.skip-test != 'yes' }}
needs:
@@ -383,20 +404,6 @@ jobs:
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
run-metrics-tests:
# Skip metrics tests whilst runner is broken
if: false
# if: ${{ inputs.skip-test != 'yes' }}
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/run-metrics.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
target-branch: ${{ inputs.target-branch }}
run-basic-amd64-tests:
if: ${{ inputs.skip-test != 'yes' }}
needs: build-kata-static-tarball-amd64

View File

@@ -25,6 +25,7 @@ jobs:
target-branch: ${{ github.ref_name }}
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
build-assets-arm64:
permissions:

View File

@@ -8,6 +8,8 @@ on:
secrets:
QUAY_DEPLOYER_PASSWORD:
required: true
KBUILD_SIGN_PIN:
required: true
permissions:
contents: read
@@ -20,6 +22,7 @@ jobs:
stage: release
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
permissions:
contents: read
packages: write

View File

@@ -35,6 +35,7 @@ jobs:
target-arch: amd64
secrets:
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
build-and-push-assets-arm64:
needs: release

View File

@@ -0,0 +1,89 @@
name: CI | Run NVIDIA GPU kubernetes tests on arm64
on:
workflow_call:
inputs:
registry:
required: true
type: string
repo:
required: true
type: string
tag:
required: true
type: string
pr-number:
required: true
type: string
commit-hash:
required: false
type: string
target-branch:
required: false
type: string
default: ""
secrets:
NGC_API_KEY:
required: true
permissions: {}
jobs:
run-nvidia-gpu-tests-on-amd64:
strategy:
fail-fast: false
matrix:
vmm:
- qemu-nvidia-gpu
k8s:
- kubeadm
runs-on: amd64-nvidia-a100
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBERNETES: ${{ matrix.k8s }}
USING_NFD: "false"
K8S_TEST_HOST_TYPE: all
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.commit-hash }}
fetch-depth: 0
persist-credentials: false
- name: Rebase atop of the latest target branch
run: |
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
env:
TARGET_BRANCH: ${{ inputs.target-branch }}
- name: Deploy Kata
timeout-minutes: 10
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
- name: Install `bats`
run: bash tests/integration/kubernetes/gha-run.sh install-bats
- name: Run tests
timeout-minutes: 30
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
env:
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
- name: Collect artifacts ${{ matrix.vmm }}
if: always()
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
continue-on-error: true
- name: Archive artifacts ${{ matrix.vmm }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
path: /tmp/artifacts
retention-days: 1
- name: Delete kata-deploy
if: always()
timeout-minutes: 5
run: bash tests/integration/kubernetes/gha-run.sh cleanup

View File

@@ -289,7 +289,7 @@ function run_tests() {
if [[ "${KATA_HYPERVISOR}" = "dragonball" ]] && [[ "${SNAPSHOTTER}" = "devmapper" ]]; then
echo "Skipping tests for ${KATA_HYPERVISOR} using devmapper"
else
bash run_kubernetes_tests.sh
bash "${K8STESTS}"
fi
popd
}
@@ -589,7 +589,14 @@ function main() {
deploy-kata-zvsi) deploy_kata "zvsi" ;;
deploy-snapshotter) deploy_snapshotter ;;
report-tests) report_tests ;;
run-tests) run_tests ;;
run-tests)
K8STESTS=run_kubernetes_tests.sh
run_tests
;;
run-nv-tests)
K8STESTS=run_kubernetes_nv_tests.sh
run_tests
;;
run-tests-kcli) run_tests "kcli" ;;
collect-artifacts) collect_artifacts ;;
cleanup) cleanup ;;

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env bats
#
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# shellcheck disable=SC2154 # BATS variables are not assigned in this file
load "${BATS_TEST_DIRNAME}/../../common.bash"
# shellcheck disable=SC1091
load "${BATS_TEST_DIRNAME}/tests_common.sh"
export POD_NAME_INSTRUCT="nvidia-nim-llama-3-1-8b-instruct"
export POD_NAME_EMBEDQA="nvidia-nim-llama-3-2-nv-embedqa-1b-v2"
export POD_SECRET_INSTRUCT="ngc-secret-instruct"
DOCKER_CONFIG_JSON=$(
echo -n "{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"${NGC_API_KEY}\",\"auth\":\"$(echo -n "\$oauthtoken:${NGC_API_KEY}" | base64 -w0)\"}}}" |
base64 -w0
)
export DOCKER_CONFIG_JSON
setup_file() {
dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
export PYENV_ROOT="${HOME}/.pyenv"
[[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
eval "$(pyenv init - bash)"
python3 -m venv "${HOME}"/.cicd/venv
get_pod_config_dir
pod_instruct_yaml_in="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml.in"
pod_instruct_yaml="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml"
envsubst <"${pod_instruct_yaml_in}" >"${pod_instruct_yaml}"
export POD_INSTRUCT_YAML="${pod_instruct_yaml}"
}
@test "NVIDIA NIM Llama 3.1-8b Instruct" {
kubectl apply -f "${POD_INSTRUCT_YAML}"
kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_INSTRUCT}"
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
POD_IP_INSTRUCT=$(kubectl get pod "${POD_NAME_INSTRUCT}" -o jsonpath='{.status.podIP}')
[[ -n "${POD_IP_INSTRUCT}" ]]
echo "POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >"${BATS_SUITE_TMPDIR}/env"
echo "# POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >&3
}
@test "List of models available for inference" {
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
[[ -n "${POD_IP_INSTRUCT}" ]]
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
run curl -sX GET "http://${POD_IP_INSTRUCT}:8000/v1/models"
[[ "${status}" -eq 0 ]]
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
MODEL_NAME=$(echo "${output}" | jq '.data[0].id' | tr -d '"')
export MODEL_NAME
[[ -n "${MODEL_NAME}" ]]
echo "MODEL_NAME=${MODEL_NAME}" >>"${BATS_SUITE_TMPDIR}/env"
echo "# MODEL_NAME=${MODEL_NAME}" >&3
}
@test "Simple OpenAI completion request" {
# shellcheck disable=SC1091 # File is created by previous test
source "${BATS_SUITE_TMPDIR}/env"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${POD_IP_INSTRUCT}" ]]
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
[[ -n "${MODEL_NAME}" ]]
QUESTION="What are Kata Containers?"
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
run curl -sX 'POST' \
"http://${POD_IP_INSTRUCT}:8000/v1/completions" \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d "{\"model\": \"${MODEL_NAME}\", \"prompt\": \"${QUESTION}\", \"max_tokens\": 64}"
ANSWER=$(echo "${output}" | jq '.choices[0].text')
[[ -n "${ANSWER}" ]]
echo "# QUESTION: ${QUESTION}" >&3
echo "# ANSWER: ${ANSWER}" >&3
}
teardown_file() {
kubectl delete -f "${POD_INSTRUCT_YAML}"
}

View File

@@ -0,0 +1,42 @@
#!/bin/bash
#
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
set -e
kubernetes_dir=$(dirname "$(readlink -f "$0")")
# shellcheck disable=SC1091 # import based on variable
source "${kubernetes_dir}/../../common.bash"
cleanup() {
true
}
trap cleanup EXIT
# Setting to "yes" enables fail fast, stopping execution at the first failed test.
K8S_TEST_FAIL_FAST="${K8S_TEST_FAIL_FAST:-no}"
K8S_TEST_NV=("k8s-nvidia-nim.bats")
ensure_yq
info "Running tests with bats version: $(bats --version)"
tests_fail=()
for K8S_TEST_ENTRY in "${K8S_TEST_NV[@]}"
do
K8S_TEST_ENTRY=$(echo "${K8S_TEST_ENTRY}" | tr -d '[:space:][:cntrl:]')
info "$(kubectl get pods --all-namespaces 2>&1)"
info "Executing ${K8S_TEST_ENTRY}"
if ! bats --show-output-of-passing-tests "${K8S_TEST_ENTRY}"; then
tests_fail+=("${K8S_TEST_ENTRY}")
[[ "${K8S_TEST_FAIL_FAST}" = "yes" ]] && break
fi
done
[[ ${#tests_fail[@]} -ne 0 ]] && die "Tests FAILED from suites: ${tests_fail[*]}"
info "All tests SUCCEEDED"

View File

@@ -0,0 +1,89 @@
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
---
apiVersion: v1
kind: Secret
metadata:
name: ngc-secret-instruct
type: kubernetes.io/dockerconfigjson
data:
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
---
apiVersion: v1
kind: Pod
metadata:
name: ${POD_NAME_INSTRUCT}
labels:
app: ${POD_NAME_INSTRUCT}
spec:
restartPolicy: Never
runtimeClassName: kata-qemu-nvidia-gpu
imagePullSecrets:
- name: ngc-secret-instruct
securityContext:
runAsUser: 0
runAsGroup: 0
fsGroup: 0
containers:
- name: ${POD_NAME_INSTRUCT}
image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
# Ports exposed by the container:
ports:
- containerPort: 8000
name: http-openai
livenessProbe:
httpGet:
path: /v1/health/live
port: http-openai
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /v1/health/ready
port: http-openai
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
startupProbe:
httpGet:
path: /v1/health/ready
port: http-openai
initialDelaySeconds: 40
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 180
# Environment variable for NGC_API_KEY. In production, use a Secret.
env:
- name: NGC_API_KEY
value: "${NGC_API_KEY}"
# GPU resource request/limit (for NVIDIA GPU)
resources:
requests:
cpu: "16"
memory: "32Gi"
limits:
nvidia.com/pgpu: "1"
cpu: "16"
memory: "32Gi"
# Mount the local .cache directory into the container
volumeMounts:
- name: nim-cache
mountPath: /opt/nim/.cache
# Host path volume for the local .cache directory.
# Adjust 'path' to match your $LOCAL_NIM_CACHE location.
volumes:
- name: nim-cache
hostPath:
path: "/opr/nim/.cache"
type: DirectoryOrCreate

View File

@@ -0,0 +1,95 @@
# Copyright (c) 2025 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
---
apiVersion: v1
kind: Secret
metadata:
name: ngc-secret-embedqa
namespace: nim-embedqa
type: kubernetes.io/dockerconfigjson
data:
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
---
apiVersion: v1
kind: Pod
metadata:
name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
namespace: nim-embedqa
labels:
app: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
spec:
restartPolicy: Always
runtimeClassName: "${RUNTIME_CLASS_NAME}"
serviceAccountName: default
imagePullSecrets:
- name: ngc-secret-embedqa
securityContext:
fsGroup: 0
runAsGroup: 0
runAsUser: 0
containers:
- name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.5.0
imagePullPolicy: IfNotPresent
env:
- name: NIM_CACHE_PATH
value: "/opt/nim/.cache"
- name: NGC_API_KEY
value: "${NGC_API_KEY}"
- name: NIM_HTTP_API_PORT
value: "8000"
- name: NIM_JSONL_LOGGING
value: "1"
- name: NIM_LOG_LEVEL
value: "INFO"
ports:
- containerPort: 8000
name: http
livenessProbe:
httpGet:
path: /v1/health/live
port: 8000
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /v1/health/ready
port: 8000
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
startupProbe:
httpGet:
path: /v1/health/ready
port: 8000
initialDelaySeconds: 40
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 180
resources:
limits:
nvidia.com/pgpu: 1
cpu: "16"
memory: "32Gi"
volumeMounts:
- name: nim-cache
mountPath: /opt/nim/.cache
volumes:
- name: nim-cache
hostPath:
path: "/opr/nim/.cache"
type: DirectoryOrCreate