mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-09-05 02:40:18 +00:00
Merge pull request #11236 from kata-containers/amd64-nvidia-gpu-cicd
gpu: AMD64 NVIDIA GPU CI/CD
This commit is contained in:
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@@ -23,3 +23,4 @@ self-hosted-runner:
|
|||||||
- s390x
|
- s390x
|
||||||
- s390x-large
|
- s390x-large
|
||||||
- tdx
|
- tdx
|
||||||
|
- amd64-nvidia-a100
|
||||||
|
@@ -23,6 +23,8 @@ on:
|
|||||||
secrets:
|
secrets:
|
||||||
QUAY_DEPLOYER_PASSWORD:
|
QUAY_DEPLOYER_PASSWORD:
|
||||||
required: false
|
required: false
|
||||||
|
KBUILD_SIGN_PIN:
|
||||||
|
required: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@@ -108,6 +110,7 @@ jobs:
|
|||||||
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
|
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
|
||||||
TARGET_BRANCH: ${{ inputs.target-branch }}
|
TARGET_BRANCH: ${{ inputs.target-branch }}
|
||||||
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
|
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
- name: Parse OCI image name and digest
|
- name: Parse OCI image name and digest
|
||||||
id: parse-oci-segments
|
id: parse-oci-segments
|
||||||
@@ -215,6 +218,7 @@ jobs:
|
|||||||
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
|
ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
|
||||||
TARGET_BRANCH: ${{ inputs.target-branch }}
|
TARGET_BRANCH: ${{ inputs.target-branch }}
|
||||||
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
|
RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
- name: store-artifact ${{ matrix.asset }}
|
- name: store-artifact ${{ matrix.asset }}
|
||||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||||
|
1
.github/workflows/ci-coco-stability.yaml
vendored
1
.github/workflows/ci-coco-stability.yaml
vendored
@@ -31,3 +31,4 @@ jobs:
|
|||||||
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
|
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
|
||||||
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
|
AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
2
.github/workflows/ci-devel.yaml
vendored
2
.github/workflows/ci-devel.yaml
vendored
@@ -27,6 +27,8 @@ jobs:
|
|||||||
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
||||||
ITA_KEY: ${{ secrets.ITA_KEY }}
|
ITA_KEY: ${{ secrets.ITA_KEY }}
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
build-checks:
|
build-checks:
|
||||||
uses: ./.github/workflows/build-checks.yaml
|
uses: ./.github/workflows/build-checks.yaml
|
||||||
|
2
.github/workflows/ci-nightly.yaml
vendored
2
.github/workflows/ci-nightly.yaml
vendored
@@ -31,3 +31,5 @@ jobs:
|
|||||||
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
||||||
ITA_KEY: ${{ secrets.ITA_KEY }}
|
ITA_KEY: ${{ secrets.ITA_KEY }}
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
2
.github/workflows/ci-on-push.yaml
vendored
2
.github/workflows/ci-on-push.yaml
vendored
@@ -52,3 +52,5 @@ jobs:
|
|||||||
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
|
||||||
ITA_KEY: ${{ secrets.ITA_KEY }}
|
ITA_KEY: ${{ secrets.ITA_KEY }}
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
4
.github/workflows/ci-weekly.yaml
vendored
4
.github/workflows/ci-weekly.yaml
vendored
@@ -27,6 +27,8 @@ on:
|
|||||||
required: true
|
required: true
|
||||||
QUAY_DEPLOYER_PASSWORD:
|
QUAY_DEPLOYER_PASSWORD:
|
||||||
required: true
|
required: true
|
||||||
|
KBUILD_SIGN_PIN:
|
||||||
|
required: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@@ -43,6 +45,8 @@ jobs:
|
|||||||
tarball-suffix: -${{ inputs.tag }}
|
tarball-suffix: -${{ inputs.tag }}
|
||||||
commit-hash: ${{ inputs.commit-hash }}
|
commit-hash: ${{ inputs.commit-hash }}
|
||||||
target-branch: ${{ inputs.target-branch }}
|
target-branch: ${{ inputs.target-branch }}
|
||||||
|
secrets:
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
publish-kata-deploy-payload-amd64:
|
publish-kata-deploy-payload-amd64:
|
||||||
needs: build-kata-static-tarball-amd64
|
needs: build-kata-static-tarball-amd64
|
||||||
|
35
.github/workflows/ci.yaml
vendored
35
.github/workflows/ci.yaml
vendored
@@ -35,6 +35,10 @@ on:
|
|||||||
required: true
|
required: true
|
||||||
QUAY_DEPLOYER_PASSWORD:
|
QUAY_DEPLOYER_PASSWORD:
|
||||||
required: true
|
required: true
|
||||||
|
NGC_API_KEY:
|
||||||
|
required: true
|
||||||
|
KBUILD_SIGN_PIN:
|
||||||
|
required: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@@ -52,6 +56,8 @@ jobs:
|
|||||||
tarball-suffix: -${{ inputs.tag }}
|
tarball-suffix: -${{ inputs.tag }}
|
||||||
commit-hash: ${{ inputs.commit-hash }}
|
commit-hash: ${{ inputs.commit-hash }}
|
||||||
target-branch: ${{ inputs.target-branch }}
|
target-branch: ${{ inputs.target-branch }}
|
||||||
|
secrets:
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
publish-kata-deploy-payload-amd64:
|
publish-kata-deploy-payload-amd64:
|
||||||
needs: build-kata-static-tarball-amd64
|
needs: build-kata-static-tarball-amd64
|
||||||
@@ -323,6 +329,21 @@ jobs:
|
|||||||
pr-number: ${{ inputs.pr-number }}
|
pr-number: ${{ inputs.pr-number }}
|
||||||
target-branch: ${{ inputs.target-branch }}
|
target-branch: ${{ inputs.target-branch }}
|
||||||
|
|
||||||
|
run-k8s-tests-on-nvidia-gpu:
|
||||||
|
if: ${{ inputs.skip-test != 'yes' }}
|
||||||
|
needs: publish-kata-deploy-payload-amd64
|
||||||
|
uses: ./.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||||
|
tag: ${{ inputs.tag }}-amd64
|
||||||
|
commit-hash: ${{ inputs.commit-hash }}
|
||||||
|
pr-number: ${{ inputs.pr-number }}
|
||||||
|
target-branch: ${{ inputs.target-branch }}
|
||||||
|
secrets:
|
||||||
|
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||||
|
|
||||||
|
|
||||||
run-kata-coco-tests:
|
run-kata-coco-tests:
|
||||||
if: ${{ inputs.skip-test != 'yes' }}
|
if: ${{ inputs.skip-test != 'yes' }}
|
||||||
needs:
|
needs:
|
||||||
@@ -383,20 +404,6 @@ jobs:
|
|||||||
pr-number: ${{ inputs.pr-number }}
|
pr-number: ${{ inputs.pr-number }}
|
||||||
target-branch: ${{ inputs.target-branch }}
|
target-branch: ${{ inputs.target-branch }}
|
||||||
|
|
||||||
run-metrics-tests:
|
|
||||||
# Skip metrics tests whilst runner is broken
|
|
||||||
if: false
|
|
||||||
# if: ${{ inputs.skip-test != 'yes' }}
|
|
||||||
needs: build-kata-static-tarball-amd64
|
|
||||||
uses: ./.github/workflows/run-metrics.yaml
|
|
||||||
with:
|
|
||||||
registry: ghcr.io
|
|
||||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
|
||||||
tag: ${{ inputs.tag }}-amd64
|
|
||||||
commit-hash: ${{ inputs.commit-hash }}
|
|
||||||
pr-number: ${{ inputs.pr-number }}
|
|
||||||
target-branch: ${{ inputs.target-branch }}
|
|
||||||
|
|
||||||
run-basic-amd64-tests:
|
run-basic-amd64-tests:
|
||||||
if: ${{ inputs.skip-test != 'yes' }}
|
if: ${{ inputs.skip-test != 'yes' }}
|
||||||
needs: build-kata-static-tarball-amd64
|
needs: build-kata-static-tarball-amd64
|
||||||
|
1
.github/workflows/payload-after-push.yaml
vendored
1
.github/workflows/payload-after-push.yaml
vendored
@@ -25,6 +25,7 @@ jobs:
|
|||||||
target-branch: ${{ github.ref_name }}
|
target-branch: ${{ github.ref_name }}
|
||||||
secrets:
|
secrets:
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
build-assets-arm64:
|
build-assets-arm64:
|
||||||
permissions:
|
permissions:
|
||||||
|
3
.github/workflows/release-amd64.yaml
vendored
3
.github/workflows/release-amd64.yaml
vendored
@@ -8,6 +8,8 @@ on:
|
|||||||
secrets:
|
secrets:
|
||||||
QUAY_DEPLOYER_PASSWORD:
|
QUAY_DEPLOYER_PASSWORD:
|
||||||
required: true
|
required: true
|
||||||
|
KBUILD_SIGN_PIN:
|
||||||
|
required: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
@@ -20,6 +22,7 @@ jobs:
|
|||||||
stage: release
|
stage: release
|
||||||
secrets:
|
secrets:
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
packages: write
|
packages: write
|
||||||
|
1
.github/workflows/release.yaml
vendored
1
.github/workflows/release.yaml
vendored
@@ -35,6 +35,7 @@ jobs:
|
|||||||
target-arch: amd64
|
target-arch: amd64
|
||||||
secrets:
|
secrets:
|
||||||
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
|
||||||
|
KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
|
||||||
|
|
||||||
build-and-push-assets-arm64:
|
build-and-push-assets-arm64:
|
||||||
needs: release
|
needs: release
|
||||||
|
89
.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
vendored
Normal file
89
.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
vendored
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
name: CI | Run NVIDIA GPU kubernetes tests on arm64
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
registry:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
repo:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
tag:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
pr-number:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
commit-hash:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
target-branch:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: ""
|
||||||
|
secrets:
|
||||||
|
NGC_API_KEY:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
permissions: {}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run-nvidia-gpu-tests-on-amd64:
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
vmm:
|
||||||
|
- qemu-nvidia-gpu
|
||||||
|
k8s:
|
||||||
|
- kubeadm
|
||||||
|
runs-on: amd64-nvidia-a100
|
||||||
|
env:
|
||||||
|
DOCKER_REGISTRY: ${{ inputs.registry }}
|
||||||
|
DOCKER_REPO: ${{ inputs.repo }}
|
||||||
|
DOCKER_TAG: ${{ inputs.tag }}
|
||||||
|
GH_PR_NUMBER: ${{ inputs.pr-number }}
|
||||||
|
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||||
|
KUBERNETES: ${{ matrix.k8s }}
|
||||||
|
USING_NFD: "false"
|
||||||
|
K8S_TEST_HOST_TYPE: all
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
with:
|
||||||
|
ref: ${{ inputs.commit-hash }}
|
||||||
|
fetch-depth: 0
|
||||||
|
persist-credentials: false
|
||||||
|
|
||||||
|
- name: Rebase atop of the latest target branch
|
||||||
|
run: |
|
||||||
|
./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
|
||||||
|
env:
|
||||||
|
TARGET_BRANCH: ${{ inputs.target-branch }}
|
||||||
|
|
||||||
|
- name: Deploy Kata
|
||||||
|
timeout-minutes: 10
|
||||||
|
run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
|
||||||
|
|
||||||
|
- name: Install `bats`
|
||||||
|
run: bash tests/integration/kubernetes/gha-run.sh install-bats
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
timeout-minutes: 30
|
||||||
|
run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
|
||||||
|
env:
|
||||||
|
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
|
||||||
|
- name: Collect artifacts ${{ matrix.vmm }}
|
||||||
|
if: always()
|
||||||
|
run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
- name: Archive artifacts ${{ matrix.vmm }}
|
||||||
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||||
|
with:
|
||||||
|
name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
|
||||||
|
path: /tmp/artifacts
|
||||||
|
retention-days: 1
|
||||||
|
|
||||||
|
- name: Delete kata-deploy
|
||||||
|
if: always()
|
||||||
|
timeout-minutes: 5
|
||||||
|
run: bash tests/integration/kubernetes/gha-run.sh cleanup
|
@@ -289,7 +289,7 @@ function run_tests() {
|
|||||||
if [[ "${KATA_HYPERVISOR}" = "dragonball" ]] && [[ "${SNAPSHOTTER}" = "devmapper" ]]; then
|
if [[ "${KATA_HYPERVISOR}" = "dragonball" ]] && [[ "${SNAPSHOTTER}" = "devmapper" ]]; then
|
||||||
echo "Skipping tests for ${KATA_HYPERVISOR} using devmapper"
|
echo "Skipping tests for ${KATA_HYPERVISOR} using devmapper"
|
||||||
else
|
else
|
||||||
bash run_kubernetes_tests.sh
|
bash "${K8STESTS}"
|
||||||
fi
|
fi
|
||||||
popd
|
popd
|
||||||
}
|
}
|
||||||
@@ -589,7 +589,14 @@ function main() {
|
|||||||
deploy-kata-zvsi) deploy_kata "zvsi" ;;
|
deploy-kata-zvsi) deploy_kata "zvsi" ;;
|
||||||
deploy-snapshotter) deploy_snapshotter ;;
|
deploy-snapshotter) deploy_snapshotter ;;
|
||||||
report-tests) report_tests ;;
|
report-tests) report_tests ;;
|
||||||
run-tests) run_tests ;;
|
run-tests)
|
||||||
|
K8STESTS=run_kubernetes_tests.sh
|
||||||
|
run_tests
|
||||||
|
;;
|
||||||
|
run-nv-tests)
|
||||||
|
K8STESTS=run_kubernetes_nv_tests.sh
|
||||||
|
run_tests
|
||||||
|
;;
|
||||||
run-tests-kcli) run_tests "kcli" ;;
|
run-tests-kcli) run_tests "kcli" ;;
|
||||||
collect-artifacts) collect_artifacts ;;
|
collect-artifacts) collect_artifacts ;;
|
||||||
cleanup) cleanup ;;
|
cleanup) cleanup ;;
|
||||||
|
99
tests/integration/kubernetes/k8s-nvidia-nim.bats
Normal file
99
tests/integration/kubernetes/k8s-nvidia-nim.bats
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
#!/usr/bin/env bats
|
||||||
|
#
|
||||||
|
# Copyright (c) 2025 NVIDIA Corporation
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
|
||||||
|
# shellcheck disable=SC2154 # BATS variables are not assigned in this file
|
||||||
|
load "${BATS_TEST_DIRNAME}/../../common.bash"
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
load "${BATS_TEST_DIRNAME}/tests_common.sh"
|
||||||
|
|
||||||
|
export POD_NAME_INSTRUCT="nvidia-nim-llama-3-1-8b-instruct"
|
||||||
|
export POD_NAME_EMBEDQA="nvidia-nim-llama-3-2-nv-embedqa-1b-v2"
|
||||||
|
|
||||||
|
export POD_SECRET_INSTRUCT="ngc-secret-instruct"
|
||||||
|
|
||||||
|
DOCKER_CONFIG_JSON=$(
|
||||||
|
echo -n "{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"${NGC_API_KEY}\",\"auth\":\"$(echo -n "\$oauthtoken:${NGC_API_KEY}" | base64 -w0)\"}}}" |
|
||||||
|
base64 -w0
|
||||||
|
)
|
||||||
|
export DOCKER_CONFIG_JSON
|
||||||
|
|
||||||
|
setup_file() {
|
||||||
|
dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
|
||||||
|
|
||||||
|
export PYENV_ROOT="${HOME}/.pyenv"
|
||||||
|
[[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
|
||||||
|
eval "$(pyenv init - bash)"
|
||||||
|
|
||||||
|
python3 -m venv "${HOME}"/.cicd/venv
|
||||||
|
|
||||||
|
get_pod_config_dir
|
||||||
|
|
||||||
|
pod_instruct_yaml_in="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml.in"
|
||||||
|
pod_instruct_yaml="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml"
|
||||||
|
|
||||||
|
envsubst <"${pod_instruct_yaml_in}" >"${pod_instruct_yaml}"
|
||||||
|
|
||||||
|
export POD_INSTRUCT_YAML="${pod_instruct_yaml}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "NVIDIA NIM Llama 3.1-8b Instruct" {
|
||||||
|
kubectl apply -f "${POD_INSTRUCT_YAML}"
|
||||||
|
kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_INSTRUCT}"
|
||||||
|
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
|
||||||
|
POD_IP_INSTRUCT=$(kubectl get pod "${POD_NAME_INSTRUCT}" -o jsonpath='{.status.podIP}')
|
||||||
|
[[ -n "${POD_IP_INSTRUCT}" ]]
|
||||||
|
|
||||||
|
echo "POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >"${BATS_SUITE_TMPDIR}/env"
|
||||||
|
echo "# POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >&3
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "List of models available for inference" {
|
||||||
|
# shellcheck disable=SC1091 # File is created by previous test
|
||||||
|
source "${BATS_SUITE_TMPDIR}/env"
|
||||||
|
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
|
||||||
|
[[ -n "${POD_IP_INSTRUCT}" ]]
|
||||||
|
|
||||||
|
# shellcheck disable=SC2031 # Variable is shared via file between BATS tests
|
||||||
|
run curl -sX GET "http://${POD_IP_INSTRUCT}:8000/v1/models"
|
||||||
|
[[ "${status}" -eq 0 ]]
|
||||||
|
|
||||||
|
# shellcheck disable=SC2030 # Variable is shared via file between BATS tests
|
||||||
|
MODEL_NAME=$(echo "${output}" | jq '.data[0].id' | tr -d '"')
|
||||||
|
export MODEL_NAME
|
||||||
|
[[ -n "${MODEL_NAME}" ]]
|
||||||
|
echo "MODEL_NAME=${MODEL_NAME}" >>"${BATS_SUITE_TMPDIR}/env"
|
||||||
|
echo "# MODEL_NAME=${MODEL_NAME}" >&3
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "Simple OpenAI completion request" {
|
||||||
|
# shellcheck disable=SC1091 # File is created by previous test
|
||||||
|
source "${BATS_SUITE_TMPDIR}/env"
|
||||||
|
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||||
|
[[ -n "${POD_IP_INSTRUCT}" ]]
|
||||||
|
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||||
|
[[ -n "${MODEL_NAME}" ]]
|
||||||
|
|
||||||
|
QUESTION="What are Kata Containers?"
|
||||||
|
|
||||||
|
# shellcheck disable=SC2031 # Variables are shared via file between BATS tests
|
||||||
|
run curl -sX 'POST' \
|
||||||
|
"http://${POD_IP_INSTRUCT}:8000/v1/completions" \
|
||||||
|
-H "accept: application/json" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"model\": \"${MODEL_NAME}\", \"prompt\": \"${QUESTION}\", \"max_tokens\": 64}"
|
||||||
|
|
||||||
|
ANSWER=$(echo "${output}" | jq '.choices[0].text')
|
||||||
|
[[ -n "${ANSWER}" ]]
|
||||||
|
|
||||||
|
echo "# QUESTION: ${QUESTION}" >&3
|
||||||
|
echo "# ANSWER: ${ANSWER}" >&3
|
||||||
|
}
|
||||||
|
|
||||||
|
teardown_file() {
|
||||||
|
kubectl delete -f "${POD_INSTRUCT_YAML}"
|
||||||
|
}
|
42
tests/integration/kubernetes/run_kubernetes_nv_tests.sh
Normal file
42
tests/integration/kubernetes/run_kubernetes_nv_tests.sh
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Copyright (c) 2025 NVIDIA Corporation
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
kubernetes_dir=$(dirname "$(readlink -f "$0")")
|
||||||
|
# shellcheck disable=SC1091 # import based on variable
|
||||||
|
source "${kubernetes_dir}/../../common.bash"
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# Setting to "yes" enables fail fast, stopping execution at the first failed test.
|
||||||
|
K8S_TEST_FAIL_FAST="${K8S_TEST_FAIL_FAST:-no}"
|
||||||
|
K8S_TEST_NV=("k8s-nvidia-nim.bats")
|
||||||
|
|
||||||
|
ensure_yq
|
||||||
|
|
||||||
|
info "Running tests with bats version: $(bats --version)"
|
||||||
|
|
||||||
|
tests_fail=()
|
||||||
|
for K8S_TEST_ENTRY in "${K8S_TEST_NV[@]}"
|
||||||
|
do
|
||||||
|
K8S_TEST_ENTRY=$(echo "${K8S_TEST_ENTRY}" | tr -d '[:space:][:cntrl:]')
|
||||||
|
info "$(kubectl get pods --all-namespaces 2>&1)"
|
||||||
|
info "Executing ${K8S_TEST_ENTRY}"
|
||||||
|
if ! bats --show-output-of-passing-tests "${K8S_TEST_ENTRY}"; then
|
||||||
|
tests_fail+=("${K8S_TEST_ENTRY}")
|
||||||
|
[[ "${K8S_TEST_FAIL_FAST}" = "yes" ]] && break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[[ ${#tests_fail[@]} -ne 0 ]] && die "Tests FAILED from suites: ${tests_fail[*]}"
|
||||||
|
|
||||||
|
info "All tests SUCCEEDED"
|
@@ -0,0 +1,89 @@
|
|||||||
|
|
||||||
|
# Copyright (c) 2025 NVIDIA Corporation
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: ngc-secret-instruct
|
||||||
|
type: kubernetes.io/dockerconfigjson
|
||||||
|
data:
|
||||||
|
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: ${POD_NAME_INSTRUCT}
|
||||||
|
labels:
|
||||||
|
app: ${POD_NAME_INSTRUCT}
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
runtimeClassName: kata-qemu-nvidia-gpu
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: ngc-secret-instruct
|
||||||
|
securityContext:
|
||||||
|
runAsUser: 0
|
||||||
|
runAsGroup: 0
|
||||||
|
fsGroup: 0
|
||||||
|
containers:
|
||||||
|
- name: ${POD_NAME_INSTRUCT}
|
||||||
|
image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
|
||||||
|
# Ports exposed by the container:
|
||||||
|
ports:
|
||||||
|
- containerPort: 8000
|
||||||
|
name: http-openai
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/live
|
||||||
|
port: http-openai
|
||||||
|
initialDelaySeconds: 15
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/ready
|
||||||
|
port: http-openai
|
||||||
|
initialDelaySeconds: 15
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
startupProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/ready
|
||||||
|
port: http-openai
|
||||||
|
initialDelaySeconds: 40
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 180
|
||||||
|
# Environment variable for NGC_API_KEY. In production, use a Secret.
|
||||||
|
env:
|
||||||
|
- name: NGC_API_KEY
|
||||||
|
value: "${NGC_API_KEY}"
|
||||||
|
# GPU resource request/limit (for NVIDIA GPU)
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "16"
|
||||||
|
memory: "32Gi"
|
||||||
|
limits:
|
||||||
|
nvidia.com/pgpu: "1"
|
||||||
|
cpu: "16"
|
||||||
|
memory: "32Gi"
|
||||||
|
# Mount the local .cache directory into the container
|
||||||
|
volumeMounts:
|
||||||
|
- name: nim-cache
|
||||||
|
mountPath: /opt/nim/.cache
|
||||||
|
|
||||||
|
# Host path volume for the local .cache directory.
|
||||||
|
# Adjust 'path' to match your $LOCAL_NIM_CACHE location.
|
||||||
|
volumes:
|
||||||
|
- name: nim-cache
|
||||||
|
hostPath:
|
||||||
|
path: "/opr/nim/.cache"
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
|
@@ -0,0 +1,95 @@
|
|||||||
|
# Copyright (c) 2025 NVIDIA Corporation
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: ngc-secret-embedqa
|
||||||
|
namespace: nim-embedqa
|
||||||
|
type: kubernetes.io/dockerconfigjson
|
||||||
|
data:
|
||||||
|
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
|
||||||
|
namespace: nim-embedqa
|
||||||
|
labels:
|
||||||
|
app: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
|
||||||
|
spec:
|
||||||
|
restartPolicy: Always
|
||||||
|
runtimeClassName: "${RUNTIME_CLASS_NAME}"
|
||||||
|
serviceAccountName: default
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: ngc-secret-embedqa
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 0
|
||||||
|
runAsGroup: 0
|
||||||
|
runAsUser: 0
|
||||||
|
containers:
|
||||||
|
- name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
|
||||||
|
image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.5.0
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
env:
|
||||||
|
- name: NIM_CACHE_PATH
|
||||||
|
value: "/opt/nim/.cache"
|
||||||
|
- name: NGC_API_KEY
|
||||||
|
value: "${NGC_API_KEY}"
|
||||||
|
- name: NIM_HTTP_API_PORT
|
||||||
|
value: "8000"
|
||||||
|
- name: NIM_JSONL_LOGGING
|
||||||
|
value: "1"
|
||||||
|
- name: NIM_LOG_LEVEL
|
||||||
|
value: "INFO"
|
||||||
|
ports:
|
||||||
|
- containerPort: 8000
|
||||||
|
name: http
|
||||||
|
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/live
|
||||||
|
port: 8000
|
||||||
|
initialDelaySeconds: 15
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/ready
|
||||||
|
port: 8000
|
||||||
|
initialDelaySeconds: 15
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
|
||||||
|
startupProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health/ready
|
||||||
|
port: 8000
|
||||||
|
initialDelaySeconds: 40
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 1
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 180
|
||||||
|
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/pgpu: 1
|
||||||
|
cpu: "16"
|
||||||
|
memory: "32Gi"
|
||||||
|
|
||||||
|
volumeMounts:
|
||||||
|
- name: nim-cache
|
||||||
|
mountPath: /opt/nim/.cache
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- name: nim-cache
|
||||||
|
hostPath:
|
||||||
|
path: "/opr/nim/.cache"
|
||||||
|
type: DirectoryOrCreate
|
Reference in New Issue
Block a user