Merge pull request #11236 from kata-containers/amd64-nvidia-gpu-cicd

gpu: AMD64 NVIDIA GPU CI/CD
2025-08-31 16:36:38 +00:00 · 2025-07-31 14:52:01 +02:00
parent 00e0db99a3 fbb0e7f2f2
commit 20bef41347
17 changed files with 465 additions and 16 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -23,3 +23,4 @@ self-hosted-runner:
    - s390x
    - s390x-large
    - tdx
+    - amd64-nvidia-a100
--- a/.github/workflows/build-kata-static-tarball-amd64.yaml
+++ b/.github/workflows/build-kata-static-tarball-amd64.yaml
@@ -23,6 +23,8 @@ on:
    secrets:
      QUAY_DEPLOYER_PASSWORD:
        required: false
+      KBUILD_SIGN_PIN:
+        required: true

 permissions:
  contents: read
@@ -108,6 +110,7 @@ jobs:
          ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
          TARGET_BRANCH: ${{ inputs.target-branch }}
          RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
+          KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

      - name: Parse OCI image name and digest
        id: parse-oci-segments
@@ -215,6 +218,7 @@ jobs:
          ARTEFACT_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
          TARGET_BRANCH: ${{ inputs.target-branch }}
          RELEASE: ${{ inputs.stage == 'release' && 'yes' || 'no' }}
+          KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

      - name: store-artifact ${{ matrix.asset }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
--- a/.github/workflows/ci-coco-stability.yaml
+++ b/.github/workflows/ci-coco-stability.yaml
@@ -31,3 +31,4 @@ jobs:
      AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
      AZ_SUBSCRIPTION_ID: ${{ secrets.AZ_SUBSCRIPTION_ID }}
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
--- a/.github/workflows/ci-devel.yaml
+++ b/.github/workflows/ci-devel.yaml
@@ -27,6 +27,8 @@ jobs:
      CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
      ITA_KEY: ${{ secrets.ITA_KEY }}
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

  build-checks:
    uses: ./.github/workflows/build-checks.yaml
--- a/.github/workflows/ci-nightly.yaml
+++ b/.github/workflows/ci-nightly.yaml
@@ -31,3 +31,5 @@ jobs:
      CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
      ITA_KEY: ${{ secrets.ITA_KEY }}
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
--- a/.github/workflows/ci-on-push.yaml
+++ b/.github/workflows/ci-on-push.yaml
@@ -52,3 +52,5 @@ jobs:
      CI_HKD_PATH: ${{ secrets.CI_HKD_PATH }}
      ITA_KEY: ${{ secrets.ITA_KEY }}
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
--- a/.github/workflows/ci-weekly.yaml
+++ b/.github/workflows/ci-weekly.yaml
@@ -27,6 +27,8 @@ on:
        required: true
      QUAY_DEPLOYER_PASSWORD:
        required: true
+      KBUILD_SIGN_PIN:
+        required: true

 permissions:
  contents: read
@@ -43,6 +45,8 @@ jobs:
      tarball-suffix: -${{ inputs.tag }}
      commit-hash: ${{ inputs.commit-hash }}
      target-branch: ${{ inputs.target-branch }}
+    secrets:
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

  publish-kata-deploy-payload-amd64:
    needs: build-kata-static-tarball-amd64
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -35,6 +35,10 @@ on:
        required: true
      QUAY_DEPLOYER_PASSWORD:
        required: true
+      NGC_API_KEY:
+        required: true
+      KBUILD_SIGN_PIN:
+        required: true

 permissions:
  contents: read
@@ -52,6 +56,8 @@ jobs:
      tarball-suffix: -${{ inputs.tag }}
      commit-hash: ${{ inputs.commit-hash }}
      target-branch: ${{ inputs.target-branch }}
+    secrets:
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

  publish-kata-deploy-payload-amd64:
    needs: build-kata-static-tarball-amd64
@@ -323,6 +329,21 @@ jobs:
      pr-number: ${{ inputs.pr-number }}
      target-branch: ${{ inputs.target-branch }}

+  run-k8s-tests-on-nvidia-gpu:
+    if: ${{ inputs.skip-test != 'yes' }}
+    needs: publish-kata-deploy-payload-amd64
+    uses: ./.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+      pr-number: ${{ inputs.pr-number }}
+      target-branch: ${{ inputs.target-branch }}
+    secrets:
+      NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+
+
  run-kata-coco-tests:
    if: ${{ inputs.skip-test != 'yes' }}
    needs:
@@ -383,20 +404,6 @@ jobs:
      pr-number: ${{ inputs.pr-number }}
      target-branch: ${{ inputs.target-branch }}

-  run-metrics-tests:
-    # Skip metrics tests whilst runner is broken
-    if: false
-    # if: ${{ inputs.skip-test != 'yes' }}
-    needs: build-kata-static-tarball-amd64
-    uses: ./.github/workflows/run-metrics.yaml
-    with:
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ inputs.tag }}-amd64
-      commit-hash: ${{ inputs.commit-hash }}
-      pr-number: ${{ inputs.pr-number }}
-      target-branch: ${{ inputs.target-branch }}
-
  run-basic-amd64-tests:
    if: ${{ inputs.skip-test != 'yes' }}
    needs: build-kata-static-tarball-amd64
--- a/.github/workflows/payload-after-push.yaml
+++ b/.github/workflows/payload-after-push.yaml
@@ -25,6 +25,7 @@ jobs:
      target-branch: ${{ github.ref_name }}
    secrets:
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

  build-assets-arm64:
    permissions:
--- a/.github/workflows/release-amd64.yaml
+++ b/.github/workflows/release-amd64.yaml
@@ -8,6 +8,8 @@ on:
    secrets:
      QUAY_DEPLOYER_PASSWORD:
        required: true
+      KBUILD_SIGN_PIN:
+        required: true

 permissions:
  contents: read
@@ -20,6 +22,7 @@ jobs:
      stage: release
    secrets:
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}
    permissions:
      contents: read
      packages: write
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -35,6 +35,7 @@ jobs:
      target-arch: amd64
    secrets:
      QUAY_DEPLOYER_PASSWORD: ${{ secrets.QUAY_DEPLOYER_PASSWORD }}
+      KBUILD_SIGN_PIN: ${{ secrets.KBUILD_SIGN_PIN }}

  build-and-push-assets-arm64:
    needs: release
--- a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
+++ b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml
@@ -0,0 +1,89 @@
+name: CI | Run NVIDIA GPU kubernetes tests on arm64
+on:
+  workflow_call:
+    inputs:
+      registry:
+        required: true
+        type: string
+      repo:
+        required: true
+        type: string
+      tag:
+        required: true
+        type: string
+      pr-number:
+        required: true
+        type: string
+      commit-hash:
+        required: false
+        type: string
+      target-branch:
+        required: false
+        type: string
+        default: ""
+    secrets:
+      NGC_API_KEY:
+        required: true
+
+permissions: {}
+
+jobs:
+  run-nvidia-gpu-tests-on-amd64:
+    strategy:
+      fail-fast: false
+      matrix:
+        vmm:
+          - qemu-nvidia-gpu
+        k8s:
+          - kubeadm
+    runs-on: amd64-nvidia-a100
+    env:
+      DOCKER_REGISTRY: ${{ inputs.registry }}
+      DOCKER_REPO: ${{ inputs.repo }}
+      DOCKER_TAG: ${{ inputs.tag }}
+      GH_PR_NUMBER: ${{ inputs.pr-number }}
+      KATA_HYPERVISOR: ${{ matrix.vmm }}
+      KUBERNETES: ${{ matrix.k8s }}
+      USING_NFD: "false"
+      K8S_TEST_HOST_TYPE: all
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: ${{ inputs.commit-hash }}
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Rebase atop of the latest target branch
+        run: |
+          ./tests/git-helper.sh "rebase-atop-of-the-latest-target-branch"
+        env:
+          TARGET_BRANCH: ${{ inputs.target-branch }}
+
+      - name: Deploy Kata
+        timeout-minutes: 10
+        run: bash tests/integration/kubernetes/gha-run.sh deploy-kata
+
+      - name: Install `bats`
+        run: bash tests/integration/kubernetes/gha-run.sh install-bats
+
+      - name: Run tests
+        timeout-minutes: 30
+        run: bash tests/integration/kubernetes/gha-run.sh run-nv-tests
+        env:
+          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+      - name: Collect artifacts ${{ matrix.vmm }}
+        if: always()
+        run: bash tests/integration/kubernetes/gha-run.sh collect-artifacts
+        continue-on-error: true
+
+      - name: Archive artifacts ${{ matrix.vmm }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: k8s-tests-${{ matrix.vmm }}-${{ matrix.k8s }}-${{ inputs.tag }}
+          path: /tmp/artifacts
+          retention-days: 1
+
+      - name: Delete kata-deploy
+        if: always()
+        timeout-minutes: 5
+        run: bash tests/integration/kubernetes/gha-run.sh cleanup
--- a/tests/integration/kubernetes/gha-run.sh
+++ b/tests/integration/kubernetes/gha-run.sh
@@ -289,7 +289,7 @@ function run_tests() {
 	if [[ "${KATA_HYPERVISOR}" = "dragonball" ]] && [[ "${SNAPSHOTTER}" = "devmapper" ]]; then
 		echo "Skipping tests for ${KATA_HYPERVISOR} using devmapper"
 	else
-		bash run_kubernetes_tests.sh
+		bash "${K8STESTS}"
 	fi
 	popd
 }
@@ -589,7 +589,14 @@ function main() {
 		deploy-kata-zvsi) deploy_kata "zvsi" ;;
 		deploy-snapshotter) deploy_snapshotter ;;
 		report-tests) report_tests ;;
-		run-tests) run_tests ;;
+		run-tests)
+			K8STESTS=run_kubernetes_tests.sh
+			run_tests
+			;;
+		run-nv-tests)
+			K8STESTS=run_kubernetes_nv_tests.sh
+			run_tests
+			;;
 		run-tests-kcli) run_tests "kcli" ;;
 		collect-artifacts) collect_artifacts ;;
 		cleanup) cleanup ;;
--- a/tests/integration/kubernetes/k8s-nvidia-nim.bats
+++ b/tests/integration/kubernetes/k8s-nvidia-nim.bats
@@ -0,0 +1,99 @@
+#!/usr/bin/env bats
+#
+# Copyright (c) 2025 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# shellcheck disable=SC2154  # BATS variables are not assigned in this file
+load "${BATS_TEST_DIRNAME}/../../common.bash"
+# shellcheck disable=SC1091
+load "${BATS_TEST_DIRNAME}/tests_common.sh"
+
+export POD_NAME_INSTRUCT="nvidia-nim-llama-3-1-8b-instruct"
+export POD_NAME_EMBEDQA="nvidia-nim-llama-3-2-nv-embedqa-1b-v2"
+
+export POD_SECRET_INSTRUCT="ngc-secret-instruct"
+
+DOCKER_CONFIG_JSON=$(
+    echo -n "{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"${NGC_API_KEY}\",\"auth\":\"$(echo -n "\$oauthtoken:${NGC_API_KEY}" | base64 -w0)\"}}}" |
+        base64 -w0
+)
+export DOCKER_CONFIG_JSON
+
+setup_file() {
+    dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq
+
+    export PYENV_ROOT="${HOME}/.pyenv"
+    [[ -d ${PYENV_ROOT}/bin ]] && export PATH="${PYENV_ROOT}/bin:${PATH}"
+    eval "$(pyenv init - bash)"
+
+    python3 -m venv "${HOME}"/.cicd/venv
+
+    get_pod_config_dir
+
+    pod_instruct_yaml_in="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml.in"
+    pod_instruct_yaml="${pod_config_dir}/${POD_NAME_INSTRUCT}.yaml"
+
+    envsubst <"${pod_instruct_yaml_in}" >"${pod_instruct_yaml}"
+
+    export POD_INSTRUCT_YAML="${pod_instruct_yaml}"
+}
+
+@test "NVIDIA NIM Llama 3.1-8b Instruct" {
+    kubectl apply -f "${POD_INSTRUCT_YAML}"
+    kubectl wait --for=condition=Ready --timeout=500s pod "${POD_NAME_INSTRUCT}"
+    # shellcheck disable=SC2030  # Variable is shared via file between BATS tests
+    POD_IP_INSTRUCT=$(kubectl get pod "${POD_NAME_INSTRUCT}" -o jsonpath='{.status.podIP}')
+    [[ -n "${POD_IP_INSTRUCT}" ]]
+
+    echo "POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >"${BATS_SUITE_TMPDIR}/env"
+    echo "# POD_IP_INSTRUCT=${POD_IP_INSTRUCT}" >&3
+}
+
+@test "List of models available for inference" {
+    # shellcheck disable=SC1091  # File is created by previous test
+    source "${BATS_SUITE_TMPDIR}/env"
+    # shellcheck disable=SC2031  # Variable is shared via file between BATS tests
+    [[ -n "${POD_IP_INSTRUCT}" ]]
+
+    # shellcheck disable=SC2031  # Variable is shared via file between BATS tests
+    run curl -sX GET "http://${POD_IP_INSTRUCT}:8000/v1/models"
+    [[ "${status}" -eq 0 ]]
+
+    # shellcheck disable=SC2030  # Variable is shared via file between BATS tests
+    MODEL_NAME=$(echo "${output}" | jq '.data[0].id' | tr -d '"')
+    export MODEL_NAME
+    [[ -n "${MODEL_NAME}" ]]
+    echo "MODEL_NAME=${MODEL_NAME}" >>"${BATS_SUITE_TMPDIR}/env"
+    echo "# MODEL_NAME=${MODEL_NAME}" >&3
+
+}
+
+@test "Simple OpenAI completion request" {
+    # shellcheck disable=SC1091  # File is created by previous test
+    source "${BATS_SUITE_TMPDIR}/env"
+    # shellcheck disable=SC2031  # Variables are shared via file between BATS tests
+    [[ -n "${POD_IP_INSTRUCT}" ]]
+    # shellcheck disable=SC2031  # Variables are shared via file between BATS tests
+    [[ -n "${MODEL_NAME}" ]]
+
+    QUESTION="What are Kata Containers?"
+
+    # shellcheck disable=SC2031  # Variables are shared via file between BATS tests
+    run curl -sX 'POST' \
+        "http://${POD_IP_INSTRUCT}:8000/v1/completions" \
+        -H "accept: application/json" \
+        -H "Content-Type: application/json" \
+        -d "{\"model\": \"${MODEL_NAME}\", \"prompt\": \"${QUESTION}\", \"max_tokens\": 64}"
+
+    ANSWER=$(echo "${output}" | jq '.choices[0].text')
+    [[ -n "${ANSWER}" ]]
+
+    echo "# QUESTION: ${QUESTION}" >&3
+    echo "# ANSWER: ${ANSWER}" >&3
+}
+
+teardown_file() {
+        kubectl delete -f "${POD_INSTRUCT_YAML}"
+}
--- a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh
+++ b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Copyright (c) 2025 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set -e
+
+kubernetes_dir=$(dirname "$(readlink -f "$0")")
+# shellcheck disable=SC1091 # import based on variable
+source "${kubernetes_dir}/../../common.bash"
+
+cleanup() {
+	true
+}
+
+trap cleanup EXIT
+
+# Setting to "yes" enables fail fast, stopping execution at the first failed test.
+K8S_TEST_FAIL_FAST="${K8S_TEST_FAIL_FAST:-no}"
+K8S_TEST_NV=("k8s-nvidia-nim.bats")
+
+ensure_yq
+
+info "Running tests with bats version: $(bats --version)"
+
+tests_fail=()
+for K8S_TEST_ENTRY in "${K8S_TEST_NV[@]}"
+do
+	K8S_TEST_ENTRY=$(echo "${K8S_TEST_ENTRY}" | tr -d '[:space:][:cntrl:]')
+	info "$(kubectl get pods --all-namespaces 2>&1)"
+	info "Executing ${K8S_TEST_ENTRY}"
+	if ! bats --show-output-of-passing-tests "${K8S_TEST_ENTRY}"; then
+		tests_fail+=("${K8S_TEST_ENTRY}")
+		[[ "${K8S_TEST_FAIL_FAST}" = "yes" ]] && break
+	fi
+done
+
+[[ ${#tests_fail[@]} -ne 0 ]] && die "Tests FAILED from suites: ${tests_fail[*]}"
+
+info "All tests SUCCEEDED"
--- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in
+++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in
@@ -0,0 +1,89 @@
+
+# Copyright (c) 2025 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ngc-secret-instruct
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: ${DOCKER_CONFIG_JSON}
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${POD_NAME_INSTRUCT}
+  labels:
+    app: ${POD_NAME_INSTRUCT}
+spec:
+  restartPolicy: Never
+  runtimeClassName: kata-qemu-nvidia-gpu
+  imagePullSecrets:
+    - name: ngc-secret-instruct
+  securityContext:
+    runAsUser: 0
+    runAsGroup: 0
+    fsGroup: 0
+  containers:
+  - name: ${POD_NAME_INSTRUCT}
+    image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
+    # Ports exposed by the container:
+    ports:
+      - containerPort: 8000
+        name: http-openai
+    livenessProbe:
+      httpGet:
+        path: /v1/health/live
+        port: http-openai
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+    readinessProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: http-openai
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+    startupProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: http-openai
+      initialDelaySeconds: 40
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 180
+    # Environment variable for NGC_API_KEY. In production, use a Secret.
+    env:
+      - name: NGC_API_KEY
+        value: "${NGC_API_KEY}"
+    # GPU resource request/limit (for NVIDIA GPU)
+    resources:
+      requests:
+        cpu: "16"
+        memory: "32Gi"
+      limits:
+        nvidia.com/pgpu: "1"
+        cpu: "16"
+        memory: "32Gi"
+    # Mount the local .cache directory into the container
+    volumeMounts:
+      - name: nim-cache
+        mountPath: /opt/nim/.cache
+
+  # Host path volume for the local .cache directory.
+  # Adjust 'path' to match your $LOCAL_NIM_CACHE location.
+  volumes:
+  - name: nim-cache
+    hostPath:
+      path: "/opr/nim/.cache"
+      type: DirectoryOrCreate
+
--- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in
+++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2.yaml.in
@@ -0,0 +1,95 @@
+# Copyright (c) 2025 NVIDIA Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ngc-secret-embedqa
+  namespace: nim-embedqa
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: ${DOCKER_CONFIG_JSON}
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
+  namespace: nim-embedqa
+  labels:
+    app: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
+spec:
+  restartPolicy: Always
+  runtimeClassName: "${RUNTIME_CLASS_NAME}"
+  serviceAccountName: default
+  imagePullSecrets:
+    - name: ngc-secret-embedqa
+  securityContext:
+    fsGroup: 0
+    runAsGroup: 0
+    runAsUser: 0
+  containers:
+  - name: nvidia-nim-llama-3-2-nv-embedqa-1b-v2
+    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.5.0
+    imagePullPolicy: IfNotPresent
+    env:
+      - name: NIM_CACHE_PATH
+        value: "/opt/nim/.cache"
+      - name: NGC_API_KEY
+        value: "${NGC_API_KEY}"
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_JSONL_LOGGING
+        value: "1"
+      - name: NIM_LOG_LEVEL
+        value: "INFO"
+    ports:
+      - containerPort: 8000
+        name: http
+
+    livenessProbe:
+      httpGet:
+        path: /v1/health/live
+        port: 8000
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+
+    readinessProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: 8000
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+
+    startupProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: 8000
+      initialDelaySeconds: 40
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 180
+
+    resources:
+      limits:
+        nvidia.com/pgpu: 1
+        cpu: "16"
+        memory: "32Gi"
+
+    volumeMounts:
+      - name: nim-cache
+        mountPath: /opt/nim/.cache
+
+  volumes:
+  - name: nim-cache
+    hostPath:
+      path: "/opr/nim/.cache"
+      type: DirectoryOrCreate