diff --git a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml index 41f1c52700..286afa6e09 100644 --- a/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml +++ b/.github/workflows/run-k8s-tests-on-nvidia-gpu.yaml @@ -41,8 +41,10 @@ jobs: fail-fast: false matrix: environment: [ - { name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100 }, - { name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp }, + { name: nvidia-gpu, vmm: qemu-nvidia-gpu, runner: amd64-nvidia-a100, coco: false }, + { name: nvidia-gpu-runtime-rs, vmm: qemu-nvidia-gpu-runtime-rs, runner: amd64-nvidia-a100, coco: false }, + { name: nvidia-gpu-snp, vmm: qemu-nvidia-gpu-snp, runner: amd64-nvidia-h100-snp, coco: true }, + { name: nvidia-gpu-snp-runtime-rs, vmm: qemu-nvidia-gpu-snp-runtime-rs, runner: amd64-nvidia-h100-snp, coco: true }, ] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.event.pull_request.number || github.ref }}-${{ toJSON(matrix) }} @@ -55,9 +57,9 @@ jobs: GH_PR_NUMBER: ${{ inputs.pr-number }} KATA_HYPERVISOR: ${{ matrix.environment.vmm }} KUBERNETES: kubeadm - KBS: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }} - SNAPSHOTTER: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'nydus' || '' }} - USE_EXPERIMENTAL_SNAPSHOTTER_SETUP: ${{ matrix.environment.name == 'nvidia-gpu-snp' && 'true' || 'false' }} + KBS: ${{ matrix.environment.coco && 'true' || 'false' }} + SNAPSHOTTER: ${{ matrix.environment.coco && 'nydus' || '' }} + USE_EXPERIMENTAL_SNAPSHOTTER_SETUP: ${{ matrix.environment.coco && 'true' || 'false' }} K8S_TEST_HOST_TYPE: baremetal steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -82,12 +84,12 @@ jobs: run: bash tests/integration/kubernetes/gha-run.sh install-kata-tools kata-tools-artifacts - name: Uninstall previous `kbs-client` - if: matrix.environment.name != 'nvidia-gpu' + if: matrix.environment.coco timeout-minutes: 10 run: bash tests/integration/kubernetes/gha-run.sh uninstall-kbs-client - name: Deploy CoCo KBS - if: matrix.environment.name != 'nvidia-gpu' + if: matrix.environment.coco timeout-minutes: 10 run: bash tests/integration/kubernetes/gha-run.sh deploy-coco-kbs env: @@ -95,7 +97,7 @@ jobs: KBS_INGRESS: nodeport - name: Install `kbs-client` - if: matrix.environment.name != 'nvidia-gpu' + if: matrix.environment.coco timeout-minutes: 10 run: bash tests/integration/kubernetes/gha-run.sh install-kbs-client @@ -134,7 +136,7 @@ jobs: run: bash tests/integration/kubernetes/gha-run.sh cleanup - name: Delete CoCo KBS - if: always() && matrix.environment.name != 'nvidia-gpu' + if: always() && matrix.environment.coco timeout-minutes: 10 run: | bash tests/integration/kubernetes/gha-run.sh delete-coco-kbs diff --git a/tests/hypervisor_helpers.sh b/tests/hypervisor_helpers.sh index 5b8619b7bd..5ca610d6a9 100644 --- a/tests/hypervisor_helpers.sh +++ b/tests/hypervisor_helpers.sh @@ -8,7 +8,7 @@ SNP_HYPERVISORS=("qemu-snp" "qemu-snp-runtime-rs") TDX_HYPERVISORS=("qemu-tdx" "qemu-tdx-runtime-rs") SE_HYPERVISORS=("qemu-se" "qemu-se-runtime-rs") CCA_HYPERVISORS=("qemu-cca") -GPU_TEE_HYPERVISORS=("qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx") +GPU_TEE_HYPERVISORS=("qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx" "qemu-nvidia-gpu-snp-runtime-rs" "qemu-nvidia-gpu-tdx-runtime-rs") TEE_HYPERVISORS=("${SNP_HYPERVISORS[@]}" "${TDX_HYPERVISORS[@]}" "${SE_HYPERVISORS[@]}" "${CCA_HYPERVISORS[@]}" "${GPU_TEE_HYPERVISORS[@]}") NON_TEE_HYPERVISORS=("qemu-coco-dev" "qemu-coco-dev-runtime-rs") FIRECRACKER_HYPERVISORS=("firecracker" "fc") diff --git a/tests/integration/kubernetes/k8s-nvidia-nim.bats b/tests/integration/kubernetes/k8s-nvidia-nim.bats index d3ab909e92..76ebc1f7a1 100644 --- a/tests/integration/kubernetes/k8s-nvidia-nim.bats +++ b/tests/integration/kubernetes/k8s-nvidia-nim.bats @@ -182,6 +182,13 @@ setup_file() { export POD_EMBEDQA_YAML_IN="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml.in" export POD_EMBEDQA_YAML="${pod_config_dir}/${POD_NAME_EMBEDQA}.yaml" + # runtime-rs does not support trusted storage yet, so use alternative + # TEE templates without emptyDir/PVC volumes and higher memory. + if is_runtime_rs && [[ "${TEE}" = "true" ]]; then + export POD_INSTRUCT_YAML_IN="${pod_config_dir}/${POD_NAME_INSTRUCT}-no-trusted-storage.yaml.in" + export POD_EMBEDQA_YAML_IN="${pod_config_dir}/${POD_NAME_EMBEDQA}-no-trusted-storage.yaml.in" + fi + dpkg -s jq >/dev/null 2>&1 || sudo apt -y install jq setup_langchain_flow @@ -198,27 +205,29 @@ setup_file() { # file and BEFORE auto_generate_policy() runs. create_nim_initdata_file "${policy_settings_dir}/default-initdata.toml" - # Container image layer storage: one block device and PV/PVC per pod. - storage_config_template="${pod_config_dir}/confidential/trusted-storage.yaml.in" + if ! is_runtime_rs; then + # Container image layer storage: one block device and PV/PVC per pod. + storage_config_template="${pod_config_dir}/confidential/trusted-storage.yaml.in" - instruct_storage_mib=57344 - local_device_instruct=$(create_loop_device /tmp/trusted-image-storage-instruct.img "$instruct_storage_mib") - storage_config_instruct=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").instruct.XXX") - PV_NAME=trusted-block-pv-instruct PVC_NAME=trusted-pvc-instruct \ - PV_STORAGE_CAPACITY="${instruct_storage_mib}Mi" PVC_STORAGE_REQUEST="${instruct_storage_mib}Mi" \ - LOCAL_DEVICE="$local_device_instruct" NODE_NAME="$node" \ - envsubst < "$storage_config_template" > "$storage_config_instruct" - retry_kubectl_apply "$storage_config_instruct" + instruct_storage_mib=57344 + local_device_instruct=$(create_loop_device /tmp/trusted-image-storage-instruct.img "$instruct_storage_mib") + storage_config_instruct=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").instruct.XXX") + PV_NAME=trusted-block-pv-instruct PVC_NAME=trusted-pvc-instruct \ + PV_STORAGE_CAPACITY="${instruct_storage_mib}Mi" PVC_STORAGE_REQUEST="${instruct_storage_mib}Mi" \ + LOCAL_DEVICE="$local_device_instruct" NODE_NAME="$node" \ + envsubst < "$storage_config_template" > "$storage_config_instruct" + retry_kubectl_apply "$storage_config_instruct" - if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then - embedqa_storage_mib=8192 - local_device_embedqa=$(create_loop_device /tmp/trusted-image-storage-embedqa.img "$embedqa_storage_mib") - storage_config_embedqa=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").embedqa.XXX") - PV_NAME=trusted-block-pv-embedqa PVC_NAME=trusted-pvc-embedqa \ - PV_STORAGE_CAPACITY="${embedqa_storage_mib}Mi" PVC_STORAGE_REQUEST="${embedqa_storage_mib}Mi" \ - LOCAL_DEVICE="$local_device_embedqa" NODE_NAME="$node" \ - envsubst < "$storage_config_template" > "$storage_config_embedqa" - retry_kubectl_apply "$storage_config_embedqa" + if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then + embedqa_storage_mib=8192 + local_device_embedqa=$(create_loop_device /tmp/trusted-image-storage-embedqa.img "$embedqa_storage_mib") + storage_config_embedqa=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").embedqa.XXX") + PV_NAME=trusted-block-pv-embedqa PVC_NAME=trusted-pvc-embedqa \ + PV_STORAGE_CAPACITY="${embedqa_storage_mib}Mi" PVC_STORAGE_REQUEST="${embedqa_storage_mib}Mi" \ + LOCAL_DEVICE="$local_device_embedqa" NODE_NAME="$node" \ + envsubst < "$storage_config_template" > "$storage_config_embedqa" + retry_kubectl_apply "$storage_config_embedqa" + fi fi fi @@ -490,7 +499,7 @@ teardown_file() { [ -f "${POD_EMBEDQA_YAML}" ] && kubectl delete -f "${POD_EMBEDQA_YAML}" --ignore-not-found=true fi - if [[ "${TEE}" = "true" ]]; then + if [[ "${TEE}" = "true" ]] && ! is_runtime_rs; then kubectl delete --ignore-not-found pvc trusted-pvc-instruct trusted-pvc-embedqa kubectl delete --ignore-not-found pv trusted-block-pv-instruct trusted-block-pv-embedqa kubectl delete --ignore-not-found storageclass local-storage diff --git a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh index 8d3bab3dac..154fc169df 100644 --- a/tests/integration/kubernetes/run_kubernetes_nv_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_nv_tests.sh @@ -14,7 +14,12 @@ source "${kubernetes_dir}/../../common.bash" # Enable NVRC trace logging for NVIDIA GPU runtime via drop-in config enable_nvrc_trace() { - local config_dir="/opt/kata/share/defaults/kata-containers/runtimes/${KATA_HYPERVISOR}/config.d" + local kata_config_base="/opt/kata/share/defaults/kata-containers" + case "${KATA_HYPERVISOR}" in + *-runtime-rs) kata_config_base="${kata_config_base}/runtime-rs" ;; + esac + + local config_dir="${kata_config_base}/runtimes/${KATA_HYPERVISOR}/config.d" local drop_in_file="${config_dir}/90-nvrc-trace.toml" local kernel_params_drop_in="${config_dir}/30-kernel-params.toml" @@ -30,7 +35,7 @@ enable_nvrc_trace() { if [[ -f "${kernel_params_drop_in}" ]]; then base_params=$(grep -E '^kernel_params\s*=' "${kernel_params_drop_in}" | sed 's/^kernel_params\s*=\s*"\(.*\)"/\1/' || true) else - local runtime_config="/opt/kata/share/defaults/kata-containers/runtimes/${KATA_HYPERVISOR}/configuration-${KATA_HYPERVISOR}.toml" + local runtime_config="${kata_config_base}/runtimes/${KATA_HYPERVISOR}/configuration-${KATA_HYPERVISOR}.toml" if [[ -f "${runtime_config}" ]]; then base_params=$(grep -E '^kernel_params\s*=' "${runtime_config}" | sed 's/^kernel_params\s*=\s*"\(.*\)"/\1/' || true) fi @@ -93,7 +98,7 @@ else "k8s-nvidia-nim-service.bats") fi -SUPPORTED_HYPERVISORS=("qemu-nvidia-gpu" "qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx") +SUPPORTED_HYPERVISORS=("qemu-nvidia-gpu" "qemu-nvidia-gpu-snp" "qemu-nvidia-gpu-tdx" "qemu-nvidia-gpu-runtime-rs" "qemu-nvidia-gpu-snp-runtime-rs" "qemu-nvidia-gpu-tdx-runtime-rs") export KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu-nvidia-gpu}" # shellcheck disable=SC2076 # intentionally use literal string matching if [[ ! " ${SUPPORTED_HYPERVISORS[*]} " =~ " ${KATA_HYPERVISOR} " ]]; then diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee-no-trusted-storage.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee-no-trusted-storage.yaml.in new file mode 100644 index 0000000000..44f9a24602 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee-no-trusted-storage.yaml.in @@ -0,0 +1,98 @@ +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# TEE variant without trusted storage support (e.g. for runtime-rs which +# does not yet implement block-encrypted emptyDir or trusted PVC). +# Uses higher memory to compensate for the lack of offloaded storage. +# +--- +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_INSTRUCT} + labels: + app: ${POD_NAME_INSTRUCT} + annotations: + # Start CDH process and configure AA for KBS communication + # aa_kbc_params tells the Attestation Agent where KBS is located + io.katacontainers.config.hypervisor.kernel_params: "agent.guest_components_procs=confidential-data-hub agent.aa_kbc_params=cc_kbc::${CC_KBS_ADDR}" + # cc_init_data annotation will be added by genpolicy with CDH configuration + # from the custom default-initdata.toml created by create_nim_initdata_file() +spec: + # Explicit user/group/supplementary groups to support nydus guest-pull. + # See issue https://github.com/kata-containers/kata-containers/issues/11162 and + # other references to this issue in the genpolicy source folder. + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + supplementalGroups: [4, 20, 24, 25, 27, 29, 30, 44, 46] + restartPolicy: Never + runtimeClassName: kata + imagePullSecrets: + - name: ngc-secret-instruct + containers: + - name: ${POD_NAME_INSTRUCT} + image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.13.1 + # Ports exposed by the container: + ports: + - containerPort: 8000 + name: http-openai + livenessProbe: + httpGet: + path: /v1/health/live + port: http-openai + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v1/health/ready + port: http-openai + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: /v1/health/ready + port: http-openai + initialDelaySeconds: 360 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 30 + env: + - name: NGC_API_KEY + valueFrom: + secretKeyRef: + name: ngc-api-key-sealed-instruct + key: api-key + # GPU resource limit (for NVIDIA GPU) + resources: + limits: + nvidia.com/pgpu: "1" + cpu: "16" + memory: "128Gi" +--- +apiVersion: v1 +kind: Secret +metadata: + name: ngc-secret-instruct +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: ${DOCKER_CONFIG_JSON} +--- +apiVersion: v1 +kind: Secret +metadata: + name: ngc-api-key-sealed-instruct +type: Opaque +data: + # Sealed secret pointing to kbs:///default/ngc-api-key/instruct + # CDH will unseal this by fetching the actual key from KBS + api-key: "${NGC_API_KEY_SEALED_SECRET_INSTRUCT_BASE64}" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee-no-trusted-storage.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee-no-trusted-storage.yaml.in new file mode 100644 index 0000000000..40fcf7d9e5 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee-no-trusted-storage.yaml.in @@ -0,0 +1,107 @@ +# Copyright (c) 2026 NVIDIA Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# TEE variant without trusted storage support (e.g. for runtime-rs which +# does not yet implement block-encrypted emptyDir or trusted PVC). +# Uses higher memory to compensate for the lack of offloaded storage. +# +--- +apiVersion: v1 +kind: Pod +metadata: + name: ${POD_NAME_EMBEDQA} + labels: + app: ${POD_NAME_EMBEDQA} + annotations: + # Start CDH process and configure AA for KBS communication + # aa_kbc_params tells the Attestation Agent where KBS is located + io.katacontainers.config.hypervisor.kernel_params: "agent.guest_components_procs=confidential-data-hub agent.aa_kbc_params=cc_kbc::${CC_KBS_ADDR}" + # cc_init_data annotation will be added by genpolicy with CDH configuration + # from the custom default-initdata.toml created by create_nim_initdata_file() +spec: + # Explicit user/group/supplementary groups to support nydus guest-pull. + # See issue https://github.com/kata-containers/kata-containers/issues/11162 and + # other references to this issue in the genpolicy source folder. + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + restartPolicy: Always + runtimeClassName: kata + serviceAccountName: default + imagePullSecrets: + - name: ngc-secret-embedqa + containers: + - name: ${POD_NAME_EMBEDQA} + image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.10.1 + imagePullPolicy: IfNotPresent + env: + - name: NGC_API_KEY + valueFrom: + secretKeyRef: + name: ngc-api-key-sealed-embedqa + key: api-key + - name: NIM_HTTP_API_PORT + value: "8000" + - name: NIM_JSONL_LOGGING + value: "1" + - name: NIM_LOG_LEVEL + value: "INFO" + ports: + - containerPort: 8000 + name: http + + livenessProbe: + httpGet: + path: /v1/health/live + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: /v1/health/ready + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + + startupProbe: + httpGet: + path: /v1/health/ready + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 180 + + resources: + limits: + nvidia.com/pgpu: "1" + cpu: "16" + memory: "48Gi" +--- +apiVersion: v1 +kind: Secret +metadata: + name: ngc-secret-embedqa +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: ${DOCKER_CONFIG_JSON} +--- +apiVersion: v1 +kind: Secret +metadata: + name: ngc-api-key-sealed-embedqa +type: Opaque +data: + # Sealed secret pointing to kbs:///default/ngc-api-key/embedqa + # CDH will unseal this by fetching the actual key from KBS + api-key: "${NGC_API_KEY_SEALED_SECRET_EMBEDQA_BASE64}"