tests: gpu: use container image layer storage

Use the container image layer storage feature for the
k8s-nvidia-nim.bats test pod manifests. This reduces the pods'
memory requirements.

Signed-off-by: Manuel Huber <manuelh@nvidia.com>
This commit is contained in:
Manuel Huber
2026-02-25 16:08:56 -08:00
committed by Fabiano Fidêncio
parent b6cf00a374
commit 177f5c308e
4 changed files with 49 additions and 3 deletions

View File

@@ -118,7 +118,7 @@ function is_confidential_gpu_hardware() {
# create_loop_device creates a loop device backed by a file.
# $1: loop file path (default: /tmp/trusted-image-storage.img)
# $2: size in MB (default: 2500, i.e. ~2.5Gi; use 30720 for ~30Gi)
# $2: size in MiB, i.e. dd bs=1M count=... (default: 2500, ~2.4Gi)
function create_loop_device(){
local loop_file="${1:-/tmp/trusted-image-storage.img}"
local size_mb="${2:-2500}"

View File

@@ -85,6 +85,8 @@ setup_langchain_flow() {
# generated policy.rego to it and set it as the cc_init_data annotation.
# We must overwrite the default empty file AFTER create_tmp_policy_settings_dir()
# copies it to the temp directory.
# As we use multiple vCPUs we set `max_concurrent_layer_downloads_per_image = 1`,
# see: https://github.com/kata-containers/kata-containers/issues/12721
create_nim_initdata_file() {
local output_file="$1"
local cc_kbs_address
@@ -107,6 +109,7 @@ name = "cc_kbc"
url = "${cc_kbs_address}"
[image]
max_concurrent_layer_downloads_per_image = 1
authenticated_registry_credentials_uri = "kbs:///default/credentials/nvcr"
'''
EOF
@@ -189,12 +192,35 @@ setup_file() {
# This must happen AFTER create_tmp_policy_settings_dir() copies the empty
# file and BEFORE auto_generate_policy() runs.
create_nim_initdata_file "${policy_settings_dir}/default-initdata.toml"
# Container image layer storage: one block device and PV/PVC per pod.
storage_config_template="${pod_config_dir}/confidential/trusted-storage.yaml.in"
instruct_storage_mib=57344
local_device_instruct=$(create_loop_device /tmp/trusted-image-storage-instruct.img "$instruct_storage_mib")
storage_config_instruct=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").instruct.XXX")
PV_NAME=trusted-block-pv-instruct PVC_NAME=trusted-pvc-instruct \
PV_STORAGE_CAPACITY="${instruct_storage_mib}Mi" PVC_STORAGE_REQUEST="${instruct_storage_mib}Mi" \
LOCAL_DEVICE="$local_device_instruct" NODE_NAME="$node" \
envsubst < "$storage_config_template" > "$storage_config_instruct"
retry_kubectl_apply "$storage_config_instruct"
if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then
embedqa_storage_mib=8192
local_device_embedqa=$(create_loop_device /tmp/trusted-image-storage-embedqa.img "$embedqa_storage_mib")
storage_config_embedqa=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").embedqa.XXX")
PV_NAME=trusted-block-pv-embedqa PVC_NAME=trusted-pvc-embedqa \
PV_STORAGE_CAPACITY="${embedqa_storage_mib}Mi" PVC_STORAGE_REQUEST="${embedqa_storage_mib}Mi" \
LOCAL_DEVICE="$local_device_embedqa" NODE_NAME="$node" \
envsubst < "$storage_config_template" > "$storage_config_embedqa"
retry_kubectl_apply "$storage_config_embedqa"
fi
fi
create_inference_pod
if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then
create_embedqa_pod
create_embedqa_pod
fi
}
@@ -459,5 +485,13 @@ teardown_file() {
[ -f "${POD_EMBEDQA_YAML}" ] && kubectl delete -f "${POD_EMBEDQA_YAML}" --ignore-not-found=true
fi
if [[ "${TEE}" = "true" ]]; then
kubectl delete --ignore-not-found pvc trusted-pvc-instruct trusted-pvc-embedqa
kubectl delete --ignore-not-found pv trusted-block-pv-instruct trusted-block-pv-embedqa
kubectl delete --ignore-not-found storageclass local-storage
cleanup_loop_device /tmp/trusted-image-storage-instruct.img || true
cleanup_loop_device /tmp/trusted-image-storage-embedqa.img || true
fi
print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}" >&3
}

View File

@@ -69,14 +69,20 @@ spec:
limits:
nvidia.com/pgpu: "1"
cpu: "16"
memory: "64Gi"
memory: "48Gi"
volumeMounts:
- name: nim-trusted-cache
mountPath: /opt/nim/.cache
volumeDevices:
- devicePath: /dev/trusted_store
name: trusted-storage
volumes:
- name: nim-trusted-cache
emptyDir:
sizeLimit: 64Gi
- name: trusted-storage
persistentVolumeClaim:
claimName: trusted-pvc-instruct
---
apiVersion: v1
kind: Secret

View File

@@ -83,10 +83,16 @@ spec:
volumeMounts:
- name: nim-trusted-cache
mountPath: /opt/nim/.cache
volumeDevices:
- devicePath: /dev/trusted_store
name: trusted-storage
volumes:
- name: nim-trusted-cache
emptyDir:
sizeLimit: 40Gi
- name: trusted-storage
persistentVolumeClaim:
claimName: trusted-pvc-embedqa
---
apiVersion: v1
kind: Secret