diff --git a/tests/integration/kubernetes/confidential_common.sh b/tests/integration/kubernetes/confidential_common.sh index 73535a2ff2..0f8e5a2136 100644 --- a/tests/integration/kubernetes/confidential_common.sh +++ b/tests/integration/kubernetes/confidential_common.sh @@ -118,7 +118,7 @@ function is_confidential_gpu_hardware() { # create_loop_device creates a loop device backed by a file. # $1: loop file path (default: /tmp/trusted-image-storage.img) -# $2: size in MB (default: 2500, i.e. ~2.5Gi; use 30720 for ~30Gi) +# $2: size in MiB, i.e. dd bs=1M count=... (default: 2500, ~2.4Gi) function create_loop_device(){ local loop_file="${1:-/tmp/trusted-image-storage.img}" local size_mb="${2:-2500}" diff --git a/tests/integration/kubernetes/k8s-nvidia-nim.bats b/tests/integration/kubernetes/k8s-nvidia-nim.bats index 6c1b0c5572..b94db67ede 100644 --- a/tests/integration/kubernetes/k8s-nvidia-nim.bats +++ b/tests/integration/kubernetes/k8s-nvidia-nim.bats @@ -85,6 +85,8 @@ setup_langchain_flow() { # generated policy.rego to it and set it as the cc_init_data annotation. # We must overwrite the default empty file AFTER create_tmp_policy_settings_dir() # copies it to the temp directory. +# As we use multiple vCPUs we set `max_concurrent_layer_downloads_per_image = 1`, +# see: https://github.com/kata-containers/kata-containers/issues/12721 create_nim_initdata_file() { local output_file="$1" local cc_kbs_address @@ -107,6 +109,7 @@ name = "cc_kbc" url = "${cc_kbs_address}" [image] +max_concurrent_layer_downloads_per_image = 1 authenticated_registry_credentials_uri = "kbs:///default/credentials/nvcr" ''' EOF @@ -189,12 +192,35 @@ setup_file() { # This must happen AFTER create_tmp_policy_settings_dir() copies the empty # file and BEFORE auto_generate_policy() runs. create_nim_initdata_file "${policy_settings_dir}/default-initdata.toml" + + # Container image layer storage: one block device and PV/PVC per pod. + storage_config_template="${pod_config_dir}/confidential/trusted-storage.yaml.in" + + instruct_storage_mib=57344 + local_device_instruct=$(create_loop_device /tmp/trusted-image-storage-instruct.img "$instruct_storage_mib") + storage_config_instruct=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").instruct.XXX") + PV_NAME=trusted-block-pv-instruct PVC_NAME=trusted-pvc-instruct \ + PV_STORAGE_CAPACITY="${instruct_storage_mib}Mi" PVC_STORAGE_REQUEST="${instruct_storage_mib}Mi" \ + LOCAL_DEVICE="$local_device_instruct" NODE_NAME="$node" \ + envsubst < "$storage_config_template" > "$storage_config_instruct" + retry_kubectl_apply "$storage_config_instruct" + + if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then + embedqa_storage_mib=8192 + local_device_embedqa=$(create_loop_device /tmp/trusted-image-storage-embedqa.img "$embedqa_storage_mib") + storage_config_embedqa=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").embedqa.XXX") + PV_NAME=trusted-block-pv-embedqa PVC_NAME=trusted-pvc-embedqa \ + PV_STORAGE_CAPACITY="${embedqa_storage_mib}Mi" PVC_STORAGE_REQUEST="${embedqa_storage_mib}Mi" \ + LOCAL_DEVICE="$local_device_embedqa" NODE_NAME="$node" \ + envsubst < "$storage_config_template" > "$storage_config_embedqa" + retry_kubectl_apply "$storage_config_embedqa" + fi fi create_inference_pod if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then - create_embedqa_pod + create_embedqa_pod fi } @@ -459,5 +485,13 @@ teardown_file() { [ -f "${POD_EMBEDQA_YAML}" ] && kubectl delete -f "${POD_EMBEDQA_YAML}" --ignore-not-found=true fi + if [[ "${TEE}" = "true" ]]; then + kubectl delete --ignore-not-found pvc trusted-pvc-instruct trusted-pvc-embedqa + kubectl delete --ignore-not-found pv trusted-block-pv-instruct trusted-block-pv-embedqa + kubectl delete --ignore-not-found storageclass local-storage + cleanup_loop_device /tmp/trusted-image-storage-instruct.img || true + cleanup_loop_device /tmp/trusted-image-storage-embedqa.img || true + fi + print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}" >&3 } diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee.yaml.in index 8207604c7e..f86969217b 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee.yaml.in +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct-tee.yaml.in @@ -69,14 +69,20 @@ spec: limits: nvidia.com/pgpu: "1" cpu: "16" - memory: "64Gi" + memory: "48Gi" volumeMounts: - name: nim-trusted-cache mountPath: /opt/nim/.cache + volumeDevices: + - devicePath: /dev/trusted_store + name: trusted-storage volumes: - name: nim-trusted-cache emptyDir: sizeLimit: 64Gi + - name: trusted-storage + persistentVolumeClaim: + claimName: trusted-pvc-instruct --- apiVersion: v1 kind: Secret diff --git a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee.yaml.in b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee.yaml.in index 7bc15daf97..7685afdc5c 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee.yaml.in +++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-2-nv-embedqa-1b-v2-tee.yaml.in @@ -83,10 +83,16 @@ spec: volumeMounts: - name: nim-trusted-cache mountPath: /opt/nim/.cache + volumeDevices: + - devicePath: /dev/trusted_store + name: trusted-storage volumes: - name: nim-trusted-cache emptyDir: sizeLimit: 40Gi + - name: trusted-storage + persistentVolumeClaim: + claimName: trusted-pvc-embedqa --- apiVersion: v1 kind: Secret