mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-04-02 18:13:57 +00:00
tests: gpu: use container image layer storage
Use the container image layer storage feature for the k8s-nvidia-nim.bats test pod manifests. This reduces the pods' memory requirements. Signed-off-by: Manuel Huber <manuelh@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
b6cf00a374
commit
177f5c308e
@@ -118,7 +118,7 @@ function is_confidential_gpu_hardware() {
|
||||
|
||||
# create_loop_device creates a loop device backed by a file.
|
||||
# $1: loop file path (default: /tmp/trusted-image-storage.img)
|
||||
# $2: size in MB (default: 2500, i.e. ~2.5Gi; use 30720 for ~30Gi)
|
||||
# $2: size in MiB, i.e. dd bs=1M count=... (default: 2500, ~2.4Gi)
|
||||
function create_loop_device(){
|
||||
local loop_file="${1:-/tmp/trusted-image-storage.img}"
|
||||
local size_mb="${2:-2500}"
|
||||
|
||||
@@ -85,6 +85,8 @@ setup_langchain_flow() {
|
||||
# generated policy.rego to it and set it as the cc_init_data annotation.
|
||||
# We must overwrite the default empty file AFTER create_tmp_policy_settings_dir()
|
||||
# copies it to the temp directory.
|
||||
# As we use multiple vCPUs we set `max_concurrent_layer_downloads_per_image = 1`,
|
||||
# see: https://github.com/kata-containers/kata-containers/issues/12721
|
||||
create_nim_initdata_file() {
|
||||
local output_file="$1"
|
||||
local cc_kbs_address
|
||||
@@ -107,6 +109,7 @@ name = "cc_kbc"
|
||||
url = "${cc_kbs_address}"
|
||||
|
||||
[image]
|
||||
max_concurrent_layer_downloads_per_image = 1
|
||||
authenticated_registry_credentials_uri = "kbs:///default/credentials/nvcr"
|
||||
'''
|
||||
EOF
|
||||
@@ -189,12 +192,35 @@ setup_file() {
|
||||
# This must happen AFTER create_tmp_policy_settings_dir() copies the empty
|
||||
# file and BEFORE auto_generate_policy() runs.
|
||||
create_nim_initdata_file "${policy_settings_dir}/default-initdata.toml"
|
||||
|
||||
# Container image layer storage: one block device and PV/PVC per pod.
|
||||
storage_config_template="${pod_config_dir}/confidential/trusted-storage.yaml.in"
|
||||
|
||||
instruct_storage_mib=57344
|
||||
local_device_instruct=$(create_loop_device /tmp/trusted-image-storage-instruct.img "$instruct_storage_mib")
|
||||
storage_config_instruct=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").instruct.XXX")
|
||||
PV_NAME=trusted-block-pv-instruct PVC_NAME=trusted-pvc-instruct \
|
||||
PV_STORAGE_CAPACITY="${instruct_storage_mib}Mi" PVC_STORAGE_REQUEST="${instruct_storage_mib}Mi" \
|
||||
LOCAL_DEVICE="$local_device_instruct" NODE_NAME="$node" \
|
||||
envsubst < "$storage_config_template" > "$storage_config_instruct"
|
||||
retry_kubectl_apply "$storage_config_instruct"
|
||||
|
||||
if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then
|
||||
embedqa_storage_mib=8192
|
||||
local_device_embedqa=$(create_loop_device /tmp/trusted-image-storage-embedqa.img "$embedqa_storage_mib")
|
||||
storage_config_embedqa=$(mktemp "${BATS_FILE_TMPDIR}/$(basename "${storage_config_template}").embedqa.XXX")
|
||||
PV_NAME=trusted-block-pv-embedqa PVC_NAME=trusted-pvc-embedqa \
|
||||
PV_STORAGE_CAPACITY="${embedqa_storage_mib}Mi" PVC_STORAGE_REQUEST="${embedqa_storage_mib}Mi" \
|
||||
LOCAL_DEVICE="$local_device_embedqa" NODE_NAME="$node" \
|
||||
envsubst < "$storage_config_template" > "$storage_config_embedqa"
|
||||
retry_kubectl_apply "$storage_config_embedqa"
|
||||
fi
|
||||
fi
|
||||
|
||||
create_inference_pod
|
||||
|
||||
if [ "${SKIP_MULTI_GPU_TESTS}" != "true" ]; then
|
||||
create_embedqa_pod
|
||||
create_embedqa_pod
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -459,5 +485,13 @@ teardown_file() {
|
||||
[ -f "${POD_EMBEDQA_YAML}" ] && kubectl delete -f "${POD_EMBEDQA_YAML}" --ignore-not-found=true
|
||||
fi
|
||||
|
||||
if [[ "${TEE}" = "true" ]]; then
|
||||
kubectl delete --ignore-not-found pvc trusted-pvc-instruct trusted-pvc-embedqa
|
||||
kubectl delete --ignore-not-found pv trusted-block-pv-instruct trusted-block-pv-embedqa
|
||||
kubectl delete --ignore-not-found storageclass local-storage
|
||||
cleanup_loop_device /tmp/trusted-image-storage-instruct.img || true
|
||||
cleanup_loop_device /tmp/trusted-image-storage-embedqa.img || true
|
||||
fi
|
||||
|
||||
print_node_journal_since_test_start "${node}" "${node_start_time:-}" "${BATS_TEST_COMPLETED:-}" >&3
|
||||
}
|
||||
|
||||
@@ -69,14 +69,20 @@ spec:
|
||||
limits:
|
||||
nvidia.com/pgpu: "1"
|
||||
cpu: "16"
|
||||
memory: "64Gi"
|
||||
memory: "48Gi"
|
||||
volumeMounts:
|
||||
- name: nim-trusted-cache
|
||||
mountPath: /opt/nim/.cache
|
||||
volumeDevices:
|
||||
- devicePath: /dev/trusted_store
|
||||
name: trusted-storage
|
||||
volumes:
|
||||
- name: nim-trusted-cache
|
||||
emptyDir:
|
||||
sizeLimit: 64Gi
|
||||
- name: trusted-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: trusted-pvc-instruct
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
|
||||
@@ -83,10 +83,16 @@ spec:
|
||||
volumeMounts:
|
||||
- name: nim-trusted-cache
|
||||
mountPath: /opt/nim/.cache
|
||||
volumeDevices:
|
||||
- devicePath: /dev/trusted_store
|
||||
name: trusted-storage
|
||||
volumes:
|
||||
- name: nim-trusted-cache
|
||||
emptyDir:
|
||||
sizeLimit: 40Gi
|
||||
- name: trusted-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: trusted-pvc-embedqa
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
|
||||
Reference in New Issue
Block a user