From f153229865dd757f292ecc3e9cc58ed74abee569 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 16 Jan 2025 23:31:20 +0000 Subject: [PATCH 1/5] gpu: Add driver version selection Besides latest and lts options add an option to specify the exact driver version. Signed-off-by: Zvonko Kaiser --- .../rootfs-builder/nvidia/nvidia_chroot.sh | 69 ++++++++++++++++--- .../rootfs-builder/nvidia/nvidia_rootfs.sh | 15 ++-- .../local-build/kata-deploy-binaries.sh | 5 +- 3 files changed, 71 insertions(+), 18 deletions(-) diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh index 8193bba47..6488a8956 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh @@ -13,14 +13,53 @@ shopt -s extglob run_file_name=$2 run_fm_file_name=$3 arch_target=$4 -driver_version="$5" -driver_type="open" +nvidia_gpu_stack="$5" +driver_version="" +driver_type="-open" supported_gpu_devids="/supported-gpu.devids" APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install" export DEBIAN_FRONTEND=noninteractive +is_feature_enabled() { + local feature="$1" + # Check if feature is in the comma-separated list + if [[ ",$nvidia_gpu_stack," == *",$feature,"* ]]; then + return 0 + else + return 1 + fi +} + +set_driver_version_type() { + echo "chroot: Setting the correct driver version" + + if [[ ",$nvidia_gpu_stack," == *",latest,"* ]]; then + driver_version="latest" + elif [[ ",$nvidia_gpu_stack," == *",lts,"* ]]; then + driver_version="lts" + elif [[ "$nvidia_gpu_stack" =~ version=([^,]+) ]]; then + driver_version="${BASH_REMATCH[1]}" + else + echo "No known driver spec found. Please specify \"latest\", \"lts\", or \"version=\"." + exit 1 + fi + + echo "chroot: driver_version: ${driver_version}" + + echo "chroot: Setting the correct driver type" + + # driver -> enable open or closed drivers + if [[ "$nvidia_gpu_stack" =~ (^|,)driver=open($|,) ]]; then + driver_type="-open" + elif [[ "$nvidia_gpu_stack" =~ (^|,)driver=closed($|,) ]]; then + driver_type="" + fi + + echo "chroot: driver_type: ${driver_type}" +} + install_nvidia_ctk() { echo "chroot: Installing NVIDIA GPU container runtime" apt list nvidia-container-toolkit-base -a @@ -29,6 +68,10 @@ install_nvidia_ctk() { } install_nvidia_fabricmanager() { + is_feature_enabled "nvswitch" || { + echo "chroot: Skipping NVIDIA fabricmanager installation" + return + } # if run_fm_file_name exists run it if [ -f /"${run_fm_file_name}" ]; then install_nvidia_fabricmanager_from_run_file @@ -52,6 +95,11 @@ install_nvidia_fabricmanager_from_distribution() { } build_nvidia_drivers() { + is_feature_enabled "compute" || { + echo "chroot: Skipping NVIDIA drivers build" + return + } + echo "chroot: Build NVIDIA drivers" pushd "${driver_source_files}" >> /dev/null @@ -129,7 +177,7 @@ prepare_distribution_drivers() { fi echo "chroot: Prepare NVIDIA distribution drivers" - eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \ + eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}${driver_type}" \ libnvidia-cfg1-"${driver_version}" \ nvidia-compute-utils-"${driver_version}" \ nvidia-utils-"${driver_version}" \ @@ -152,7 +200,7 @@ prepare_nvidia_drivers() { for source_dir in /NVIDIA-*; do if [ -d "${source_dir}" ]; then - driver_source_files="${source_dir}"/kernel-${driver_type} + driver_source_files="${source_dir}"/kernel${driver_type} driver_source_dir="${source_dir}" break fi @@ -245,6 +293,11 @@ export_driver_version() { install_nvidia_dcgm() { + is_feature_enabled "dcgm" || { + echo "chroot: Skipping NVIDIA DCGM installation" + return + } + curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb dpkg -i cuda-keyring_1.0-1_all.deb && rm -f cuda-keyring_1.0-1_all.deb @@ -292,11 +345,11 @@ cleanup_rootfs() { apt purge -yqq jq make gcc wget libc6-dev git xz-utils curl gpg \ python3-pip software-properties-common ca-certificates \ - linux-libc-dev nuitka python3-minimal cuda-keyring + linux-libc-dev nuitka python3-minimal if [ -n "${driver_version}" ]; then - apt purge -yqq nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \ - nvidia-kernel-source-"${driver_version}-${driver_type}" -yqq + apt purge -yqq nvidia-headless-no-dkms-"${driver_version}${driver_type}" \ + nvidia-kernel-source-"${driver_version}${driver_type}" -yqq fi apt autoremove -yqq @@ -325,7 +378,7 @@ cleanup_rootfs() { # Start of script echo "chroot: Setup NVIDIA GPU rootfs stage one" - +set_driver_version_type setup_apt_repositories install_kernel_dependencies install_build_dependencies diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh index 6e46c1dc0..d401d3ba7 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh @@ -17,6 +17,7 @@ DEBUG="" setup_nvidia-nvrc() { local TARGET="nvidia-nvrc" + local TARGET_VERSION="main" local PROJECT="nvrc" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" @@ -45,6 +46,7 @@ setup_nvidia-nvrc() { setup_nvidia-gpu-admin-tools() { local TARGET="nvidia-gpu-admin-tools" + local TARGET_VERSION="v2024.12.06" local TARGET_GIT="https://github.com/NVIDIA/gpu-admin-tools" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" @@ -72,6 +74,7 @@ setup_nvidia-gpu-admin-tools() { setup_nvidia-dcgm-exporter() { local TARGET="nvidia-dcgm-exporter" + local TARGET_VERSION="3.3.9-3.6.1" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst" @@ -85,7 +88,7 @@ setup_nvidia-dcgm-exporter() { local dex="dcgm-exporter" rm -rf "${dex}" - git clone https://github.com/NVIDIA/${dex} + git clone --branch "${TARGET_VERSION}" https://github.com/NVIDIA/${dex} make -C ${dex} binary mkdir -p ../destdir/bin @@ -151,14 +154,8 @@ setup_nvidia_gpu_rootfs_stage_one() { mount --make-rslave ./dev mount -t proc /proc ./proc - local driver_version="latest" - if echo "$NVIDIA_GPU_STACK" | grep -q '\'; then - driver_version="latest" - elif echo "$NVIDIA_GPU_STACK" | grep -q '\'; then - driver_version="lts" - fi - - chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} ${run_fm_file_name} ${ARCH} ${driver_version}" + chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} \ + ${run_fm_file_name} ${ARCH} ${NVIDIA_GPU_STACK}" umount -R ./dev umount ./proc diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index eccfcb9e8..8380efe4c 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -475,7 +475,10 @@ install_initrd_confidential() { # For all nvidia_gpu targets we can customize the stack that is enbled # in the VM by setting the NVIDIA_GPU_STACK= environment variable # -# latest | lts -> use the latest and greatest driver or lts release +# latest | lts | version +# -> use the latest and greatest driver, +# lts release or e.g. version=550.127.1 +# driver -> enable open or closed drivers # debug -> enable debugging support # compute -> enable the compute GPU stack, includes utility # graphics -> enable the graphics GPU stack, includes compute From 98e0dc16766de835c43849016d94e78d3924b01b Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 23 Jan 2025 16:06:33 +0000 Subject: [PATCH 2/5] gpu: Add set -u to scripts Make the scripts more robust by failing on unset varaibles Signed-off-by: Zvonko Kaiser --- tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh | 2 +- tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh index 6488a8956..efc6a6437 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 #!/bin/bash -set -xe +set -xeuo pipefail shopt -s nullglob shopt -s extglob diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh index d401d3ba7..224d59a5d 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh @@ -4,7 +4,7 @@ # # SPDX-License-Identifier: Apache-2.0 -set -e +set -euo pipefail [ -n "$DEBUG" ] && set -x readonly BUILD_DIR="/kata-containers/tools/packaging/kata-deploy/local-build/build/" From 10974b7bec1529d488cd663a671bce3297b11cf0 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 23 Jan 2025 16:09:00 +0000 Subject: [PATCH 3/5] gpu: AGENT_INIT=no We're setting globally for each initrd and image AGENT_INIT=no Signed-off-by: Zvonko Kaiser --- .../kata-deploy/local-build/kata-deploy-binaries.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 8380efe4c..73b3b7652 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -494,7 +494,7 @@ install_initrd_confidential() { # Install NVIDIA GPU image install_image_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" + export AGENT_INIT="no" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_image "nvidia-gpu" @@ -503,7 +503,7 @@ install_image_nvidia_gpu() { # Install NVIDIA GPU initrd install_initrd_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" + export AGENT_INIT="no" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_initrd "nvidia-gpu" @@ -512,7 +512,7 @@ install_initrd_nvidia_gpu() { # Instal NVIDIA GPU confidential image install_image_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" + export AGENT_INIT="no" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"} @@ -522,7 +522,7 @@ install_image_nvidia_gpu_confidential() { # Install NVIDIA GPU confidential initrd install_initrd_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" + export AGENT_INIT="no" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"} From cd7001612a15c5d9661778b988e8990bebc7b675 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 23 Jan 2025 19:48:18 +0000 Subject: [PATCH 4/5] gpu: rootfs adjust for AGENT_INIT=no Since we're defaulting to AGENT_INIT=no for all the initrd/images adapt the NV build to properly get kata-agent installed. Signed-off-by: Zvonko Kaiser --- tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh | 6 +++--- .../kata-deploy/local-build/kata-deploy-binaries.sh | 4 ---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh index 224d59a5d..e0b06736a 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh @@ -253,15 +253,15 @@ chisseled_init() { tar xvf "${BUILD_DIR}"/kata-static-busybox.tar.xz -C . mkdir -p dev etc proc run/cdi sys tmp usr var lib/modules lib/firmware \ - usr/share/nvidia lib/x86_64-linux-gnu lib64 + usr/share/nvidia lib/x86_64-linux-gnu lib64 usr/bin ln -sf ../run var/run tar xvf "${BUILD_DIR}"/kata-static-nvidia-nvrc.tar.zst -C . - ln -sf /bin/NVRC init + ln -sf /bin/NVRC sbin/init - cp -a "${stage_one}"/sbin/init sbin/. + cp -a "${stage_one}"/usr/bin/kata-agent usr/bin/. cp -a "${stage_one}"/etc/kata-opa etc/. cp -a "${stage_one}"/etc/resolv.conf etc/. cp -a "${stage_one}"/supported-gpu.devids . diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 73b3b7652..ece0911c3 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -494,7 +494,6 @@ install_initrd_confidential() { # Install NVIDIA GPU image install_image_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="no" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_image "nvidia-gpu" @@ -503,7 +502,6 @@ install_image_nvidia_gpu() { # Install NVIDIA GPU initrd install_initrd_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="no" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_initrd "nvidia-gpu" @@ -512,7 +510,6 @@ install_initrd_nvidia_gpu() { # Instal NVIDIA GPU confidential image install_image_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="no" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"} @@ -522,7 +519,6 @@ install_image_nvidia_gpu_confidential() { # Install NVIDIA GPU confidential initrd install_initrd_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="no" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"} From d2528ef84f2c297e47818815aad91175b8104fcd Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 23 Jan 2025 20:13:48 +0000 Subject: [PATCH 5/5] gpu: Initialize unbound variables rootfs.sh Since we're importing some build script for nvidia and we're setting set -u we have some unbound variables in rootfs.sh add initialization for those. Signed-off-by: Zvonko Kaiser --- tools/osbuilder/rootfs-builder/rootfs.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/osbuilder/rootfs-builder/rootfs.sh b/tools/osbuilder/rootfs-builder/rootfs.sh index e377e25f7..0a840b131 100755 --- a/tools/osbuilder/rootfs-builder/rootfs.sh +++ b/tools/osbuilder/rootfs-builder/rootfs.sh @@ -43,6 +43,16 @@ if [[ "${AGENT_POLICY}" == "yes" ]]; then agent_policy_file="$(readlink -f -v "${AGENT_POLICY_FILE:-"${script_dir}/../../../src/kata-opa/allow-all.rego"}")" fi +INSIDE_CONTAINER=${INSIDE_CONTAINER:-""} +IMAGE_REGISTRY=${IMAGE_REGISTRY:-""} +http_proxy=${http_proxy:-""} +https_proxy=${https_proxy:-""} +AGENT_POLICY_FILE=${AGENT_POLICY_FILE:-""} +GRACEFUL_EXIT=${GRACEFUL_EXIT:-""} +USE_DOCKER=${USE_DOCKER:-""} +USE_PODMAN=${USE_PODMAN:-""} +EXTRA_PKGS=${EXTRA_PKGS:-""} + NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-""} nvidia_rootfs="${script_dir}/nvidia/nvidia_rootfs.sh" source "$nvidia_rootfs"