diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh index 8193bba47..efc6a6437 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_chroot.sh @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 #!/bin/bash -set -xe +set -xeuo pipefail shopt -s nullglob shopt -s extglob @@ -13,14 +13,53 @@ shopt -s extglob run_file_name=$2 run_fm_file_name=$3 arch_target=$4 -driver_version="$5" -driver_type="open" +nvidia_gpu_stack="$5" +driver_version="" +driver_type="-open" supported_gpu_devids="/supported-gpu.devids" APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install" export DEBIAN_FRONTEND=noninteractive +is_feature_enabled() { + local feature="$1" + # Check if feature is in the comma-separated list + if [[ ",$nvidia_gpu_stack," == *",$feature,"* ]]; then + return 0 + else + return 1 + fi +} + +set_driver_version_type() { + echo "chroot: Setting the correct driver version" + + if [[ ",$nvidia_gpu_stack," == *",latest,"* ]]; then + driver_version="latest" + elif [[ ",$nvidia_gpu_stack," == *",lts,"* ]]; then + driver_version="lts" + elif [[ "$nvidia_gpu_stack" =~ version=([^,]+) ]]; then + driver_version="${BASH_REMATCH[1]}" + else + echo "No known driver spec found. Please specify \"latest\", \"lts\", or \"version=\"." + exit 1 + fi + + echo "chroot: driver_version: ${driver_version}" + + echo "chroot: Setting the correct driver type" + + # driver -> enable open or closed drivers + if [[ "$nvidia_gpu_stack" =~ (^|,)driver=open($|,) ]]; then + driver_type="-open" + elif [[ "$nvidia_gpu_stack" =~ (^|,)driver=closed($|,) ]]; then + driver_type="" + fi + + echo "chroot: driver_type: ${driver_type}" +} + install_nvidia_ctk() { echo "chroot: Installing NVIDIA GPU container runtime" apt list nvidia-container-toolkit-base -a @@ -29,6 +68,10 @@ install_nvidia_ctk() { } install_nvidia_fabricmanager() { + is_feature_enabled "nvswitch" || { + echo "chroot: Skipping NVIDIA fabricmanager installation" + return + } # if run_fm_file_name exists run it if [ -f /"${run_fm_file_name}" ]; then install_nvidia_fabricmanager_from_run_file @@ -52,6 +95,11 @@ install_nvidia_fabricmanager_from_distribution() { } build_nvidia_drivers() { + is_feature_enabled "compute" || { + echo "chroot: Skipping NVIDIA drivers build" + return + } + echo "chroot: Build NVIDIA drivers" pushd "${driver_source_files}" >> /dev/null @@ -129,7 +177,7 @@ prepare_distribution_drivers() { fi echo "chroot: Prepare NVIDIA distribution drivers" - eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \ + eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}${driver_type}" \ libnvidia-cfg1-"${driver_version}" \ nvidia-compute-utils-"${driver_version}" \ nvidia-utils-"${driver_version}" \ @@ -152,7 +200,7 @@ prepare_nvidia_drivers() { for source_dir in /NVIDIA-*; do if [ -d "${source_dir}" ]; then - driver_source_files="${source_dir}"/kernel-${driver_type} + driver_source_files="${source_dir}"/kernel${driver_type} driver_source_dir="${source_dir}" break fi @@ -245,6 +293,11 @@ export_driver_version() { install_nvidia_dcgm() { + is_feature_enabled "dcgm" || { + echo "chroot: Skipping NVIDIA DCGM installation" + return + } + curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb dpkg -i cuda-keyring_1.0-1_all.deb && rm -f cuda-keyring_1.0-1_all.deb @@ -292,11 +345,11 @@ cleanup_rootfs() { apt purge -yqq jq make gcc wget libc6-dev git xz-utils curl gpg \ python3-pip software-properties-common ca-certificates \ - linux-libc-dev nuitka python3-minimal cuda-keyring + linux-libc-dev nuitka python3-minimal if [ -n "${driver_version}" ]; then - apt purge -yqq nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \ - nvidia-kernel-source-"${driver_version}-${driver_type}" -yqq + apt purge -yqq nvidia-headless-no-dkms-"${driver_version}${driver_type}" \ + nvidia-kernel-source-"${driver_version}${driver_type}" -yqq fi apt autoremove -yqq @@ -325,7 +378,7 @@ cleanup_rootfs() { # Start of script echo "chroot: Setup NVIDIA GPU rootfs stage one" - +set_driver_version_type setup_apt_repositories install_kernel_dependencies install_build_dependencies diff --git a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh index 6e46c1dc0..e0b06736a 100644 --- a/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh +++ b/tools/osbuilder/rootfs-builder/nvidia/nvidia_rootfs.sh @@ -4,7 +4,7 @@ # # SPDX-License-Identifier: Apache-2.0 -set -e +set -euo pipefail [ -n "$DEBUG" ] && set -x readonly BUILD_DIR="/kata-containers/tools/packaging/kata-deploy/local-build/build/" @@ -17,6 +17,7 @@ DEBUG="" setup_nvidia-nvrc() { local TARGET="nvidia-nvrc" + local TARGET_VERSION="main" local PROJECT="nvrc" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" @@ -45,6 +46,7 @@ setup_nvidia-nvrc() { setup_nvidia-gpu-admin-tools() { local TARGET="nvidia-gpu-admin-tools" + local TARGET_VERSION="v2024.12.06" local TARGET_GIT="https://github.com/NVIDIA/gpu-admin-tools" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" @@ -72,6 +74,7 @@ setup_nvidia-gpu-admin-tools() { setup_nvidia-dcgm-exporter() { local TARGET="nvidia-dcgm-exporter" + local TARGET_VERSION="3.3.9-3.6.1" local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir" local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir" local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst" @@ -85,7 +88,7 @@ setup_nvidia-dcgm-exporter() { local dex="dcgm-exporter" rm -rf "${dex}" - git clone https://github.com/NVIDIA/${dex} + git clone --branch "${TARGET_VERSION}" https://github.com/NVIDIA/${dex} make -C ${dex} binary mkdir -p ../destdir/bin @@ -151,14 +154,8 @@ setup_nvidia_gpu_rootfs_stage_one() { mount --make-rslave ./dev mount -t proc /proc ./proc - local driver_version="latest" - if echo "$NVIDIA_GPU_STACK" | grep -q '\'; then - driver_version="latest" - elif echo "$NVIDIA_GPU_STACK" | grep -q '\'; then - driver_version="lts" - fi - - chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} ${run_fm_file_name} ${ARCH} ${driver_version}" + chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} \ + ${run_fm_file_name} ${ARCH} ${NVIDIA_GPU_STACK}" umount -R ./dev umount ./proc @@ -256,15 +253,15 @@ chisseled_init() { tar xvf "${BUILD_DIR}"/kata-static-busybox.tar.xz -C . mkdir -p dev etc proc run/cdi sys tmp usr var lib/modules lib/firmware \ - usr/share/nvidia lib/x86_64-linux-gnu lib64 + usr/share/nvidia lib/x86_64-linux-gnu lib64 usr/bin ln -sf ../run var/run tar xvf "${BUILD_DIR}"/kata-static-nvidia-nvrc.tar.zst -C . - ln -sf /bin/NVRC init + ln -sf /bin/NVRC sbin/init - cp -a "${stage_one}"/sbin/init sbin/. + cp -a "${stage_one}"/usr/bin/kata-agent usr/bin/. cp -a "${stage_one}"/etc/kata-opa etc/. cp -a "${stage_one}"/etc/resolv.conf etc/. cp -a "${stage_one}"/supported-gpu.devids . diff --git a/tools/osbuilder/rootfs-builder/rootfs.sh b/tools/osbuilder/rootfs-builder/rootfs.sh index e377e25f7..0a840b131 100755 --- a/tools/osbuilder/rootfs-builder/rootfs.sh +++ b/tools/osbuilder/rootfs-builder/rootfs.sh @@ -43,6 +43,16 @@ if [[ "${AGENT_POLICY}" == "yes" ]]; then agent_policy_file="$(readlink -f -v "${AGENT_POLICY_FILE:-"${script_dir}/../../../src/kata-opa/allow-all.rego"}")" fi +INSIDE_CONTAINER=${INSIDE_CONTAINER:-""} +IMAGE_REGISTRY=${IMAGE_REGISTRY:-""} +http_proxy=${http_proxy:-""} +https_proxy=${https_proxy:-""} +AGENT_POLICY_FILE=${AGENT_POLICY_FILE:-""} +GRACEFUL_EXIT=${GRACEFUL_EXIT:-""} +USE_DOCKER=${USE_DOCKER:-""} +USE_PODMAN=${USE_PODMAN:-""} +EXTRA_PKGS=${EXTRA_PKGS:-""} + NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-""} nvidia_rootfs="${script_dir}/nvidia/nvidia_rootfs.sh" source "$nvidia_rootfs" diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index eccfcb9e8..ece0911c3 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -475,7 +475,10 @@ install_initrd_confidential() { # For all nvidia_gpu targets we can customize the stack that is enbled # in the VM by setting the NVIDIA_GPU_STACK= environment variable # -# latest | lts -> use the latest and greatest driver or lts release +# latest | lts | version +# -> use the latest and greatest driver, +# lts release or e.g. version=550.127.1 +# driver -> enable open or closed drivers # debug -> enable debugging support # compute -> enable the compute GPU stack, includes utility # graphics -> enable the graphics GPU stack, includes compute @@ -491,7 +494,6 @@ install_initrd_confidential() { # Install NVIDIA GPU image install_image_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_image "nvidia-gpu" @@ -500,7 +502,6 @@ install_image_nvidia_gpu() { # Install NVIDIA GPU initrd install_initrd_nvidia_gpu() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" export EXTRA_PKGS="apt" NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"} install_initrd "nvidia-gpu" @@ -509,7 +510,6 @@ install_initrd_nvidia_gpu() { # Instal NVIDIA GPU confidential image install_image_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"} @@ -519,7 +519,6 @@ install_image_nvidia_gpu_confidential() { # Install NVIDIA GPU confidential initrd install_initrd_nvidia_gpu_confidential() { export AGENT_POLICY="yes" - export AGENT_INIT="yes" export EXTRA_PKGS="apt" # TODO: export MEASURED_ROOTFS=yes NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"}