gpu: Add driver version selection

Besides latest and lts options add an option to specify
the exact driver version.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser 2025-01-16 23:31:20 +00:00
parent 311c3638c6
commit f153229865
3 changed files with 71 additions and 18 deletions

View File

@ -13,14 +13,53 @@ shopt -s extglob
run_file_name=$2
run_fm_file_name=$3
arch_target=$4
driver_version="$5"
driver_type="open"
nvidia_gpu_stack="$5"
driver_version=""
driver_type="-open"
supported_gpu_devids="/supported-gpu.devids"
APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install"
export DEBIAN_FRONTEND=noninteractive
is_feature_enabled() {
local feature="$1"
# Check if feature is in the comma-separated list
if [[ ",$nvidia_gpu_stack," == *",$feature,"* ]]; then
return 0
else
return 1
fi
}
set_driver_version_type() {
echo "chroot: Setting the correct driver version"
if [[ ",$nvidia_gpu_stack," == *",latest,"* ]]; then
driver_version="latest"
elif [[ ",$nvidia_gpu_stack," == *",lts,"* ]]; then
driver_version="lts"
elif [[ "$nvidia_gpu_stack" =~ version=([^,]+) ]]; then
driver_version="${BASH_REMATCH[1]}"
else
echo "No known driver spec found. Please specify \"latest\", \"lts\", or \"version=<VERSION>\"."
exit 1
fi
echo "chroot: driver_version: ${driver_version}"
echo "chroot: Setting the correct driver type"
# driver -> enable open or closed drivers
if [[ "$nvidia_gpu_stack" =~ (^|,)driver=open($|,) ]]; then
driver_type="-open"
elif [[ "$nvidia_gpu_stack" =~ (^|,)driver=closed($|,) ]]; then
driver_type=""
fi
echo "chroot: driver_type: ${driver_type}"
}
install_nvidia_ctk() {
echo "chroot: Installing NVIDIA GPU container runtime"
apt list nvidia-container-toolkit-base -a
@ -29,6 +68,10 @@ install_nvidia_ctk() {
}
install_nvidia_fabricmanager() {
is_feature_enabled "nvswitch" || {
echo "chroot: Skipping NVIDIA fabricmanager installation"
return
}
# if run_fm_file_name exists run it
if [ -f /"${run_fm_file_name}" ]; then
install_nvidia_fabricmanager_from_run_file
@ -52,6 +95,11 @@ install_nvidia_fabricmanager_from_distribution() {
}
build_nvidia_drivers() {
is_feature_enabled "compute" || {
echo "chroot: Skipping NVIDIA drivers build"
return
}
echo "chroot: Build NVIDIA drivers"
pushd "${driver_source_files}" >> /dev/null
@ -129,7 +177,7 @@ prepare_distribution_drivers() {
fi
echo "chroot: Prepare NVIDIA distribution drivers"
eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \
eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}${driver_type}" \
libnvidia-cfg1-"${driver_version}" \
nvidia-compute-utils-"${driver_version}" \
nvidia-utils-"${driver_version}" \
@ -152,7 +200,7 @@ prepare_nvidia_drivers() {
for source_dir in /NVIDIA-*; do
if [ -d "${source_dir}" ]; then
driver_source_files="${source_dir}"/kernel-${driver_type}
driver_source_files="${source_dir}"/kernel${driver_type}
driver_source_dir="${source_dir}"
break
fi
@ -245,6 +293,11 @@ export_driver_version() {
install_nvidia_dcgm() {
is_feature_enabled "dcgm" || {
echo "chroot: Skipping NVIDIA DCGM installation"
return
}
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb && rm -f cuda-keyring_1.0-1_all.deb
@ -292,11 +345,11 @@ cleanup_rootfs() {
apt purge -yqq jq make gcc wget libc6-dev git xz-utils curl gpg \
python3-pip software-properties-common ca-certificates \
linux-libc-dev nuitka python3-minimal cuda-keyring
linux-libc-dev nuitka python3-minimal
if [ -n "${driver_version}" ]; then
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \
nvidia-kernel-source-"${driver_version}-${driver_type}" -yqq
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}${driver_type}" \
nvidia-kernel-source-"${driver_version}${driver_type}" -yqq
fi
apt autoremove -yqq
@ -325,7 +378,7 @@ cleanup_rootfs() {
# Start of script
echo "chroot: Setup NVIDIA GPU rootfs stage one"
set_driver_version_type
setup_apt_repositories
install_kernel_dependencies
install_build_dependencies

View File

@ -17,6 +17,7 @@ DEBUG=""
setup_nvidia-nvrc() {
local TARGET="nvidia-nvrc"
local TARGET_VERSION="main"
local PROJECT="nvrc"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
@ -45,6 +46,7 @@ setup_nvidia-nvrc() {
setup_nvidia-gpu-admin-tools() {
local TARGET="nvidia-gpu-admin-tools"
local TARGET_VERSION="v2024.12.06"
local TARGET_GIT="https://github.com/NVIDIA/gpu-admin-tools"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
@ -72,6 +74,7 @@ setup_nvidia-gpu-admin-tools() {
setup_nvidia-dcgm-exporter() {
local TARGET="nvidia-dcgm-exporter"
local TARGET_VERSION="3.3.9-3.6.1"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst"
@ -85,7 +88,7 @@ setup_nvidia-dcgm-exporter() {
local dex="dcgm-exporter"
rm -rf "${dex}"
git clone https://github.com/NVIDIA/${dex}
git clone --branch "${TARGET_VERSION}" https://github.com/NVIDIA/${dex}
make -C ${dex} binary
mkdir -p ../destdir/bin
@ -151,14 +154,8 @@ setup_nvidia_gpu_rootfs_stage_one() {
mount --make-rslave ./dev
mount -t proc /proc ./proc
local driver_version="latest"
if echo "$NVIDIA_GPU_STACK" | grep -q '\<latest\>'; then
driver_version="latest"
elif echo "$NVIDIA_GPU_STACK" | grep -q '\<lts\>'; then
driver_version="lts"
fi
chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} ${run_fm_file_name} ${ARCH} ${driver_version}"
chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} \
${run_fm_file_name} ${ARCH} ${NVIDIA_GPU_STACK}"
umount -R ./dev
umount ./proc

View File

@ -475,7 +475,10 @@ install_initrd_confidential() {
# For all nvidia_gpu targets we can customize the stack that is enbled
# in the VM by setting the NVIDIA_GPU_STACK= environment variable
#
# latest | lts -> use the latest and greatest driver or lts release
# latest | lts | version
# -> use the latest and greatest driver,
# lts release or e.g. version=550.127.1
# driver -> enable open or closed drivers
# debug -> enable debugging support
# compute -> enable the compute GPU stack, includes utility
# graphics -> enable the graphics GPU stack, includes compute