mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-27 19:35:32 +00:00
Besides latest and lts options add an option to specify the exact driver version. Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
393 lines
12 KiB
Bash
393 lines
12 KiB
Bash
#!/usr/bin/env bash
|
|
#
|
|
# Copyright (c) 2024 NVIDIA Corporation
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
#!/bin/bash
|
|
set -xe
|
|
|
|
shopt -s nullglob
|
|
shopt -s extglob
|
|
|
|
run_file_name=$2
|
|
run_fm_file_name=$3
|
|
arch_target=$4
|
|
nvidia_gpu_stack="$5"
|
|
driver_version=""
|
|
driver_type="-open"
|
|
supported_gpu_devids="/supported-gpu.devids"
|
|
|
|
APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install"
|
|
|
|
export DEBIAN_FRONTEND=noninteractive
|
|
|
|
is_feature_enabled() {
|
|
local feature="$1"
|
|
# Check if feature is in the comma-separated list
|
|
if [[ ",$nvidia_gpu_stack," == *",$feature,"* ]]; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
set_driver_version_type() {
|
|
echo "chroot: Setting the correct driver version"
|
|
|
|
if [[ ",$nvidia_gpu_stack," == *",latest,"* ]]; then
|
|
driver_version="latest"
|
|
elif [[ ",$nvidia_gpu_stack," == *",lts,"* ]]; then
|
|
driver_version="lts"
|
|
elif [[ "$nvidia_gpu_stack" =~ version=([^,]+) ]]; then
|
|
driver_version="${BASH_REMATCH[1]}"
|
|
else
|
|
echo "No known driver spec found. Please specify \"latest\", \"lts\", or \"version=<VERSION>\"."
|
|
exit 1
|
|
fi
|
|
|
|
echo "chroot: driver_version: ${driver_version}"
|
|
|
|
echo "chroot: Setting the correct driver type"
|
|
|
|
# driver -> enable open or closed drivers
|
|
if [[ "$nvidia_gpu_stack" =~ (^|,)driver=open($|,) ]]; then
|
|
driver_type="-open"
|
|
elif [[ "$nvidia_gpu_stack" =~ (^|,)driver=closed($|,) ]]; then
|
|
driver_type=""
|
|
fi
|
|
|
|
echo "chroot: driver_type: ${driver_type}"
|
|
}
|
|
|
|
install_nvidia_ctk() {
|
|
echo "chroot: Installing NVIDIA GPU container runtime"
|
|
apt list nvidia-container-toolkit-base -a
|
|
# Base gives a nvidia-ctk and the nvidia-container-runtime
|
|
eval "${APT_INSTALL}" nvidia-container-toolkit-base
|
|
}
|
|
|
|
install_nvidia_fabricmanager() {
|
|
is_feature_enabled "nvswitch" || {
|
|
echo "chroot: Skipping NVIDIA fabricmanager installation"
|
|
return
|
|
}
|
|
# if run_fm_file_name exists run it
|
|
if [ -f /"${run_fm_file_name}" ]; then
|
|
install_nvidia_fabricmanager_from_run_file
|
|
else
|
|
install_nvidia_fabricmanager_from_distribution
|
|
fi
|
|
}
|
|
|
|
install_nvidia_fabricmanager_from_run_file() {
|
|
echo "chroot: Install NVIDIA fabricmanager from run file"
|
|
pushd / >> /dev/null
|
|
chmod +x "${run_fm_file_name}"
|
|
./"${run_fm_file_name}" --nox11
|
|
popd >> /dev/null
|
|
}
|
|
|
|
install_nvidia_fabricmanager_from_distribution() {
|
|
echo "chroot: Install NVIDIA fabricmanager from distribution"
|
|
eval "${APT_INSTALL}" nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
|
|
apt-mark hold nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
|
|
}
|
|
|
|
build_nvidia_drivers() {
|
|
is_feature_enabled "compute" || {
|
|
echo "chroot: Skipping NVIDIA drivers build"
|
|
return
|
|
}
|
|
|
|
echo "chroot: Build NVIDIA drivers"
|
|
pushd "${driver_source_files}" >> /dev/null
|
|
|
|
local kernel_version
|
|
for version in /lib/modules/*; do
|
|
kernel_version=$(basename "${version}")
|
|
echo "chroot: Building GPU modules for: ${kernel_version}"
|
|
cp /boot/System.map-"${kernel_version}" /lib/modules/"${kernel_version}"/build/System.map
|
|
|
|
if [ "${arch_target}" == "aarch64" ]; then
|
|
ln -sf /lib/modules/"${kernel_version}"/build/arch/arm64 /lib/modules/"${kernel_version}"/build/arch/aarch64
|
|
fi
|
|
|
|
if [ "${arch_target}" == "x86_64" ]; then
|
|
ln -sf /lib/modules/"${kernel_version}"/build/arch/x86 /lib/modules/"${kernel_version}"/build/arch/amd64
|
|
fi
|
|
|
|
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build > /dev/null
|
|
make INSTALL_MOD_STRIP=1 -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build modules_install
|
|
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build clean > /dev/null
|
|
|
|
done
|
|
# Save the modules for later so that a linux-image purge does not remove it
|
|
tar cvfa /lib/modules.save_from_purge.tar.zst /lib/modules
|
|
popd >> /dev/null
|
|
}
|
|
|
|
install_userspace_components() {
|
|
if [ ! -f /"${run_file_name}" ]; then
|
|
echo "chroot: Skipping NVIDIA userspace runfile components installation"
|
|
return
|
|
fi
|
|
|
|
pushd /NVIDIA-* >> /dev/null
|
|
# if aarch64 we need to remove --no-install-compat32-libs
|
|
if [ "${arch_target}" == "aarch64" ]; then
|
|
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root
|
|
else
|
|
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root --no-install-compat32-libs
|
|
fi
|
|
popd >> /dev/null
|
|
|
|
}
|
|
|
|
prepare_run_file_drivers() {
|
|
if [ "${driver_version}" == "latest" ]; then
|
|
driver_version=""
|
|
echo "chroot: Resetting driver version not supported with run-file"
|
|
elif [ "${driver_version}" == "lts" ]; then
|
|
driver_version=""
|
|
echo "chroot: Resetting driver version not supported with run-file"
|
|
fi
|
|
|
|
|
|
echo "chroot: Prepare NVIDIA run file drivers"
|
|
pushd / >> /dev/null
|
|
chmod +x "${run_file_name}"
|
|
./"${run_file_name}" -x
|
|
|
|
mkdir -p /usr/share/nvidia/rim/
|
|
|
|
# Sooner or later RIM files will be only available remotely
|
|
RIMFILE=$(ls NVIDIA-*/RIM_GH100PROD.swidtag)
|
|
if [ -e "${RIMFILE}" ]; then
|
|
cp NVIDIA-*/RIM_GH100PROD.swidtag /usr/share/nvidia/rim/.
|
|
fi
|
|
popd >> /dev/null
|
|
}
|
|
|
|
prepare_distribution_drivers() {
|
|
if [ "${driver_version}" == "latest" ]; then
|
|
driver_version=$(apt-cache search --names-only 'nvidia-headless-no-dkms-.?.?.?-open' | awk '{ print $1 }' | tail -n 1 | cut -d'-' -f5)
|
|
elif [ "${driver_version}" == "lts" ]; then
|
|
driver_version="550"
|
|
fi
|
|
|
|
echo "chroot: Prepare NVIDIA distribution drivers"
|
|
eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}${driver_type}" \
|
|
libnvidia-cfg1-"${driver_version}" \
|
|
nvidia-compute-utils-"${driver_version}" \
|
|
nvidia-utils-"${driver_version}" \
|
|
nvidia-kernel-common-"${driver_version}" \
|
|
nvidia-imex-"${driver_version}" \
|
|
libnvidia-compute-"${driver_version}" \
|
|
libnvidia-compute-"${driver_version}" \
|
|
libnvidia-gl-"${driver_version}" \
|
|
libnvidia-extra-"${driver_version}" \
|
|
libnvidia-decode-"${driver_version}" \
|
|
libnvidia-fbc1-"${driver_version}" \
|
|
libnvidia-encode-"${driver_version}"
|
|
}
|
|
|
|
prepare_nvidia_drivers() {
|
|
local driver_source_dir=""
|
|
|
|
if [ -f /"${run_file_name}" ]; then
|
|
prepare_run_file_drivers
|
|
|
|
for source_dir in /NVIDIA-*; do
|
|
if [ -d "${source_dir}" ]; then
|
|
driver_source_files="${source_dir}"/kernel${driver_type}
|
|
driver_source_dir="${source_dir}"
|
|
break
|
|
fi
|
|
done
|
|
get_supported_gpus_from_run_file "${driver_source_dir}"
|
|
|
|
else
|
|
prepare_distribution_drivers
|
|
|
|
for source_dir in /usr/src/nvidia*; do
|
|
if [ -d "${source_dir}" ]; then
|
|
driver_source_files="${source_dir}"
|
|
driver_source_dir="${source_dir}"
|
|
break
|
|
fi
|
|
done
|
|
get_supported_gpus_from_distro_drivers "${driver_source_dir}"
|
|
fi
|
|
|
|
}
|
|
|
|
install_build_dependencies() {
|
|
echo "chroot: Install NVIDIA drivers build dependencies"
|
|
eval "${APT_INSTALL}" make gcc gawk kmod libvulkan1 pciutils jq zstd linuxptp
|
|
}
|
|
|
|
setup_apt_repositories() {
|
|
echo "chroot: Setup APT repositories"
|
|
mkdir -p /var/cache/apt/archives/partial
|
|
mkdir -p /var/log/apt
|
|
mkdir -p /var/lib/dpkg/info
|
|
mkdir -p /var/lib/dpkg/updates
|
|
mkdir -p /var/lib/dpkg/alternatives
|
|
mkdir -p /var/lib/dpkg/triggers
|
|
mkdir -p /var/lib/dpkg/parts
|
|
touch /var/lib/dpkg/status
|
|
rm -f /etc/apt/sources.list.d/*
|
|
|
|
if [ "${arch_target}" == "aarch64" ]; then
|
|
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/jammy.list
|
|
deb http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted universe multiverse
|
|
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted universe multiverse
|
|
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted universe multiverse
|
|
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse
|
|
CHROOT_EOF
|
|
else
|
|
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/noble.list
|
|
deb http://us.archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse
|
|
deb http://us.archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse
|
|
deb http://us.archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
|
|
deb http://us.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse
|
|
CHROOT_EOF
|
|
fi
|
|
|
|
apt update
|
|
|
|
eval "${APT_INSTALL}" curl gpg ca-certificates
|
|
|
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
|
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
|
|
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
|
|
|
apt update
|
|
}
|
|
|
|
install_kernel_dependencies() {
|
|
dpkg -i /linux-*deb
|
|
}
|
|
|
|
get_supported_gpus_from_run_file() {
|
|
local source_dir="$1"
|
|
local supported_gpus_json="${source_dir}"/supported-gpus/supported-gpus.json
|
|
|
|
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > ${supported_gpu_devids}
|
|
}
|
|
|
|
get_supported_gpus_from_distro_drivers() {
|
|
local supported_gpus_json=/usr/share/doc/nvidia-kernel-common-"${driver_version}"/supported-gpus.json
|
|
|
|
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > ${supported_gpu_devids}
|
|
}
|
|
|
|
export_driver_version() {
|
|
for modules_version in /lib/modules/*; do
|
|
modinfo "${modules_version}"/kernel/drivers/video/nvidia.ko | grep ^version | awk '{ print $2 }' > /nvidia_driver_version
|
|
break
|
|
done
|
|
}
|
|
|
|
|
|
install_nvidia_dcgm() {
|
|
is_feature_enabled "dcgm" || {
|
|
echo "chroot: Skipping NVIDIA DCGM installation"
|
|
return
|
|
}
|
|
|
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
|
|
dpkg -i cuda-keyring_1.0-1_all.deb && rm -f cuda-keyring_1.0-1_all.deb
|
|
|
|
if [ "${arch_target}" == "aarch64" ]; then
|
|
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/cuda.list
|
|
deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/ /
|
|
CHROOT_EOF
|
|
else
|
|
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/cuda.list
|
|
deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /
|
|
CHROOT_EOF
|
|
fi
|
|
apt update
|
|
eval "${APT_INSTALL}" datacenter-gpu-manager
|
|
}
|
|
|
|
cleanup_rootfs() {
|
|
echo "chroot: Cleanup NVIDIA GPU rootfs"
|
|
|
|
apt-mark hold libstdc++6 libzstd1 libgnutls30 pciutils
|
|
# noble=libgnutls30t64
|
|
|
|
if [ -n "${driver_version}" ]; then
|
|
apt-mark hold libnvidia-cfg1-"${driver_version}" \
|
|
nvidia-compute-utils-"${driver_version}" \
|
|
nvidia-utils-"${driver_version}" \
|
|
nvidia-kernel-common-"${driver_version}" \
|
|
nvidia-imex-"${driver_version}" \
|
|
libnvidia-compute-"${driver_version}" \
|
|
libnvidia-compute-"${driver_version}" \
|
|
libnvidia-gl-"${driver_version}" \
|
|
libnvidia-extra-"${driver_version}" \
|
|
libnvidia-decode-"${driver_version}" \
|
|
libnvidia-fbc1-"${driver_version}" \
|
|
libnvidia-encode-"${driver_version}" \
|
|
libnvidia-nscq-"${driver_version}" \
|
|
linuxptp libnftnl11
|
|
fi
|
|
|
|
kernel_headers=$(dpkg --get-selections | cut -f1 | grep linux-headers)
|
|
linux_images=$(dpkg --get-selections | cut -f1 | grep linux-image)
|
|
for i in ${kernel_headers} ${linux_images}; do
|
|
apt purge -yqq "${i}"
|
|
done
|
|
|
|
apt purge -yqq jq make gcc wget libc6-dev git xz-utils curl gpg \
|
|
python3-pip software-properties-common ca-certificates \
|
|
linux-libc-dev nuitka python3-minimal
|
|
|
|
if [ -n "${driver_version}" ]; then
|
|
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}${driver_type}" \
|
|
nvidia-kernel-source-"${driver_version}${driver_type}" -yqq
|
|
fi
|
|
|
|
apt autoremove -yqq
|
|
|
|
apt clean
|
|
apt autoclean
|
|
|
|
for modules_version in /lib/modules/*; do
|
|
ln -sf "${modules_version}" /lib/modules/"$(uname -r)"
|
|
touch "${modules_version}"/modules.order
|
|
touch "${modules_version}"/modules.builtin
|
|
depmod -a
|
|
done
|
|
|
|
rm -rf /etc/apt/sources.list* /var/lib/apt /var/log/apt /var/cache/debconf
|
|
rm -f /usr/bin/nvidia-ngx-updater /usr/bin/nvidia-container-runtime
|
|
rm -f /var/log/{nvidia-installer.log,dpkg.log,alternatives.log}
|
|
|
|
# Clear and regenerate the ld cache
|
|
rm -f /etc/ld.so.cache
|
|
ldconfig
|
|
|
|
tar xvf /lib/modules.save_from_purge.tar.zst -C /
|
|
|
|
}
|
|
# Start of script
|
|
echo "chroot: Setup NVIDIA GPU rootfs stage one"
|
|
|
|
set_driver_version_type
|
|
setup_apt_repositories
|
|
install_kernel_dependencies
|
|
install_build_dependencies
|
|
prepare_nvidia_drivers
|
|
build_nvidia_drivers
|
|
install_userspace_components
|
|
install_nvidia_fabricmanager
|
|
install_nvidia_ctk
|
|
export_driver_version
|
|
install_nvidia_dcgm
|
|
cleanup_rootfs
|