Compare commits

...

10 Commits

Author SHA1 Message Date
Fabiano Fidêncio
4f24b4fc9e versions: nvidia: Bump kernel to the latest LTS
As now that we have the decoupled rootfs / kernel, doing the bump
becomes trivial.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-01-14 14:55:20 +01:00
Fabiano Fidêncio
6b2a3fab8e workflows: nvidia: Adjust to kernel / roots build decouple
We don't need to store the kernel headers anymore. We do need to store
the kernel modules, instead.

Signed-off-by: Fabiano Fidêncio <ffidencio@nvidia.com>
2026-01-14 14:55:20 +01:00
Zvonko Kaiser
451dcb289a kernel: bump kata_config_version
We have kernel build changes bump the config version

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:55:20 +01:00
Zvonko Kaiser
34cde2637d gpu: build_image.sh use versions.yaml
We've done some bad file based driver determination,
now with versions.yaml there is a single source of truth.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:55:20 +01:00
Zvonko Kaiser
664a3af02b gpu: nvidia_chroot.sh update decoupling
Remove all the driver build instructions,
sicne those are now done in the kernel target.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:55:20 +01:00
Zvonko Kaiser
e9bb43ef01 gpu: deploy modules for kernel build
We need to package the build modules for the rootfs
to be able to consume it. We package the whole
/lib/modules/$(uname -r)  directory strip=2.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:55:20 +01:00
Zvonko Kaiser
d4962bafac gpu: Add NVIDA modules to build-kernel.sh
Checkout and build the kernel modules along
with the kernel to avoid the kernel rootfs dependency.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:30:31 +01:00
Zvonko Kaiser
c42f7501fd gpu: Remove building of Headers
Since we build along the kernel we do not need to
carry over the headers to the rootfs build.

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:30:31 +01:00
Zvonko Kaiser
a00ebab8ad gpu: versions.yaml nvidia driver pinning
We want to have deterministic behaviour and only
one valid driver version acceptable via versions.yaml

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:30:31 +01:00
Zvonko Kaiser
1f6cfb11b0 kernel: bugfix install yq
We actually never installed yq to the kernel build,
there are  some path that use yq but were never hit,
for the GPU use-case we need to read values from versions.yaml

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2026-01-14 14:30:31 +01:00
12 changed files with 274 additions and 412 deletions

View File

@@ -148,8 +148,8 @@ jobs:
if: ${{ startsWith(matrix.asset, 'kernel-nvidia-gpu') }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-artifacts-amd64-${{ matrix.asset }}-headers${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-headers.tar.zst
name: kata-artifacts-amd64-${{ matrix.asset }}-modules${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-modules.tar.zst
retention-days: 15
if-no-files-found: error
@@ -237,8 +237,8 @@ jobs:
asset:
- busybox
- coco-guest-components
- kernel-nvidia-gpu-headers
- kernel-nvidia-gpu-confidential-headers
- kernel-nvidia-gpu-modules
- kernel-nvidia-gpu-confidential-modules
- pause-image
steps:
- uses: geekyeggo/delete-artifact@f275313e70c08f6120db482d7a6b98377786765b # v5.1.0

View File

@@ -134,8 +134,8 @@ jobs:
if: ${{ startsWith(matrix.asset, 'kernel-nvidia-gpu') }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: kata-artifacts-arm64-${{ matrix.asset }}-headers${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-headers.tar.zst
name: kata-artifacts-arm64-${{ matrix.asset }}-modules${{ inputs.tarball-suffix }}
path: kata-build/kata-static-${{ matrix.asset }}-modules.tar.zst
retention-days: 15
if-no-files-found: error
@@ -216,7 +216,7 @@ jobs:
matrix:
asset:
- busybox
- kernel-nvidia-gpu-headers
- kernel-nvidia-gpu-modules
steps:
- uses: geekyeggo/delete-artifact@f275313e70c08f6120db482d7a6b98377786765b # v5.1.0
with:

1
src/agent/Cargo.lock generated
View File

@@ -4305,6 +4305,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
name = "test-utils"
version = "0.1.0"
dependencies = [
"libc",
"nix 0.26.4",
]

View File

@@ -23,57 +23,44 @@ run_fm_file_name=$3
arch_target=$4
nvidia_gpu_stack="$5"
driver_version=""
driver_type="-open"
supported_gpu_devids="/supported-gpu.devids"
base_os="noble"
APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install"
export KBUILD_SIGN_PIN="${6:-}"
export DEBIAN_FRONTEND=noninteractive
is_feature_enabled() {
local feature="$1"
# Check if feature is in the comma-separated list
if [[ ",${nvidia_gpu_stack}," == *",${feature},"* ]]; then
return 0
else
return 1
fi
[[ ",${nvidia_gpu_stack}," == *",${feature},"* ]]
}
set_driver_version_type() {
echo "chroot: Setting the correct driver version"
if [[ ",${nvidia_gpu_stack}," == *",latest,"* ]]; then
driver_version="latest"
elif [[ ",${nvidia_gpu_stack}," == *",lts,"* ]]; then
driver_version="lts"
elif [[ "${nvidia_gpu_stack}" =~ version=([^,]+) ]]; then
set_driver_version() {
# Extract the driver=XXX part first, then get the value
if [[ "$nvidia_gpu_stack" =~ driver=([^,]+) ]]; then
driver_version="${BASH_REMATCH[1]}"
else
echo "No known driver spec found. Please specify \"latest\", \"lts\", or \"version=<VERSION>\"."
exit 1
fi
echo "chroot: driver_version: ${driver_version}"
echo "chroot: Setting the correct driver type"
# driver -> enable open or closed drivers
if [[ "${nvidia_gpu_stack}" =~ (^|,)driver=open($|,) ]]; then
driver_type="-open"
elif [[ "${nvidia_gpu_stack}" =~ (^|,)driver=closed($|,) ]]; then
driver_type=""
fi
echo "chroot: driver_type: ${driver_type}"
echo "chroot: TODO remove with new NVRC"
cat <<-CHROOT_EOF > "/supported-gpu.devids"
0x230E
0x2321
0x2322
0x2324
0x2329
0x232C
0x2330
0x2331
0x2335
0x2339
0x233A
0x233B
0x2342
0x2348
CHROOT_EOF
}
install_nvidia_ctk() {
echo "chroot: Installing NVIDIA GPU container runtime"
apt list nvidia-container-toolkit-base -a
# Base gives a nvidia-ctk and the nvidia-container-runtime
eval "${APT_INSTALL}" nvidia-container-toolkit-base=1.17.6-1
}
@@ -83,222 +70,54 @@ install_nvidia_fabricmanager() {
echo "chroot: Skipping NVIDIA fabricmanager installation"
return
}
# if run_fm_file_name exists run it
if [[ -f /"${run_fm_file_name}" ]]; then
install_nvidia_fabricmanager_from_run_file
else
install_nvidia_fabricmanager_from_distribution
fi
}
install_nvidia_fabricmanager_from_run_file() {
echo "chroot: Install NVIDIA fabricmanager from run file"
pushd / >> /dev/null
chmod +x "${run_fm_file_name}"
./"${run_fm_file_name}" --nox11
popd >> /dev/null
}
install_nvidia_fabricmanager_from_distribution() {
echo "chroot: Install NVIDIA fabricmanager from distribution"
eval "${APT_INSTALL}" nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
apt-mark hold nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
}
check_kernel_sig_config() {
[[ -n ${kernel_version} ]] || die "kernel_version is not set"
[[ -e /lib/modules/"${kernel_version}"/build/scripts/config ]] || die "Cannot find /lib/modules/${kernel_version}/build/scripts/config"
# make sure the used kernel has the proper CONFIG(s) set
readonly scripts_config=/lib/modules/"${kernel_version}"/build/scripts/config
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_MODULE_SIG)" == "y" ]] || die "Kernel config CONFIG_MODULE_SIG must be =Y"
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_MODULE_SIG_FORCE)" == "y" ]] || die "Kernel config CONFIG_MODULE_SIG_FORCE must be =Y"
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_MODULE_SIG_ALL)" == "y" ]] || die "Kernel config CONFIG_MODULE_SIG_ALL must be =Y"
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_MODULE_SIG_SHA512)" == "y" ]] || die "Kernel config CONFIG_MODULE_SIG_SHA512 must be =Y"
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_SYSTEM_TRUSTED_KEYS)" == "" ]] || die "Kernel config CONFIG_SYSTEM_TRUSTED_KEYS must be =\"\""
[[ "$("${scripts_config}" --file "/boot/config-${kernel_version}" --state CONFIG_SYSTEM_TRUSTED_KEYRING)" == "y" ]] || die "Kernel config CONFIG_SYSTEM_TRUSTED_KEYRING must be =Y"
}
build_nvidia_drivers() {
is_feature_enabled "compute" || {
echo "chroot: Skipping NVIDIA drivers build"
return
}
echo "chroot: Build NVIDIA drivers"
pushd "${driver_source_files}" >> /dev/null
local certs_dir
local kernel_version
local ARCH
for version in /lib/modules/*; do
kernel_version=$(basename "${version}")
certs_dir=/lib/modules/"${kernel_version}"/build/certs
signing_key=${certs_dir}/signing_key.pem
echo "chroot: Building GPU modules for: ${kernel_version}"
cp /boot/System.map-"${kernel_version}" /lib/modules/"${kernel_version}"/build/System.map
if [[ "${arch_target}" == "aarch64" ]]; then
ln -sf /lib/modules/"${kernel_version}"/build/arch/arm64 /lib/modules/"${kernel_version}"/build/arch/aarch64
ARCH=arm64
fi
if [[ "${arch_target}" == "x86_64" ]]; then
ln -sf /lib/modules/"${kernel_version}"/build/arch/x86 /lib/modules/"${kernel_version}"/build/arch/amd64
ARCH=x86_64
fi
echo "chroot: Building GPU modules for: ${kernel_version} ${ARCH}"
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build > /dev/null
if [[ -n "${KBUILD_SIGN_PIN}" ]]; then
mkdir -p "${certs_dir}" && mv /signing_key.* "${certs_dir}"/.
check_kernel_sig_config
fi
make INSTALL_MOD_STRIP=1 -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build modules_install
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build clean > /dev/null
# The make clean above should clear also the certs directory but just in case something
# went wroing make sure the signing_key.pem is removed
[[ -e "${signing_key}" ]] && rm -f "${signing_key}"
done
# Save the modules for later so that a linux-image purge does not remove them
tar cvfa /lib/modules.save_from_purge.tar.zst /lib/modules
popd >> /dev/null
echo "chroot: Install NVIDIA fabricmanager"
eval "${APT_INSTALL}" nvidia-fabricmanager libnvidia-nscq
apt-mark hold nvidia-fabricmanager libnvidia-nscq
}
install_userspace_components() {
if [[ ! -f /"${run_file_name}" ]]; then
echo "chroot: Skipping NVIDIA userspace runfile components installation"
return
fi
eval "${APT_INSTALL}" nvidia-driver-pinning-"${driver_version}"
eval "${APT_INSTALL}" nvidia-imex nvidia-firmware \
libnvidia-cfg1 libnvidia-gl libnvidia-extra \
libnvidia-decode libnvidia-fbc1 libnvidia-encode \
libnvidia-nscq
pushd /NVIDIA-* >> /dev/null
# if aarch64 we need to remove --no-install-compat32-libs
if [[ "${arch_target}" == "aarch64" ]]; then
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root
else
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root --no-install-compat32-libs
fi
popd >> /dev/null
}
prepare_run_file_drivers() {
if [[ "${driver_version}" == "latest" ]]; then
driver_version=""
echo "chroot: Resetting driver version not supported with run-file"
elif [[ "${driver_version}" == "lts" ]]; then
driver_version=""
echo "chroot: Resetting driver version not supported with run-file"
fi
echo "chroot: Prepare NVIDIA run file drivers"
pushd / >> /dev/null
chmod +x "${run_file_name}"
./"${run_file_name}" -x
mkdir -p /usr/share/nvidia/rim/
# Sooner or later RIM files will be only available remotely
RIMFILE=$(ls NVIDIA-*/RIM_GH100PROD.swidtag)
if [[ -e "${RIMFILE}" ]]; then
cp NVIDIA-*/RIM_GH100PROD.swidtag /usr/share/nvidia/rim/.
fi
popd >> /dev/null
}
prepare_distribution_drivers() {
if [[ "${driver_version}" == "latest" ]]; then
driver_version=$(apt-cache search --names-only 'nvidia-headless-no-dkms-.?.?.?-server-open' | sort | awk '{ print $1 }' | tail -n 1 | cut -d'-' -f5)
elif [[ "${driver_version}" == "lts" ]]; then
driver_version="580"
fi
echo "chroot: Prepare NVIDIA distribution drivers"
eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}-server${driver_type}" \
nvidia-kernel-common-"${driver_version}"-server \
nvidia-imex-"${driver_version}" \
nvidia-utils-"${driver_version}"-server \
libnvidia-cfg1-"${driver_version}"-server \
libnvidia-gl-"${driver_version}"-server \
libnvidia-extra-"${driver_version}"-server \
libnvidia-decode-"${driver_version}"-server \
libnvidia-fbc1-"${driver_version}"-server \
libnvidia-encode-"${driver_version}"-server \
libnvidia-nscq-"${driver_version}"
}
prepare_nvidia_drivers() {
local driver_source_dir=""
if [[ -f /"${run_file_name}" ]]; then
prepare_run_file_drivers
for source_dir in /NVIDIA-*; do
if [[ -d "${source_dir}" ]]; then
driver_source_files="${source_dir}"/kernel${driver_type}
driver_source_dir="${source_dir}"
break
fi
done
get_supported_gpus_from_run_file "${driver_source_dir}"
else
prepare_distribution_drivers
for source_dir in /usr/src/nvidia*; do
if [[ -d "${source_dir}" ]]; then
driver_source_files="${source_dir}"
driver_source_dir="${source_dir}"
break
fi
done
get_supported_gpus_from_distro_drivers "${driver_source_dir}"
fi
}
install_build_dependencies() {
echo "chroot: Install NVIDIA drivers build dependencies"
eval "${APT_INSTALL}" make gcc gawk kmod libvulkan1 pciutils jq zstd linuxptp xz-utils
apt-mark hold nvidia-imex nvidia-firmware \
libnvidia-cfg1 libnvidia-gl libnvidia-extra \
libnvidia-decode libnvidia-fbc1 libnvidia-encode \
libnvidia-nscq
}
setup_apt_repositories() {
echo "chroot: Setup APT repositories"
mkdir -p /var/cache/apt/archives/partial
mkdir -p /var/log/apt
mkdir -p /var/lib/dpkg/info
mkdir -p /var/lib/dpkg/updates
mkdir -p /var/lib/dpkg/alternatives
mkdir -p /var/lib/dpkg/triggers
mkdir -p /var/lib/dpkg/parts
# Architecture to mirror mapping
declare -A arch_to_mirror=(
["x86_64"]="us.archive.ubuntu.com/ubuntu"
["aarch64"]="ports.ubuntu.com/ubuntu-ports"
)
local mirror="${arch_to_mirror[$arch_target]}"
[[ -z "$mirror" ]] && die "Unknown arch_target: ${arch_target}"
local deb_arch="amd64"
[[ "$arch_target" == "aarch64" ]] && deb_arch="arm64"
mkdir -p /var/cache/apt/archives/partial /var/log/apt \
/var/lib/dpkg/{info,updates,alternatives,triggers,parts}
touch /var/lib/dpkg/status
rm -f /etc/apt/sources.list.d/*
if [[ "${arch_target}" == "x86_64" ]]; then
cat <<-CHROOT_EOF > /etc/apt/sources.list.d/"${base_os}".list
deb [arch=amd64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://us.archive.ubuntu.com/ubuntu ${base_os} main restricted universe multiverse
deb [arch=amd64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://us.archive.ubuntu.com/ubuntu ${base_os}-updates main restricted universe multiverse
deb [arch=amd64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://us.archive.ubuntu.com/ubuntu ${base_os}-security main restricted universe multiverse
deb [arch=amd64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://us.archive.ubuntu.com/ubuntu ${base_os}-backports main restricted universe multiverse
CHROOT_EOF
fi
key="/usr/share/keyrings/ubuntu-archive-keyring.gpg"
if [[ "${arch_target}" == "aarch64" ]]; then
cat <<-CHROOT_EOF > /etc/apt/sources.list.d/"${base_os}".list
deb [arch=arm64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://ports.ubuntu.com/ubuntu-ports ${base_os} main restricted universe multiverse
deb [arch=arm64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://ports.ubuntu.com/ubuntu-ports ${base_os}-updates main restricted universe multiverse
deb [arch=arm64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://ports.ubuntu.com/ubuntu-ports ${base_os}-security main restricted universe multiverse
deb [arch=arm64 signed-by=/usr/share/keyrings/ubuntu-archive-keyring.gpg] http://ports.ubuntu.com/ubuntu-ports ${base_os}-backports main restricted universe multiverse
CHROOT_EOF
fi
cat <<-CHROOT_EOF > /etc/apt/sources.list.d/"${base_os}".list
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os} main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-updates main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-security main restricted universe multiverse
deb [arch=${deb_arch} signed-by=${key}] http://${mirror} ${base_os}-backports main restricted universe multiverse
CHROOT_EOF
local arch="${arch_target}"
[[ ${arch_target} == "aarch64" ]] && arch="sbsa"
@@ -312,58 +131,22 @@ setup_apt_repositories() {
# Set priorities: Ubuntu repos highest, NVIDIA Container Toolkit next, CUDA repo blocked for driver packages
cat <<-CHROOT_EOF > /etc/apt/preferences.d/nvidia-priority
# Prioritize Ubuntu repositories (highest priority)
Package: *
Pin: origin us.archive.ubuntu.com
Pin-Priority: 1000
Pin: $(dirname ${mirror})
Pin-Priority: 400
Package: *
Pin: origin ports.ubuntu.com
Pin-Priority: 1000
# NVIDIA Container Toolkit (medium priority for toolkit only)
Package: nvidia-container-toolkit* libnvidia-container*
Pin: origin nvidia.github.io
Pin-Priority: 500
# Block all nvidia and libnvidia packages from CUDA repository
Package: nvidia-* libnvidia-*
Pin: origin developer.download.nvidia.com
Pin: $(dirname ${mirror})
Pin-Priority: -1
# Allow non-driver CUDA packages from CUDA repository (low priority)
Package: *
Pin: origin developer.download.nvidia.com
Pin-Priority: 100
Pin-Priority: 800
CHROOT_EOF
apt update
}
install_kernel_dependencies() {
dpkg -i /linux-*deb
}
get_supported_gpus_from_run_file() {
local source_dir="$1"
local supported_gpus_json="${source_dir}"/supported-gpus/supported-gpus.json
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > "${supported_gpu_devids}"
}
get_supported_gpus_from_distro_drivers() {
local supported_gpus_json="./usr/share/doc/nvidia-kernel-common-${driver_version}-server/supported-gpus.json"
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > "${supported_gpu_devids}"
}
export_driver_version() {
for modules_version in /lib/modules/*; do
modinfo "${modules_version}"/kernel/drivers/video/nvidia.ko | grep ^version | awk '{ print $2 }' > /nvidia_driver_version
break
done
}
install_nvidia_dcgm() {
is_feature_enabled "dcgm" || {
echo "chroot: Skipping NVIDIA DCGM installation"
@@ -379,49 +162,12 @@ install_nvidia_dcgm() {
cleanup_rootfs() {
echo "chroot: Cleanup NVIDIA GPU rootfs"
apt-mark hold libstdc++6 libzstd1 libgnutls30t64 pciutils
if [[ -n "${driver_version}" ]]; then
apt-mark hold libnvidia-cfg1-"${driver_version}"-server \
nvidia-utils-"${driver_version}"-server \
nvidia-kernel-common-"${driver_version}"-server \
nvidia-imex-"${driver_version}" \
nvidia-compute-utils-"${driver_version}"-server \
libnvidia-compute-"${driver_version}"-server \
libnvidia-gl-"${driver_version}"-server \
libnvidia-extra-"${driver_version}"-server \
libnvidia-decode-"${driver_version}"-server \
libnvidia-fbc1-"${driver_version}"-server \
libnvidia-encode-"${driver_version}"-server \
libnvidia-nscq-"${driver_version}" \
linuxptp libnftnl11
fi
kernel_headers=$(dpkg --get-selections | cut -f1 | grep linux-headers)
linux_images=$(dpkg --get-selections | cut -f1 | grep linux-image)
for i in ${kernel_headers} ${linux_images}; do
apt purge -yqq "${i}"
done
apt purge -yqq jq make gcc xz-utils linux-libc-dev
if [[ -n "${driver_version}" ]]; then
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}"-server"${driver_type}" \
nvidia-kernel-source-"${driver_version}"-server"${driver_type}"
fi
apt-mark hold libstdc++6 libzstd1 libgnutls30t64 pciutils linuxptp libnftnl11
apt autoremove -yqq
apt clean
apt autoclean
for modules_version in /lib/modules/*; do
ln -sf "${modules_version}" /lib/modules/"$(uname -r)"
touch "${modules_version}"/modules.order
touch "${modules_version}"/modules.builtin
depmod -a
done
rm -rf /var/lib/apt/lists/* /var/cache/apt/* /var/log/apt /var/cache/debconf
rm -f /etc/apt/sources.list
rm -f /usr/bin/nvidia-ngx-updater /usr/bin/nvidia-container-runtime
@@ -430,23 +176,15 @@ cleanup_rootfs() {
# Clear and regenerate the ld cache
rm -f /etc/ld.so.cache
ldconfig
tar xvf /lib/modules.save_from_purge.tar.zst -C /
rm -f /lib/modules.save_from_purge.tar.zst
}
# Start of script
echo "chroot: Setup NVIDIA GPU rootfs stage one"
set_driver_version_type
set_driver_version
setup_apt_repositories
install_kernel_dependencies
install_build_dependencies
prepare_nvidia_drivers
build_nvidia_drivers
install_userspace_components
install_nvidia_fabricmanager
install_nvidia_ctk
export_driver_version
install_nvidia_dcgm
cleanup_rootfs

View File

@@ -93,9 +93,9 @@ setup_nvidia_gpu_rootfs_stage_one() {
appendix="-dragonball-experimental"
fi
# We need the kernel packages for building the drivers cleanly will be
# deinstalled and removed from the roofs once the build finishes.
tar --zstd -xvf "${BUILD_DIR}"/kata-static-kernel-nvidia-gpu"${appendix}"-headers.tar.zst -C .
# Install the precompiled kernel modules shipped with the kernel
mkdir -p ./lib/modules/
tar --zstd -xvf "${BUILD_DIR}"/kata-static-kernel-nvidia-gpu"${appendix}"-modules.tar.zst -C ./lib/modules/
# If we find a local downloaded run file build the kernel modules
# with it, otherwise use the distribution packages. Run files may have
@@ -115,13 +115,12 @@ setup_nvidia_gpu_rootfs_stage_one() {
mount -t proc /proc ./proc
chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} \
${run_fm_file_name} ${machine_arch} ${NVIDIA_GPU_STACK} ${KBUILD_SIGN_PIN}"
${run_fm_file_name} ${machine_arch} ${NVIDIA_GPU_STACK}"
umount -R ./dev
umount ./proc
rm ./nvidia_chroot.sh
rm ./*.deb
tar cfa "${stage_one}.tar.zst" --remove-files -- *
@@ -183,7 +182,6 @@ chisseled_dcgm() {
chisseled_compute() {
echo "nvidia: chisseling GPU"
cp -a "${stage_one}"/nvidia_driver_version .
cp -a "${stage_one}"/lib/modules/* lib/modules/.
libdir="lib/${machine_arch}-linux-gnu"
@@ -194,6 +192,15 @@ chisseled_compute() {
cp -a "${stage_one}/${libdir}"/libc.so.6* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libm.so.6* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/librt.so.1* "${libdir}"/.
# nvidia-persistenced dependencies for CUDA repo and >= 590
cp -a "${stage_one}/${libdir}"/libtirpc.so.3* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libgssapi_krb5.so.2* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libkrb5.so.3* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libkrb5support.so.0* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libk5crypto.so.3* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libcom_err.so.2* "${libdir}"/.
cp -a "${stage_one}/${libdir}"/libkeyutils.so.1* "${libdir}"/.
cp -a "${stage_one}/etc/netconfig" etc/.
[[ "${type}" == "confidential" ]] && cp -a "${stage_one}/${libdir}"/libnvidia-pkcs11* "${libdir}"/.

View File

@@ -52,7 +52,7 @@ build_initrd() {
GUEST_HOOKS_TARBALL="${GUEST_HOOKS_TARBALL}"
if [[ "${image_initrd_suffix}" == "nvidia-gpu"* ]]; then
nvidia_driver_version=$(cat "${builddir}"/initrd-image/*/nvidia_driver_version)
nvidia_driver_version=$(get_from_kata_deps .externals.nvidia.driver.version)
artifact_name=${artifact_name/.initrd/"-${nvidia_driver_version}".initrd}
fi
@@ -81,7 +81,7 @@ build_image() {
GUEST_HOOKS_TARBALL="${GUEST_HOOKS_TARBALL}"
if [[ "${image_initrd_suffix}" == "nvidia-gpu"* ]]; then
nvidia_driver_version=$(cat "${builddir}"/rootfs-image/*/nvidia_driver_version)
nvidia_driver_version=$(get_from_kata_deps .externals.nvidia.driver.version)
artifact_name=${artifact_name/.image/"-${nvidia_driver_version}".image}
fi

View File

@@ -56,7 +56,6 @@ REPO_COMPONENTS="${REPO_COMPONENTS:-}"
AGENT_POLICY="${AGENT_POLICY:-yes}"
TARGET_BRANCH="${TARGET_BRANCH:-main}"
PUSH_TO_REGISTRY="${PUSH_TO_REGISTRY:-}"
KERNEL_HEADERS_PKG_TYPE="${KERNEL_HEADERS_PKG_TYPE:-deb}"
RELEASE="${RELEASE:-"no"}"
KBUILD_SIGN_PIN="${KBUILD_SIGN_PIN:-}"
RUNTIME_CHOICE="${RUNTIME_CHOICE:-both}"
@@ -145,15 +144,6 @@ EOF
exit "${return_code}"
}
get_kernel_headers_dir() {
local kernel_name"=${1:-}"
[ -z "${kernel_name}" ] && die "kernel name is a required argument"
local kernel_headers_dir="${repo_root_dir}/tools/packaging/kata-deploy/local-build/build/${kernel_name}/builddir"
echo "${kernel_headers_dir}"
}
get_kernel_modules_dir() {
local kernel_version="${1:-}"
local kernel_kata_config_version="${2:-}"
@@ -607,10 +597,8 @@ install_initrd_confidential() {
# For all nvidia_gpu targets we can customize the stack that is enbled
# in the VM by setting the NVIDIA_GPU_STACK= environment variable
#
# latest | lts | version
# -> use the latest and greatest driver,
# lts release or e.g. version=550.127.1
# driver -> enable open or closed drivers
# driver -> driver version is set via versions.yaml making sure kernel
# and rootfs builds are using the same version
# compute -> enable the compute GPU stack, includes utility
# graphics -> enable the graphics GPU stack, includes compute
# dcgm -> enable the DCGM stack + DGCM exporter
@@ -624,39 +612,43 @@ install_initrd_confidential() {
#
# The full stack can be enabled by setting all the options like:
#
# NVIDIA_GPU_STACK="latest,compute,dcgm,nvswitch,gpudirect"
# NVIDIA_GPU_STACK="compute,dcgm,nvswitch,gpudirect"
#
# Install NVIDIA GPU image
install_image_nvidia_gpu() {
export AGENT_POLICY
local version=$(get_from_kata_deps .externals.nvidia.driver.version)
EXTRA_PKGS="apt curl ${EXTRA_PKGS}"
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"driver=${version},compute,dcgm"}
install_image "nvidia-gpu"
}
# Install NVIDIA GPU initrd
install_initrd_nvidia_gpu() {
export AGENT_POLICY
local version=$(get_from_kata_deps .externals.nvidia.driver.version)
EXTRA_PKGS="apt curl ${EXTRA_PKGS}"
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"driver=${version},compute,dcgm"}
install_initrd "nvidia-gpu"
}
# Instal NVIDIA GPU confidential image
install_image_nvidia_gpu_confidential() {
export AGENT_POLICY
local version=$(get_from_kata_deps .externals.nvidia.driver.version)
EXTRA_PKGS="apt curl ${EXTRA_PKGS}"
# TODO: export MEASURED_ROOTFS=yes
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"driver=${version},compute,dcgm"}
install_image "nvidia-gpu-confidential"
}
# Install NVIDIA GPU confidential initrd
install_initrd_nvidia_gpu_confidential() {
export AGENT_POLICY
local version=$(get_from_kata_deps .externals.nvidia.driver.version)
EXTRA_PKGS="apt curl ${EXTRA_PKGS}"
# TODO: export MEASURED_ROOTFS=yes
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"driver=${version},compute,dcgm"}
install_initrd "nvidia-gpu-confidential"
}
@@ -685,10 +677,10 @@ install_cached_kernel_tarball_component() {
case ${kernel_name} in
"kernel-nvidia-gpu"*"")
local kernel_headers_dir=$(get_kernel_headers_dir "${kernel_name}")
mkdir -p ${kernel_headers_dir} || true
tar --zstd -xvf ${workdir}/${kernel_name}/builddir/kata-static-${kernel_name}-headers.tar.zst -C "${kernel_headers_dir}" || return 1
;;& # fallthrough in the confidential case we need the modules.tar.zst and for every kernel-nvidia-gpu we need the headers
local modules_dir=$(get_kernel_modules_dir ${kernel_version} ${kernel_kata_config_version} ${build_target})
mkdir -p "${modules_dir}" || true
tar --strip-components=1 --zstd -xvf "${workdir}/kata-static-${kernel_name}-modules.tar.zst" -C "${modules_dir}" || return 1
;;
"kernel"*"-confidential")
local modules_dir=$(get_kernel_modules_dir ${kernel_version} ${kernel_kata_config_version} ${build_target})
mkdir -p "${modules_dir}" || true
@@ -726,17 +718,13 @@ install_kernel_helper() {
kernel_url="$(get_from_kata_deps .assets.kernel.nvidia-confidential.url)"
fi
if [[ "${kernel_name}" == "kernel"*"-confidential" ]]; then
local kernel_modules_tarball_name="kata-static-${kernel_name}-modules.tar.zst"
local kernel_modules_tarball_path="${workdir}/${kernel_modules_tarball_name}"
extra_tarballs="${kernel_modules_tarball_name}:${kernel_modules_tarball_path}"
fi
if [[ "${kernel_name}" == "kernel-nvidia-gpu*" ]]; then
local kernel_headers_tarball_name="kata-static-${kernel_name}-headers.tar.zst"
local kernel_headers_tarball_path="${workdir}/${kernel_headers_tarball_name}"
extra_tarballs+=" ${kernel_headers_tarball_name}:${kernel_headers_tarball_path}"
fi
case ${kernel_name} in
kernel-nvidia-gpu*|kernel*-confidential)
local kernel_modules_tarball_name="kata-static-${kernel_name}-modules.tar.zst"
local kernel_modules_tarball_path="${workdir}/${kernel_modules_tarball_name}"
extra_tarballs="${kernel_modules_tarball_name}:${kernel_modules_tarball_path}"
;;
esac
default_patches_dir="${repo_root_dir}/tools/packaging/kernel/patches"
@@ -791,7 +779,7 @@ install_kernel_nvidia_gpu_dragonball_experimental() {
install_kernel_helper \
"assets.kernel-dragonball-experimental" \
"kernel-dragonball-experimental" \
"-e -t dragonball -g nvidia -H deb"
"-e -t dragonball -g nvidia"
}
#Install GPU enabled kernel asset
@@ -799,7 +787,7 @@ install_kernel_nvidia_gpu() {
install_kernel_helper \
"assets.kernel.nvidia" \
"kernel-nvidia-gpu" \
"-g nvidia -H deb"
"-g nvidia"
}
#Install GPU and TEE enabled kernel asset
@@ -807,7 +795,7 @@ install_kernel_nvidia_gpu_confidential() {
install_kernel_helper \
"assets.kernel.nvidia-confidential" \
"kernel-nvidia-gpu-confidential" \
"-x -g nvidia -H deb"
"-x -g nvidia"
}
install_qemu_helper() {
@@ -1458,33 +1446,22 @@ handle_build() {
case ${build_target} in
kernel-nvidia-gpu*)
local kernel_headers_final_tarball_path="${workdir}/kata-static-${build_target}-headers.tar.zst"
if [ ! -f "${kernel_headers_final_tarball_path}" ]; then
local kernel_headers_dir
kernel_headers_dir=$(get_kernel_headers_dir "${build_target}")
local modules_final_tarball_path="${workdir}/kata-static-${build_target}-modules.tar.zst"
if [ ! -f "${modules_final_tarball_path}" ]; then
local modules_dir=$(get_kernel_modules_dir ${kernel_version} ${kernel_kata_config_version} ${build_target})
pushd "${kernel_headers_dir}"
find . -type f -name "*.${KERNEL_HEADERS_PKG_TYPE}" -exec tar -rvf kernel-headers.tar {} +
if [ -n "${KBUILD_SIGN_PIN}" ]; then
# For those 2 we can simply do a `|| true` as the signing_key.{pem,x509} are either:
# * already in ., as we're using a cached tarball
# * will be moved here, in case we had built the kernel
mv kata-linux-*/certs/signing_key.pem . || true
mv kata-linux-*/certs/signing_key.x509 . || true
parent_dir=$(dirname "${modules_dir}")
parent_dir_basename=$(basename "${parent_dir}")
# Then we can check for the key on ., as it should always be here on both cases
# (cached or built kernel).
head -n1 "signing_key.pem" | grep -q "ENCRYPTED PRIVATE KEY" || die "signing_key.pem is not encrypted"
pushd "${parent_dir}"
rm -f ${parent_dir_basename}/build
tar --zstd -cvf "${modules_final_tarball_path}" "."
tar -rvf kernel-headers.tar signing_key.pem signing_key.x509 --remove-files
fi
zstd -T0 kernel-headers.tar -o kernel-headers.tar.zst
mv kernel-headers.tar.zst "${kernel_headers_final_tarball_path}"
popd
fi
tar --zstd -tvf "${kernel_headers_final_tarball_path}"
;;& # fallthrough in the confidential case we need the modules.tar.zst and for every kernel-nvidia-gpu we need the headers
tar --zstd -tvf "${modules_final_tarball_path}"
;;
kernel*-confidential)
local modules_final_tarball_path="${workdir}/kata-static-${build_target}-modules.tar.zst"
if [ ! -f "${modules_final_tarball_path}" ]; then
@@ -1551,18 +1528,7 @@ handle_build() {
)
oci_image="${ARTEFACT_REGISTRY}/${ARTEFACT_REPOSITORY}/cached-artefacts/${build_target}:${normalized_tags}"
case ${build_target} in
kernel-nvidia-gpu)
files_to_push+=(
"kata-static-${build_target}-headers.tar.zst"
)
;;
kernel-nvidia-gpu-confidential)
files_to_push+=(
"kata-static-${build_target}-modules.tar.zst"
"kata-static-${build_target}-headers.tar.zst"
)
;;
kernel*-confidential)
kernel-nvidia-gpu*|kernel*-confidential)
files_to_push+=(
"kata-static-${build_target}-modules.tar.zst"
)

View File

@@ -514,6 +514,18 @@ setup_kernel() {
cp "${kernel_config_path}" ./.config
ARCH=${arch_target} make oldconfig ${CROSS_BUILD_ARG}
)
info "Fetching NVIDIA driver source code"
if [[ "${gpu_vendor}" == "${VENDOR_NVIDIA}" ]]; then
driver_version=$(get_from_kata_deps .externals.nvidia.driver.version)
driver_url=$(get_from_kata_deps .externals.nvidia.driver.url)
driver_src="open-gpu-kernel-modules-${driver_version}"
info "Downloading NVIDIA driver source code from: ${driver_url}${driver_version}.tar.gz"
[[ -d "${driver_src}" ]] && rm -rf "${driver_src}"
curl -L -o "${driver_version}.tar.gz" "${driver_url}${driver_version}.tar.gz"
tar -xvf "${driver_version}.tar.gz" --transform "s|open-gpu-kernel-modules-${driver_version}|open-gpu-kernel-modules|"
fi
}
build_kernel() {
@@ -531,6 +543,13 @@ build_kernel() {
[ -e "vmlinux" ]
([ "${hypervisor_target}" == "firecracker" ] || [ "${hypervisor_target}" == "cloud-hypervisor" ]) && [ "${arch_target}" == "arm64" ] && [ -e "arch/${arch_target}/boot/Image" ]
popd >>/dev/null
if [[ "${gpu_vendor}" == "${VENDOR_NVIDIA}" ]]; then
pushd open-gpu-kernel-modules
make -j "$(nproc)" CC=gcc SYSSRC="${kernel_path}" > /dev/null
make INSTALL_MOD_STRIP=1 INSTALL_MOD_PATH=${kernel_path} -j "$(nproc)" CC=gcc SYSSRC="${kernel_path}" modules_install
make -j "$(nproc)" CC=gcc SYSSRC="${kernel_path}" clean > /dev/null
fi
}
build_kernel_headers() {

View File

@@ -1 +1 @@
174
175

View File

@@ -4,9 +4,12 @@
FROM ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV INSTALL_IN_GOPATH=false
ARG ARCH
COPY install_yq.sh /usr/bin/install_yq.sh
# kernel deps
RUN apt-get update && \
apt-get install -y --no-install-recommends \
@@ -30,3 +33,5 @@ RUN apt-get update && \
python3 && \
if [ "${ARCH}" != "$(uname -m)" ]; then apt-get install --no-install-recommends -y gcc-"${ARCH}"-linux-gnu binutils-"${ARCH}"-linux-gnu; fi && \
apt-get clean && apt-get autoclean && rm -rf /var/lib/apt/lists/*
RUN install_yq.sh

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env bash
#
# Copyright (c) 2019 IBM
#
# SPDX-License-Identifier: Apache-2.0
#
[[ -n "${DEBUG}" ]] && set -o xtrace
# If we fail for any reason a message will be displayed
die() {
msg="$*"
echo "ERROR: ${msg}" >&2
exit 1
}
function verify_yq_exists() {
local yq_path=$1
local yq_version=$2
local expected="yq (https://github.com/mikefarah/yq/) version ${yq_version}"
if [[ -x "${yq_path}" ]] && [[ "$(${yq_path} --version)"X == "${expected}"X ]]; then
return 0
else
return 1
fi
}
# Install the yq yaml query package from the mikefarah github repo
# Install via binary download, as we may not have golang installed at this point
function install_yq() {
local yq_pkg="github.com/mikefarah/yq"
local yq_version=v4.44.5
local precmd=""
local yq_path=""
INSTALL_IN_GOPATH=${INSTALL_IN_GOPATH:-true}
if [[ "${INSTALL_IN_GOPATH}" == "true" ]]; then
GOPATH=${GOPATH:-${HOME}/go}
mkdir -p "${GOPATH}/bin"
yq_path="${GOPATH}/bin/yq"
else
yq_path="/usr/local/bin/yq"
fi
if verify_yq_exists "${yq_path}" "${yq_version}"; then
echo "yq is already installed in correct version"
return
fi
if [[ "${yq_path}" == "/usr/local/bin/yq" ]]; then
# Check if we need sudo to install yq
if [[ ! -w "/usr/local/bin" ]]; then
# Check if we have sudo privileges
if ! sudo -n true 2>/dev/null; then
die "Please provide sudo privileges to install yq"
else
precmd="sudo"
fi
fi
fi
read -r -a sysInfo <<< "$(uname -sm)"
case "${sysInfo[0]}" in
"Linux" | "Darwin")
goos="${sysInfo[0],}"
;;
"*")
die "OS ${sysInfo[0]} not supported"
;;
esac
case "${sysInfo[1]}" in
"aarch64")
goarch=arm64
;;
"arm64")
# If we're on an apple silicon machine, just assign amd64.
# The version of yq we use doesn't have a darwin arm build,
# but Rosetta can come to the rescue here.
if [[ ${goos} == "Darwin" ]]; then
goarch=amd64
else
goarch=arm64
fi
;;
"riscv64")
goarch=riscv64
;;
"ppc64le")
goarch=ppc64le
;;
"x86_64")
goarch=amd64
;;
"s390x")
goarch=s390x
;;
"*")
die "Arch ${sysInfo[1]} not supported"
;;
esac
# Check curl
if ! command -v "curl" >/dev/null; then
die "Please install curl"
fi
## NOTE: ${var,,} => gives lowercase value of var
local yq_url="https://${yq_pkg}/releases/download/${yq_version}/yq_${goos}_${goarch}"
${precmd} curl -o "${yq_path}" -LSsf "${yq_url}" || die "Download ${yq_url} failed"
${precmd} chmod +x "${yq_path}"
if ! command -v "${yq_path}" >/dev/null; then
die "Cannot not get ${yq_path} executable"
fi
}
install_yq

View File

@@ -215,11 +215,11 @@ assets:
nvidia:
description: "Linux kernel optimised for virtual machines"
url: "https://cdn.kernel.org/pub/linux/kernel/v6.x/"
version: "v6.12.64"
version: "v6.18.5"
nvidia-confidential:
description: "Linux kernel with x86_64 TEEs (SNP and TDX) support"
url: "https://cdn.kernel.org/pub/linux/kernel/v6.x/"
version: "v6.16.7"
version: "v6.18.5"
kernel-arm-experimental:
description: "Linux kernel with cpu/mem hotplug support on arm64"
@@ -245,6 +245,14 @@ externals:
version: "v0.0.1"
url: "https://github.com/NVIDIA/nvrc/releases/download/"
nvidia:
desc: "NVIDIA driver version"
driver:
version: "590.48.01"
url: "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/"
cuda:
url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/"
busybox:
desc: "The Swiss Army Knife of Embedded Linux"
version: "1.36.1"