gpu: introduce devkit build flag

Introduce a new devkit parameter which will produce a rootfs
without chisselling. This results in a larger rootfs with various
packages and binaries being included, for instance, enabling the
use of the debug console.

Signed-off-by: Manuel Huber <manuelh@nvidia.com>
This commit is contained in:
Manuel Huber
2025-11-06 23:40:30 +00:00
committed by Fabiano Fidêncio
parent 2c9e0f9f4f
commit 3966864376
4 changed files with 130 additions and 41 deletions

View File

@@ -4,7 +4,6 @@
#
# SPDX-License-Identifier: Apache-2.0
#!/bin/bash
set -euo pipefail
[[ -n "${DEBUG}" ]] && set -x
@@ -164,6 +163,10 @@ build_nvidia_drivers() {
# went wroing make sure the signing_key.pem is removed
[[ -e "${signing_key}" ]] && rm -f "${signing_key}"
done
# Save the modules for later so that a linux-image purge does not remove them
tar cvfa /lib/modules.save_from_purge.tar.zst /lib/modules
popd >> /dev/null
}
@@ -373,6 +376,65 @@ install_nvidia_dcgm() {
datacenter-gpu-manager-exporter
}
cleanup_rootfs() {
echo "chroot: Cleanup NVIDIA GPU rootfs"
apt-mark hold libstdc++6 libzstd1 libgnutls30t64 pciutils
if [[ -n "${driver_version}" ]]; then
apt-mark hold libnvidia-cfg1-"${driver_version}"-server \
nvidia-utils-"${driver_version}"-server \
nvidia-kernel-common-"${driver_version}"-server \
nvidia-imex-"${driver_version}" \
nvidia-compute-utils-"${driver_version}"-server \
libnvidia-compute-"${driver_version}"-server \
libnvidia-gl-"${driver_version}"-server \
libnvidia-extra-"${driver_version}"-server \
libnvidia-decode-"${driver_version}"-server \
libnvidia-fbc1-"${driver_version}"-server \
libnvidia-encode-"${driver_version}"-server \
libnvidia-nscq-"${driver_version}" \
linuxptp libnftnl11
fi
kernel_headers=$(dpkg --get-selections | cut -f1 | grep linux-headers)
linux_images=$(dpkg --get-selections | cut -f1 | grep linux-image)
for i in ${kernel_headers} ${linux_images}; do
apt purge -yqq "${i}"
done
apt purge -yqq jq make gcc xz-utils linux-libc-dev
if [[ -n "${driver_version}" ]]; then
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}"-server"${driver_type}" \
nvidia-kernel-source-"${driver_version}"-server"${driver_type}"
fi
apt autoremove -yqq
apt clean
apt autoclean
for modules_version in /lib/modules/*; do
ln -sf "${modules_version}" /lib/modules/"$(uname -r)"
touch "${modules_version}"/modules.order
touch "${modules_version}"/modules.builtin
depmod -a
done
rm -rf /var/lib/apt/lists/* /var/cache/apt/* /var/log/apt /var/cache/debconf
rm -f /etc/apt/sources.list
rm -f /usr/bin/nvidia-ngx-updater /usr/bin/nvidia-container-runtime
rm -f /var/log/{nvidia-installer.log,dpkg.log,alternatives.log}
# Clear and regenerate the ld cache
rm -f /etc/ld.so.cache
ldconfig
tar xvf /lib/modules.save_from_purge.tar.zst -C /
rm -f /lib/modules.save_from_purge.tar.zst
}
# Start of script
echo "chroot: Setup NVIDIA GPU rootfs stage one"
@@ -387,3 +449,4 @@ install_nvidia_fabricmanager
install_nvidia_ctk
export_driver_version
install_nvidia_dcgm
cleanup_rootfs

View File

@@ -207,7 +207,7 @@ chisseled_compute() {
cp -a "${stage_one}/${libdir}"/libcuda.so.* lib/"${machine_arch}"-linux-gnu/.
cp -a "${stage_one}/${libdir}"/libnvidia-cfg.so.* lib/"${machine_arch}"-linux-gnu/.
# basich GPU admin tools
# basic GPU admin tools
cp -a "${stage_one}"/usr/bin/nvidia-persistenced bin/.
cp -a "${stage_one}"/usr/bin/nvidia-smi bin/.
cp -a "${stage_one}"/usr/bin/nvidia-ctk bin/.
@@ -221,6 +221,17 @@ chisseled_gpudirect() {
exit 1
}
setup_nvrc_init_symlinks() {
local rootfs_type=${1:-""}
local bin="NVRC${rootfs_type:+"-${rootfs_type}"}"
local target=${machine_arch}-unknown-linux-musl
# make sure NVRC is the init process for the initrd and image case
ln -sf /bin/"${bin}-${target}" init
ln -sf /bin/"${bin}-${target}" sbin/init
}
chisseled_init() {
local rootfs_type=${1:-""}
@@ -244,9 +255,7 @@ chisseled_init() {
cp -a "${stage_one}/bin/${bin}-${target}".cert bin/.
cp -a "${stage_one}/bin/${bin}-${target}".sig bin/.
# make sure NVRC is the init process for the initrd and image case
ln -sf /bin/"${bin}-${target}" init
ln -sf /bin/"${bin}-${target}" sbin/init
setup_nvrc_init_symlinks "${rootfs_type}"
cp -a "${stage_one}"/usr/bin/kata-agent usr/bin/.
if [[ "${AGENT_POLICY}" == "yes" ]]; then
@@ -270,11 +279,24 @@ compress_rootfs() {
# For some unobvious reason libc has executable bit set
# clean this up otherwise the find -executable will not work correctly
find . -type f -name "*.so.*" | while IFS= read -r file; do
if ! file "${file}" | grep -q ELF; then
echo "nvidia: skip stripping file: ${file} ($(file -b "${file}"))"
continue
fi
chmod -x "${file}"
strip "${file}"
done
find . -type f -executable | while IFS= read -r file; do
# Skip files with setuid/setgid bits (UPX refuses to pack them)
if [ -u "${file}" ] || [ -g "${file}" ]; then
echo "nvidia: skip compressing executable (special permissions): ${file} ($(file -b "${file}"))"
continue
fi
if ! file "${file}" | grep -q ELF; then
echo "nvidia: skip compressing executable (not ELF): ${file} ($(file -b "${file}"))"
continue
fi
strip "${file}"
"${BUILD_DIR}"/upx-4.2.4-"${distro_arch}"_linux/upx --best --lzma "${file}"
done
@@ -287,7 +309,6 @@ compress_rootfs() {
[[ ${machine_arch} == "x86_64" ]] && libdir="lib64"
chmod +x "${libdir}"/ld-linux-*
}
coco_guest_components() {
@@ -315,56 +336,57 @@ coco_guest_components() {
info "TODO: nvidia: luks-encrypt-storage is a bash script, we do not have a shell!"
}
toggle_debug() {
if echo "${NVIDIA_GPU_STACK}" | grep -q '\<debug\>'; then
export DEBUG="true"
fi
}
setup_nvidia_gpu_rootfs_stage_two() {
readonly stage_two="${ROOTFS_DIR:?}"
readonly stack="${NVIDIA_GPU_STACK:?}"
readonly type=${1:-""}
echo "nvidia: chisseling the following stack components: ${stack}"
# If devkit flag is set, skip chisseling, use stage_one
if echo "${stack}" | grep -q '\<devkit\>'; then
echo "nvidia: devkit mode enabled - skip chisseling"
tar -C "${stage_two}" -xf "${stage_one}".tar.zst
[[ -e "${stage_one}" ]] && rm -rf "${stage_one}"
[[ ! -e "${stage_one}" ]] && mkdir -p "${stage_one}"
pushd "${stage_two}" >> /dev/null
tar -C "${stage_one}" -xf "${stage_one}".tar.zst
# Only step needed from stage_two (see chisseled_init)
setup_nvrc_init_symlinks "${type}"
else
echo "nvidia: chisseling the following stack components: ${stack}"
[[ -e "${stage_one}" ]] && rm -rf "${stage_one}"
[[ ! -e "${stage_one}" ]] && mkdir -p "${stage_one}"
pushd "${stage_two}" >> /dev/null
tar -C "${stage_one}" -xf "${stage_one}".tar.zst
toggle_debug
chisseled_init "${type}"
chisseled_iptables
pushd "${stage_two}" >> /dev/null
IFS=',' read -r -a stack_components <<< "${NVIDIA_GPU_STACK}"
chisseled_init "${type}"
chisseled_iptables
for component in "${stack_components[@]}"; do
if [[ "${component}" = "compute" ]]; then
echo "nvidia: processing \"compute\" component"
chisseled_compute
elif [[ "${component}" = "dcgm" ]]; then
echo "nvidia: processing DCGM component"
chisseled_dcgm
elif [[ "${component}" = "nvswitch" ]]; then
echo "nvidia: processing NVSwitch component"
chisseled_nvswitch
elif [[ "${component}" = "gpudirect" ]]; then
echo "nvidia: processing GPUDirect component"
chisseled_gpudirect
fi
done
IFS=',' read -r -a stack_components <<< "${NVIDIA_GPU_STACK}"
coco_guest_components
for component in "${stack_components[@]}"; do
if [[ "${component}" = "compute" ]]; then
echo "nvidia: processing \"compute\" component"
chisseled_compute
elif [[ "${component}" = "dcgm" ]]; then
echo "nvidia: processing DCGM component"
chisseled_dcgm
elif [[ "${component}" = "nvswitch" ]]; then
echo "nvidia: processing NVSwitch component"
chisseled_nvswitch
elif [[ "${component}" = "gpudirect" ]]; then
echo "nvidia: processing GPUDirect component"
chisseled_gpudirect
fi
done
coco_guest_components
fi
compress_rootfs
chroot . ldconfig
popd >> /dev/null
popd >> /dev/null
}

View File

@@ -51,6 +51,7 @@ RUN apt-get update && \
pip \
python3-dev \
libclang-dev \
file \
zstd && \
apt-get clean && rm -rf /var/lib/apt/lists/&& \
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_TOOLCHAIN}

View File

@@ -570,13 +570,16 @@ install_initrd_confidential() {
# -> use the latest and greatest driver,
# lts release or e.g. version=550.127.1
# driver -> enable open or closed drivers
# debug -> enable debugging support
# compute -> enable the compute GPU stack, includes utility
# graphics -> enable the graphics GPU stack, includes compute
# dcgm -> enable the DCGM stack + DGCM exporter
# nvswitch -> enable DGX like systems
# gpudirect -> enable use-cases like GPUDirect RDMA, GPUDirect GDS
# dragonball -> enable dragonball support
# devkit -> builds a developer kit image, resulting in a larger
# rootfs size. May require incrementing the
# default_memory allocation and with this, potentially
# podOverhead. Experimental. Not for use in production
#
# The full stack can be enabled by setting all the options like:
#