Merge pull request #10464 from zvonkok/nvidia-gpu-rootfs

gpu: NVIDIA GPU initrd/image build
This commit is contained in:
Zvonko Kaiser 2024-11-25 16:16:42 -05:00 committed by GitHub
commit c3d1b3c5e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 755 additions and 10 deletions

View File

@ -0,0 +1,339 @@
#!/usr/bin/env bash
#
# Copyright (c) 2024 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
#!/bin/bash
set -xe
shopt -s nullglob
shopt -s extglob
run_file_name=$2
run_fm_file_name=$3
arch_target=$4
driver_version="$5"
driver_type="open"
supported_gpu_devids="/supported-gpu.devids"
APT_INSTALL="apt -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' -yqq --no-install-recommends install"
export DEBIAN_FRONTEND=noninteractive
install_nvidia_ctk() {
echo "chroot: Installing NVIDIA GPU container runtime"
apt list nvidia-container-toolkit-base -a
# Base gives a nvidia-ctk and the nvidia-container-runtime
eval "${APT_INSTALL}" nvidia-container-toolkit-base
}
install_nvidia_fabricmanager() {
# if run_fm_file_name exists run it
if [ -f /"${run_fm_file_name}" ]; then
install_nvidia_fabricmanager_from_run_file
else
install_nvidia_fabricmanager_from_distribution
fi
}
install_nvidia_fabricmanager_from_run_file() {
echo "chroot: Install NVIDIA fabricmanager from run file"
pushd / >> /dev/null
chmod +x "${run_fm_file_name}"
./"${run_fm_file_name}" --nox11
popd >> /dev/null
}
install_nvidia_fabricmanager_from_distribution() {
echo "chroot: Install NVIDIA fabricmanager from distribution"
eval "${APT_INSTALL}" nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
apt-mark hold nvidia-fabricmanager-"${driver_version}" libnvidia-nscq-"${driver_version}"
}
build_nvidia_drivers() {
echo "chroot: Build NVIDIA drivers"
pushd "${driver_source_files}" >> /dev/null
local kernel_version
for version in /lib/modules/*; do
kernel_version=$(basename "${version}")
echo "chroot: Building GPU modules for: ${kernel_version}"
cp /boot/System.map-"${kernel_version}" /lib/modules/"${kernel_version}"/build/System.map
if [ "${arch_target}" == "aarch64" ]; then
ln -sf /lib/modules/"${kernel_version}"/build/arch/arm64 /lib/modules/"${kernel_version}"/build/arch/aarch64
fi
if [ "${arch_target}" == "x86_64" ]; then
ln -sf /lib/modules/"${kernel_version}"/build/arch/x86 /lib/modules/"${kernel_version}"/build/arch/amd64
fi
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build > /dev/null
make INSTALL_MOD_STRIP=1 -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build modules_install
make -j "$(nproc)" CC=gcc SYSSRC=/lib/modules/"${kernel_version}"/build clean > /dev/null
done
# Save the modules for later so that a linux-image purge does not remove it
tar cvfa /lib/modules.save_from_purge.tar.zst /lib/modules
popd >> /dev/null
}
install_userspace_components() {
if [ ! -f /"${run_file_name}" ]; then
echo "chroot: Skipping NVIDIA userspace runfile components installation"
return
fi
pushd /NVIDIA-* >> /dev/null
# if aarch64 we need to remove --no-install-compat32-libs
if [ "${arch_target}" == "aarch64" ]; then
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root
else
./nvidia-installer --no-kernel-modules --no-systemd --no-nvidia-modprobe -s --x-prefix=/root --no-install-compat32-libs
fi
popd >> /dev/null
}
prepare_run_file_drivers() {
if [ "${driver_version}" == "latest" ]; then
driver_version=""
echo "chroot: Resetting driver version not supported with run-file"
elif [ "${driver_version}" == "lts" ]; then
driver_version=""
echo "chroot: Resetting driver version not supported with run-file"
fi
echo "chroot: Prepare NVIDIA run file drivers"
pushd / >> /dev/null
chmod +x "${run_file_name}"
./"${run_file_name}" -x
mkdir -p /usr/share/nvidia/rim/
# Sooner or later RIM files will be only available remotely
RIMFILE=$(ls NVIDIA-*/RIM_GH100PROD.swidtag)
if [ -e "${RIMFILE}" ]; then
cp NVIDIA-*/RIM_GH100PROD.swidtag /usr/share/nvidia/rim/.
fi
popd >> /dev/null
}
prepare_distribution_drivers() {
if [ "${driver_version}" == "latest" ]; then
driver_version=$(apt-cache search --names-only 'nvidia-headless-no-dkms-.?.?.?-open' | awk '{ print $1 }' | tail -n 1 | cut -d'-' -f5)
elif [ "${driver_version}" == "lts" ]; then
driver_version="550"
fi
echo "chroot: Prepare NVIDIA distribution drivers"
eval "${APT_INSTALL}" nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \
libnvidia-cfg1-"${driver_version}" \
nvidia-compute-utils-"${driver_version}" \
nvidia-utils-"${driver_version}" \
nvidia-kernel-common-"${driver_version}" \
nvidia-imex-"${driver_version}" \
libnvidia-compute-"${driver_version}" \
libnvidia-compute-"${driver_version}" \
libnvidia-gl-"${driver_version}" \
libnvidia-extra-"${driver_version}" \
libnvidia-decode-"${driver_version}" \
libnvidia-fbc1-"${driver_version}" \
libnvidia-encode-"${driver_version}"
}
prepare_nvidia_drivers() {
local driver_source_dir=""
if [ -f /"${run_file_name}" ]; then
prepare_run_file_drivers
for source_dir in /NVIDIA-*; do
if [ -d "${source_dir}" ]; then
driver_source_files="${source_dir}"/kernel-${driver_type}
driver_source_dir="${source_dir}"
break
fi
done
get_supported_gpus_from_run_file "${driver_source_dir}"
else
prepare_distribution_drivers
for source_dir in /usr/src/nvidia*; do
if [ -d "${source_dir}" ]; then
driver_source_files="${source_dir}"
driver_source_dir="${source_dir}"
break
fi
done
get_supported_gpus_from_distro_drivers "${driver_source_dir}"
fi
}
install_build_dependencies() {
echo "chroot: Install NVIDIA drivers build dependencies"
eval "${APT_INSTALL}" make gcc gawk kmod libvulkan1 pciutils jq zstd linuxptp
}
setup_apt_repositories() {
echo "chroot: Setup APT repositories"
mkdir -p /var/cache/apt/archives/partial
mkdir -p /var/log/apt
mkdir -p /var/lib/dpkg/info
mkdir -p /var/lib/dpkg/updates
mkdir -p /var/lib/dpkg/alternatives
mkdir -p /var/lib/dpkg/triggers
mkdir -p /var/lib/dpkg/parts
touch /var/lib/dpkg/status
rm -f /etc/apt/sources.list.d/*
if [ "${arch_target}" == "aarch64" ]; then
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/jammy.list
deb http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted universe multiverse
deb http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse
CHROOT_EOF
else
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/noble.list
deb http://us.archive.ubuntu.com/ubuntu/ jammy main restricted universe multiverse
deb http://us.archive.ubuntu.com/ubuntu/ jammy-updates main restricted universe multiverse
deb http://us.archive.ubuntu.com/ubuntu/ jammy-security main restricted universe multiverse
deb http://us.archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse
CHROOT_EOF
fi
apt update
eval "${APT_INSTALL}" curl gpg ca-certificates
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list |
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' |
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt update
}
install_kernel_dependencies() {
dpkg -i /linux-*deb
}
get_supported_gpus_from_run_file() {
local source_dir="$1"
local supported_gpus_json="${source_dir}"/supported-gpus/supported-gpus.json
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > ${supported_gpu_devids}
}
get_supported_gpus_from_distro_drivers() {
local supported_gpus_json=/usr/share/doc/nvidia-kernel-common-"${driver_version}"/supported-gpus.json
jq . < "${supported_gpus_json}" | grep '"devid"' | awk '{ print $2 }' | tr -d ',"' > ${supported_gpu_devids}
}
export_driver_version() {
for modules_version in /lib/modules/*; do
modinfo "${modules_version}"/kernel/drivers/video/nvidia.ko | grep ^version | awk '{ print $2 }' > /nvidia_driver_version
break
done
}
install_nvidia_dcgm() {
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
dpkg -i cuda-keyring_1.0-1_all.deb && rm -f cuda-keyring_1.0-1_all.deb
if [ "${arch_target}" == "aarch64" ]; then
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/cuda.list
deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/ /
CHROOT_EOF
else
cat <<-'CHROOT_EOF' > /etc/apt/sources.list.d/cuda.list
deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /
CHROOT_EOF
fi
apt update
eval "${APT_INSTALL}" datacenter-gpu-manager
}
cleanup_rootfs() {
echo "chroot: Cleanup NVIDIA GPU rootfs"
apt-mark hold libstdc++6 libzstd1 libgnutls30 pciutils
# noble=libgnutls30t64
if [ -n "${driver_version}" ]; then
apt-mark hold libnvidia-cfg1-"${driver_version}" \
nvidia-compute-utils-"${driver_version}" \
nvidia-utils-"${driver_version}" \
nvidia-kernel-common-"${driver_version}" \
nvidia-imex-"${driver_version}" \
libnvidia-compute-"${driver_version}" \
libnvidia-compute-"${driver_version}" \
libnvidia-gl-"${driver_version}" \
libnvidia-extra-"${driver_version}" \
libnvidia-decode-"${driver_version}" \
libnvidia-fbc1-"${driver_version}" \
libnvidia-encode-"${driver_version}" \
libnvidia-nscq-"${driver_version}" \
linuxptp libnftnl11
fi
kernel_headers=$(dpkg --get-selections | cut -f1 | grep linux-headers)
linux_images=$(dpkg --get-selections | cut -f1 | grep linux-image)
for i in ${kernel_headers} ${linux_images}; do
apt purge -yqq "${i}"
done
apt purge -yqq jq make gcc wget libc6-dev git xz-utils curl gpg \
python3-pip software-properties-common ca-certificates \
linux-libc-dev nuitka python3-minimal cuda-keyring
if [ -n "${driver_version}" ]; then
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}-${driver_type}" \
nvidia-kernel-source-"${driver_version}-${driver_type}" -yqq
fi
apt autoremove -yqq
apt clean
apt autoclean
for modules_version in /lib/modules/*; do
ln -sf "${modules_version}" /lib/modules/"$(uname -r)"
touch "${modules_version}"/modules.order
touch "${modules_version}"/modules.builtin
depmod -a
done
rm -rf /etc/apt/sources.list* /var/lib/apt /var/log/apt /var/cache/debconf
rm -f /usr/bin/nvidia-ngx-updater /usr/bin/nvidia-container-runtime
rm -f /var/log/{nvidia-installer.log,dpkg.log,alternatives.log}
# Clear and regenerate the ld cache
rm -f /etc/ld.so.cache
ldconfig
tar xvf /lib/modules.save_from_purge.tar.zst -C /
}
# Start of script
echo "chroot: Setup NVIDIA GPU rootfs stage one"
setup_apt_repositories
install_kernel_dependencies
install_build_dependencies
prepare_nvidia_drivers
build_nvidia_drivers
install_userspace_components
install_nvidia_fabricmanager
install_nvidia_ctk
export_driver_version
install_nvidia_dcgm
cleanup_rootfs

View File

@ -0,0 +1,348 @@
#!/usr/bin/env bash
#
# Copyright (c) 2024 NVIDIA Corporation
#
# SPDX-License-Identifier: Apache-2.0
set -e
set -x
readonly BUILD_DIR="/kata-containers/tools/packaging/kata-deploy/local-build/build/"
# catch errors and then assign
script_dir="$(dirname "$(readlink -f "$0")")"
readonly SCRIPT_DIR="${script_dir}/nvidia"
# This will control how much output the inird/image will produce
DEBUG=""
setup_nvidia-nvrc() {
local TARGET="nvidia-nvrc"
local PROJECT="nvrc"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst"
mkdir -p "${TARGET_BUILD_DIR}"
mkdir -p "${TARGET_DEST_DIR}/bin"
pushd "${TARGET_BUILD_DIR}" > /dev/null || exit 1
rm -rf "${PROJECT}"
git clone https://github.com/NVIDIA/${PROJECT}.git
pushd "${PROJECT}" > /dev/null || exit 1
cargo build --release --target=x86_64-unknown-linux-musl
cp target/x86_64-unknown-linux-musl/release/NVRC ../../destdir/bin/.
popd > /dev/null || exit 1
tar cvfa "${TARBALL}" -C ../destdir .
tar tvf "${TARBALL}"
popd > /dev/null || exit 1
}
setup_nvidia-gpu-admin-tools() {
local TARGET="nvidia-gpu-admin-tools"
local TARGET_GIT="https://github.com/NVIDIA/gpu-admin-tools"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst"
mkdir -p "${TARGET_BUILD_DIR}"
mkdir -p "${TARGET_DEST_DIR}/sbin"
pushd "${TARGET_BUILD_DIR}" > /dev/null || exit 1
rm -rf "$(basename ${TARGET_GIT})"
git clone ${TARGET_GIT}
rm -rf dist
# Installed via pipx local python environment
"${HOME}"/local/bin/pyinstaller -s -F gpu-admin-tools/nvidia_gpu_tools.py
cp dist/nvidia_gpu_tools ../destdir/sbin/.
tar cvfa "${TARBALL}" -C ../destdir .
tar tvf "${TARBALL}"
popd > /dev/null || exit 1
}
setup_nvidia-dcgm-exporter() {
local TARGET="nvidia-dcgm-exporter"
local TARGET_BUILD_DIR="${BUILD_DIR}/${TARGET}/builddir"
local TARGET_DEST_DIR="${BUILD_DIR}/${TARGET}/destdir"
local TARBALL="${BUILD_DIR}/kata-static-${TARGET}.tar.zst"
mkdir -p "${TARGET_BUILD_DIR}"
mkdir -p "${TARGET_DEST_DIR}/bin"
mkdir -p "${TARGET_DEST_DIR}/etc"
pushd "${TARGET_BUILD_DIR}" > /dev/null || exit 1
local dex="dcgm-exporter"
rm -rf "${dex}"
git clone https://github.com/NVIDIA/${dex}
make -C ${dex} binary
mkdir -p ../destdir/bin
mkdir -p ../destdir/etc/${dex}
cp ${dex}/cmd/${dex}/${dex} ../destdir/bin/.
cp ${dex}/etc/*.csv ../destdir/etc/${dex}/.
tar cvfa "${TARBALL}" -C ../destdir .
tar tvf "${TARBALL}"
popd > /dev/null || exit 1
}
setup_nvidia_gpu_rootfs_stage_one() {
if [ -e "${BUILD_DIR}/kata-static-nvidia-gpu-rootfs-stage-one.tar.zst" ]; then
info "nvidia: GPU rootfs stage one already exists"
return
fi
pushd "${ROOTFS_DIR:?}" >> /dev/null
local rootfs_type=${1:-""}
info "nvidia: Setup GPU rootfs type=$rootfs_type"
for component in "nvidia-gpu-admin-tools" "nvidia-dcgm-exporter" "nvidia-nvrc"; do
if [ ! -e "${BUILD_DIR}/kata-static-${component}.tar.zst" ]; then
setup_${component}
fi
done
cp "${SCRIPT_DIR}/nvidia_chroot.sh" ./nvidia_chroot.sh
chmod +x ./nvidia_chroot.sh
local appendix=""
if [ "$rootfs_type" == "confidential" ]; then
appendix="-${rootfs_type}"
fi
if echo "$NVIDIA_GPU_STACK" | grep -q '\<dragonball\>'; then
appendix="-dragonball-experimental"
fi
# We need the kernel packages for building the drivers cleanly will be
# deinstalled and removed from the roofs once the build finishes.
tar -xvf ${BUILD_DIR}/kata-static-kernel-nvidia-gpu"${appendix}"-headers.tar.xz -C .
# If we find a local downloaded run file build the kernel modules
# with it, otherwise use the distribution packages. Run files may have
# more recent drivers available then the distribution packages.
local run_file_name="nvidia-driver.run"
if [ -f ${BUILD_DIR}/${run_file_name} ]; then
cp -L ${BUILD_DIR}/${run_file_name} ./${run_file_name}
fi
local run_fm_file_name="nvidia-fabricmanager.run"
if [ -f ${BUILD_DIR}/${run_fm_file_name} ]; then
cp -L ${BUILD_DIR}/${run_fm_file_name} ./${run_fm_file_name}
fi
mount --rbind /dev ./dev
mount --make-rslave ./dev
mount -t proc /proc ./proc
local driver_version="latest"
if echo "$NVIDIA_GPU_STACK" | grep -q '\<latest\>'; then
driver_version="latest"
elif echo "$NVIDIA_GPU_STACK" | grep -q '\<lts\>'; then
driver_version="lts"
fi
chroot . /bin/bash -c "/nvidia_chroot.sh $(uname -r) ${run_file_name} ${run_fm_file_name} ${ARCH} ${driver_version}"
umount -R ./dev
umount ./proc
rm ./nvidia_chroot.sh
rm ./*.deb
tar cfa "${BUILD_DIR}"/kata-static-rootfs-nvidia-gpu-stage-one.tar.zst --remove-files -- *
popd >> /dev/null
pushd "${BUILD_DIR}" >> /dev/null
curl -LO https://github.com/upx/upx/releases/download/v4.2.4/upx-4.2.4-amd64_linux.tar.xz
tar xvf upx-4.2.4-amd64_linux.tar.xz
popd >> /dev/null
}
chisseled_iptables() {
echo "nvidia: chisseling iptables"
cp -a "${stage_one}"/usr/sbin/xtables-nft-multi sbin/.
ln -s ../sbin/xtables-nft-multi sbin/iptables-restore
ln -s ../sbin/xtables-nft-multi sbin/iptables-save
libdir="lib/x86_64-linux-gnu"
cp -a "${stage_one}"/${libdir}/libmnl.so.0* lib/.
libdir="usr/lib/x86_64-linux-gnu"
cp -a "${stage_one}"/${libdir}/libnftnl.so.11* lib/.
cp -a "${stage_one}"/${libdir}/libxtables.so.12* lib/.
}
chisseled_nvswitch() {
echo "nvidia: chisseling NVSwitch"
echo "nvidia: not implemented yet"
exit 1
}
chisseled_dcgm() {
echo "nvidia: chisseling DCGM"
mkdir -p etc/dcgm-exporter
libdir="lib/x86_64-linux-gnu"
cp -a "${stage_one}"/usr/${libdir}/libdcgm.* ${libdir}/.
cp -a "${stage_one}"/${libdir}/libgcc_s.so.1* ${libdir}/.
cp -a "${stage_one}"/usr/bin/nv-hostengine bin/.
tar xvf "${BUILD_DIR}"/kata-static-nvidia-dcgm-exporter.tar.zst -C .
}
# copute always includes utility per default
chisseled_compute() {
echo "nvidia: chisseling GPU"
cp -a "${stage_one}"/nvidia_driver_version .
tar xvf "${BUILD_DIR}"/kata-static-nvidia-gpu-admin-tools.tar.zst -C .
cp -a "${stage_one}"/lib/modules/* lib/modules/.
libdir="lib/x86_64-linux-gnu"
cp -a "${stage_one}"/${libdir}/libdl.so.2* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libz.so.1* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libpthread.so.0* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libresolv.so.2* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libc.so.6* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libm.so.6* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/librt.so.1* lib/x86_64-linux-gnu/.
libdir="lib64"
cp -aL "${stage_one}"/${libdir}/ld-linux-x86-64.so.* lib64/.
libdir="usr/lib/x86_64-linux-gnu"
cp -a "${stage_one}"/${libdir}/libnvidia-ml.so.* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libcuda.so.* lib/x86_64-linux-gnu/.
cp -a "${stage_one}"/${libdir}/libnvidia-cfg.so.* lib/x86_64-linux-gnu/.
# basich GPU admin tools
cp -a "${stage_one}"/usr/bin/nvidia-persistenced bin/.
cp -a "${stage_one}"/usr/bin/nvidia-smi bin/.
cp -a "${stage_one}"/usr/bin/nvidia-ctk bin/.
cp -a "${stage_one}"/usr/bin/nvidia-cdi-hook bin/.
ln -s ../bin usr/bin
}
chisseled_gpudirect() {
echo "nvidia: chisseling GPUDirect"
echo "nvidia: not implemented yet"
exit 1
}
chisseled_init() {
echo "nvidia: chisseling init"
tar xvf "${BUILD_DIR}"/kata-static-busybox.tar.xz -C .
mkdir -p dev etc proc run/cdi sys tmp usr var lib/modules lib/firmware \
usr/share/nvidia lib/x86_64-linux-gnu lib64
ln -sf ../run var/run
tar xvf "${BUILD_DIR}"/kata-static-nvidia-nvrc.tar.zst -C .
ln -sf /bin/NVRC init
cp -a "${stage_one}"/sbin/init sbin/.
cp -a "${stage_one}"/etc/kata-opa etc/.
cp -a "${stage_one}"/etc/resolv.conf etc/.
cp -a "${stage_one}"/supported-gpu.devids .
cp -a "${stage_one}"/lib/firmware/nvidia lib/firmware/.
cp -a "${stage_one}"/sbin/ldconfig.real sbin/ldconfig
}
compress_rootfs() {
echo "nvidia: compressing rootfs"
# For some unobvious reason libc has executable bit set
# clean this up otherwise the find -executable will not work correctly
find . -type f -name "*.so.*" | while IFS= read -r file; do
chmod -x "${file}"
strip "${file}"
done
find . -type f -executable | while IFS= read -r file; do
strip "${file}"
${BUILD_DIR}/upx-4.2.4-amd64_linux/upx --best --lzma "${file}"
done
# While I was playing with compression the executable flag on
# /lib64/ld-linux-x86-64.so.2 was lost...
# Since this is the program interpreter, it needs to be executable
# as well.. sigh
chmod +x lib64/ld-linux-x86-64.so.2
}
toggle_debug() {
if echo "$NVIDIA_GPU_STACK" | grep -q '\<debug\>'; then
export DEBUG="true"
fi
}
setup_nvidia_gpu_rootfs_stage_two() {
readonly stage_one="${BUILD_DIR:?}/rootfs-${VARIANT}-stage-one"
readonly stage_two="${ROOTFS_DIR:?}"
readonly stack="${NVIDIA_GPU_STACK:?}"
echo "nvidia: chisseling the following stack components: $stack"
[ -e "${stage_one}" ] && rm -rf "${stage_one}"
[ ! -e "${stage_one}" ] && mkdir -p "${stage_one}"
tar -C "${stage_one}" -xf ${BUILD_DIR}/kata-static-rootfs-nvidia-gpu-stage-one.tar.zst
pushd "${stage_two}" >> /dev/null
toggle_debug
chisseled_init
chisseled_iptables
IFS=',' read -r -a stack_components <<< "$NVIDIA_GPU_STACK"
for component in "${stack_components[@]}"; do
if [ "$component" = "compute" ]; then
echo "nvidia: processing \"compute\" component"
chisseled_compute
elif [ "$component" = "dcgm" ]; then
echo "nvidia: processing DCGM component"
chisseled_dcgm
elif [ "$component" = "nvswitch" ]; then
echo "nvidia: processing NVSwitch component"
chisseled_nvswitch
elif [ "$component" = "gpudirect" ]; then
echo "nvidia: processing GPUDirect component"
chisseled_gpudirect
fi
done
compress_rootfs
chroot . ldconfig
popd >> /dev/null
}

View File

@ -43,6 +43,10 @@ if [[ "${AGENT_POLICY}" == "yes" ]]; then
agent_policy_file="$(readlink -f -v "${AGENT_POLICY_FILE:-"${script_dir}/../../../src/kata-opa/allow-all.rego"}")"
fi
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-""}
nvidia_rootfs="${script_dir}/nvidia/nvidia_rootfs.sh"
source "$nvidia_rootfs"
#For cross build
CROSS_BUILD=${CROSS_BUILD:-false}
BUILDX=""
@ -516,6 +520,7 @@ build_rootfs_distro()
--env EXTRA_PKGS="${EXTRA_PKGS}" \
--env OSBUILDER_VERSION="${OSBUILDER_VERSION}" \
--env OS_VERSION="${OS_VERSION}" \
--env VARIANT="${VARIANT}" \
--env INSIDE_CONTAINER=1 \
--env SECCOMP="${SECCOMP}" \
--env SELINUX="${SELINUX}" \
@ -525,6 +530,7 @@ build_rootfs_distro()
--env HOME="/root" \
--env AGENT_POLICY="${AGENT_POLICY}" \
--env CONFIDENTIAL_GUEST="${CONFIDENTIAL_GUEST}" \
--env NVIDIA_GPU_STACK="${NVIDIA_GPU_STACK}" \
-v "${repo_dir}":"/kata-containers" \
-v "${ROOTFS_DIR}":"/rootfs" \
-v "${script_dir}/../scripts":"/scripts" \
@ -819,6 +825,18 @@ main()
init="${ROOTFS_DIR}/sbin/init"
setup_rootfs
if [ "${VARIANT}" = "nvidia-gpu" ]; then
setup_nvidia_gpu_rootfs_stage_one
setup_nvidia_gpu_rootfs_stage_two
return $?
fi
if [ "${VARIANT}" = "nvidia-gpu-confidential" ]; then
setup_nvidia_gpu_rootfs_stage_one "confidential"
setup_nvidia_gpu_rootfs_stage_two "confidential"
return $?
fi
}
main $*

View File

@ -80,8 +80,9 @@ agent-tarball: copy-scripts-for-the-agent-build
agent-ctl-tarball: copy-scripts-for-the-tools-build
${MAKE} $@-build
BUSYBOX_CONF_FILE ?= busybox.nvidia.conf
busybox-tarball:
${MAKE} $@-build
${MAKE} BUSYBOX_CONF_FILE=${BUSYBOX_CONF_FILE} $@-build
coco-guest-components-tarball:
${MAKE} $@-build
@ -163,6 +164,17 @@ rootfs-initrd-tarball: agent-tarball
runk-tarball: copy-scripts-for-the-tools-build
${MAKE} $@-build
rootfs-nvidia-gpu-image-tarball: agent-tarball busybox-tarball
${MAKE} $@-build
rootfs-nvidia-gpu-initrd-tarball: agent-tarball busybox-tarball
${MAKE} $@-build
rootfs-nvidia-gpu-confidential-image-tarball: agent-tarball busybox-tarball pause-image-tarball coco-guest-components-tarball kernel-nvidia-gpu-confidential-tarball
${MAKE} $@-build
rootfs-nvidia-gpu-confidential-initrd-tarball: agent-tarball busybox-tarball pause-image-tarball coco-guest-components-tarball kernel-nvidia-gpu-confidential-tarball
${MAKE} $@-build
shim-v2-tarball:
${MAKE} $@-build

View File

@ -57,6 +57,9 @@ RUN apt-get update && \
cpio \
gcc \
unzip \
git \
make \
wget \
xz-utils && \
if [ "${ARCH}" != "$(uname -m)" ] && [ "${ARCH}" == "s390x" ]; then \
apt-get install -y --no-install-recommends \

View File

@ -102,6 +102,7 @@ MEASURED_ROOTFS="${MEASURED_ROOTFS:-}"
PULL_TYPE="${PULL_TYPE:-default}"
USE_CACHE="${USE_CACHE:-}"
BUSYBOX_CONF_FILE=${BUSYBOX_CONF_FILE:-}
NVIDIA_GPU_STACK="${NVIDIA_GPU_STACK:-}"
docker run \
-v $HOME/.docker:/root/.docker \
@ -131,6 +132,7 @@ docker run \
--env PULL_TYPE="${PULL_TYPE}" \
--env USE_CACHE="${USE_CACHE}" \
--env BUSYBOX_CONF_FILE="${BUSYBOX_CONF_FILE}" \
--env NVIDIA_GPU_STACK="${NVIDIA_GPU_STACK}" \
--env AA_KBC="${AA_KBC:-}" \
--env HKD_PATH="$(realpath "${HKD_PATH:-}" 2> /dev/null || true)" \
--env SE_KERNEL_PARAMS="${SE_KERNEL_PARAMS:-}" \

View File

@ -448,7 +448,7 @@ install_initrd() {
os_name="$(get_from_kata_deps ".assets.initrd.architecture.${ARCH}.${variant}.name")"
os_version="$(get_from_kata_deps ".assets.initrd.architecture.${ARCH}.${variant}.version")"
if [ "${variant}" == "confidential" ]; then
if [[ "${variant}" == *-confidential ]]; then
export COCO_GUEST_COMPONENTS_TARBALL="$(get_coco_guest_components_tarball_path)"
export PAUSE_IMAGE_TARBALL="$(get_pause_image_tarball_path)"
fi
@ -470,35 +470,57 @@ install_initrd_confidential() {
install_initrd "confidential"
}
#Instal NVIDIA GPU image
# For all nvidia_gpu targets we can customize the stack that is enbled
# in the VM by setting the NVIDIA_GPU_STACK= environment variable
#
# latest | lts -> use the latest and greatest driver or lts release
# debug -> enable debugging support
# compute -> enable the compute GPU stack, includes utility
# graphics -> enable the graphics GPU stack, includes compute
# dcgm -> enable the DCGM stack + DGCM exporter
# nvswitch -> enable DGX like systems
# gpudirect -> enable use-cases like GPUDirect RDMA, GPUDirect GDS
# dragonball -> enable dragonball support
#
# The full stack can be enabled by setting all the options like:
#
# NVIDIA_GPU_STACK="latest,compute,dcgm,nvswitch,gpudirect"
#
# Install NVIDIA GPU image
install_image_nvidia_gpu() {
export AGENT_POLICY="yes"
export AGENT_INIT="yes"
export EXTRA_PKGS="apt udev"
export EXTRA_PKGS="apt"
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
install_image "nvidia-gpu"
}
#Install NVIDIA GPU initrd
# Install NVIDIA GPU initrd
install_initrd_nvidia_gpu() {
export AGENT_POLICY="yes"
export AGENT_INIT="yes"
export EXTRA_PKGS="apt udev"
export EXTRA_PKGS="apt"
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute,dcgm"}
install_initrd "nvidia-gpu"
}
#Instal NVIDIA GPU confidential image
# Instal NVIDIA GPU confidential image
install_image_nvidia_gpu_confidential() {
export AGENT_POLICY="yes"
export AGENT_INIT="yes"
export EXTRA_PKGS="apt udev"
export EXTRA_PKGS="apt"
# TODO: export MEASURED_ROOTFS=yes
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"}
install_image "nvidia-gpu-confidential"
}
#Install NVIDIA GPU confidential initrd
# Install NVIDIA GPU confidential initrd
install_initrd_nvidia_gpu_confidential() {
export AGENT_POLICY="yes"
export AGENT_INIT="yes"
export EXTRA_PKGS="apt udev"
export EXTRA_PKGS="apt"
# TODO: export MEASURED_ROOTFS=yes
NVIDIA_GPU_STACK=${NVIDIA_GPU_STACK:-"latest,compute"}
install_initrd "nvidia-gpu-confidential"
}
@ -1122,6 +1144,7 @@ handle_build() {
kernel-confidential) install_kernel_confidential ;;
kernel-dragonball-experimental) install_kernel_dragonball_experimental ;;
kernel-nvidia-gpu-dragonball-experimental) install_kernel_nvidia_gpu_dragonball_experimental ;;
kernel-nvidia-gpu) install_kernel_nvidia_gpu ;;