mirror of
https://github.com/kata-containers/kata-containers.git
synced 2026-01-25 06:26:41 +00:00
gpu: introduce devkit build flag
Introduce a new devkit parameter which will produce a rootfs without chisselling. This results in a larger rootfs with various packages and binaries being included, for instance, enabling the use of the debug console. Signed-off-by: Manuel Huber <manuelh@nvidia.com>
This commit is contained in:
committed by
Fabiano Fidêncio
parent
2c9e0f9f4f
commit
3966864376
@@ -4,7 +4,6 @@
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
[[ -n "${DEBUG}" ]] && set -x
|
||||
|
||||
@@ -164,6 +163,10 @@ build_nvidia_drivers() {
|
||||
# went wroing make sure the signing_key.pem is removed
|
||||
[[ -e "${signing_key}" ]] && rm -f "${signing_key}"
|
||||
done
|
||||
|
||||
# Save the modules for later so that a linux-image purge does not remove them
|
||||
tar cvfa /lib/modules.save_from_purge.tar.zst /lib/modules
|
||||
|
||||
popd >> /dev/null
|
||||
}
|
||||
|
||||
@@ -373,6 +376,65 @@ install_nvidia_dcgm() {
|
||||
datacenter-gpu-manager-exporter
|
||||
}
|
||||
|
||||
cleanup_rootfs() {
|
||||
echo "chroot: Cleanup NVIDIA GPU rootfs"
|
||||
|
||||
apt-mark hold libstdc++6 libzstd1 libgnutls30t64 pciutils
|
||||
|
||||
if [[ -n "${driver_version}" ]]; then
|
||||
apt-mark hold libnvidia-cfg1-"${driver_version}"-server \
|
||||
nvidia-utils-"${driver_version}"-server \
|
||||
nvidia-kernel-common-"${driver_version}"-server \
|
||||
nvidia-imex-"${driver_version}" \
|
||||
nvidia-compute-utils-"${driver_version}"-server \
|
||||
libnvidia-compute-"${driver_version}"-server \
|
||||
libnvidia-gl-"${driver_version}"-server \
|
||||
libnvidia-extra-"${driver_version}"-server \
|
||||
libnvidia-decode-"${driver_version}"-server \
|
||||
libnvidia-fbc1-"${driver_version}"-server \
|
||||
libnvidia-encode-"${driver_version}"-server \
|
||||
libnvidia-nscq-"${driver_version}" \
|
||||
linuxptp libnftnl11
|
||||
fi
|
||||
|
||||
kernel_headers=$(dpkg --get-selections | cut -f1 | grep linux-headers)
|
||||
linux_images=$(dpkg --get-selections | cut -f1 | grep linux-image)
|
||||
for i in ${kernel_headers} ${linux_images}; do
|
||||
apt purge -yqq "${i}"
|
||||
done
|
||||
|
||||
apt purge -yqq jq make gcc xz-utils linux-libc-dev
|
||||
|
||||
if [[ -n "${driver_version}" ]]; then
|
||||
apt purge -yqq nvidia-headless-no-dkms-"${driver_version}"-server"${driver_type}" \
|
||||
nvidia-kernel-source-"${driver_version}"-server"${driver_type}"
|
||||
fi
|
||||
|
||||
apt autoremove -yqq
|
||||
|
||||
apt clean
|
||||
apt autoclean
|
||||
|
||||
for modules_version in /lib/modules/*; do
|
||||
ln -sf "${modules_version}" /lib/modules/"$(uname -r)"
|
||||
touch "${modules_version}"/modules.order
|
||||
touch "${modules_version}"/modules.builtin
|
||||
depmod -a
|
||||
done
|
||||
|
||||
rm -rf /var/lib/apt/lists/* /var/cache/apt/* /var/log/apt /var/cache/debconf
|
||||
rm -f /etc/apt/sources.list
|
||||
rm -f /usr/bin/nvidia-ngx-updater /usr/bin/nvidia-container-runtime
|
||||
rm -f /var/log/{nvidia-installer.log,dpkg.log,alternatives.log}
|
||||
|
||||
# Clear and regenerate the ld cache
|
||||
rm -f /etc/ld.so.cache
|
||||
ldconfig
|
||||
|
||||
tar xvf /lib/modules.save_from_purge.tar.zst -C /
|
||||
rm -f /lib/modules.save_from_purge.tar.zst
|
||||
}
|
||||
|
||||
# Start of script
|
||||
echo "chroot: Setup NVIDIA GPU rootfs stage one"
|
||||
|
||||
@@ -387,3 +449,4 @@ install_nvidia_fabricmanager
|
||||
install_nvidia_ctk
|
||||
export_driver_version
|
||||
install_nvidia_dcgm
|
||||
cleanup_rootfs
|
||||
|
||||
@@ -207,7 +207,7 @@ chisseled_compute() {
|
||||
cp -a "${stage_one}/${libdir}"/libcuda.so.* lib/"${machine_arch}"-linux-gnu/.
|
||||
cp -a "${stage_one}/${libdir}"/libnvidia-cfg.so.* lib/"${machine_arch}"-linux-gnu/.
|
||||
|
||||
# basich GPU admin tools
|
||||
# basic GPU admin tools
|
||||
cp -a "${stage_one}"/usr/bin/nvidia-persistenced bin/.
|
||||
cp -a "${stage_one}"/usr/bin/nvidia-smi bin/.
|
||||
cp -a "${stage_one}"/usr/bin/nvidia-ctk bin/.
|
||||
@@ -221,6 +221,17 @@ chisseled_gpudirect() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
setup_nvrc_init_symlinks() {
|
||||
local rootfs_type=${1:-""}
|
||||
|
||||
local bin="NVRC${rootfs_type:+"-${rootfs_type}"}"
|
||||
local target=${machine_arch}-unknown-linux-musl
|
||||
|
||||
# make sure NVRC is the init process for the initrd and image case
|
||||
ln -sf /bin/"${bin}-${target}" init
|
||||
ln -sf /bin/"${bin}-${target}" sbin/init
|
||||
}
|
||||
|
||||
chisseled_init() {
|
||||
local rootfs_type=${1:-""}
|
||||
|
||||
@@ -244,9 +255,7 @@ chisseled_init() {
|
||||
cp -a "${stage_one}/bin/${bin}-${target}".cert bin/.
|
||||
cp -a "${stage_one}/bin/${bin}-${target}".sig bin/.
|
||||
|
||||
# make sure NVRC is the init process for the initrd and image case
|
||||
ln -sf /bin/"${bin}-${target}" init
|
||||
ln -sf /bin/"${bin}-${target}" sbin/init
|
||||
setup_nvrc_init_symlinks "${rootfs_type}"
|
||||
|
||||
cp -a "${stage_one}"/usr/bin/kata-agent usr/bin/.
|
||||
if [[ "${AGENT_POLICY}" == "yes" ]]; then
|
||||
@@ -270,11 +279,24 @@ compress_rootfs() {
|
||||
# For some unobvious reason libc has executable bit set
|
||||
# clean this up otherwise the find -executable will not work correctly
|
||||
find . -type f -name "*.so.*" | while IFS= read -r file; do
|
||||
if ! file "${file}" | grep -q ELF; then
|
||||
echo "nvidia: skip stripping file: ${file} ($(file -b "${file}"))"
|
||||
continue
|
||||
fi
|
||||
chmod -x "${file}"
|
||||
strip "${file}"
|
||||
done
|
||||
|
||||
find . -type f -executable | while IFS= read -r file; do
|
||||
# Skip files with setuid/setgid bits (UPX refuses to pack them)
|
||||
if [ -u "${file}" ] || [ -g "${file}" ]; then
|
||||
echo "nvidia: skip compressing executable (special permissions): ${file} ($(file -b "${file}"))"
|
||||
continue
|
||||
fi
|
||||
if ! file "${file}" | grep -q ELF; then
|
||||
echo "nvidia: skip compressing executable (not ELF): ${file} ($(file -b "${file}"))"
|
||||
continue
|
||||
fi
|
||||
strip "${file}"
|
||||
"${BUILD_DIR}"/upx-4.2.4-"${distro_arch}"_linux/upx --best --lzma "${file}"
|
||||
done
|
||||
@@ -287,7 +309,6 @@ compress_rootfs() {
|
||||
[[ ${machine_arch} == "x86_64" ]] && libdir="lib64"
|
||||
|
||||
chmod +x "${libdir}"/ld-linux-*
|
||||
|
||||
}
|
||||
|
||||
coco_guest_components() {
|
||||
@@ -315,56 +336,57 @@ coco_guest_components() {
|
||||
info "TODO: nvidia: luks-encrypt-storage is a bash script, we do not have a shell!"
|
||||
}
|
||||
|
||||
toggle_debug() {
|
||||
if echo "${NVIDIA_GPU_STACK}" | grep -q '\<debug\>'; then
|
||||
export DEBUG="true"
|
||||
fi
|
||||
}
|
||||
|
||||
setup_nvidia_gpu_rootfs_stage_two() {
|
||||
readonly stage_two="${ROOTFS_DIR:?}"
|
||||
readonly stack="${NVIDIA_GPU_STACK:?}"
|
||||
|
||||
readonly type=${1:-""}
|
||||
|
||||
echo "nvidia: chisseling the following stack components: ${stack}"
|
||||
# If devkit flag is set, skip chisseling, use stage_one
|
||||
if echo "${stack}" | grep -q '\<devkit\>'; then
|
||||
echo "nvidia: devkit mode enabled - skip chisseling"
|
||||
|
||||
tar -C "${stage_two}" -xf "${stage_one}".tar.zst
|
||||
|
||||
[[ -e "${stage_one}" ]] && rm -rf "${stage_one}"
|
||||
[[ ! -e "${stage_one}" ]] && mkdir -p "${stage_one}"
|
||||
pushd "${stage_two}" >> /dev/null
|
||||
|
||||
tar -C "${stage_one}" -xf "${stage_one}".tar.zst
|
||||
# Only step needed from stage_two (see chisseled_init)
|
||||
setup_nvrc_init_symlinks "${type}"
|
||||
else
|
||||
echo "nvidia: chisseling the following stack components: ${stack}"
|
||||
|
||||
[[ -e "${stage_one}" ]] && rm -rf "${stage_one}"
|
||||
[[ ! -e "${stage_one}" ]] && mkdir -p "${stage_one}"
|
||||
|
||||
pushd "${stage_two}" >> /dev/null
|
||||
tar -C "${stage_one}" -xf "${stage_one}".tar.zst
|
||||
|
||||
toggle_debug
|
||||
chisseled_init "${type}"
|
||||
chisseled_iptables
|
||||
pushd "${stage_two}" >> /dev/null
|
||||
|
||||
IFS=',' read -r -a stack_components <<< "${NVIDIA_GPU_STACK}"
|
||||
chisseled_init "${type}"
|
||||
chisseled_iptables
|
||||
|
||||
for component in "${stack_components[@]}"; do
|
||||
if [[ "${component}" = "compute" ]]; then
|
||||
echo "nvidia: processing \"compute\" component"
|
||||
chisseled_compute
|
||||
elif [[ "${component}" = "dcgm" ]]; then
|
||||
echo "nvidia: processing DCGM component"
|
||||
chisseled_dcgm
|
||||
elif [[ "${component}" = "nvswitch" ]]; then
|
||||
echo "nvidia: processing NVSwitch component"
|
||||
chisseled_nvswitch
|
||||
elif [[ "${component}" = "gpudirect" ]]; then
|
||||
echo "nvidia: processing GPUDirect component"
|
||||
chisseled_gpudirect
|
||||
fi
|
||||
done
|
||||
IFS=',' read -r -a stack_components <<< "${NVIDIA_GPU_STACK}"
|
||||
|
||||
coco_guest_components
|
||||
for component in "${stack_components[@]}"; do
|
||||
if [[ "${component}" = "compute" ]]; then
|
||||
echo "nvidia: processing \"compute\" component"
|
||||
chisseled_compute
|
||||
elif [[ "${component}" = "dcgm" ]]; then
|
||||
echo "nvidia: processing DCGM component"
|
||||
chisseled_dcgm
|
||||
elif [[ "${component}" = "nvswitch" ]]; then
|
||||
echo "nvidia: processing NVSwitch component"
|
||||
chisseled_nvswitch
|
||||
elif [[ "${component}" = "gpudirect" ]]; then
|
||||
echo "nvidia: processing GPUDirect component"
|
||||
chisseled_gpudirect
|
||||
fi
|
||||
done
|
||||
|
||||
coco_guest_components
|
||||
fi
|
||||
|
||||
compress_rootfs
|
||||
|
||||
chroot . ldconfig
|
||||
|
||||
popd >> /dev/null
|
||||
popd >> /dev/null
|
||||
}
|
||||
|
||||
@@ -51,6 +51,7 @@ RUN apt-get update && \
|
||||
pip \
|
||||
python3-dev \
|
||||
libclang-dev \
|
||||
file \
|
||||
zstd && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/&& \
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain ${RUST_TOOLCHAIN}
|
||||
|
||||
@@ -570,13 +570,16 @@ install_initrd_confidential() {
|
||||
# -> use the latest and greatest driver,
|
||||
# lts release or e.g. version=550.127.1
|
||||
# driver -> enable open or closed drivers
|
||||
# debug -> enable debugging support
|
||||
# compute -> enable the compute GPU stack, includes utility
|
||||
# graphics -> enable the graphics GPU stack, includes compute
|
||||
# dcgm -> enable the DCGM stack + DGCM exporter
|
||||
# nvswitch -> enable DGX like systems
|
||||
# gpudirect -> enable use-cases like GPUDirect RDMA, GPUDirect GDS
|
||||
# dragonball -> enable dragonball support
|
||||
# devkit -> builds a developer kit image, resulting in a larger
|
||||
# rootfs size. May require incrementing the
|
||||
# default_memory allocation and with this, potentially
|
||||
# podOverhead. Experimental. Not for use in production
|
||||
#
|
||||
# The full stack can be enabled by setting all the options like:
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user