From e1a951afe5ccb3bfd2f37bf1cffe601dd8c37bc4 Mon Sep 17 00:00:00 2001 From: David Porter Date: Thu, 28 Oct 2021 17:49:50 -0700 Subject: [PATCH] Fix COS GPU driver installation * Rely on the built in GPU driver installer in COS as recommended in public docs - https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus * Run `nvidia-smi` after installation to verify installation --- test/e2e_node/jenkins/gci-init-gpu.yaml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test/e2e_node/jenkins/gci-init-gpu.yaml b/test/e2e_node/jenkins/gci-init-gpu.yaml index c8553395b62..6817a3bcd23 100644 --- a/test/e2e_node/jenkins/gci-init-gpu.yaml +++ b/test/e2e_node/jenkins/gci-init-gpu.yaml @@ -2,15 +2,12 @@ runcmd: - modprobe configs - # Setup the installation target at make it executable - - mkdir -p /home/kubernetes/bin/nvidia - - mount --bind /home/kubernetes/bin/nvidia /home/kubernetes/bin/nvidia - - mount -o remount,exec /home/kubernetes/bin/nvidia - # Compile and install the nvidia driver (precompiled driver installation currently fails) - - docker run --net=host --pid=host -v /dev:/dev -v /:/root -v /home/kubernetes/bin/nvidia:/usr/local/nvidia -e NVIDIA_INSTALL_DIR_HOST=/home/kubernetes/bin/nvidia -e NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia -e NVIDIA_DRIVER_VERSION=460.91.03 --privileged gcr.io/cos-cloud/cos-gpu-installer:latest - # Run the installer again, as on the first try it doesn't detect the libnvidia-ml.so - # on the second attempt we detect it and update the ld cache. - - docker run --net=host --pid=host -v /dev:/dev -v /:/root -v /home/kubernetes/bin/nvidia:/usr/local/nvidia -e NVIDIA_INSTALL_DIR_HOST=/home/kubernetes/bin/nvidia -e NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia -e NVIDIA_DRIVER_VERSION=460.91.03 --privileged gcr.io/cos-cloud/cos-gpu-installer:latest + # Install GPU drivers - https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus + - cos-extensions install gpu + - mount --bind /var/lib/nvidia /var/lib/nvidia + - mount -o remount,exec /var/lib/nvidia /var/lib/nvidia + # Run nvidia-smi to verify installation + - /var/lib/nvidia/bin/nvidia-smi # Remove build containers. They're very large. - docker rm -f $(docker ps -aq) # Standard installation proceeds