diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index d1afca171b6..80cf1b2748e 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -767,7 +767,7 @@ function construct-linux-kubelet-flags { # Keep in sync with the mkdir command in configure-helper.sh (until the TODO is resolved) flags+=" --cert-dir=/var/lib/kubelet/pki/" - # If ENABLE_AUTH_PROVIDER_GCP is set to true, kubelet is enabled to use out-of-tree auth + # If ENABLE_AUTH_PROVIDER_GCP is set to true, kubelet is enabled to use out-of-tree auth # credential provider instead of in-tree auth credential provider. # https://kubernetes.io/docs/tasks/kubelet-credential-provider/kubelet-credential-provider if [[ "${ENABLE_AUTH_PROVIDER_GCP:-true}" == "true" ]]; then diff --git a/test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml b/test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml index f8d3a723ec4..c23670087c3 100644 --- a/test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml +++ b/test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml @@ -1,5 +1,5 @@ # This DaemonSet was originally referenced from -# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/daemonset.yaml +# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml # The Dockerfile and other source for this daemonset are in # https://github.com/GoogleCloudPlatform/cos-gpu-installer @@ -47,17 +47,43 @@ spec: - name: root-mount hostPath: path: / + - name: cos-tools + hostPath: + path: /var/lib/cos-tools + - name: nvidia-config + hostPath: + path: /etc/nvidia initContainers: + - image: "ubuntu@sha256:3f85b7caad41a95462cf5b787d8a04604c8262cdcdf9a472b8c52ef83375fe15" + name: bind-mount-install-dir + securityContext: + privileged: true + command: + - nsenter + - -at + - '1' + - -- + - sh + - -c + - | + if mountpoint -q /var/lib/nvidia; then + echo "The mountpoint /var/lib/nvidia exists." + else + echo "The mountpoint /var/lib/nvidia does not exist. Creating directories /home/kubernetes/bin/nvidia and /var/lib/nvidia and bind mount." + mkdir -p /var/lib/nvidia /home/kubernetes/bin/nvidia + mount --bind /home/kubernetes/bin/nvidia /var/lib/nvidia + echo "Done creating bind mounts" + fi # The COS GPU installer image version may be dependent on the version of COS being used. # Refer to details about the installer in https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/ # and the COS release notes (https://cloud.google.com/container-optimized-os/docs/release-notes) to determine version COS GPU installer for a given version of COS. # Maps to gcr.io/cos-cloud/cos-gpu-installer:v2.1.10 - suitable for COS M109 as per https://cloud.google.com/container-optimized-os/docs/release-notes - - image: gcr.io/cos-cloud/cos-gpu-installer:v2.1.10 + - image: "gcr.io/cos-cloud/cos-gpu-installer:v2.1.10" name: nvidia-driver-installer resources: requests: - cpu: 0.15 + cpu: 150m securityContext: privileged: true env: @@ -71,6 +97,10 @@ spec: value: /etc/vulkan/icd.d - name: ROOT_MOUNT_DIR value: /root + - name: COS_TOOLS_DIR_HOST + value: /var/lib/cos-tools + - name: COS_TOOLS_DIR_CONTAINER + value: /build/cos-tools volumeMounts: - name: nvidia-install-dir-host mountPath: /usr/local/nvidia @@ -80,6 +110,37 @@ spec: mountPath: /dev - name: root-mount mountPath: /root + - name: cos-tools + mountPath: /build/cos-tools + command: + - bash + - -c + - | + echo "Checking for existing GPU driver modules" + if lsmod | grep nvidia; then + echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation" + exit 0 + else + echo "No GPU driver module detected, installing now" + /cos-gpu-installer install + fi + - image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:e226275da6c45816959fe43cde907ee9a85c6a2aa8a429418a4cadef8ecdb86a" + name: partition-gpus + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + resources: + requests: + cpu: 150m + securityContext: + privileged: true + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: dev + mountPath: /dev + - name: nvidia-config + mountPath: /etc/nvidia containers: - image: "registry.k8s.io/pause:3.10" name: pause