Merge pull request #93207 from hasheddan/nvidia-gpu-installer

Use local daemonset manifest for installing Nvidia drivers
This commit is contained in:
Kubernetes Prow Robot 2020-07-20 09:02:51 -07:00 committed by GitHub
commit 5feab0aa1e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 95 additions and 8 deletions

View File

@ -55,6 +55,7 @@ go_library(
"//test/e2e/framework/resource:go_default_library",
"//test/e2e/framework/service:go_default_library",
"//test/e2e/framework/skipper:go_default_library",
"//test/e2e/framework/testfiles:go_default_library",
"//test/utils:go_default_library",
"//test/utils/image:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library",

View File

@ -22,6 +22,7 @@ import (
"regexp"
"time"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -36,6 +37,7 @@ import (
"k8s.io/kubernetes/test/e2e/framework/providers/gce"
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
imageutils "k8s.io/kubernetes/test/utils/image"
"github.com/onsi/ginkgo"
@ -50,7 +52,6 @@ const (
var (
gpuResourceName v1.ResourceName
dsYamlURL string
)
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
@ -128,18 +129,23 @@ func getGPUsAvailable(f *framework.Framework) int64 {
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
logOSImages(f)
var err error
var ds *appsv1.DaemonSet
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
if dsYamlURLFromEnv != "" {
dsYamlURL = dsYamlURLFromEnv
// Using DaemonSet from remote URL
framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
ds, err = e2emanifest.DaemonSetFromURL(dsYamlURLFromEnv)
framework.ExpectNoError(err, "failed get remote")
} else {
dsYamlURL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
// Using default local DaemonSet
framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
ds, err = e2emanifest.DaemonSetFromData(data)
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
}
gpuResourceName = e2egpu.NVIDIAGPUResourceName
framework.Logf("Using %v", dsYamlURL)
// Creates the DaemonSet that installs Nvidia Drivers.
ds, err := e2emanifest.DaemonSetFromURL(dsYamlURL)
framework.ExpectNoError(err)
ds.Namespace = f.Namespace.Name
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), ds, metav1.CreateOptions{})
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")

View File

@ -0,0 +1,80 @@
# This DaemonSet was originally referenced from
# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/daemonset.yaml
# The Dockerfile and other source for this daemonset are in
# https://github.com/GoogleCloudPlatform/cos-gpu-installer
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
labels:
k8s-app: nvidia-driver-installer
spec:
selector:
matchLabels:
k8s-app: nvidia-driver-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-driver-installer
k8s-app: nvidia-driver-installer
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: vulkan-icd-mount
hostPath:
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
initContainers:
- image: gcr.io/cos-cloud/cos-gpu-installer:v20200701
name: nvidia-driver-installer
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
containers:
- image: "gcr.io/google-containers/pause:3.2"
name: pause