mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Use local daemonset manifest for installing Nvidia drivers
Updates sig-scheduling e2e Nvidia GPU tests to install drivers using local manifest by default. Currently the DaemonSet is fetched from the GoogleCloudPlatform/container-enginer-accelerators repo by default. Using a local manifest allows for manually specifying the image cos-gpu-installer image rather than always using latest. A remote manifest can still be fetched by setting NVIDIA_DRIVER_INSTALLER_DAEMONSET env var. Signed-off-by: hasheddan <georgedanielmangum@gmail.com>
This commit is contained in:
parent
3a0b683c01
commit
e990698d5f
@ -55,6 +55,7 @@ go_library(
|
||||
"//test/e2e/framework/resource:go_default_library",
|
||||
"//test/e2e/framework/service:go_default_library",
|
||||
"//test/e2e/framework/skipper:go_default_library",
|
||||
"//test/e2e/framework/testfiles:go_default_library",
|
||||
"//test/utils:go_default_library",
|
||||
"//test/utils/image:go_default_library",
|
||||
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@ -36,6 +37,7 @@ import (
|
||||
"k8s.io/kubernetes/test/e2e/framework/providers/gce"
|
||||
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
|
||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
@ -50,7 +52,6 @@ const (
|
||||
|
||||
var (
|
||||
gpuResourceName v1.ResourceName
|
||||
dsYamlURL string
|
||||
)
|
||||
|
||||
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
||||
@ -128,18 +129,23 @@ func getGPUsAvailable(f *framework.Framework) int64 {
|
||||
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
|
||||
logOSImages(f)
|
||||
|
||||
var err error
|
||||
var ds *appsv1.DaemonSet
|
||||
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
|
||||
if dsYamlURLFromEnv != "" {
|
||||
dsYamlURL = dsYamlURLFromEnv
|
||||
// Using DaemonSet from remote URL
|
||||
framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
|
||||
ds, err = e2emanifest.DaemonSetFromURL(dsYamlURLFromEnv)
|
||||
framework.ExpectNoError(err, "failed get remote")
|
||||
} else {
|
||||
dsYamlURL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
|
||||
// Using default local DaemonSet
|
||||
framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
|
||||
data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
|
||||
framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
|
||||
ds, err = e2emanifest.DaemonSetFromData(data)
|
||||
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
|
||||
}
|
||||
gpuResourceName = e2egpu.NVIDIAGPUResourceName
|
||||
|
||||
framework.Logf("Using %v", dsYamlURL)
|
||||
// Creates the DaemonSet that installs Nvidia Drivers.
|
||||
ds, err := e2emanifest.DaemonSetFromURL(dsYamlURL)
|
||||
framework.ExpectNoError(err)
|
||||
ds.Namespace = f.Namespace.Name
|
||||
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), ds, metav1.CreateOptions{})
|
||||
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
|
||||
|
@ -0,0 +1,80 @@
|
||||
# This DaemonSet was originally referenced from
|
||||
# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/daemonset.yaml
|
||||
|
||||
# The Dockerfile and other source for this daemonset are in
|
||||
# https://github.com/GoogleCloudPlatform/cos-gpu-installer
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-driver-installer
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: nvidia-driver-installer
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: nvidia-driver-installer
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-driver-installer
|
||||
k8s-app: nvidia-driver-installer
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: cloud.google.com/gke-accelerator
|
||||
operator: Exists
|
||||
tolerations:
|
||||
- operator: "Exists"
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
volumes:
|
||||
- name: dev
|
||||
hostPath:
|
||||
path: /dev
|
||||
- name: vulkan-icd-mount
|
||||
hostPath:
|
||||
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
|
||||
- name: nvidia-install-dir-host
|
||||
hostPath:
|
||||
path: /home/kubernetes/bin/nvidia
|
||||
- name: root-mount
|
||||
hostPath:
|
||||
path: /
|
||||
initContainers:
|
||||
- image: gcr.io/cos-cloud/cos-gpu-installer:v20200701
|
||||
name: nvidia-driver-installer
|
||||
resources:
|
||||
requests:
|
||||
cpu: 0.15
|
||||
securityContext:
|
||||
privileged: true
|
||||
env:
|
||||
- name: NVIDIA_INSTALL_DIR_HOST
|
||||
value: /home/kubernetes/bin/nvidia
|
||||
- name: NVIDIA_INSTALL_DIR_CONTAINER
|
||||
value: /usr/local/nvidia
|
||||
- name: VULKAN_ICD_DIR_HOST
|
||||
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
|
||||
- name: VULKAN_ICD_DIR_CONTAINER
|
||||
value: /etc/vulkan/icd.d
|
||||
- name: ROOT_MOUNT_DIR
|
||||
value: /root
|
||||
volumeMounts:
|
||||
- name: nvidia-install-dir-host
|
||||
mountPath: /usr/local/nvidia
|
||||
- name: vulkan-icd-mount
|
||||
mountPath: /etc/vulkan/icd.d
|
||||
- name: dev
|
||||
mountPath: /dev
|
||||
- name: root-mount
|
||||
mountPath: /root
|
||||
containers:
|
||||
- image: "gcr.io/google-containers/pause:3.2"
|
||||
name: pause
|
Loading…
Reference in New Issue
Block a user