mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Use local daemonset manifest for installing Nvidia drivers
Updates sig-scheduling e2e Nvidia GPU tests to install drivers using local manifest by default. Currently the DaemonSet is fetched from the GoogleCloudPlatform/container-enginer-accelerators repo by default. Using a local manifest allows for manually specifying the image cos-gpu-installer image rather than always using latest. A remote manifest can still be fetched by setting NVIDIA_DRIVER_INSTALLER_DAEMONSET env var. Signed-off-by: hasheddan <georgedanielmangum@gmail.com>
This commit is contained in:
parent
3a0b683c01
commit
e990698d5f
@ -55,6 +55,7 @@ go_library(
|
|||||||
"//test/e2e/framework/resource:go_default_library",
|
"//test/e2e/framework/resource:go_default_library",
|
||||||
"//test/e2e/framework/service:go_default_library",
|
"//test/e2e/framework/service:go_default_library",
|
||||||
"//test/e2e/framework/skipper:go_default_library",
|
"//test/e2e/framework/skipper:go_default_library",
|
||||||
|
"//test/e2e/framework/testfiles:go_default_library",
|
||||||
"//test/utils:go_default_library",
|
"//test/utils:go_default_library",
|
||||||
"//test/utils/image:go_default_library",
|
"//test/utils/image:go_default_library",
|
||||||
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
||||||
|
@ -22,6 +22,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
appsv1 "k8s.io/api/apps/v1"
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
@ -36,6 +37,7 @@ import (
|
|||||||
"k8s.io/kubernetes/test/e2e/framework/providers/gce"
|
"k8s.io/kubernetes/test/e2e/framework/providers/gce"
|
||||||
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
|
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
|
||||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||||
|
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
|
||||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
|
|
||||||
"github.com/onsi/ginkgo"
|
"github.com/onsi/ginkgo"
|
||||||
@ -50,7 +52,6 @@ const (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
gpuResourceName v1.ResourceName
|
gpuResourceName v1.ResourceName
|
||||||
dsYamlURL string
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
||||||
@ -128,18 +129,23 @@ func getGPUsAvailable(f *framework.Framework) int64 {
|
|||||||
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
|
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
|
||||||
logOSImages(f)
|
logOSImages(f)
|
||||||
|
|
||||||
|
var err error
|
||||||
|
var ds *appsv1.DaemonSet
|
||||||
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
|
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
|
||||||
if dsYamlURLFromEnv != "" {
|
if dsYamlURLFromEnv != "" {
|
||||||
dsYamlURL = dsYamlURLFromEnv
|
// Using DaemonSet from remote URL
|
||||||
|
framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
|
||||||
|
ds, err = e2emanifest.DaemonSetFromURL(dsYamlURLFromEnv)
|
||||||
|
framework.ExpectNoError(err, "failed get remote")
|
||||||
} else {
|
} else {
|
||||||
dsYamlURL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
|
// Using default local DaemonSet
|
||||||
|
framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
|
||||||
|
data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
|
||||||
|
framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
|
||||||
|
ds, err = e2emanifest.DaemonSetFromData(data)
|
||||||
|
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
|
||||||
}
|
}
|
||||||
gpuResourceName = e2egpu.NVIDIAGPUResourceName
|
gpuResourceName = e2egpu.NVIDIAGPUResourceName
|
||||||
|
|
||||||
framework.Logf("Using %v", dsYamlURL)
|
|
||||||
// Creates the DaemonSet that installs Nvidia Drivers.
|
|
||||||
ds, err := e2emanifest.DaemonSetFromURL(dsYamlURL)
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
ds.Namespace = f.Namespace.Name
|
ds.Namespace = f.Namespace.Name
|
||||||
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), ds, metav1.CreateOptions{})
|
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(context.TODO(), ds, metav1.CreateOptions{})
|
||||||
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
|
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
|
||||||
|
@ -0,0 +1,80 @@
|
|||||||
|
# This DaemonSet was originally referenced from
|
||||||
|
# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/daemonset.yaml
|
||||||
|
|
||||||
|
# The Dockerfile and other source for this daemonset are in
|
||||||
|
# https://github.com/GoogleCloudPlatform/cos-gpu-installer
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
k8s-app: nvidia-driver-installer
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: nvidia-driver-installer
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
k8s-app: nvidia-driver-installer
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: cloud.google.com/gke-accelerator
|
||||||
|
operator: Exists
|
||||||
|
tolerations:
|
||||||
|
- operator: "Exists"
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
volumes:
|
||||||
|
- name: dev
|
||||||
|
hostPath:
|
||||||
|
path: /dev
|
||||||
|
- name: vulkan-icd-mount
|
||||||
|
hostPath:
|
||||||
|
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
|
||||||
|
- name: nvidia-install-dir-host
|
||||||
|
hostPath:
|
||||||
|
path: /home/kubernetes/bin/nvidia
|
||||||
|
- name: root-mount
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
initContainers:
|
||||||
|
- image: gcr.io/cos-cloud/cos-gpu-installer:v20200701
|
||||||
|
name: nvidia-driver-installer
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 0.15
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
env:
|
||||||
|
- name: NVIDIA_INSTALL_DIR_HOST
|
||||||
|
value: /home/kubernetes/bin/nvidia
|
||||||
|
- name: NVIDIA_INSTALL_DIR_CONTAINER
|
||||||
|
value: /usr/local/nvidia
|
||||||
|
- name: VULKAN_ICD_DIR_HOST
|
||||||
|
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
|
||||||
|
- name: VULKAN_ICD_DIR_CONTAINER
|
||||||
|
value: /etc/vulkan/icd.d
|
||||||
|
- name: ROOT_MOUNT_DIR
|
||||||
|
value: /root
|
||||||
|
volumeMounts:
|
||||||
|
- name: nvidia-install-dir-host
|
||||||
|
mountPath: /usr/local/nvidia
|
||||||
|
- name: vulkan-icd-mount
|
||||||
|
mountPath: /etc/vulkan/icd.d
|
||||||
|
- name: dev
|
||||||
|
mountPath: /dev
|
||||||
|
- name: root-mount
|
||||||
|
mountPath: /root
|
||||||
|
containers:
|
||||||
|
- image: "gcr.io/google-containers/pause:3.2"
|
||||||
|
name: pause
|
Loading…
Reference in New Issue
Block a user