mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-27 05:27:21 +00:00
Extend test/e2e/scheduling/nvidia-gpus.go to include a device plugin based nvidia gpu e2e test.
This commit is contained in:
parent
775f5d232d
commit
01b49b4165
@ -42,8 +42,14 @@ const (
|
|||||||
cosOSImage = "Container-Optimized OS from Google"
|
cosOSImage = "Container-Optimized OS from Google"
|
||||||
// Nvidia driver installation can take upwards of 5 minutes.
|
// Nvidia driver installation can take upwards of 5 minutes.
|
||||||
driverInstallTimeout = 10 * time.Minute
|
driverInstallTimeout = 10 * time.Minute
|
||||||
// Nvidia COS driver installer daemonset.
|
)
|
||||||
cosNvidiaDriverInstallerUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/stable/cos-nvidia-gpu-installer/daemonset.yaml"
|
|
||||||
|
type podCreationFuncType func() *v1.Pod
|
||||||
|
|
||||||
|
var (
|
||||||
|
gpuResourceName v1.ResourceName
|
||||||
|
dsYamlUrl string
|
||||||
|
podCreationFunc podCreationFuncType
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeCudaAdditionTestPod() *v1.Pod {
|
func makeCudaAdditionTestPod() *v1.Pod {
|
||||||
@ -60,7 +66,7 @@ func makeCudaAdditionTestPod() *v1.Pod {
|
|||||||
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
||||||
Resources: v1.ResourceRequirements{
|
Resources: v1.ResourceRequirements{
|
||||||
Limits: v1.ResourceList{
|
Limits: v1.ResourceList{
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI),
|
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
VolumeMounts: []v1.VolumeMount{
|
VolumeMounts: []v1.VolumeMount{
|
||||||
@ -86,6 +92,30 @@ func makeCudaAdditionTestPod() *v1.Pod {
|
|||||||
return testPod
|
return testPod
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
||||||
|
podName := testPodNamePrefix + string(uuid.NewUUID())
|
||||||
|
testPod := &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: podName,
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyNever,
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "vector-addition",
|
||||||
|
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
||||||
|
Resources: v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return testPod
|
||||||
|
}
|
||||||
|
|
||||||
func isClusterRunningCOS(f *framework.Framework) bool {
|
func isClusterRunningCOS(f *framework.Framework) bool {
|
||||||
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
|
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
|
||||||
framework.ExpectNoError(err, "getting node list")
|
framework.ExpectNoError(err, "getting node list")
|
||||||
@ -105,7 +135,8 @@ func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool {
|
|||||||
if node.Spec.Unschedulable {
|
if node.Spec.Unschedulable {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
framework.Logf("gpuResourceName %s", gpuResourceName)
|
||||||
|
if val, ok := node.Status.Capacity[gpuResourceName]; !ok || val.Value() == 0 {
|
||||||
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
|
framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@ -119,7 +150,9 @@ func getGPUsAvailable(f *framework.Framework) int64 {
|
|||||||
framework.ExpectNoError(err, "getting node list")
|
framework.ExpectNoError(err, "getting node list")
|
||||||
var gpusAvailable int64
|
var gpusAvailable int64
|
||||||
for _, node := range nodeList.Items {
|
for _, node := range nodeList.Items {
|
||||||
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
if val, ok := node.Status.Capacity[gpuResourceName]; ok {
|
||||||
|
gpusAvailable += (&val).Value()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return gpusAvailable
|
return gpusAvailable
|
||||||
}
|
}
|
||||||
@ -133,10 +166,21 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
|||||||
Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
|
Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently")
|
||||||
}
|
}
|
||||||
framework.Logf("Cluster is running on COS. Proceeding with test")
|
framework.Logf("Cluster is running on COS. Proceeding with test")
|
||||||
|
|
||||||
|
if f.BaseName == "device-plugin-gpus" {
|
||||||
|
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/device-plugin-daemonset.yaml"
|
||||||
|
gpuResourceName = "nvidia.com/gpu"
|
||||||
|
podCreationFunc = makeCudaAdditionDevicePluginTestPod
|
||||||
|
} else {
|
||||||
|
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
|
||||||
|
gpuResourceName = v1.ResourceNvidiaGPU
|
||||||
|
podCreationFunc = makeCudaAdditionTestPod
|
||||||
|
}
|
||||||
|
|
||||||
// GPU drivers might have already been installed.
|
// GPU drivers might have already been installed.
|
||||||
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
if !areGPUsAvailableOnAllSchedulableNodes(f) {
|
||||||
// Install Nvidia Drivers.
|
// Install Nvidia Drivers.
|
||||||
ds := dsFromManifest(cosNvidiaDriverInstallerUrl)
|
ds := dsFromManifest(dsYamlUrl)
|
||||||
ds.Namespace = f.Namespace.Name
|
ds.Namespace = f.Namespace.Name
|
||||||
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
_, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds)
|
||||||
framework.ExpectNoError(err, "failed to create daemonset")
|
framework.ExpectNoError(err, "failed to create daemonset")
|
||||||
@ -149,7 +193,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
|||||||
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
||||||
podList := []*v1.Pod{}
|
podList := []*v1.Pod{}
|
||||||
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
||||||
podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod()))
|
podList = append(podList, f.PodClient().Create(podCreationFunc()))
|
||||||
}
|
}
|
||||||
framework.Logf("Wait for all test pods to succeed")
|
framework.Logf("Wait for all test pods to succeed")
|
||||||
// Wait for all pods to succeed
|
// Wait for all pods to succeed
|
||||||
@ -192,3 +236,10 @@ var _ = SIGDescribe("[Feature:GPU]", func() {
|
|||||||
testNvidiaGPUsOnCOS(f)
|
testNvidiaGPUsOnCOS(f)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
|
||||||
|
f := framework.NewDefaultFramework("device-plugin-gpus")
|
||||||
|
It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {
|
||||||
|
testNvidiaGPUsOnCOS(f)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user