Merge pull request #127509 from dims/test-more-gpu-stuff

Test MOAR GPU stuff (add the cuda demo suite!)
2025-09-14 05:36:12 +00:00 · 2024-09-20 23:53:58 +01:00
parent 7a58803c84 e516e003c5
commit 52095a8b7b
1 changed files with 17 additions and 3 deletions
--- a/test/e2e/node/gpu.go
+++ b/test/e2e/node/gpu.go
@@ -61,10 +61,9 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Sanity tes
 		podClient = e2epod.NewPodClient(f)
 	})

-	f.It("should run nvidia-smi cli", func(ctx context.Context) {
+	f.It("should run nvidia-smi and cuda-demo-suite", func(ctx context.Context) {
 		SetupEnvironmentAndSkipIfNeeded(ctx, f, f.ClientSet)
 		pod := testNvidiaCLIPod()
-		pod.Spec.Containers[0].Command = []string{"nvidia-smi"}

 		ginkgo.By("Creating a pod that runs nvidia-smi")
 		createAndValidatePod(ctx, f, podClient, pod)
@@ -103,6 +102,8 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
 		framework.ExpectNoError(err)

 		ginkgo.By("Checking output from nvidia-smi")
+		framework.Logf("Got container logs for %s:\n%v", pod.Spec.Containers[0].Name, log)
+
 		gomega.Expect(log).To(gomega.ContainSubstring("TensorFlow version"))
 		gomega.Expect(log).To(gomega.ContainSubstring("Matrix multiplication result:"))
 		gomega.Expect(log).To(gomega.ContainSubstring("Time taken for 5000x5000 matrix multiplication"))
@@ -178,7 +179,20 @@ func testNvidiaCLIPod() *v1.Pod {
 			Containers: []v1.Container{
 				{
 					Name:  "nvidia-smi",
-					Image: "nvidia/cuda:12.3.2-runtime-ubuntu22.04",
+					Image: "nvidia/cuda:12.5.0-devel-ubuntu22.04",
+					Command: []string{
+						"bash",
+						"-c",
+						`
+nvidia-smi
+apt-get update -y && \
+	DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated cuda-demo-suite-12-5
+/usr/local/cuda/extras/demo_suite/deviceQuery
+/usr/local/cuda/extras/demo_suite/vectorAdd
+/usr/local/cuda/extras/demo_suite/bandwidthTest --device=all --csv
+/usr/local/cuda/extras/demo_suite/busGrind -a
+`,
+					},
 					Resources: v1.ResourceRequirements{
 						Limits: v1.ResourceList{
 							"nvidia.com/gpu": resource.MustParse("1"),