From 349c7136c9ffe2f0889526c8ae8c0e634127c82d Mon Sep 17 00:00:00 2001
From: Davanum Srinivas <davanum@gmail.com>
Date: Tue, 24 Sep 2024 09:11:18 -0400
Subject: [PATCH 1/2] Wait for GPUs even for AWS kubetest2 ec2 harness

Signed-off-by: Davanum Srinivas <davanum@gmail.com>
---
 test/e2e/node/gpu.go | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go
index 3e0a0376ab3..ca84cd1008b 100644
--- a/test/e2e/node/gpu.go
+++ b/test/e2e/node/gpu.go
@@ -263,7 +263,11 @@ print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f
 func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework, clientSet clientset.Interface) {
 	if framework.ProviderIs("gce") {
 		SetupNVIDIAGPUNode(ctx, f)
+	} else if framework.ProviderIs("aws") {
+		// see nvidia-device-plugin.yml in https://github.com/NVIDIA/k8s-device-plugin/tree/main/deployments/static
+		waitForGPUs(ctx, f, "kube-system", "nvidia-device-plugin-daemonset")
 	}
+
 	nodes, err := e2enode.GetReadySchedulableNodes(ctx, clientSet)
 	framework.ExpectNoError(err)
 	capacity := 0
@@ -281,10 +285,10 @@ func SetupEnvironmentAndSkipIfNeeded(ctx context.Context, f *framework.Framework
 		allocatable += int(val.Value())
 	}
 	if capacity == 0 {
-		e2eskipper.Skipf("%d ready nodes do not have any Nvidia GPU(s). Skipping...", len(nodes.Items))
+		framework.Failf("%d ready nodes do not have any Nvidia GPU(s). Bailing out...", len(nodes.Items))
 	}
 	if allocatable == 0 {
-		e2eskipper.Skipf("%d ready nodes do not have any allocatable Nvidia GPU(s). Skipping...", len(nodes.Items))
+		framework.Failf("%d ready nodes do not have any allocatable Nvidia GPU(s). Bailing out...", len(nodes.Items))
 	}
 }
 
@@ -351,7 +355,11 @@ func SetupNVIDIAGPUNode(ctx context.Context, f *framework.Framework) {
 	framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
 	framework.Logf("Successfully created daemonset to install Nvidia drivers.")
 
-	pods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet"))
+	waitForGPUs(ctx, f, ds.Namespace, ds.Name)
+}
+
+func waitForGPUs(ctx context.Context, f *framework.Framework, namespace, name string) {
+	pods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, namespace, name, extensionsinternal.Kind("DaemonSet"))
 	framework.ExpectNoError(err, "failed to get pods controlled by the nvidia-driver-installer daemonset")
 
 	devicepluginPods, err := e2eresource.WaitForControlledPods(ctx, f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet"))

From 472ca3b27991e165d4be9f058798d59badab1bdb Mon Sep 17 00:00:00 2001
From: Davanum Srinivas <davanum@gmail.com>
Date: Tue, 24 Sep 2024 10:09:25 -0400
Subject: [PATCH 2/2] skip control plane nodes, they may not have GPUs

Signed-off-by: Davanum Srinivas <davanum@gmail.com>
---
 test/e2e/node/gpu.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go
index ca84cd1008b..acac83ea21f 100644
--- a/test/e2e/node/gpu.go
+++ b/test/e2e/node/gpu.go
@@ -300,6 +300,9 @@ func areGPUsAvailableOnAllSchedulableNodes(ctx context.Context, clientSet client
 		if node.Spec.Unschedulable {
 			continue
 		}
+		if _, ok := node.Labels[framework.ControlPlaneLabel]; ok {
+			continue
+		}
 		framework.Logf("gpuResourceName %s", e2egpu.NVIDIAGPUResourceName)
 		if val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]; !ok || val.Value() == 0 {
 			framework.Logf("Nvidia GPUs not available on Node: %q", node.Name)