From a26f4d855d5f4fdd391068f30720c7413e4f6e28 Mon Sep 17 00:00:00 2001
From: Swati Sehgal <swsehgal@redhat.com>
Date: Wed, 26 Apr 2023 16:59:57 +0100
Subject: [PATCH] node: device-plugin: e2e: Capture pod admission failure

This test captures that scenario where after kubelet restart,
application pod comes up and the device plugin pod hasn't re-registered
itself, the pod fails with admission error. It is worth noting that
once the device plugin pod has registered itself, another
application pod requesting devices ends up running
successfully.

For the test case where kubelet is restarted and device plugin
has re-registered without involving pod restart, since the
pod after kubelet restart ends up with admission error,
we cannot be certain the device that the second pod (pod2) would
get. As long as, it gets a device we consider the test to pass.

Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
---
 test/e2e_node/device_plugin_test.go | 48 +++++++++++++++++------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go
index 5ac7b2c1192..a436fa45f43 100644
--- a/test/e2e_node/device_plugin_test.go
+++ b/test/e2e_node/device_plugin_test.go
@@ -313,8 +313,12 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
 					CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount
 			}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
 
-			err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod1.Name, f.Namespace.Name, 1*time.Minute)
-			framework.ExpectNoError(err)
+			ginkgo.By("Waiting for the pod to fail with admission error as device plugin hasn't re-registered yet")
+			gomega.Eventually(ctx, getPod).
+				WithArguments(f, pod1.Name).
+				WithTimeout(time.Minute).
+				Should(HaveFailedWithAdmissionError(),
+					"the pod succeeded to start, when it should fail with the admission error")
 
 			// crosscheck from the device assignment is preserved and stable from perspective of the kubelet.
 			// note we don't check again the logs of the container: the check is done at startup, the container
@@ -351,6 +355,26 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
 			ginkgo.By("Wait for node to be ready again")
 			e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute)
 
+			ginkgo.By("Waiting for the pod to fail with admission error as device plugin hasn't re-registered yet")
+			gomega.Eventually(ctx, getPod).
+				WithArguments(f, pod1.Name).
+				WithTimeout(time.Minute).
+				Should(HaveFailedWithAdmissionError(),
+					"the pod succeeded to start, when it should fail with the admission error")
+
+			// crosscheck from the device assignment is preserved and stable from perspective of the kubelet.
+			// note we don't check again the logs of the container: the check is done at startup, the container
+			// never restarted (runs "forever" from this test timescale perspective) hence re-doing this check
+			// is useless.
+			ginkgo.By("Verifying the device assignment after kubelet restart using podresources API")
+			gomega.Eventually(ctx, func() error {
+				v1PodResources, err = getV1NodeDevices(ctx)
+				return err
+			}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")
+
+			err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
+			framework.ExpectNoError(err, "inconsistent device assignment after pod restart")
+
 			ginkgo.By("Re-Register resources by deleting the plugin pod")
 			gp := int64(0)
 			deleteOptions := metav1.DeleteOptions{
@@ -370,36 +394,20 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
 					CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount
 			}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
 
-			// crosscheck from the device assignment is preserved and stable from perspective of the kubelet.
-			// note we don't check again the logs of the container: the check is done at startup, the container
-			// never restarted (runs "forever" from this test timescale perspective) hence re-doing this check
-			// is useless.
-			ginkgo.By("Verifying the device assignment after kubelet and device plugin restart using podresources API")
-			gomega.Eventually(ctx, func() error {
-				v1PodResources, err = getV1NodeDevices(ctx)
-				return err
-			}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet and device plugin restart")
-
-			err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
-			framework.ExpectNoError(err, "inconsistent device assignment after pod restart")
-
 			ginkgo.By("Creating another pod")
 			pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD))
 
-			ginkgo.By("Checking that pod got a different fake device")
+			ginkgo.By("Checking that pod got a fake device")
 			devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE)
 			framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name)
 
-			gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2)), "pod2 requested a device but started successfully without")
-
 			ginkgo.By("Verifying the device assignment after kubelet restart and device plugin re-registration using podresources API")
 			// note we don't use eventually: the kubelet is supposed to be running and stable by now, so the call should just succeed
 			v1PodResources, err = getV1NodeDevices(ctx)
 			if err != nil {
 				framework.ExpectNoError(err, "getting pod resources assignment after pod restart")
 			}
-			err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
-			framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod1")
+
 			err = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2})
 			framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2")
 		})