From 29d26297a1f2acafafd050e377f6977daa2757be Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Wed, 27 Nov 2024 17:20:16 +0100
Subject: [PATCH] e2e: node: fix misleading device plugin test

We have a e2e test which tries to ensure device plugin assignments to pods are kept
across node reboots. And this tests is permafailing since many weeks at
time of writing (xref: #128443).

Problem is: closer inspection reveals the test was well intentioned, but
puzzling:
The test runs a pod, then restarts the kubelet, then _expects the pod to
end up in admission failure_ and yet _ensure the device assignment is
kept_! https://github.com/kubernetes/kubernetes/blob/v1.32.0-rc.0/test/e2e_node/device_plugin_test.go#L97

A reader can legitmately wonder if this means the device will be kept busy forever?

This is not the case, luckily. The test however embodied the behavior at
time of the kubelet, in turn caused by #103979

Device manager used to record the last admitted pod and forcibly added
to the list of active pod. The retention logic had space for exactly one
pod, the last which attempted admission.

This retention prevented the cleanup code
(see: https://github.com/kubernetes/kubernetes/blob/v1.32.0-rc.0/pkg/kubelet/cm/devicemanager/manager.go#L549
compare to: https://github.com/kubernetes/kubernetes/blob/v1.31.0-rc.0/pkg/kubelet/cm/devicemanager/manager.go#L549)
to clear the registration, so the device was still (mis)reported
allocated to the failed pod.

This fact was in turn leveraged by the test in question:
the test uses the podresources API to learn about the device assignment,
and because of the chain of events above the pod failed admission yet
was still reported as owning the device.

What happened however was the next pod trying admission would have
replaced the previous pod in the device manager data, so the previous
pod was no longer forced to be added into the active list, so its
assignment were correctly cleared once the cleanup code runs;
And the cleanup code is run, among other things, every time device
manager is asked to allocated devices and every time podresources API
queries the device assignment

Later, in PR https://github.com/kubernetes/kubernetes/pull/120661
the forced retention logic was removed from all the resource managers,
thus also from device manager, and this is what caused the permafailure.

Because all of the above, it should be evident that the e2e test was
actually enforcing a very specific and not really work-as-intended
behavior, which was also overall quite puzzling for users.

The best we can do is to fix the test to record and ensure that
pods which did fail admission _do not_ retain device assignment.

Unfortunately, we _cannot_ guarantee the desirable property that
pod going running retain their device assignment across node reboots.

In the kubelet restart flow, all pods race to be admitted. There's no
order enforced between device plugin pods and application pods.
Unless an application pod is lucky enough to _lose_ the race with both
the device plugin (to go running before the app pod does) and _also_
with the kubelet (which needs to set devices healthy before the pod
tries admission).

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 test/e2e_node/device_plugin_test.go | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go
index 2db84ce2b80..744f1edb7b6 100644
--- a/test/e2e_node/device_plugin_test.go
+++ b/test/e2e_node/device_plugin_test.go
@@ -934,7 +934,7 @@ func testDevicePluginNodeReboot(f *framework.Framework, pluginSockDir string) {
 		// simulate node reboot scenario by removing pods using CRI before kubelet is started. In addition to that,
 		// intentionally a scenario is created where after node reboot, application pods requesting devices appear before the device plugin pod
 		// exposing those devices as resource has restarted. The expected behavior is that the application pod fails at admission time.
-		framework.It("Keeps device plugin assignments across node reboots (no pod restart, no device plugin re-registration)", framework.WithFlaky(), func(ctx context.Context) {
+		framework.It("Does not keep device plugin assignments across node reboots if fails admission (no pod restart, no device plugin re-registration)", framework.WithFlaky(), func(ctx context.Context) {
 			podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalForever)
 			pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD))
 			deviceIDRE := "stub devices: (Dev-[0-9]+)"
@@ -985,9 +985,17 @@ func testDevicePluginNodeReboot(f *framework.Framework, pluginSockDir string) {
 				return err
 			}, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart")
 
-			err, _ = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1})
-			framework.ExpectNoError(err, "inconsistent device assignment after node reboot")
-
+			// if we got this far, podresources API will now report 2 entries:
+			// - sample device plugin pod, running and doing fine
+			// - our test pod, in failed state. Pods in terminal state will still be reported, see https://github.com/kubernetes/kubernetes/issues/119423
+			// so we care about our test pod, and it will be present in the returned list till 119423 is fixed, but since it failed admission it must not have
+			// any device allocated to it, hence we check for empty device set in the podresources response. So, we check that
+			// A. our test pod must be present in the list response *and*
+			// B. it has no devices assigned to it.
+			// anything else is unexpected and thus makes the test fail. Once 119423 is fixed, a better, simpler and more intuitive check will be for the
+			// test pod to not be present in the podresources list response, but till that time we're stuck with this approach.
+			_, found := checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{})
+			gomega.Expect(found).To(gomega.BeTrueBecause("%s/%s/%s failed admission, should not have devices registered", pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name))
 		})
 	})
 }