From a26f4d855d5f4fdd391068f30720c7413e4f6e28 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 26 Apr 2023 16:59:57 +0100 Subject: [PATCH] node: device-plugin: e2e: Capture pod admission failure This test captures that scenario where after kubelet restart, application pod comes up and the device plugin pod hasn't re-registered itself, the pod fails with admission error. It is worth noting that once the device plugin pod has registered itself, another application pod requesting devices ends up running successfully. For the test case where kubelet is restarted and device plugin has re-registered without involving pod restart, since the pod after kubelet restart ends up with admission error, we cannot be certain the device that the second pod (pod2) would get. As long as, it gets a device we consider the test to pass. Signed-off-by: Swati Sehgal --- test/e2e_node/device_plugin_test.go | 48 +++++++++++++++++------------ 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 5ac7b2c1192..a436fa45f43 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -313,8 +313,12 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) - err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod1.Name, f.Namespace.Name, 1*time.Minute) - framework.ExpectNoError(err) + ginkgo.By("Waiting for the pod to fail with admission error as device plugin hasn't re-registered yet") + gomega.Eventually(ctx, getPod). + WithArguments(f, pod1.Name). + WithTimeout(time.Minute). + Should(HaveFailedWithAdmissionError(), + "the pod succeeded to start, when it should fail with the admission error") // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. // note we don't check again the logs of the container: the check is done at startup, the container @@ -351,6 +355,26 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Wait for node to be ready again") e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) + ginkgo.By("Waiting for the pod to fail with admission error as device plugin hasn't re-registered yet") + gomega.Eventually(ctx, getPod). + WithArguments(f, pod1.Name). + WithTimeout(time.Minute). + Should(HaveFailedWithAdmissionError(), + "the pod succeeded to start, when it should fail with the admission error") + + // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. + // note we don't check again the logs of the container: the check is done at startup, the container + // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check + // is useless. + ginkgo.By("Verifying the device assignment after kubelet restart using podresources API") + gomega.Eventually(ctx, func() error { + v1PodResources, err = getV1NodeDevices(ctx) + return err + }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart") + + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + ginkgo.By("Re-Register resources by deleting the plugin pod") gp := int64(0) deleteOptions := metav1.DeleteOptions{ @@ -370,36 +394,20 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) - // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. - // note we don't check again the logs of the container: the check is done at startup, the container - // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check - // is useless. - ginkgo.By("Verifying the device assignment after kubelet and device plugin restart using podresources API") - gomega.Eventually(ctx, func() error { - v1PodResources, err = getV1NodeDevices(ctx) - return err - }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet and device plugin restart") - - err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) - framework.ExpectNoError(err, "inconsistent device assignment after pod restart") - ginkgo.By("Creating another pod") pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) - ginkgo.By("Checking that pod got a different fake device") + ginkgo.By("Checking that pod got a fake device") devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) - gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2)), "pod2 requested a device but started successfully without") - ginkgo.By("Verifying the device assignment after kubelet restart and device plugin re-registration using podresources API") // note we don't use eventually: the kubelet is supposed to be running and stable by now, so the call should just succeed v1PodResources, err = getV1NodeDevices(ctx) if err != nil { framework.ExpectNoError(err, "getting pod resources assignment after pod restart") } - err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) - framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod1") + err = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2}) framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2") })