From 225658884bc122f9012ecefbef2122f23cafbce8 Mon Sep 17 00:00:00 2001 From: David Porter Date: Tue, 31 Jan 2023 17:28:45 -0800 Subject: [PATCH] test: Fix node e2e device plugin flake The device plugin test expects that no other pods are running prior to the test starting. However, it has been observed that in some cases some resources may still be around from previous tests. This is because the deletion of resources from other tests is handled by deleting that test's framework's namespace which is done asynchronously without waiting for the other test's namespace to be deleted. As a result, when the node e2e device plugin starts, there may still be other pods in process of termination. To work around this, add a retry to the device plugin test to account for the time it takes to delete the resources from the prior test. Signed-off-by: David Porter --- test/e2e_node/device_plugin_test.go | 31 +++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 9e6da4f48d1..5cdf50e2ba8 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -18,6 +18,7 @@ package e2enode import ( "context" + "fmt" "path/filepath" "regexp" "time" @@ -105,17 +106,31 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { return nodes == 1 }, time.Minute, time.Second).Should(gomega.BeTrue()) - v1alphaPodResources, err = getV1alpha1NodeDevices(ctx) - framework.ExpectNoError(err, "should get node local podresources by accessing the (v1alpha) podresources API endpoint") - - v1PodResources, err = getV1NodeDevices(ctx) - framework.ExpectNoError(err, "should get node local podresources by accessing the (v1) podresources API endpoint") - // Before we run the device plugin test, we need to ensure // that the cluster is in a clean state and there are no // pods running on this node. - gomega.Expect(v1alphaPodResources.PodResources).To(gomega.BeEmpty(), "should have no pod resources") - gomega.Expect(v1PodResources.PodResources).To(gomega.BeEmpty(), "should have no pod resources") + // This is done in a gomega.Eventually with retries since a prior test in a different test suite could've run and the deletion of it's resources may still be in progress. + // xref: https://issue.k8s.io/115381 + gomega.Eventually(ctx, func(ctx context.Context) error { + v1alphaPodResources, err = getV1alpha1NodeDevices(ctx) + if err != nil { + return fmt.Errorf("failed to get node local podresources by accessing the (v1alpha) podresources API endpoint: %v", err) + } + + v1PodResources, err = getV1NodeDevices(ctx) + if err != nil { + return fmt.Errorf("failed to get node local podresources by accessing the (v1) podresources API endpoint: %v", err) + } + + if len(v1alphaPodResources.PodResources) > 0 { + return fmt.Errorf("expected v1alpha pod resources to be empty, but got non-empty resources: %+v", v1alphaPodResources.PodResources) + } + + if len(v1PodResources.PodResources) > 0 { + return fmt.Errorf("expected v1 pod resources to be empty, but got non-empty resources: %+v", v1PodResources.PodResources) + } + return nil + }, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.Succeed()) ginkgo.By("Scheduling a sample device plugin pod") data, err := e2etestfiles.Read(SampleDevicePluginDSYAML)