test: Fix node e2e device plugin flake

The device plugin test expects that no other pods are running prior to
the test starting. However, it has been observed that in some cases
some resources may still be around from previous tests. This is because
the deletion of resources from other tests is handled by deleting that
test's framework's namespace which is done asynchronously without
waiting for the other test's namespace to be deleted.

As a result, when the node e2e device plugin starts, there may still be
other pods in process of termination. To work around this, add a retry
to the device plugin test to account for the time it takes to delete the
resources from the prior test.

Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
David Porter 2023-01-31 17:28:45 -08:00
parent f58f70bd57
commit 225658884b

View File

@ -18,6 +18,7 @@ package e2enode
import (
"context"
"fmt"
"path/filepath"
"regexp"
"time"
@ -105,17 +106,31 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
return nodes == 1
}, time.Minute, time.Second).Should(gomega.BeTrue())
v1alphaPodResources, err = getV1alpha1NodeDevices(ctx)
framework.ExpectNoError(err, "should get node local podresources by accessing the (v1alpha) podresources API endpoint")
v1PodResources, err = getV1NodeDevices(ctx)
framework.ExpectNoError(err, "should get node local podresources by accessing the (v1) podresources API endpoint")
// Before we run the device plugin test, we need to ensure
// that the cluster is in a clean state and there are no
// pods running on this node.
gomega.Expect(v1alphaPodResources.PodResources).To(gomega.BeEmpty(), "should have no pod resources")
gomega.Expect(v1PodResources.PodResources).To(gomega.BeEmpty(), "should have no pod resources")
// This is done in a gomega.Eventually with retries since a prior test in a different test suite could've run and the deletion of it's resources may still be in progress.
// xref: https://issue.k8s.io/115381
gomega.Eventually(ctx, func(ctx context.Context) error {
v1alphaPodResources, err = getV1alpha1NodeDevices(ctx)
if err != nil {
return fmt.Errorf("failed to get node local podresources by accessing the (v1alpha) podresources API endpoint: %v", err)
}
v1PodResources, err = getV1NodeDevices(ctx)
if err != nil {
return fmt.Errorf("failed to get node local podresources by accessing the (v1) podresources API endpoint: %v", err)
}
if len(v1alphaPodResources.PodResources) > 0 {
return fmt.Errorf("expected v1alpha pod resources to be empty, but got non-empty resources: %+v", v1alphaPodResources.PodResources)
}
if len(v1PodResources.PodResources) > 0 {
return fmt.Errorf("expected v1 pod resources to be empty, but got non-empty resources: %+v", v1PodResources.PodResources)
}
return nil
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.Succeed())
ginkgo.By("Scheduling a sample device plugin pod")
data, err := e2etestfiles.Read(SampleDevicePluginDSYAML)