From 23147ff4b37df4f49b9ac60e5ac2ad70bb927aed Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Sun, 15 May 2022 09:44:30 +0200 Subject: [PATCH] e2e: node: devplugin: tolerate node readiness flip In the AfterEach check of the e2e node device plugin tests, the tests want really bad to clean up after themselves: - delete the sample device plugin - restart again the kubelet - ensure that after the restart, no stale sample devices (provided by the sample device plugin) are reported anymore. We observed that in the AfterEach block of these e2e tests we have quite reliably a flip/flop of the kubelet readiness state, possibly related to a race with/ a slow runtime/PLEG check. What happens is that the kubelet readiness state is true, but goes false for a quick interval and then goes true again and it's pretty stable after that (observed adding more logs to the check loop). The key factor here is the function `getLocalNode` aborts the test (as in `framework.ExpectNoError`) if the node state is not ready. So any occurrence of this scenario, even if it is transient, will cause a test failure. I believe this will make the e2e test unnecessarily fragile without making it more correct. For the purpose of the test we can tolerate this kind of glitches, with kubelet flip/flopping the ready state, granted that we meet eventually the final desired condition on which the node reports ready AND reports no sample devices present - which was the condition the code was trying to check. So, we add a variant of `getLocalNode`, which just fetches the node object the e2e_node framework created, alongside to a flag reporting the node readiness. The new helper does not make implicitly the test abort if the node is not ready, just bubbles up this information. Signed-off-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 18 ++++++++++++------ test/e2e_node/util.go | 13 +++++++++++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 9a4e3382f93..755589c1578 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -130,14 +130,16 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Waiting for devices to become available on the local node") gomega.Eventually(func() bool { - return numberOfSampleResources(getLocalNode(f)) > 0 + node, ready := getLocalTestNode(f) + return ready && numberOfSampleResources(node) > 0 }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) framework.Logf("Successfully created device plugin pod") ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node") gomega.Eventually(func() bool { - node := getLocalNode(f) - return numberOfDevicesCapacity(node, resourceName) == devsLen && + node, ready := getLocalTestNode(f) + return ready && + numberOfDevicesCapacity(node, resourceName) == devsLen && numberOfDevicesAllocatable(node, resourceName) == devsLen }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) }) @@ -162,8 +164,11 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Waiting for devices to become unavailable on the local node") gomega.Eventually(func() bool { - return numberOfSampleResources(getLocalNode(f)) <= 0 + node, ready := getLocalTestNode(f) + return ready && numberOfSampleResources(node) <= 0 }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) + + ginkgo.By("devices now unavailable on the local node") }) ginkgo.It("Can schedule a pod that requires a device", func() { @@ -284,8 +289,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Waiting for resource to become available on the local node after re-registration") gomega.Eventually(func() bool { - node := getLocalNode(f) - return numberOfDevicesCapacity(node, resourceName) == devsLen && + node, ready := getLocalTestNode(f) + return ready && + numberOfDevicesCapacity(node, resourceName) == devsLen && numberOfDevicesAllocatable(node, resourceName) == devsLen }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 25185255c94..220fc3b15d6 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -254,6 +254,19 @@ func getLocalNode(f *framework.Framework) *v1.Node { return &nodeList.Items[0] } +// getLocalTestNode fetches the node object describing the local worker node set up by the e2e_node infra, alongside with its ready state. +// getLocalTestNode is a variant of `getLocalNode` which reports but does not set any requirement about the node readiness state, letting +// the caller decide. The check is intentionally done like `getLocalNode` does. +// Note `getLocalNode` aborts (as in ginkgo.Expect) the test implicitly if the worker node is not ready. +func getLocalTestNode(f *framework.Framework) (*v1.Node, bool) { + node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + ready := e2enode.IsNodeReady(node) + schedulable := e2enode.IsNodeSchedulable(node) + framework.Logf("node %q ready=%v schedulable=%v", node.Name, ready, schedulable) + return node, ready && schedulable +} + // logKubeletLatencyMetrics logs KubeletLatencyMetrics computed from the Prometheus // metrics exposed on the current node and identified by the metricNames. // The Kubelet subsystem prefix is automatically prepended to these metric names.