mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
e2e: node: devplugin: tolerate node readiness flip
In the AfterEach check of the e2e node device plugin tests, the tests want really bad to clean up after themselves: - delete the sample device plugin - restart again the kubelet - ensure that after the restart, no stale sample devices (provided by the sample device plugin) are reported anymore. We observed that in the AfterEach block of these e2e tests we have quite reliably a flip/flop of the kubelet readiness state, possibly related to a race with/ a slow runtime/PLEG check. What happens is that the kubelet readiness state is true, but goes false for a quick interval and then goes true again and it's pretty stable after that (observed adding more logs to the check loop). The key factor here is the function `getLocalNode` aborts the test (as in `framework.ExpectNoError`) if the node state is not ready. So any occurrence of this scenario, even if it is transient, will cause a test failure. I believe this will make the e2e test unnecessarily fragile without making it more correct. For the purpose of the test we can tolerate this kind of glitches, with kubelet flip/flopping the ready state, granted that we meet eventually the final desired condition on which the node reports ready AND reports no sample devices present - which was the condition the code was trying to check. So, we add a variant of `getLocalNode`, which just fetches the node object the e2e_node framework created, alongside to a flag reporting the node readiness. The new helper does not make implicitly the test abort if the node is not ready, just bubbles up this information. Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
parent
56c539bff0
commit
23147ff4b3
@ -130,14 +130,16 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
|
||||
ginkgo.By("Waiting for devices to become available on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfSampleResources(getLocalNode(f)) > 0
|
||||
node, ready := getLocalTestNode(f)
|
||||
return ready && numberOfSampleResources(node) > 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
framework.Logf("Successfully created device plugin pod")
|
||||
|
||||
ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
node := getLocalNode(f)
|
||||
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
node, ready := getLocalTestNode(f)
|
||||
return ready &&
|
||||
numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
|
||||
})
|
||||
@ -162,8 +164,11 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
|
||||
ginkgo.By("Waiting for devices to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfSampleResources(getLocalNode(f)) <= 0
|
||||
node, ready := getLocalTestNode(f)
|
||||
return ready && numberOfSampleResources(node) <= 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
|
||||
ginkgo.By("devices now unavailable on the local node")
|
||||
})
|
||||
|
||||
ginkgo.It("Can schedule a pod that requires a device", func() {
|
||||
@ -284,8 +289,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
|
||||
ginkgo.By("Waiting for resource to become available on the local node after re-registration")
|
||||
gomega.Eventually(func() bool {
|
||||
node := getLocalNode(f)
|
||||
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
node, ready := getLocalTestNode(f)
|
||||
return ready &&
|
||||
numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
|
||||
|
||||
|
@ -254,6 +254,19 @@ func getLocalNode(f *framework.Framework) *v1.Node {
|
||||
return &nodeList.Items[0]
|
||||
}
|
||||
|
||||
// getLocalTestNode fetches the node object describing the local worker node set up by the e2e_node infra, alongside with its ready state.
|
||||
// getLocalTestNode is a variant of `getLocalNode` which reports but does not set any requirement about the node readiness state, letting
|
||||
// the caller decide. The check is intentionally done like `getLocalNode` does.
|
||||
// Note `getLocalNode` aborts (as in ginkgo.Expect) the test implicitly if the worker node is not ready.
|
||||
func getLocalTestNode(f *framework.Framework) (*v1.Node, bool) {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
ready := e2enode.IsNodeReady(node)
|
||||
schedulable := e2enode.IsNodeSchedulable(node)
|
||||
framework.Logf("node %q ready=%v schedulable=%v", node.Name, ready, schedulable)
|
||||
return node, ready && schedulable
|
||||
}
|
||||
|
||||
// logKubeletLatencyMetrics logs KubeletLatencyMetrics computed from the Prometheus
|
||||
// metrics exposed on the current node and identified by the metricNames.
|
||||
// The Kubelet subsystem prefix is automatically prepended to these metric names.
|
||||
|
Loading…
Reference in New Issue
Block a user