e2e: node: expose the running flag

Each e2e test knows it wants to restart a running kubelet or a
non-running kubelet. The vast majority of times, we want to
restart a running kubelet (e.g. to change config or to check
some properties hold across kubelet crashes/restarts), but sometimes
we stop the kubelet, do some actions and only then restart.

To accomodate both use cases, we just expose the `running` boolean
flag to the e2e tests.

Having the `restartKubelet` explicitly restarting a running kubelet
helps us to trobuleshoot e2e failures on which the kubelet
was supposed to be running, while it was not; attempting a restart
in such cases only murkied the waters further, making the
troubleshooting and the eventual fix harder.

In the happy path, no expected change in behaviour.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2021-10-06 17:45:22 +02:00
parent e878c20ac7
commit d15bff2839
6 changed files with 10 additions and 10 deletions

View File

@ -220,7 +220,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
restartTime := time.Now()
ginkgo.By("Restarting Kubelet")
restartKubelet()
restartKubelet(true)
// We need to wait for node to be ready before re-registering stub device plugin.
// Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts.

View File

@ -103,7 +103,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
}
restartKubelet()
restartKubelet(true)
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
gomega.Eventually(func() bool {
@ -142,7 +142,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
framework.ExpectEqual(devIDRestart1, devID1)
ginkgo.By("Restarting Kubelet")
restartKubelet()
restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
@ -172,7 +172,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
}
ginkgo.By("Restarting Kubelet")
restartKubelet()
restartKubelet(true)
ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept")
ensurePodContainerRestart(f, p1.Name, p1.Name)
@ -181,7 +181,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
ginkgo.By("Restarting Kubelet and creating another pod")
restartKubelet()
restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
ensurePodContainerRestart(f, p1.Name, p1.Name)

View File

@ -207,7 +207,7 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H
framework.ExpectEqual(value.String(), "9Mi", "huge pages with size 3Mi should be supported")
ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported")
restartKubelet()
restartKubelet(true)
ginkgo.By("verifying that the hugepages-3Mi resource no longer is present")
gomega.Eventually(func() bool {

View File

@ -348,7 +348,7 @@ var _ = SIGDescribe("Memory Manager [Serial] [Feature:MemoryManager]", func() {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, time.Minute, time.Second).Should(gomega.BeFalse())
restartKubelet()
restartKubelet(false)
// wait until the kubelet health check will pass
gomega.Eventually(func() bool {

View File

@ -731,7 +731,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
expectPodResources(1, cli, []podDesc{desc})
ginkgo.By("Restarting Kubelet")
restartKubelet()
restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
expectPodResources(1, cli, []podDesc{desc})
tpd.deletePodsForTest(f)

View File

@ -414,8 +414,8 @@ func findKubletServiceName(running bool) string {
return kubeletServiceName
}
func restartKubelet() {
kubeletServiceName := findKubletServiceName(false)
func restartKubelet(running bool) {
kubeletServiceName := findKubletServiceName(running)
// reset the kubelet service start-limit-hit
stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))