e2e: node: expose the running flag

Each e2e test knows it wants to restart a running kubelet or a
non-running kubelet. The vast majority of times, we want to
restart a running kubelet (e.g. to change config or to check
some properties hold across kubelet crashes/restarts), but sometimes
we stop the kubelet, do some actions and only then restart.

To accomodate both use cases, we just expose the `running` boolean
flag to the e2e tests.

Having the `restartKubelet` explicitly restarting a running kubelet
helps us to trobuleshoot e2e failures on which the kubelet
was supposed to be running, while it was not; attempting a restart
in such cases only murkied the waters further, making the
troubleshooting and the eventual fix harder.

In the happy path, no expected change in behaviour.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2021-10-06 17:45:22 +02:00
parent e878c20ac7
commit d15bff2839
6 changed files with 10 additions and 10 deletions

View File

@ -220,7 +220,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
restartTime := time.Now() restartTime := time.Now()
ginkgo.By("Restarting Kubelet") ginkgo.By("Restarting Kubelet")
restartKubelet() restartKubelet(true)
// We need to wait for node to be ready before re-registering stub device plugin. // We need to wait for node to be ready before re-registering stub device plugin.
// Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts. // Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts.

View File

@ -103,7 +103,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute) f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
} }
restartKubelet() restartKubelet(true)
ginkgo.By("Waiting for GPUs to become unavailable on the local node") ginkgo.By("Waiting for GPUs to become unavailable on the local node")
gomega.Eventually(func() bool { gomega.Eventually(func() bool {
@ -142,7 +142,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
framework.ExpectEqual(devIDRestart1, devID1) framework.ExpectEqual(devIDRestart1, devID1)
ginkgo.By("Restarting Kubelet") ginkgo.By("Restarting Kubelet")
restartKubelet() restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
@ -172,7 +172,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
} }
ginkgo.By("Restarting Kubelet") ginkgo.By("Restarting Kubelet")
restartKubelet() restartKubelet(true)
ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept") ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept")
ensurePodContainerRestart(f, p1.Name, p1.Name) ensurePodContainerRestart(f, p1.Name, p1.Name)
@ -181,7 +181,7 @@ var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeat
ginkgo.By("Restarting Kubelet and creating another pod") ginkgo.By("Restarting Kubelet and creating another pod")
restartKubelet() restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
ensurePodContainerRestart(f, p1.Name, p1.Name) ensurePodContainerRestart(f, p1.Name, p1.Name)

View File

@ -207,7 +207,7 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H
framework.ExpectEqual(value.String(), "9Mi", "huge pages with size 3Mi should be supported") framework.ExpectEqual(value.String(), "9Mi", "huge pages with size 3Mi should be supported")
ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported") ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported")
restartKubelet() restartKubelet(true)
ginkgo.By("verifying that the hugepages-3Mi resource no longer is present") ginkgo.By("verifying that the hugepages-3Mi resource no longer is present")
gomega.Eventually(func() bool { gomega.Eventually(func() bool {

View File

@ -348,7 +348,7 @@ var _ = SIGDescribe("Memory Manager [Serial] [Feature:MemoryManager]", func() {
return kubeletHealthCheck(kubeletHealthCheckURL) return kubeletHealthCheck(kubeletHealthCheckURL)
}, time.Minute, time.Second).Should(gomega.BeFalse()) }, time.Minute, time.Second).Should(gomega.BeFalse())
restartKubelet() restartKubelet(false)
// wait until the kubelet health check will pass // wait until the kubelet health check will pass
gomega.Eventually(func() bool { gomega.Eventually(func() bool {

View File

@ -731,7 +731,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
expectPodResources(1, cli, []podDesc{desc}) expectPodResources(1, cli, []podDesc{desc})
ginkgo.By("Restarting Kubelet") ginkgo.By("Restarting Kubelet")
restartKubelet() restartKubelet(true)
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout) framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
expectPodResources(1, cli, []podDesc{desc}) expectPodResources(1, cli, []podDesc{desc})
tpd.deletePodsForTest(f) tpd.deletePodsForTest(f)

View File

@ -414,8 +414,8 @@ func findKubletServiceName(running bool) string {
return kubeletServiceName return kubeletServiceName
} }
func restartKubelet() { func restartKubelet(running bool) {
kubeletServiceName := findKubletServiceName(false) kubeletServiceName := findKubletServiceName(running)
// reset the kubelet service start-limit-hit // reset the kubelet service start-limit-hit
stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput()
framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout)) framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %s", err, string(stdout))