From a6b4868b8d79882cc65f94fcb7355d25076d3c09 Mon Sep 17 00:00:00 2001 From: Artyom Lukianov Date: Mon, 1 Mar 2021 14:15:37 +0200 Subject: [PATCH 1/2] e2e node: stop kubelet service instead of restarting it The server service monitors the kubelet service and restart it once the service is down, to avoid kubelet double restarting we will stop the kubelet service and wait until the kubelet will be restarted and the node will be ready. Signed-off-by: Artyom Lukianov --- test/e2e_node/hugepages_test.go | 6 ++++-- test/e2e_node/util.go | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go index da6c2b1e2c4..80e901f655a 100644 --- a/test/e2e_node/hugepages_test.go +++ b/test/e2e_node/hugepages_test.go @@ -337,7 +337,8 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H setHugepages() ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") - restartKubelet() + // stop the kubelet and wait until the server will restart it automatically + stopKubelet() waitForHugepages() @@ -352,7 +353,8 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H releaseHugepages() ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") - restartKubelet() + // stop the kubelet and wait until the server will restart it automatically + stopKubelet() waitForHugepages() }) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index fd6273eac55..6d06019e825 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -418,8 +418,14 @@ func restartKubelet() { // stopKubelet will kill the running kubelet, and returns a func that will restart the process again func stopKubelet() func() { kubeletServiceName := findRunningKubletServiceName() - stdout, err := exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() + + // reset the kubelet service start-limit-hit + stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() + framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) + + stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout) + return func() { stdout, err := exec.Command("sudo", "systemctl", "start", kubeletServiceName).CombinedOutput() framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) From ef3e0fd02fdbe031a107903760d4ba12a62d647e Mon Sep 17 00:00:00 2001 From: Artyom Lukianov Date: Tue, 2 Mar 2021 14:07:30 +0200 Subject: [PATCH 2/2] e2e node: wait for kubelet health check to pass after kubelet restart Signed-off-by: Artyom Lukianov --- test/e2e_node/hugepages_test.go | 8 ++++++++ test/e2e_node/util.go | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go index 80e901f655a..d3c54c457dd 100644 --- a/test/e2e_node/hugepages_test.go +++ b/test/e2e_node/hugepages_test.go @@ -339,6 +339,14 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") // stop the kubelet and wait until the server will restart it automatically stopKubelet() + // wait until the kubelet health check will fail + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, time.Minute, time.Second).Should(gomega.BeFalse()) + // wait until the kubelet health check will pass + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, 2*time.Minute, 10*time.Second).Should(gomega.BeTrue()) waitForHugepages() diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 6d06019e825..4300a993a9c 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -18,6 +18,7 @@ package e2enode import ( "context" + "crypto/tls" "encoding/json" "flag" "fmt" @@ -70,6 +71,8 @@ const ( defaultPodResourcesPath = "/var/lib/kubelet/pod-resources" defaultPodResourcesTimeout = 10 * time.Second defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb + kubeletReadOnlyPort = "10255" + kubeletHealthCheckURL = "http://127.0.0.1:" + kubeletReadOnlyPort + "/healthz" ) func getNodeSummary() (*stats.Summary, error) { @@ -432,6 +435,27 @@ func stopKubelet() func() { } } +func kubeletHealthCheck(url string) bool { + insecureTransport := http.DefaultTransport.(*http.Transport).Clone() + insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + insecureHTTPClient := &http.Client{ + Transport: insecureTransport, + } + + req, err := http.NewRequest("HEAD", url, nil) + if err != nil { + return false + } + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", framework.TestContext.BearerToken)) + resp, err := insecureHTTPClient.Do(req) + if err != nil { + klog.Warningf("Health check on %q failed, error=%v", url, err) + } else if resp.StatusCode != http.StatusOK { + klog.Warningf("Health check on %q failed, status=%d", url, resp.StatusCode) + } + return err == nil && resp.StatusCode == http.StatusOK +} + func toCgroupFsName(cgroupName cm.CgroupName) string { if framework.TestContext.KubeletConfig.CgroupDriver == "systemd" { return cgroupName.ToSystemd()