diff --git a/test/e2e_node/hugepages_test.go b/test/e2e_node/hugepages_test.go index da6c2b1e2c4..d3c54c457dd 100644 --- a/test/e2e_node/hugepages_test.go +++ b/test/e2e_node/hugepages_test.go @@ -337,7 +337,16 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H setHugepages() ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") - restartKubelet() + // stop the kubelet and wait until the server will restart it automatically + stopKubelet() + // wait until the kubelet health check will fail + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, time.Minute, time.Second).Should(gomega.BeFalse()) + // wait until the kubelet health check will pass + gomega.Eventually(func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, 2*time.Minute, 10*time.Second).Should(gomega.BeTrue()) waitForHugepages() @@ -352,7 +361,8 @@ var _ = SIGDescribe("HugePages [Serial] [Feature:HugePages][NodeSpecialFeature:H releaseHugepages() ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") - restartKubelet() + // stop the kubelet and wait until the server will restart it automatically + stopKubelet() waitForHugepages() }) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index fd6273eac55..4300a993a9c 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -18,6 +18,7 @@ package e2enode import ( "context" + "crypto/tls" "encoding/json" "flag" "fmt" @@ -70,6 +71,8 @@ const ( defaultPodResourcesPath = "/var/lib/kubelet/pod-resources" defaultPodResourcesTimeout = 10 * time.Second defaultPodResourcesMaxSize = 1024 * 1024 * 16 // 16 Mb + kubeletReadOnlyPort = "10255" + kubeletHealthCheckURL = "http://127.0.0.1:" + kubeletReadOnlyPort + "/healthz" ) func getNodeSummary() (*stats.Summary, error) { @@ -418,14 +421,41 @@ func restartKubelet() { // stopKubelet will kill the running kubelet, and returns a func that will restart the process again func stopKubelet() func() { kubeletServiceName := findRunningKubletServiceName() - stdout, err := exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() + + // reset the kubelet service start-limit-hit + stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() + framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) + + stdout, err = exec.Command("sudo", "systemctl", "kill", kubeletServiceName).CombinedOutput() framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout) + return func() { stdout, err := exec.Command("sudo", "systemctl", "start", kubeletServiceName).CombinedOutput() framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) } } +func kubeletHealthCheck(url string) bool { + insecureTransport := http.DefaultTransport.(*http.Transport).Clone() + insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true} + insecureHTTPClient := &http.Client{ + Transport: insecureTransport, + } + + req, err := http.NewRequest("HEAD", url, nil) + if err != nil { + return false + } + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", framework.TestContext.BearerToken)) + resp, err := insecureHTTPClient.Do(req) + if err != nil { + klog.Warningf("Health check on %q failed, error=%v", url, err) + } else if resp.StatusCode != http.StatusOK { + klog.Warningf("Health check on %q failed, status=%d", url, resp.StatusCode) + } + return err == nil && resp.StatusCode == http.StatusOK +} + func toCgroupFsName(cgroupName cm.CgroupName) string { if framework.TestContext.KubeletConfig.CgroupDriver == "systemd" { return cgroupName.ToSystemd()