From 4eb2c570f781e86b4c2b7775c47d0a964f50956e Mon Sep 17 00:00:00 2001 From: Sergey Kanzhelev Date: Thu, 28 Apr 2022 22:05:42 +0000 Subject: [PATCH] check for the test duraiton to make NodeProblemDetector test reliable in long running environments --- test/e2e/node/node_problem_detector.go | 73 ++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/test/e2e/node/node_problem_detector.go b/test/e2e/node/node_problem_detector.go index a61a894d5b6..f00e35eec21 100644 --- a/test/e2e/node/node_problem_detector.go +++ b/test/e2e/node/node_problem_detector.go @@ -60,8 +60,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() { }) ginkgo.It("should run without error", func() { - e2eskipper.SkipUnlessSSHKeyPresent() - ginkgo.By("Getting all nodes and their SSH-able IP addresses") readyNodes, err := e2enode.GetReadySchedulableNodes(f.ClientSet) framework.ExpectNoError(err) @@ -98,6 +96,13 @@ var _ = SIGDescribe("NodeProblemDetector", func() { rssStats := make(map[string][]float64) workingSetStats := make(map[string][]float64) + // Some tests suites running for days. + // This test is not marked as Disruptive or Serial so we do not want to + // restart the kubelet during the test to check for KubeletStart event + // detection. We use heuristic here to check if we need to validate for the + // KubeletStart event since there is no easy way to check when test has actually started. + checkForKubeletStart := false + for _, host := range hosts { cpuUsageStats[host] = []float64{} uptimeStats[host] = []float64{} @@ -119,12 +124,56 @@ var _ = SIGDescribe("NodeProblemDetector", func() { gomega.Expect(result.Stdout).To(gomega.ContainSubstring("node-problem-detector")) ginkgo.By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host)) - journalctlCmd := "sudo journalctl -u node-problem-detector" + journalctlCmd := "sudo journalctl -r -u node-problem-detector" result, err = e2essh.SSH(journalctlCmd, host, framework.TestContext.Provider) framework.ExpectNoError(err) framework.ExpectEqual(result.Code, 0) gomega.Expect(result.Stdout).NotTo(gomega.ContainSubstring("node-problem-detector.service: Failed")) + // Let's assume that node problem detector has started the same time as kubelet + // We only will check for the KubeletStart if parsing of date here succeed + // This is an optimization to not ssh one more time to get kubelet's start time. + // Also we assume specific datetime format to simplify the logic + output := result.Stdout + + // searching for the line "Apr 14 04:47:42 gke-cluster-1-default-pool-b1565719-eqht systemd[1]: Started Kubernetes node problem detector." and fallback to + idx := strings.Index(output, "Started Kubernetes node problem detector") + if idx != -1 { + output = output[:idx] + idx = strings.LastIndex(output, "\n") + + if idx != -1 { + output = output[0:15] + } + + st, err := time.Parse("Jan 02 15:04:05", output) + st = st.AddDate(time.Now().Year(), 0, 0) + + if err == nil { + checkForKubeletStart = time.Since(st) < time.Hour + } + } else { + // fallback to searching the line: + // -- Logs begin at Thu 2022-04-28 17:32:39 UTC, end at Thu 2022-04-28 17:40:05 UTC. -- + idx := strings.Index(output, ", end at ") + + output = output[:idx] + idx = strings.LastIndex(output, "-- Logs begin at ") + if idx != -1 { + output = output[17:] + } + + st, err := time.Parse("Mon 2006-01-02 15:04:05 MST", output) + + if err == nil { + checkForKubeletStart = time.Since(st) < time.Hour + } + } + + if !checkForKubeletStart { + ginkgo.By("KubeletStart event will NOT be checked") + } + if isStandaloneMode[host] { cpuUsage, uptime := getCPUStat(f, host) cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) @@ -152,13 +201,17 @@ var _ = SIGDescribe("NodeProblemDetector", func() { return verifyEvents(f, eventListOptions, 1, "DockerHung", node.Name) }, pollTimeout, pollInterval).Should(gomega.Succeed()) - // Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+. - // Since Kubelet may be restarted for a few times after node is booted. We just check the event - // is detected, but do not check how many times Kubelet is started. - ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name)) - gomega.Eventually(func() error { - return verifyEventExists(f, eventListOptions, "KubeletStart", node.Name) - }, pollTimeout, pollInterval).Should(gomega.Succeed()) + if checkForKubeletStart { + // Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+. + // Since Kubelet may be restarted for a few times after node is booted. We just check the event + // is detected, but do not check how many times Kubelet is started. + // + // Some test suites run for hours and KubeletStart event will already be cleaned up + ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name)) + gomega.Eventually(func() error { + return verifyEventExists(f, eventListOptions, "KubeletStart", node.Name) + }, pollTimeout, pollInterval).Should(gomega.Succeed()) + } } ginkgo.By("Gather node-problem-detector cpu and memory stats")