Merge pull request #41240 from Random-Liu/update-npd-test

Automatic merge from submit-queue (batch tested with PRs 41844, 41803, 39116, 41129, 41240) NPD: Update NPD test. For https://github.com/kubernetes/node-problem-detector/issues/58. Update NPD e2e test based on the new behavior. Note that before merging this PR, we need to merge all pending PRs in npd, and release the v0.3.0-alpha.1 version of NPD. /cc @dchen1107 @kubernetes/node-problem-detector-reviewers
2025-09-14 13:45:06 +00:00 · 2017-02-22 05:48:45 -08:00
parent e1b174842a bb8e2530c5
commit eef16cf141
1 changed files with 106 additions and 54 deletions
--- a/test/e2e/node_problem_detector.go
+++ b/test/e2e/node_problem_detector.go
@@ -19,6 +19,7 @@ package e2e
 import (
 	"fmt"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"time"

@@ -45,13 +46,13 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 		pollInterval   = 1 * time.Second
 		pollConsistent = 5 * time.Second
 		pollTimeout    = 1 * time.Minute
-		image          = "gcr.io/google_containers/node-problem-detector:v0.2"
+		image          = "gcr.io/google_containers/node-problem-detector:v0.3.0-alpha.1"
 	)
 	f := framework.NewDefaultFramework("node-problem-detector")
 	var c clientset.Interface
 	var uid string
 	var ns, name, configName, eventNamespace string
-	var nodeTime time.Time
+	var bootTime, nodeTime time.Time
 	BeforeEach(func() {
 		c = f.ClientSet
 		ns = f.Namespace.Name
@@ -72,14 +73,13 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 		framework.ExpectNoError(err)
 	})

-	// Test kernel monitor. We may add other tests if we have more problem daemons in the future.
-	framework.KubeDescribe("KernelMonitor", func() {
+	// Test system log monitor. We may add other tests if we have more problem daemons in the future.
+	framework.KubeDescribe("SystemLogMonitor", func() {
 		const (
 			// Use test condition to avoid conflict with real node problem detector
 			// TODO(random-liu): Now node condition could be arbitrary string, consider wether we need to
 			// add TestCondition when switching to predefined condition list.
 			condition    = v1.NodeConditionType("TestCondition")
-			lookback     = time.Hour // Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds.
 			startPattern = "test reboot"

 			// File paths used in the test.
@@ -99,10 +99,13 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 			defaultMessage = "default message"
 			tempReason     = "Temporary"
 			tempMessage    = "temporary error"
-			permReason     = "Permanent"
-			permMessage    = "permanent error"
+			permReason1    = "Permanent1"
+			permMessage1   = "permanent error 1"
+			permReason2    = "Permanent2"
+			permMessage2   = "permanent error 2"
 		)
 		var source, config, tmpDir string
+		var lookback time.Duration
 		var node *v1.Node
 		var eventListOptions metav1.ListOptions
 		injectCommand := func(timestamp time.Time, log string, num int) string {
@@ -116,13 +119,36 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {

 		BeforeEach(func() {
 			framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
+			By("Get a non master node to run the pod")
+			nodes, err := c.Core().Nodes().List(metav1.ListOptions{})
+			Expect(err).NotTo(HaveOccurred())
+			node = nil
+			for _, n := range nodes.Items {
+				if !system.IsMasterNode(n.Name) {
+					node = &n
+					break
+				}
+			}
+			Expect(node).NotTo(BeNil())
+			By("Calculate Lookback duration")
+			nodeTime, bootTime, err = getNodeTime(node)
+			Expect(err).To(BeNil())
+			// Set lookback duration longer than node up time.
+			// Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds.
+			lookback = nodeTime.Sub(bootTime) + time.Hour
+
 			// Randomize the source name to avoid conflict with real node problem detector
 			source = "kernel-monitor-" + uid
 			config = `
 			{
+				"plugin": "filelog",
+				"pluginConfig": {
+					"timestamp": "^.{15}",
+					"message": "kernel: \\[.*\\] (.*)",
+					"timestampFormat": "` + time.Stamp + `"
+				},
 				"logPath": "` + filepath.Join(logDir, logFile) + `",
 				"lookback": "` + lookback.String() + `",
-				"startPattern": "` + startPattern + `",
 				"bufferSize": 10,
 				"source": "` + source + `",
 				"conditions": [
@@ -141,22 +167,17 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				{
 					"type": "permanent",
 					"condition": "` + string(condition) + `",
-					"reason": "` + permReason + `",
-					"pattern": "` + permMessage + `"
+					"reason": "` + permReason1 + `",
+					"pattern": "` + permMessage1 + ".*" + `"
+				},
+				{
+					"type": "permanent",
+					"condition": "` + string(condition) + `",
+					"reason": "` + permReason2 + `",
+					"pattern": "` + permMessage2 + ".*" + `"
 				}
 				]
 			}`
-			By("Get a non master node to run the pod")
-			nodes, err := c.Core().Nodes().List(metav1.ListOptions{})
-			Expect(err).NotTo(HaveOccurred())
-			node = nil
-			for _, n := range nodes.Items {
-				if !system.IsMasterNode(n.Name) {
-					node = &n
-					break
-				}
-			}
-			Expect(node).NotTo(BeNil())
 			By("Generate event list options")
 			selector := fields.Set{
 				"involvedObject.kind":      "Node",
@@ -184,7 +205,6 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				},
 				Spec: v1.PodSpec{
 					NodeName:        node.Name,
-					HostNetwork:     true,
 					SecurityContext: &v1.PodSecurityContext{},
 					Volumes: []v1.Volume{
 						{
@@ -212,7 +232,7 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 						{
 							Name:            name,
 							Image:           image,
-							Command:         []string{"/node-problem-detector", "--kernel-monitor=" + filepath.Join(configDir, configFile)},
+							Command:         []string{"/node-problem-detector", "--system-log-monitors=" + filepath.Join(configDir, configFile), "--logtostderr"},
 							ImagePullPolicy: v1.PullAlways,
 							Env: []v1.EnvVar{
 								{
@@ -246,13 +266,6 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 			Expect(err).NotTo(HaveOccurred())
 			By("Wait for node problem detector running")
 			Expect(f.WaitForPodRunning(name)).To(Succeed())
-			// Get the node time
-			nodeIP := framework.GetNodeExternalIP(node)
-			result, err := framework.SSH("date '+%FT%T.%N%:z'", nodeIP, framework.TestContext.Provider)
-			Expect(err).ShouldNot(HaveOccurred())
-			Expect(result.Code).Should(BeZero())
-			nodeTime, err = time.Parse(time.RFC3339, strings.TrimSpace(result.Stdout))
-			Expect(err).ShouldNot(HaveOccurred())
 		})

 		It("should generate node condition and events for corresponding errors", func() {
@@ -274,7 +287,7 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				},
 				{
 					description:      "should not generate events for too old log",
-					timestamp:        nodeTime.Add(-3 * lookback), // Assume 3*lookback is old enough
+					timestamp:        bootTime.Add(-1 * time.Minute),
 					message:          tempMessage,
 					messageNum:       3,
 					conditionReason:  defaultReason,
@@ -283,8 +296,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				},
 				{
 					description:      "should not change node condition for too old log",
-					timestamp:        nodeTime.Add(-3 * lookback), // Assume 3*lookback is old enough
-					message:          permMessage,
+					timestamp:        bootTime.Add(-1 * time.Minute),
+					message:          permMessage1,
 					messageNum:       1,
 					conditionReason:  defaultReason,
 					conditionMessage: defaultMessage,
@@ -292,7 +305,7 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				},
 				{
 					description:      "should generate event for old log within lookback duration",
-					timestamp:        nodeTime.Add(-1 * time.Minute),
+					timestamp:        nodeTime,
 					message:          tempMessage,
 					messageNum:       3,
 					events:           3,
@@ -302,23 +315,13 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 				},
 				{
 					description:      "should change node condition for old log within lookback duration",
-					timestamp:        nodeTime.Add(-1 * time.Minute),
-					message:          permMessage,
-					messageNum:       1,
-					events:           3, // event number should not change
-					conditionReason:  permReason,
-					conditionMessage: permMessage,
-					conditionType:    v1.ConditionTrue,
-				},
-				{
-					description:      "should reset node condition if the node is reboot",
 					timestamp:        nodeTime,
-					message:          startPattern,
+					message:          permMessage1,
 					messageNum:       1,
 					events:           3, // event number should not change
-					conditionReason:  defaultReason,
-					conditionMessage: defaultMessage,
-					conditionType:    v1.ConditionFalse,
+					conditionReason:  permReason1,
+					conditionMessage: permMessage1,
+					conditionType:    v1.ConditionTrue,
 				},
 				{
 					description:      "should generate event for new log",
@@ -326,18 +329,28 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 					message:          tempMessage,
 					messageNum:       3,
 					events:           6,
-					conditionReason:  defaultReason,
-					conditionMessage: defaultMessage,
-					conditionType:    v1.ConditionFalse,
+					conditionReason:  permReason1,
+					conditionMessage: permMessage1,
+					conditionType:    v1.ConditionTrue,
+				},
+				{
+					description:      "should not update node condition with the same reason",
+					timestamp:        nodeTime.Add(5 * time.Minute),
+					message:          permMessage1 + "different message",
+					messageNum:       1,
+					events:           6, // event number should not change
+					conditionReason:  permReason1,
+					conditionMessage: permMessage1,
+					conditionType:    v1.ConditionTrue,
 				},
 				{
 					description:      "should change node condition for new log",
 					timestamp:        nodeTime.Add(5 * time.Minute),
-					message:          permMessage,
+					message:          permMessage2,
 					messageNum:       1,
 					events:           6, // event number should not change
-					conditionReason:  permReason,
-					conditionMessage: permMessage,
+					conditionReason:  permReason2,
+					conditionMessage: permMessage2,
 					conditionType:    v1.ConditionTrue,
 				},
 			} {
@@ -392,6 +405,45 @@ var _ = framework.KubeDescribe("NodeProblemDetector", func() {
 	})
 })

+// getNodeTime gets node boot time and current time by running ssh command on the node.
+func getNodeTime(node *v1.Node) (time.Time, time.Time, error) {
+	nodeIP := framework.GetNodeExternalIP(node)
+
+	// Get node current time.
+	result, err := framework.SSH("date '+%FT%T.%N%:z'", nodeIP, framework.TestContext.Provider)
+	if err != nil {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to run ssh command to get node time: %v", err)
+	}
+	if result.Code != 0 {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to run ssh command with error code: %d", result.Code)
+	}
+	timestamp := strings.TrimSpace(result.Stdout)
+	nodeTime, err := time.Parse(time.RFC3339, timestamp)
+	if err != nil {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to parse node time %q: %v", timestamp, err)
+	}
+
+	// Get system uptime.
+	result, err = framework.SSH(`cat /proc/uptime | cut -d " " -f 1`, nodeIP, framework.TestContext.Provider)
+	if err != nil {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to run ssh command to get node boot time: %v", err)
+	}
+	if result.Code != 0 {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to run ssh command with error code: %d, stdout: %q, stderr: %q",
+			result.Code, result.Stdout, result.Stderr)
+	}
+	uptime, err := strconv.ParseFloat(strings.TrimSpace(result.Stdout), 64)
+	if err != nil {
+		return time.Time{}, time.Time{}, fmt.Errorf("failed to parse node uptime %q: %v", result.Stdout, err)
+	}
+
+	// Get node boot time. NOTE that because we get node current time before uptime, the boot time
+	// calculated will be a little earlier than the real boot time. This won't affect the correctness
+	// of the test result.
+	bootTime := nodeTime.Add(-time.Duration(uptime * float64(time.Second)))
+	return nodeTime, bootTime, nil
+}
+
 // verifyEvents verifies there are num specific events generated
 func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
 	events, err := e.List(options)