Fix NPD e2e test on Ubuntu node and update NPD container version

2025-07-25 20:53:33 +00:00 · 2019-03-06 13:12:18 -08:00 · 2019-03-06 13:12:18 -08:00 · f4d9e7d992
commit f4d9e7d992
parent a8492d74ec
5 changed files with 87 additions and 31 deletions
--- a/cluster/addons/node-problem-detector/MAINTAINERS.md
+++ b/cluster/addons/node-problem-detector/MAINTAINERS.md
@ -1,6 +1,7 @@
 # Maintainers
 Random-Liu <lantaol@google.com>
 wangzhen127 <zhenw@google.com>
 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]()
--- a/cluster/addons/node-problem-detector/OWNERS
+++ b/cluster/addons/node-problem-detector/OWNERS
@ -0,0 +1,8 @@
 # See the OWNERS docs at https://go.k8s.io/owners
 approvers:
 - Random-Liu
 - wangzhen127
 reviewers:
 - Random-Liu
 - wangzhen127
--- a/cluster/addons/node-problem-detector/npd.yaml
+++ b/cluster/addons/node-problem-detector/npd.yaml
@ -26,28 +26,28 @@ subjects:
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: npd-v0.4.1
+  name: npd-v0.6.2
  namespace: kube-system
  labels:
    k8s-app: node-problem-detector
-    version: v0.4.1
+    version: v0.6.2
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
 spec:
  selector:
    matchLabels:
      k8s-app: node-problem-detector
-      version: v0.4.1
+      version: v0.6.2
  template:
    metadata:
      labels:
        k8s-app: node-problem-detector
-        version: v0.4.1
+        version: v0.6.2
        kubernetes.io/cluster-service: "true"
    spec:
      containers:
      - name: node-problem-detector
-        image: k8s.gcr.io/node-problem-detector:v0.4.1
+        image: k8s.gcr.io/node-problem-detector:v0.6.2
        command:
        - "/bin/sh"
        - "-c"
--- a/test/e2e/framework/kubelet_stats.go
+++ b/test/e2e/framework/kubelet_stats.go
@ -281,8 +281,8 @@ func HighLatencyKubeletOperations(c clientset.Interface, threshold time.Duration
 	return badMetrics, nil
 }
-// getStatsSummary contacts kubelet for the container information.
+// GetStatsSummary contacts kubelet for the container information.
-func getStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
+func GetStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
 	defer cancel()
@ -348,7 +348,7 @@ func getOneTimeResourceUsageOnNode(
 		return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
 	}
 	// Get information of all containers on the node.
-	summary, err := getStatsSummary(c, nodeName)
+	summary, err := GetStatsSummary(c, nodeName)
 	if err != nil {
 		return nil, err
 	}
--- a/test/e2e/node/node_problem_detector.go
+++ b/test/e2e/node/node_problem_detector.go
@ -18,6 +18,7 @@ package node
 import (
 	"fmt"
 	"net"
 	"sort"
 	"strconv"
 	"strings"
@ -51,13 +52,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 	})
 	It("should run without error", func() {
-		By("Getting all nodes' SSH-able IP addresses")
+		By("Getting all nodes and their SSH-able IP addresses")
-		hosts, err := framework.NodeSSHHosts(f.ClientSet)
+		nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
-		if err != nil {
+		Expect(len(nodes.Items)).NotTo(BeZero())
-			framework.Failf("Error getting node hostnames: %v", err)
+		hosts := []string{}
 		for _, node := range nodes.Items {
 			for _, addr := range node.Status.Addresses {
 				if addr.Type == v1.NodeExternalIP {
 					hosts = append(hosts, net.JoinHostPort(addr.Address, "22"))
 					break
 				}
 			}
 		}
-		Expect(len(hosts)).NotTo(BeZero())
+		Expect(len(hosts)).To(Equal(len(nodes.Items)))
 		isStandaloneMode := make(map[string]bool)
 		cpuUsageStats := make(map[string][]float64)
 		uptimeStats := make(map[string][]float64)
 		rssStats := make(map[string][]float64)
@ -69,12 +78,16 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 			rssStats[host] = []float64{}
 			workingSetStats[host] = []float64{}
 			cmd := "systemctl status node-problem-detector.service"
 			result, err := framework.SSH(cmd, host, framework.TestContext.Provider)
 			isStandaloneMode[host] = (err == nil && result.Code == 0)
 			By(fmt.Sprintf("Check node %q has node-problem-detector process", host))
 			// Using brackets "[n]" is a trick to prevent grep command itself from
 			// showing up, because string text "[n]ode-problem-detector" does not
 			// match regular expression "[n]ode-problem-detector".
 			psCmd := "ps aux | grep [n]ode-problem-detector"
-			result, err := framework.SSH(psCmd, host, framework.TestContext.Provider)
+			result, err = framework.SSH(psCmd, host, framework.TestContext.Provider)
 			framework.ExpectNoError(err)
 			Expect(result.Code).To(BeZero())
 			Expect(result.Stdout).To(ContainSubstring("node-problem-detector"))
@ -86,9 +99,11 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 			Expect(result.Code).To(BeZero())
 			Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed"))
-			cpuUsage, uptime := getCpuStat(f, host)
+			if isStandaloneMode[host] {
-			cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
+				cpuUsage, uptime := getCpuStat(f, host)
-			uptimeStats[host] = append(uptimeStats[host], uptime)
+				cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
 				uptimeStats[host] = append(uptimeStats[host], uptime)
 			}
 			By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host))
 			log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds."
@ -99,8 +114,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 		}
 		By("Check node-problem-detector can post conditions and events to API server")
 		nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
 		Expect(len(nodes.Items)).To(Equal(len(hosts)))
 		for _, node := range nodes.Items {
 			By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name))
 			Eventually(func() error {
@ -117,14 +130,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 		By("Gather node-problem-detector cpu and memory stats")
 		numIterations := 60
 		for i := 1; i <= numIterations; i++ {
-			for _, host := range hosts {
+			for j, host := range hosts {
-				rss, workingSet := getMemoryStat(f, host)
+				if isStandaloneMode[host] {
-				rssStats[host] = append(rssStats[host], rss)
+					rss, workingSet := getMemoryStat(f, host)
-				workingSetStats[host] = append(workingSetStats[host], workingSet)
+					rssStats[host] = append(rssStats[host], rss)
-				if i == numIterations {
+					workingSetStats[host] = append(workingSetStats[host], workingSet)
-					cpuUsage, uptime := getCpuStat(f, host)
+					if i == numIterations {
 						cpuUsage, uptime := getCpuStat(f, host)
 						cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
 						uptimeStats[host] = append(uptimeStats[host], uptime)
 					}
 				} else {
 					cpuUsage, rss, workingSet := getNpdPodStat(f, nodes.Items[j].Name)
 					cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
-					uptimeStats[host] = append(uptimeStats[host], uptime)
+					rssStats[host] = append(rssStats[host], rss)
 					workingSetStats[host] = append(workingSetStats[host], workingSet)
 				}
 			}
 			time.Sleep(time.Second)
@ -134,16 +154,24 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
 		rssStatsMsg := "RSS (MB):"
 		workingSetStatsMsg := "WorkingSet (MB):"
 		for i, host := range hosts {
-			cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
+			if isStandaloneMode[host] {
-			totaltime := uptimeStats[host][1] - uptimeStats[host][0]
+				// When in standalone mode, NPD is running as systemd service. We
-			cpuStatsMsg += fmt.Sprintf(" Node%d[%.3f];", i, cpuUsage/totaltime)
+				// calculate its cpu usage from cgroup cpuacct value differences.
 				cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
 				totaltime := uptimeStats[host][1] - uptimeStats[host][0]
 				cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes.Items[i].Name, cpuUsage/totaltime)
 			} else {
 				sort.Float64s(cpuUsageStats[host])
 				cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes.Items[i].Name,
 					cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1])
 			}
 			sort.Float64s(rssStats[host])
-			rssStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
+			rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
 				rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1])
 			sort.Float64s(workingSetStats[host])
-			workingSetStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
+			workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
 				workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1])
 		}
 		framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg)
@ -233,3 +261,22 @@ func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) {
 	usage *= 1e-9
 	return
 }
 func getNpdPodStat(f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) {
 	summary, err := framework.GetStatsSummary(f.ClientSet, nodeName)
 	framework.ExpectNoError(err)
 	hasNpdPod := false
 	for _, pod := range summary.Pods {
 		if !strings.HasPrefix(pod.PodRef.Name, "npd") {
 			continue
 		}
 		cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
 		rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024
 		workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024
 		hasNpdPod = true
 		break
 	}
 	Expect(hasNpdPod).To(BeTrue())
 	return
 }