diff --git a/cluster/addons/node-problem-detector/MAINTAINERS.md b/cluster/addons/node-problem-detector/MAINTAINERS.md index 010c9f69d5a..9e21ef6f77c 100644 --- a/cluster/addons/node-problem-detector/MAINTAINERS.md +++ b/cluster/addons/node-problem-detector/MAINTAINERS.md @@ -1,6 +1,7 @@ # Maintainers Random-Liu +wangzhen127 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]() diff --git a/cluster/addons/node-problem-detector/OWNERS b/cluster/addons/node-problem-detector/OWNERS new file mode 100644 index 00000000000..a8b9d5b26f9 --- /dev/null +++ b/cluster/addons/node-problem-detector/OWNERS @@ -0,0 +1,8 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +approvers: +- Random-Liu +- wangzhen127 +reviewers: +- Random-Liu +- wangzhen127 diff --git a/cluster/addons/node-problem-detector/npd.yaml b/cluster/addons/node-problem-detector/npd.yaml index ee598636be6..31d8b8c351a 100644 --- a/cluster/addons/node-problem-detector/npd.yaml +++ b/cluster/addons/node-problem-detector/npd.yaml @@ -26,28 +26,28 @@ subjects: apiVersion: apps/v1 kind: DaemonSet metadata: - name: npd-v0.4.1 + name: npd-v0.6.2 namespace: kube-system labels: k8s-app: node-problem-detector - version: v0.4.1 + version: v0.6.2 kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile spec: selector: matchLabels: k8s-app: node-problem-detector - version: v0.4.1 + version: v0.6.2 template: metadata: labels: k8s-app: node-problem-detector - version: v0.4.1 + version: v0.6.2 kubernetes.io/cluster-service: "true" spec: containers: - name: node-problem-detector - image: k8s.gcr.io/node-problem-detector:v0.4.1 + image: k8s.gcr.io/node-problem-detector:v0.6.2 command: - "/bin/sh" - "-c" diff --git a/test/e2e/framework/kubelet_stats.go b/test/e2e/framework/kubelet_stats.go index 9b8f4c9cb72..7cbdb2e4edb 100644 --- a/test/e2e/framework/kubelet_stats.go +++ b/test/e2e/framework/kubelet_stats.go @@ -281,8 +281,8 @@ func HighLatencyKubeletOperations(c clientset.Interface, threshold time.Duration return badMetrics, nil } -// getStatsSummary contacts kubelet for the container information. -func getStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) { +// GetStatsSummary contacts kubelet for the container information. +func GetStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) { ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout) defer cancel() @@ -348,7 +348,7 @@ func getOneTimeResourceUsageOnNode( return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest) } // Get information of all containers on the node. - summary, err := getStatsSummary(c, nodeName) + summary, err := GetStatsSummary(c, nodeName) if err != nil { return nil, err } diff --git a/test/e2e/node/node_problem_detector.go b/test/e2e/node/node_problem_detector.go index b4d2434c098..284aeaf5004 100644 --- a/test/e2e/node/node_problem_detector.go +++ b/test/e2e/node/node_problem_detector.go @@ -18,6 +18,7 @@ package node import ( "fmt" + "net" "sort" "strconv" "strings" @@ -51,13 +52,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() { }) It("should run without error", func() { - By("Getting all nodes' SSH-able IP addresses") - hosts, err := framework.NodeSSHHosts(f.ClientSet) - if err != nil { - framework.Failf("Error getting node hostnames: %v", err) + By("Getting all nodes and their SSH-able IP addresses") + nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet) + Expect(len(nodes.Items)).NotTo(BeZero()) + hosts := []string{} + for _, node := range nodes.Items { + for _, addr := range node.Status.Addresses { + if addr.Type == v1.NodeExternalIP { + hosts = append(hosts, net.JoinHostPort(addr.Address, "22")) + break + } + } } - Expect(len(hosts)).NotTo(BeZero()) + Expect(len(hosts)).To(Equal(len(nodes.Items))) + isStandaloneMode := make(map[string]bool) cpuUsageStats := make(map[string][]float64) uptimeStats := make(map[string][]float64) rssStats := make(map[string][]float64) @@ -69,12 +78,16 @@ var _ = SIGDescribe("NodeProblemDetector", func() { rssStats[host] = []float64{} workingSetStats[host] = []float64{} + cmd := "systemctl status node-problem-detector.service" + result, err := framework.SSH(cmd, host, framework.TestContext.Provider) + isStandaloneMode[host] = (err == nil && result.Code == 0) + By(fmt.Sprintf("Check node %q has node-problem-detector process", host)) // Using brackets "[n]" is a trick to prevent grep command itself from // showing up, because string text "[n]ode-problem-detector" does not // match regular expression "[n]ode-problem-detector". psCmd := "ps aux | grep [n]ode-problem-detector" - result, err := framework.SSH(psCmd, host, framework.TestContext.Provider) + result, err = framework.SSH(psCmd, host, framework.TestContext.Provider) framework.ExpectNoError(err) Expect(result.Code).To(BeZero()) Expect(result.Stdout).To(ContainSubstring("node-problem-detector")) @@ -86,9 +99,11 @@ var _ = SIGDescribe("NodeProblemDetector", func() { Expect(result.Code).To(BeZero()) Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed")) - cpuUsage, uptime := getCpuStat(f, host) - cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) - uptimeStats[host] = append(uptimeStats[host], uptime) + if isStandaloneMode[host] { + cpuUsage, uptime := getCpuStat(f, host) + cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) + uptimeStats[host] = append(uptimeStats[host], uptime) + } By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host)) log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds." @@ -99,8 +114,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() { } By("Check node-problem-detector can post conditions and events to API server") - nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet) - Expect(len(nodes.Items)).To(Equal(len(hosts))) for _, node := range nodes.Items { By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name)) Eventually(func() error { @@ -117,14 +130,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() { By("Gather node-problem-detector cpu and memory stats") numIterations := 60 for i := 1; i <= numIterations; i++ { - for _, host := range hosts { - rss, workingSet := getMemoryStat(f, host) - rssStats[host] = append(rssStats[host], rss) - workingSetStats[host] = append(workingSetStats[host], workingSet) - if i == numIterations { - cpuUsage, uptime := getCpuStat(f, host) + for j, host := range hosts { + if isStandaloneMode[host] { + rss, workingSet := getMemoryStat(f, host) + rssStats[host] = append(rssStats[host], rss) + workingSetStats[host] = append(workingSetStats[host], workingSet) + if i == numIterations { + cpuUsage, uptime := getCpuStat(f, host) + cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) + uptimeStats[host] = append(uptimeStats[host], uptime) + } + } else { + cpuUsage, rss, workingSet := getNpdPodStat(f, nodes.Items[j].Name) cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) - uptimeStats[host] = append(uptimeStats[host], uptime) + rssStats[host] = append(rssStats[host], rss) + workingSetStats[host] = append(workingSetStats[host], workingSet) } } time.Sleep(time.Second) @@ -134,16 +154,24 @@ var _ = SIGDescribe("NodeProblemDetector", func() { rssStatsMsg := "RSS (MB):" workingSetStatsMsg := "WorkingSet (MB):" for i, host := range hosts { - cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0] - totaltime := uptimeStats[host][1] - uptimeStats[host][0] - cpuStatsMsg += fmt.Sprintf(" Node%d[%.3f];", i, cpuUsage/totaltime) + if isStandaloneMode[host] { + // When in standalone mode, NPD is running as systemd service. We + // calculate its cpu usage from cgroup cpuacct value differences. + cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0] + totaltime := uptimeStats[host][1] - uptimeStats[host][0] + cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes.Items[i].Name, cpuUsage/totaltime) + } else { + sort.Float64s(cpuUsageStats[host]) + cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes.Items[i].Name, + cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1]) + } sort.Float64s(rssStats[host]) - rssStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i, + rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name, rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1]) sort.Float64s(workingSetStats[host]) - workingSetStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i, + workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name, workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1]) } framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg) @@ -233,3 +261,22 @@ func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) { usage *= 1e-9 return } + +func getNpdPodStat(f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) { + summary, err := framework.GetStatsSummary(f.ClientSet, nodeName) + framework.ExpectNoError(err) + + hasNpdPod := false + for _, pod := range summary.Pods { + if !strings.HasPrefix(pod.PodRef.Name, "npd") { + continue + } + cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9 + rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024 + workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024 + hasNpdPod = true + break + } + Expect(hasNpdPod).To(BeTrue()) + return +}