Fix NPD e2e test on Ubuntu node and update NPD container version

This commit is contained in:
Zhen Wang 2019-03-06 13:12:18 -08:00
parent a8492d74ec
commit f4d9e7d992
5 changed files with 87 additions and 31 deletions

View File

@ -1,6 +1,7 @@
# Maintainers # Maintainers
Random-Liu <lantaol@google.com> Random-Liu <lantaol@google.com>
wangzhen127 <zhenw@google.com>
[![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]() [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]()

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- Random-Liu
- wangzhen127
reviewers:
- Random-Liu
- wangzhen127

View File

@ -26,28 +26,28 @@ subjects:
apiVersion: apps/v1 apiVersion: apps/v1
kind: DaemonSet kind: DaemonSet
metadata: metadata:
name: npd-v0.4.1 name: npd-v0.6.2
namespace: kube-system namespace: kube-system
labels: labels:
k8s-app: node-problem-detector k8s-app: node-problem-detector
version: v0.4.1 version: v0.6.2
kubernetes.io/cluster-service: "true" kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile addonmanager.kubernetes.io/mode: Reconcile
spec: spec:
selector: selector:
matchLabels: matchLabels:
k8s-app: node-problem-detector k8s-app: node-problem-detector
version: v0.4.1 version: v0.6.2
template: template:
metadata: metadata:
labels: labels:
k8s-app: node-problem-detector k8s-app: node-problem-detector
version: v0.4.1 version: v0.6.2
kubernetes.io/cluster-service: "true" kubernetes.io/cluster-service: "true"
spec: spec:
containers: containers:
- name: node-problem-detector - name: node-problem-detector
image: k8s.gcr.io/node-problem-detector:v0.4.1 image: k8s.gcr.io/node-problem-detector:v0.6.2
command: command:
- "/bin/sh" - "/bin/sh"
- "-c" - "-c"

View File

@ -281,8 +281,8 @@ func HighLatencyKubeletOperations(c clientset.Interface, threshold time.Duration
return badMetrics, nil return badMetrics, nil
} }
// getStatsSummary contacts kubelet for the container information. // GetStatsSummary contacts kubelet for the container information.
func getStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) { func GetStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout) ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
defer cancel() defer cancel()
@ -348,7 +348,7 @@ func getOneTimeResourceUsageOnNode(
return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest) return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
} }
// Get information of all containers on the node. // Get information of all containers on the node.
summary, err := getStatsSummary(c, nodeName) summary, err := GetStatsSummary(c, nodeName)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -18,6 +18,7 @@ package node
import ( import (
"fmt" "fmt"
"net"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@ -51,13 +52,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
}) })
It("should run without error", func() { It("should run without error", func() {
By("Getting all nodes' SSH-able IP addresses") By("Getting all nodes and their SSH-able IP addresses")
hosts, err := framework.NodeSSHHosts(f.ClientSet) nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
if err != nil { Expect(len(nodes.Items)).NotTo(BeZero())
framework.Failf("Error getting node hostnames: %v", err) hosts := []string{}
for _, node := range nodes.Items {
for _, addr := range node.Status.Addresses {
if addr.Type == v1.NodeExternalIP {
hosts = append(hosts, net.JoinHostPort(addr.Address, "22"))
break
}
}
} }
Expect(len(hosts)).NotTo(BeZero()) Expect(len(hosts)).To(Equal(len(nodes.Items)))
isStandaloneMode := make(map[string]bool)
cpuUsageStats := make(map[string][]float64) cpuUsageStats := make(map[string][]float64)
uptimeStats := make(map[string][]float64) uptimeStats := make(map[string][]float64)
rssStats := make(map[string][]float64) rssStats := make(map[string][]float64)
@ -69,12 +78,16 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
rssStats[host] = []float64{} rssStats[host] = []float64{}
workingSetStats[host] = []float64{} workingSetStats[host] = []float64{}
cmd := "systemctl status node-problem-detector.service"
result, err := framework.SSH(cmd, host, framework.TestContext.Provider)
isStandaloneMode[host] = (err == nil && result.Code == 0)
By(fmt.Sprintf("Check node %q has node-problem-detector process", host)) By(fmt.Sprintf("Check node %q has node-problem-detector process", host))
// Using brackets "[n]" is a trick to prevent grep command itself from // Using brackets "[n]" is a trick to prevent grep command itself from
// showing up, because string text "[n]ode-problem-detector" does not // showing up, because string text "[n]ode-problem-detector" does not
// match regular expression "[n]ode-problem-detector". // match regular expression "[n]ode-problem-detector".
psCmd := "ps aux | grep [n]ode-problem-detector" psCmd := "ps aux | grep [n]ode-problem-detector"
result, err := framework.SSH(psCmd, host, framework.TestContext.Provider) result, err = framework.SSH(psCmd, host, framework.TestContext.Provider)
framework.ExpectNoError(err) framework.ExpectNoError(err)
Expect(result.Code).To(BeZero()) Expect(result.Code).To(BeZero())
Expect(result.Stdout).To(ContainSubstring("node-problem-detector")) Expect(result.Stdout).To(ContainSubstring("node-problem-detector"))
@ -86,9 +99,11 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
Expect(result.Code).To(BeZero()) Expect(result.Code).To(BeZero())
Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed")) Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed"))
cpuUsage, uptime := getCpuStat(f, host) if isStandaloneMode[host] {
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) cpuUsage, uptime := getCpuStat(f, host)
uptimeStats[host] = append(uptimeStats[host], uptime) cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
}
By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host)) By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host))
log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds." log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds."
@ -99,8 +114,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
} }
By("Check node-problem-detector can post conditions and events to API server") By("Check node-problem-detector can post conditions and events to API server")
nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
Expect(len(nodes.Items)).To(Equal(len(hosts)))
for _, node := range nodes.Items { for _, node := range nodes.Items {
By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name)) By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name))
Eventually(func() error { Eventually(func() error {
@ -117,14 +130,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
By("Gather node-problem-detector cpu and memory stats") By("Gather node-problem-detector cpu and memory stats")
numIterations := 60 numIterations := 60
for i := 1; i <= numIterations; i++ { for i := 1; i <= numIterations; i++ {
for _, host := range hosts { for j, host := range hosts {
rss, workingSet := getMemoryStat(f, host) if isStandaloneMode[host] {
rssStats[host] = append(rssStats[host], rss) rss, workingSet := getMemoryStat(f, host)
workingSetStats[host] = append(workingSetStats[host], workingSet) rssStats[host] = append(rssStats[host], rss)
if i == numIterations { workingSetStats[host] = append(workingSetStats[host], workingSet)
cpuUsage, uptime := getCpuStat(f, host) if i == numIterations {
cpuUsage, uptime := getCpuStat(f, host)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
}
} else {
cpuUsage, rss, workingSet := getNpdPodStat(f, nodes.Items[j].Name)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime) rssStats[host] = append(rssStats[host], rss)
workingSetStats[host] = append(workingSetStats[host], workingSet)
} }
} }
time.Sleep(time.Second) time.Sleep(time.Second)
@ -134,16 +154,24 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
rssStatsMsg := "RSS (MB):" rssStatsMsg := "RSS (MB):"
workingSetStatsMsg := "WorkingSet (MB):" workingSetStatsMsg := "WorkingSet (MB):"
for i, host := range hosts { for i, host := range hosts {
cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0] if isStandaloneMode[host] {
totaltime := uptimeStats[host][1] - uptimeStats[host][0] // When in standalone mode, NPD is running as systemd service. We
cpuStatsMsg += fmt.Sprintf(" Node%d[%.3f];", i, cpuUsage/totaltime) // calculate its cpu usage from cgroup cpuacct value differences.
cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
totaltime := uptimeStats[host][1] - uptimeStats[host][0]
cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes.Items[i].Name, cpuUsage/totaltime)
} else {
sort.Float64s(cpuUsageStats[host])
cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes.Items[i].Name,
cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1])
}
sort.Float64s(rssStats[host]) sort.Float64s(rssStats[host])
rssStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i, rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1]) rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1])
sort.Float64s(workingSetStats[host]) sort.Float64s(workingSetStats[host])
workingSetStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i, workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1]) workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1])
} }
framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg) framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg)
@ -233,3 +261,22 @@ func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) {
usage *= 1e-9 usage *= 1e-9
return return
} }
func getNpdPodStat(f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) {
summary, err := framework.GetStatsSummary(f.ClientSet, nodeName)
framework.ExpectNoError(err)
hasNpdPod := false
for _, pod := range summary.Pods {
if !strings.HasPrefix(pod.PodRef.Name, "npd") {
continue
}
cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024
workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024
hasNpdPod = true
break
}
Expect(hasNpdPod).To(BeTrue())
return
}