From d874dbfcb1529077e2889f54279b559e09a9790d Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Mon, 29 Jul 2019 14:22:41 -0700 Subject: [PATCH] Bump NPD version to v0.7 for GCI --- cluster/gce/gci/configure-helper.sh | 10 +++++++--- cluster/gce/gci/configure.sh | 4 ++-- test/e2e/node/node_problem_detector.go | 21 +++++++++++++++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index b3ff4e86fb0..b64da4b6255 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -1396,13 +1396,17 @@ function start-node-problem-detector { local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json" # TODO(random-liu): Handle this for alternative container runtime. local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json" + local -r sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor.json" + local -r ssm_config="${KUBE_HOME}/node-problem-detector/config/system-stats-monitor.json" + local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json" - local -r custom_dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor-counter.json" local -r custom_sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json" + flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}" flags+=" --logtostderr" - flags+=" --system-log-monitors=${km_config},${dm_config}" - flags+=" --custom-plugin-monitors=${custom_km_config},${custom_dm_config},${custom_sm_config}" + flags+=" --config.system-log-monitor=${km_config},${dm_config},${sm_config}" + flags+=" --config.system-stats-monitor=${ssm_config}" + flags+=" --config.custom-plugin-monitor=${custom_km_config},${custom_sm_config}" local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256} flags+=" --port=${npd_port}" if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then diff --git a/cluster/gce/gci/configure.sh b/cluster/gce/gci/configure.sh index ad5d6d6eae0..77b558e6fdd 100644 --- a/cluster/gce/gci/configure.sh +++ b/cluster/gce/gci/configure.sh @@ -26,8 +26,8 @@ set -o pipefail ### Hardcoded constants DEFAULT_CNI_VERSION="v0.7.5" DEFAULT_CNI_SHA1="52e9d2de8a5f927307d9397308735658ee44ab8d" -DEFAULT_NPD_VERSION="v0.6.3" -DEFAULT_NPD_SHA1="3a6ac56be6c121f1b94450bfd1a81ad28d532369" +DEFAULT_NPD_VERSION="v0.7.1" +DEFAULT_NPD_SHA1="a9cae965973d586bf5206ad4fe5aae07e6bfd154" DEFAULT_CRICTL_VERSION="v1.14.0" DEFAULT_CRICTL_SHA1="1f93c6183d0a4e186708efe7899da7a7bce9c736" DEFAULT_MOUNTER_TAR_SHA="8003b798cf33c7f91320cd6ee5cec4fa22244571" diff --git a/test/e2e/node/node_problem_detector.go b/test/e2e/node/node_problem_detector.go index 871018b56e6..11c26a2b79c 100644 --- a/test/e2e/node/node_problem_detector.go +++ b/test/e2e/node/node_problem_detector.go @@ -131,6 +131,14 @@ var _ = SIGDescribe("NodeProblemDetector [DisabledForLargeClusters]", func() { gomega.Eventually(func() error { return verifyEvents(f, eventListOptions, 1, "AUFSUmountHung", node.Name) }, pollTimeout, pollInterval).Should(gomega.Succeed()) + + // Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+. + // Since Kubelet may be restarted for a few times after node is booted. We just check the event + // is detected, but do not check how many times Kubelet is started. + ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name)) + gomega.Eventually(func() error { + return verifyEventExists(f, eventListOptions, "KubeletStart", node.Name) + }, pollTimeout, pollInterval).Should(gomega.Succeed()) } ginkgo.By("Gather node-problem-detector cpu and memory stats") @@ -202,6 +210,19 @@ func verifyEvents(f *framework.Framework, options metav1.ListOptions, num int, r return nil } +func verifyEventExists(f *framework.Framework, options metav1.ListOptions, reason, nodeName string) error { + events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(options) + if err != nil { + return err + } + for _, event := range events.Items { + if event.Reason == reason && event.Source.Host == nodeName && event.Count > 0 { + return nil + } + } + return fmt.Errorf("Event %s does not exist: %v", reason, events.Items) +} + func verifyNodeCondition(f *framework.Framework, condition v1.NodeConditionType, status v1.ConditionStatus, reason, nodeName string) error { node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{}) if err != nil {