Merge pull request #73288 from wangzhen127/npd-config

Decouple node-problem-detector release from kubernetes
2025-07-26 05:03:09 +00:00 · 2019-02-28 00:27:25 -08:00 · 2019-02-28 00:27:25 -08:00 · 02b8056efb
commit 02b8056efb
parent 4b1282d925 6df207bdaa
22 changed files with 352 additions and 54 deletions
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -296,6 +296,8 @@ else
 fi
 NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}"
 NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}"
 NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}"
 NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
 # Optional: Create autoscaler for cluster's nodes.
 ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -308,6 +308,8 @@ else
 fi
 NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}"
 NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}"
 NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}"
 NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
 # Optional: Create autoscaler for cluster's nodes.
 ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -1257,21 +1257,25 @@ EOF
 function start-node-problem-detector {
  echo "Start node problem detector"
  local -r npd_bin="${KUBE_HOME}/bin/node-problem-detector"
  local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
  # TODO(random-liu): Handle this for alternative container runtime.
  local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json"
  local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/docker-monitor-counter.json"
  echo "Using node problem detector binary at ${npd_bin}"
-  local flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
+
-  flags+=" --logtostderr"
+  local flags="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
-  flags+=" --system-log-monitors=${km_config},${dm_config}"
+  if [[ -z "${flags}" ]]; then
-  flags+=" --custom-plugin-monitors=${custom_km_config}"
+    local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
-  flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
+    # TODO(random-liu): Handle this for alternative container runtime.
-  local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256}
+    local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json"
-  flags+=" --port=${npd_port}"
+    local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/docker-monitor-counter.json"
-  if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then
+    flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
-    flags+=" ${EXTRA_NPD_ARGS}"
+    flags+=" --logtostderr"
    flags+=" --system-log-monitors=${km_config},${dm_config}"
    flags+=" --custom-plugin-monitors=${custom_km_config}"
    local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256}
    flags+=" --port=${npd_port}"
    if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then
      flags+=" ${EXTRA_NPD_ARGS}"
    fi
  fi
  flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
  # Write the systemd service file for node problem detector.
  cat <<EOF >/etc/systemd/system/node-problem-detector.service
--- a/cluster/gce/gci/configure.sh
+++ b/cluster/gce/gci/configure.sh
@ -213,12 +213,12 @@ function install-node-problem-detector {
  local -r npd_tar="node-problem-detector-${npd_version}.tar.gz"
  if is-preloaded "${npd_tar}" "${npd_sha1}"; then
-    echo "node-problem-detector is preloaded."
+    echo "${npd_tar} is preloaded."
    return
  fi
-  echo "Downloading node problem detector."
+  echo "Downloading ${npd_tar}."
-  local -r npd_release_path="https://storage.googleapis.com/kubernetes-release"
+  local -r npd_release_path="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-https://storage.googleapis.com/kubernetes-release}"
  download-or-bust "${npd_sha1}" "${npd_release_path}/node-problem-detector/${npd_tar}"
  local -r npd_dir="${KUBE_HOME}/node-problem-detector"
  mkdir -p "${npd_dir}"
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@ -1077,6 +1077,8 @@ ENABLE_CLUSTER_UI: $(yaml-quote ${ENABLE_CLUSTER_UI:-false})
 ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote ${ENABLE_NODE_PROBLEM_DETECTOR:-none})
 NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote ${NODE_PROBLEM_DETECTOR_VERSION:-})
 NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote ${NODE_PROBLEM_DETECTOR_TAR_HASH:-})
 NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote ${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-})
 NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote ${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-})
 ENABLE_NODE_LOGGING: $(yaml-quote ${ENABLE_NODE_LOGGING:-false})
 LOGGING_DESTINATION: $(yaml-quote ${LOGGING_DESTINATION:-})
 ELASTICSEARCH_LOGGING_REPLICAS: $(yaml-quote ${ELASTICSEARCH_LOGGING_REPLICAS:-})
--- a/hack/make-rules/test-e2e-node.sh
+++ b/hack/make-rules/test-e2e-node.sh
@ -34,6 +34,7 @@ image_service_endpoint=${IMAGE_SERVICE_ENDPOINT:-""}
 run_until_failure=${RUN_UNTIL_FAILURE:-"false"}
 test_args=${TEST_ARGS:-""}
 system_spec_name=${SYSTEM_SPEC_NAME:-}
 extra_envs=${EXTRA_ENVS:-}
 # Parse the flags to pass to ginkgo
 ginkgoflags=""
@ -148,7 +149,7 @@ if [ ${remote} = true ] ; then
    --image-project="${image_project}" --instance-name-prefix="${instance_prefix}" \
    --delete-instances="${delete_instances}" --test_args="${test_args}" --instance-metadata="${metadata}" \
    --image-config-file="${image_config_file}" --system-spec-name="${system_spec_name}" \
-    --test-suite="${test_suite}" \
+    --extra-envs="${extra_envs}" --test-suite="${test_suite}" \
    2>&1 | tee -i "${artifacts}/build-log.txt"
  exit $?
@ -169,8 +170,8 @@ else
  # Test using the host the script was run on
  # Provided for backwards compatibility
  go run test/e2e_node/runner/local/run_local.go \
-    --system-spec-name="${system_spec_name}" --ginkgo-flags="${ginkgoflags}" \
+    --system-spec-name="${system_spec_name}" --extra-envs="${extra_envs}" \
-    --test-flags="--container-runtime=${runtime} \
+    --ginkgo-flags="${ginkgoflags}" --test-flags="--container-runtime=${runtime} \
    --alsologtostderr --v 4 --report-dir=${artifacts} --node-name $(hostname) \
    ${test_args}" --build-dependencies=true 2>&1 | tee -i "${artifacts}/build-log.txt"
  exit $?
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@ -193,6 +193,8 @@ type NodeTestContextType struct {
 	// the node e2e test. If empty, the default one (system.DefaultSpec) is
 	// used. The system specs are in test/e2e_node/system/specs/.
 	SystemSpecName string
 	// ExtraEnvs is a map of environment names to values.
 	ExtraEnvs map[string]string
 }
 type CloudConfig struct {
@ -332,6 +334,7 @@ func RegisterNodeFlags() {
 	flag.BoolVar(&TestContext.PrepullImages, "prepull-images", true, "If true, prepull images so image pull failures do not cause test failures.")
 	flag.StringVar(&TestContext.ImageDescription, "image-description", "", "The description of the image which the test will be running on.")
 	flag.StringVar(&TestContext.SystemSpecName, "system-spec-name", "", "The name of the system spec (e.g., gke) that's used in the node e2e test. The system specs are in test/e2e_node/system/specs/. This is used by the test framework to determine which tests to run for validating the system requirements.")
 	flag.Var(cliflag.NewMapStringString(&TestContext.ExtraEnvs), "extra-envs", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 }
 // HandleFlags sets up all flags and parses the command line.
--- a/test/e2e/node/BUILD
+++ b/test/e2e/node/BUILD
@ -10,6 +10,7 @@ go_library(
        "kubelet.go",
        "kubelet_perf.go",
        "mount_propagation.go",
        "node_problem_detector.go",
        "pod_gc.go",
        "pods.go",
        "pre_stop.go",
--- a/test/e2e/node/node_problem_detector.go
+++ b/test/e2e/node/node_problem_detector.go
@ -0,0 +1,232 @@
 /*
 Copyright 2019 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package node
 import (
 	"fmt"
 	"sort"
 	"strconv"
 	"strings"
 	"time"
 	"k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/kubernetes/test/e2e/framework"
 	testutils "k8s.io/kubernetes/test/utils"
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
 )
 // This test checks if node-problem-detector (NPD) runs fine without error on
 // the nodes in the cluster. NPD's functionality is tested in e2e_node tests.
 var _ = SIGDescribe("NodeProblemDetector", func() {
 	const (
 		pollInterval = 1 * time.Second
 		pollTimeout  = 1 * time.Minute
 	)
 	f := framework.NewDefaultFramework("node-problem-detector")
 	BeforeEach(func() {
 		framework.SkipUnlessSSHKeyPresent()
 		framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
 		framework.SkipUnlessProviderIs("gce", "gke")
 		framework.SkipUnlessNodeOSDistroIs("gci", "ubuntu")
 		framework.WaitForAllNodesHealthy(f.ClientSet, time.Minute)
 	})
 	It("should run without error", func() {
 		By("Getting all nodes' SSH-able IP addresses")
 		hosts, err := framework.NodeSSHHosts(f.ClientSet)
 		if err != nil {
 			framework.Failf("Error getting node hostnames: %v", err)
 		}
 		Expect(len(hosts)).NotTo(BeZero())
 		cpuUsageStats := make(map[string][]float64)
 		uptimeStats := make(map[string][]float64)
 		rssStats := make(map[string][]float64)
 		workingSetStats := make(map[string][]float64)
 		for _, host := range hosts {
 			cpuUsageStats[host] = []float64{}
 			uptimeStats[host] = []float64{}
 			rssStats[host] = []float64{}
 			workingSetStats[host] = []float64{}
 			By(fmt.Sprintf("Check node %q has node-problem-detector process", host))
 			psCmd := "ps aux | grep node-problem-detector"
 			result, err := framework.SSH(psCmd, host, framework.TestContext.Provider)
 			framework.ExpectNoError(err)
 			Expect(result.Code).To(BeZero())
 			Expect(result.Stdout).To(ContainSubstring("/home/kubernetes/bin/node-problem-detector"))
 			By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host))
 			journalctlCmd := "sudo journalctl -u node-problem-detector"
 			result, err = framework.SSH(journalctlCmd, host, framework.TestContext.Provider)
 			framework.ExpectNoError(err)
 			Expect(result.Code).To(BeZero())
 			Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed"))
 			cpuUsage, uptime := getCpuStat(f, host)
 			cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
 			uptimeStats[host] = append(uptimeStats[host], uptime)
 			By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host))
 			log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds."
 			injectLogCmd := "sudo sh -c \"echo 'kernel: " + log + "' >> /dev/kmsg\""
 			_, err = framework.SSH(injectLogCmd, host, framework.TestContext.Provider)
 			framework.ExpectNoError(err)
 			Expect(result.Code).To(BeZero())
 		}
 		By("Check node-problem-detector can post conditions and events to API server")
 		nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
 		Expect(len(nodes.Items)).To(Equal(len(hosts)))
 		for _, node := range nodes.Items {
 			By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name))
 			Eventually(func() error {
 				return verifyNodeCondition(f, "KernelDeadlock", v1.ConditionTrue, "AUFSUmountHung", node.Name)
 			}, pollTimeout, pollInterval).Should(Succeed())
 			By(fmt.Sprintf("Check node-problem-detector posted AUFSUmountHung event on node %q", node.Name))
 			eventListOptions := metav1.ListOptions{FieldSelector: fields.Set{"involvedObject.kind": "Node"}.AsSelector().String()}
 			Eventually(func() error {
 				return verifyEvents(f, eventListOptions, 1, "AUFSUmountHung", node.Name)
 			}, pollTimeout, pollInterval).Should(Succeed())
 		}
 		By("Gather node-problem-detector cpu and memory stats")
 		numIterations := 60
 		for i := 1; i <= numIterations; i++ {
 			for _, host := range hosts {
 				rss, workingSet := getMemoryStat(f, host)
 				rssStats[host] = append(rssStats[host], rss)
 				workingSetStats[host] = append(workingSetStats[host], workingSet)
 				if i == numIterations {
 					cpuUsage, uptime := getCpuStat(f, host)
 					cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
 					uptimeStats[host] = append(uptimeStats[host], uptime)
 				}
 			}
 			time.Sleep(time.Second)
 		}
 		cpuStatsMsg := "CPU (core):"
 		rssStatsMsg := "RSS (MB):"
 		workingSetStatsMsg := "WorkingSet (MB):"
 		for i, host := range hosts {
 			cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
 			totaltime := uptimeStats[host][1] - uptimeStats[host][0]
 			cpuStatsMsg += fmt.Sprintf(" Node%d[%.3f];", i, cpuUsage/totaltime)
 			sort.Float64s(rssStats[host])
 			rssStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
 				rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1])
 			sort.Float64s(workingSetStats[host])
 			workingSetStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
 				workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1])
 		}
 		framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg)
 	})
 })
 func verifyEvents(f *framework.Framework, options metav1.ListOptions, num int, reason, nodeName string) error {
 	events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(options)
 	if err != nil {
 		return err
 	}
 	count := 0
 	for _, event := range events.Items {
 		if event.Reason != reason || event.Source.Host != nodeName {
 			continue
 		}
 		count += int(event.Count)
 	}
 	if count != num {
 		return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
 	}
 	return nil
 }
 func verifyNodeCondition(f *framework.Framework, condition v1.NodeConditionType, status v1.ConditionStatus, reason, nodeName string) error {
 	node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
 	if err != nil {
 		return err
 	}
 	_, c := testutils.GetNodeCondition(&node.Status, condition)
 	if c == nil {
 		return fmt.Errorf("node condition %q not found", condition)
 	}
 	if c.Status != status || c.Reason != reason {
 		return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
 	}
 	return nil
 }
 func getMemoryStat(f *framework.Framework, host string) (rss, workingSet float64) {
 	memCmd := "cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.usage_in_bytes && cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.stat"
 	result, err := framework.SSH(memCmd, host, framework.TestContext.Provider)
 	framework.ExpectNoError(err)
 	Expect(result.Code).To(BeZero())
 	lines := strings.Split(result.Stdout, "\n")
 	memoryUsage, err := strconv.ParseFloat(lines[0], 64)
 	Expect(err).To(BeNil())
 	var totalInactiveFile float64
 	for _, line := range lines[1:] {
 		tokens := strings.Split(line, " ")
 		if tokens[0] == "total_rss" {
 			rss, err = strconv.ParseFloat(tokens[1], 64)
 			Expect(err).To(BeNil())
 		}
 		if tokens[0] == "total_inactive_file" {
 			totalInactiveFile, err = strconv.ParseFloat(tokens[1], 64)
 			Expect(err).To(BeNil())
 		}
 	}
 	workingSet = memoryUsage
 	if workingSet < totalInactiveFile {
 		workingSet = 0
 	} else {
 		workingSet -= totalInactiveFile
 	}
 	// Convert to MB
 	rss = rss / 1024 / 1024
 	workingSet = workingSet / 1024 / 1024
 	return
 }
 func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) {
 	cpuCmd := "cat /sys/fs/cgroup/cpu/system.slice/node-problem-detector.service/cpuacct.usage && cat /proc/uptime | awk '{print $1}'"
 	result, err := framework.SSH(cpuCmd, host, framework.TestContext.Provider)
 	framework.ExpectNoError(err)
 	Expect(result.Code).To(BeZero())
 	lines := strings.Split(result.Stdout, "\n")
 	usage, err = strconv.ParseFloat(lines[0], 64)
 	uptime, err = strconv.ParseFloat(lines[1], 64)
 	// Convert from nanoseconds to seconds
 	usage *= 1e-9
 	return
 }
--- a/test/e2e_node/conformance/build/Dockerfile
+++ b/test/e2e_node/conformance/build/Dockerfile
@ -27,12 +27,14 @@ COPY_SYSTEM_SPEC_FILE
 # REPORT_PATH is the path in the container to save test result and logs.
 # FLAKE_ATTEMPTS is the time to retry when there is a test failure. By default 2.
 # TEST_ARGS is the test arguments passed into the test.
 # EXTRA_ENVS is the extra environment variables needed for node e2e tests.
 ENV FOCUS="\[Conformance\]" \
 	   SKIP="\[Flaky\]|\[Serial\]" \
 	   PARALLELISM=8 \
 	   REPORT_PATH="/var/result" \
 	   FLAKE_ATTEMPTS=2 \
-	   TEST_ARGS=""
+	   TEST_ARGS="" \
 	   EXTRA_ENVS=""
 ENTRYPOINT ginkgo --focus="$FOCUS" \
 	--skip="$SKIP" \
@ -46,4 +48,5 @@ ENTRYPOINT ginkgo --focus="$FOCUS" \
 	--system-spec-name=SYSTEM_SPEC_NAME \
 	# This is a placeholder that will be substituted in the Makefile.
 	--system-spec-file=SYSTEM_SPEC_FILE_PATH \
 	--extra-envs=$EXTRA_ENVS \
 	$TEST_ARGS
--- a/test/e2e_node/e2e_node_suite_test.go
+++ b/test/e2e_node/e2e_node_suite_test.go
@ -77,6 +77,7 @@ func TestMain(m *testing.M) {
 	rand.Seed(time.Now().UnixNano())
 	pflag.Parse()
 	framework.AfterReadingAllFlags(&framework.TestContext)
 	setExtraEnvs()
 	os.Exit(m.Run())
 }
@ -146,6 +147,7 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 	// This helps with debugging test flakes since it is hard to tell when a test failure is due to image pulling.
 	if framework.TestContext.PrepullImages {
 		klog.Infof("Pre-pulling images so that they are cached for the tests.")
 		updateImageWhiteList()
 		err := PrePullAllImages()
 		Expect(err).ShouldNot(HaveOccurred())
 	}
@ -244,6 +246,9 @@ func waitForNodeReady() {
 // TODO(random-liu): Using dynamic kubelet configuration feature to
 // update test context with node configuration.
 func updateTestContext() error {
 	setExtraEnvs()
 	updateImageWhiteList()
 	client, err := getAPIServerClient()
 	if err != nil {
 		return fmt.Errorf("failed to get apiserver client: %v", err)
@ -261,7 +266,7 @@ func updateTestContext() error {
 	if err != nil {
 		return fmt.Errorf("failed to get kubelet configuration: %v", err)
 	}
-	framework.TestContext.KubeletConfig = *kubeletCfg // Set kubelet config.
+	framework.TestContext.KubeletConfig = *kubeletCfg // Set kubelet config
 	return nil
 }
@ -319,3 +324,9 @@ func isNodeReady(node *v1.Node) bool {
 	}
 	return false
 }
 func setExtraEnvs() {
 	for name, value := range framework.TestContext.ExtraEnvs {
 		os.Setenv(name, value)
 	}
 }
--- a/test/e2e_node/image_list.go
+++ b/test/e2e_node/image_list.go
@ -18,6 +18,7 @@ package e2e_node
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"os/user"
 	"time"
@ -46,7 +47,6 @@ var NodeImageWhiteList = sets.NewString(
 	"k8s.gcr.io/stress:v1",
 	busyboxImage,
 	"k8s.gcr.io/busybox@sha256:4bdd623e848417d96127e16037743f0cd8b528c026e9175e22a84f639eca58ff",
 	"k8s.gcr.io/node-problem-detector:v0.4.1",
 	imageutils.GetE2EImage(imageutils.Nginx),
 	imageutils.GetE2EImage(imageutils.ServeHostname),
 	imageutils.GetE2EImage(imageutils.Netexec),
@ -58,9 +58,24 @@ var NodeImageWhiteList = sets.NewString(
 	"gcr.io/kubernetes-e2e-test-images/node-perf/tf-wide-deep-amd64:1.0",
 )
-func init() {
+// updateImageWhiteList updates the framework.ImageWhiteList with
 // 1. the hard coded lists
 // 2. the ones passed in from framework.TestContext.ExtraEnvs
 // So this function needs to be called after the extra envs are applied.
 func updateImageWhiteList() {
 	// Union NodeImageWhiteList and CommonImageWhiteList into the framework image white list.
 	framework.ImageWhiteList = NodeImageWhiteList.Union(commontest.CommonImageWhiteList)
 	// Images from extra envs
 	framework.ImageWhiteList.Insert(getNodeProblemDetectorImage())
 }
 func getNodeProblemDetectorImage() string {
 	const defaultImage string = "k8s.gcr.io/node-problem-detector:v0.6.2"
 	image := os.Getenv("NODE_PROBLEM_DETECTOR_IMAGE")
 	if image == "" {
 		image = defaultImage
 	}
 	return image
 }
 // puller represents a generic image puller
--- a/test/e2e_node/jenkins/conformance/conformance-jenkins.sh
+++ b/test/e2e_node/jenkins/conformance/conformance-jenkins.sh
@ -40,4 +40,5 @@ go run test/e2e_node/runner/remote/run_remote.go  --test-suite=conformance \
  --results-dir="$ARTIFACTS" --test-timeout="$TIMEOUT" \
  --test_args="--kubelet-flags=\"$KUBELET_ARGS\"" \
  --instance-metadata="$GCE_INSTANCE_METADATA" \
-  --system-spec-name="$SYSTEM_SPEC_NAME"
+  --system-spec-name="$SYSTEM_SPEC_NAME" \
  --extra-envs="$EXTRA_ENVS"
--- a/test/e2e_node/jenkins/e2e-node-jenkins.sh
+++ b/test/e2e_node/jenkins/e2e-node-jenkins.sh
@ -47,4 +47,5 @@ go run test/e2e_node/runner/remote/run_remote.go  --logtostderr --vmodule=*=4 \
  --image-config-file="$GCE_IMAGE_CONFIG_PATH" --cleanup="$CLEANUP" \
  --results-dir="$ARTIFACTS" --ginkgo-flags="--nodes=$PARALLELISM $GINKGO_FLAGS" \
  --test-timeout="$TIMEOUT" --test_args="$TEST_ARGS --kubelet-flags=\"$KUBELET_ARGS\"" \
-  --instance-metadata="$GCE_INSTANCE_METADATA" --system-spec-name="$SYSTEM_SPEC_NAME"
+  --instance-metadata="$GCE_INSTANCE_METADATA" --system-spec-name="$SYSTEM_SPEC_NAME" \
  --extra-envs="$EXTRA_ENVS"
--- a/test/e2e_node/node_problem_detector_linux.go
+++ b/test/e2e_node/node_problem_detector_linux.go
@ -45,13 +45,14 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 		pollInterval   = 1 * time.Second
 		pollConsistent = 5 * time.Second
 		pollTimeout    = 1 * time.Minute
 		image          = "k8s.gcr.io/node-problem-detector:v0.4.1"
 	)
 	f := framework.NewDefaultFramework("node-problem-detector")
 	var c clientset.Interface
 	var uid string
 	var ns, name, configName, eventNamespace string
 	var bootTime, nodeTime time.Time
 	var image string
 	BeforeEach(func() {
 		c = f.ClientSet
 		ns = f.Namespace.Name
@ -60,6 +61,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 		configName = "node-problem-detector-config-" + uid
 		// There is no namespace for Node, event recorder will set default namespace for node events.
 		eventNamespace = metav1.NamespaceDefault
 		image = getNodeProblemDetectorImage()
 		By(fmt.Sprintf("Using node-problem-detector image: %s", image))
 	})
 	// Test system log monitor. We may add other tests if we have more problem daemons in the future.
@ -245,7 +248,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 				timestamp        time.Time
 				message          string
 				messageNum       int
-				events           int
+				tempEvents       int // Events for temp errors
 				totalEvents      int // Events for both temp errors and condition changes
 				conditionReason  string
 				conditionMessage string
 				conditionType    v1.ConditionStatus
@ -279,7 +283,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime,
 					message:          tempMessage,
 					messageNum:       3,
-					events:           3,
+					tempEvents:       3,
 					totalEvents:      3,
 					conditionReason:  defaultReason,
 					conditionMessage: defaultMessage,
 					conditionType:    v1.ConditionFalse,
@ -289,7 +294,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime,
 					message:          permMessage1,
 					messageNum:       1,
-					events:           3, // event number should not change
+					tempEvents:       3, // event number for temp errors should not change
 					totalEvents:      4, // add 1 event for condition change
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@ -299,7 +305,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          tempMessage,
 					messageNum:       3,
-					events:           6,
+					tempEvents:       6, // add 3 events for temp errors
 					totalEvents:      7, // add 3 events for temp errors
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@ -309,7 +316,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          permMessage1 + "different message",
 					messageNum:       1,
-					events:           6, // event number should not change
+					tempEvents:       6, // event number should not change
 					totalEvents:      7, // event number should not change
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@ -319,7 +327,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          permMessage2,
 					messageNum:       1,
-					events:           6, // event number should not change
+					tempEvents:       6, // event number for temp errors should not change
 					totalEvents:      8, // add 1 event for condition change
 					conditionReason:  permReason2,
 					conditionMessage: permMessage2,
 					conditionType:    v1.ConditionTrue,
@ -332,13 +341,17 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					Expect(err).NotTo(HaveOccurred())
 				}
-				By(fmt.Sprintf("Wait for %d events generated", test.events))
+				By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
 				Eventually(func() error {
-					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.events, tempReason, tempMessage)
+					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
 				}, pollTimeout, pollInterval).Should(Succeed())
-				By(fmt.Sprintf("Make sure only %d events generated", test.events))
+				By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
 				Eventually(func() error {
 					return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
 				}, pollTimeout, pollInterval).Should(Succeed())
 				By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
 				Consistently(func() error {
-					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.events, tempReason, tempMessage)
+					return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
 				}, pollConsistent, pollInterval).Should(Succeed())
 				By(fmt.Sprintf("Make sure node condition %q is set", condition))
@ -390,7 +403,7 @@ func injectLog(file string, timestamp time.Time, log string, num int) error {
 	return nil
 }
-// verifyEvents verifies there are num specific events generated
+// verifyEvents verifies there are num specific events generated with given reason and message.
 func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
 	events, err := e.List(options)
 	if err != nil {
@ -399,7 +412,7 @@ func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, nu
 	count := 0
 	for _, event := range events.Items {
 		if event.Reason != reason || event.Message != message {
-			return fmt.Errorf("unexpected event: %v", event)
+			continue
 		}
 		count += int(event.Count)
 	}
@ -409,14 +422,18 @@ func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, nu
 	return nil
 }
-// verifyNoEvents verifies there is no event generated
+// verifyTotalEvents verifies there are num events in total.
-func verifyNoEvents(e coreclientset.EventInterface, options metav1.ListOptions) error {
+func verifyTotalEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
 	events, err := e.List(options)
 	if err != nil {
 		return err
 	}
-	if len(events.Items) != 0 {
+	count := 0
-		return fmt.Errorf("unexpected events: %v", events.Items)
+	for _, event := range events.Items {
 		count += int(event.Count)
 	}
 	if count != num {
 		return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
 	}
 	return nil
 }
--- a/test/e2e_node/remote/cadvisor_e2e.go
+++ b/test/e2e_node/remote/cadvisor_e2e.go
@ -63,7 +63,7 @@ func runCommand(command string, args ...string) error {
 }
 // RunTest implements TestSuite.RunTest
-func (n *CAdvisorE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error) {
+func (n *CAdvisorE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Kill any running node processes
 	cleanupNodeProcesses(host)
--- a/test/e2e_node/remote/node_conformance.go
+++ b/test/e2e_node/remote/node_conformance.go
@ -259,7 +259,7 @@ func stopKubelet(host, workspace string) error {
 }
 // RunTest runs test on the node.
-func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, _, systemSpecName string, timeout time.Duration) (string, error) {
+func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, _, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Install the cni plugins and add a basic CNI configuration.
 	if err := setupCNI(host, workspace); err != nil {
 		return "", err
@ -293,8 +293,8 @@ func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFi
 	// Run the tests
 	klog.V(2).Infof("Starting tests on %q", host)
 	podManifestPath := getPodPath(workspace)
-	cmd := fmt.Sprintf("'timeout -k 30s %fs docker run --rm --privileged=true --net=host -v /:/rootfs -v %s:%s -v %s:/var/result -e TEST_ARGS=--report-prefix=%s %s'",
+	cmd := fmt.Sprintf("'timeout -k 30s %fs docker run --rm --privileged=true --net=host -v /:/rootfs -v %s:%s -v %s:/var/result -e TEST_ARGS=--report-prefix=%s -e EXTRA_ENVS=%s %s'",
-		timeout.Seconds(), podManifestPath, podManifestPath, results, junitFilePrefix, getConformanceTestImageName(systemSpecName))
+		timeout.Seconds(), podManifestPath, podManifestPath, results, junitFilePrefix, extraEnvs, getConformanceTestImageName(systemSpecName))
 	testOutput, err := SSH(host, "sh", "-c", cmd)
 	if err != nil {
 		return testOutput, err
--- a/test/e2e_node/remote/node_e2e.go
+++ b/test/e2e_node/remote/node_e2e.go
@ -135,7 +135,7 @@ func updateOSSpecificKubeletFlags(args, host, workspace string) (string, error)
 }
 // RunTest runs test on the node.
-func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error) {
+func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Install the cni plugins and add a basic CNI configuration.
 	// TODO(random-liu): Do this in cloud init after we remove containervm test.
 	if err := setupCNI(host, workspace); err != nil {
@ -164,8 +164,8 @@ func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePr
 	klog.V(2).Infof("Starting tests on %q", host)
 	cmd := getSSHCommand(" && ",
 		fmt.Sprintf("cd %s", workspace),
-		fmt.Sprintf("timeout -k 30s %fs ./ginkgo %s ./e2e_node.test -- --system-spec-name=%s --system-spec-file=%s --logtostderr --v 4 --node-name=%s --report-dir=%s --report-prefix=%s --image-description=\"%s\" %s",
+		fmt.Sprintf("timeout -k 30s %fs ./ginkgo %s ./e2e_node.test -- --system-spec-name=%s --system-spec-file=%s --extra-envs=%s --logtostderr --v 4 --node-name=%s --report-dir=%s --report-prefix=%s --image-description=\"%s\" %s",
-			timeout.Seconds(), ginkgoArgs, systemSpecName, systemSpecFile, host, results, junitFilePrefix, imageDesc, testArgs),
+			timeout.Seconds(), ginkgoArgs, systemSpecName, systemSpecFile, extraEnvs, host, results, junitFilePrefix, imageDesc, testArgs),
 	)
 	return SSH(host, "sh", "-c", cmd)
 }
--- a/test/e2e_node/remote/remote.go
+++ b/test/e2e_node/remote/remote.go
@ -65,7 +65,7 @@ func CreateTestArchive(suite TestSuite, systemSpecName string) (string, error) {
 // Returns the command output, whether the exit was ok, and any errors
 // TODO(random-liu): junitFilePrefix is not prefix actually, the file name is junit-junitFilePrefix.xml. Change the variable name.
-func RunRemote(suite TestSuite, archive string, host string, cleanup bool, imageDesc, junitFilePrefix string, testArgs string, ginkgoArgs string, systemSpecName string) (string, bool, error) {
+func RunRemote(suite TestSuite, archive string, host string, cleanup bool, imageDesc, junitFilePrefix string, testArgs string, ginkgoArgs string, systemSpecName string, extraEnvs string) (string, bool, error) {
 	// Create the temp staging directory
 	klog.V(2).Infof("Staging test binaries on %q", host)
 	workspace := newWorkspaceDir()
@ -110,7 +110,7 @@ func RunRemote(suite TestSuite, archive string, host string, cleanup bool, image
 	}
 	klog.V(2).Infof("Running test on %q", host)
-	output, err := suite.RunTest(host, workspace, resultDir, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, *testTimeoutSeconds)
+	output, err := suite.RunTest(host, workspace, resultDir, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs, *testTimeoutSeconds)
 	aggErrs := []error{}
 	// Do not log the output here, let the caller deal with the test output.
--- a/test/e2e_node/remote/types.go
+++ b/test/e2e_node/remote/types.go
@ -46,6 +46,7 @@ type TestSuite interface {
 	// * ginkgoArgs is the arguments passed to ginkgo.
 	// * systemSpecName is the name of the system spec used for validating the
 	//   image on which the test runs.
 	// * extraEnvs is the extra environment variables needed for node e2e tests.
 	// * timeout is the test timeout.
-	RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error)
+	RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error)
 }
--- a/test/e2e_node/runner/local/run_local.go
+++ b/test/e2e_node/runner/local/run_local.go
@ -35,6 +35,7 @@ var buildDependencies = flag.Bool("build-dependencies", true, "If true, build al
 var ginkgoFlags = flag.String("ginkgo-flags", "", "Space-separated list of arguments to pass to Ginkgo test runner.")
 var testFlags = flag.String("test-flags", "", "Space-separated list of arguments to pass to node e2e test.")
 var systemSpecName = flag.String("system-spec-name", "", fmt.Sprintf("The name of the system spec used for validating the image in the node conformance test. The specs are at %s. If unspecified, the default built-in spec (system.DefaultSpec) will be used.", system.SystemSpecPath))
 var extraEnvs = flag.String("extra-envs", "", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 func main() {
 	klog.InitFlags(nil)
@ -63,7 +64,7 @@ func main() {
 			klog.Fatalf("Failed to get k8s root directory: %v", err)
 		}
 		systemSpecFile := filepath.Join(rootDir, system.SystemSpecPath, *systemSpecName+".yaml")
-		args = append(args, fmt.Sprintf("--system-spec-name=%s --system-spec-file=%s", *systemSpecName, systemSpecFile))
+		args = append(args, fmt.Sprintf("--system-spec-name=%s --system-spec-file=%s --extra-envs=%s", *systemSpecName, systemSpecFile, *extraEnvs))
 	}
 	if err := runCommand(ginkgo, args...); err != nil {
 		klog.Exitf("Test failed: %v", err)
--- a/test/e2e_node/runner/remote/run_remote.go
+++ b/test/e2e_node/runner/remote/run_remote.go
@ -63,6 +63,7 @@ var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata
 var gubernator = flag.Bool("gubernator", false, "If true, output Gubernator link to view logs")
 var ginkgoFlags = flag.String("ginkgo-flags", "", "Passed to ginkgo to specify additional flags such as --skip=.")
 var systemSpecName = flag.String("system-spec-name", "", fmt.Sprintf("The name of the system spec used for validating the image in the node conformance test. The specs are at %s. If unspecified, the default built-in spec (system.DefaultSpec) will be used.", system.SystemSpecPath))
 var extraEnvs = flag.String("extra-envs", "", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 // envs is the type used to collect all node envs. The key is the env name,
 // and the value is the env value
@ -442,7 +443,7 @@ func testHost(host string, deleteFiles bool, imageDesc, junitFilePrefix, ginkgoF
 		}
 	}
-	output, exitOk, err := remote.RunRemote(suite, path, host, deleteFiles, imageDesc, junitFilePrefix, *testArgs, ginkgoFlagsStr, *systemSpecName)
+	output, exitOk, err := remote.RunRemote(suite, path, host, deleteFiles, imageDesc, junitFilePrefix, *testArgs, ginkgoFlagsStr, *systemSpecName, *extraEnvs)
 	return &TestResult{
 		output: output,
 		err:    err,