Merge pull request #29092 from Random-Liu/make-node-e2e-parallel

Automatic merge from submit-queue Node E2E: Make node e2e parallel For https://github.com/kubernetes/kubernetes/issues/29081. Fix https://github.com/kubernetes/kubernetes/issues/26215. Based on https://github.com/kubernetes/kubernetes/pull/28807, https://github.com/kubernetes/kubernetes/pull/29020, will rebase after they are merged. **Only the last commit is new.** We are going to move more tests into the node e2e test. However, currently node e2e test only run sequentially, the test duration will increase quickly when we add more test. This PR makes the node e2e test run in parallel so as to shorten test duration, so that we can add more test to improve the test coverage. * If you run the test locally with `make test-e2e-node`, it will use `-p` ginkgo flag, which uses `(cores-1)` parallel test nodes by default. * If you run the test remotely or in the Jenkin, the parallelism will be controlled by the environment variable `PARALLELISM`. The default value is `8`, which is reasonable for our test node (n1-standard-1). Before this PR, it took **833.592s** to run all test on my desktop. With this PR, it only takes **234.058s** to run. The pull request node e2e run with this PR takes **232.327s**. The pull request node e2e run for other PRs takes **673.810s**. /cc @kubernetes/sig-node
2025-07-29 22:46:12 +00:00 · 2016-07-29 18:38:29 -07:00 · 2016-07-29 18:38:29 -07:00 · fe3dda1aeb
commit fe3dda1aeb
parent 44115cd2e4 9d48c76361
7 changed files with 105 additions and 46 deletions
--- a/docs/devel/e2e-node-tests.md
+++ b/docs/devel/e2e-node-tests.md
@ -206,6 +206,18 @@ less useful for catching flakes related creating the instance from an image.**
 make test-e2e-node REMOTE=true RUN_UNTIL_FAILURE=true
 ```

+## Run tests in parallel
+
+Running test in parallel can usually shorten the test duration. By default node
+e2e test runs with`--nodes=8` (see ginkgo flag
+[--nodes](https://onsi.github.io/ginkgo/#parallel-specs)). You can use the
+`PARALLELISM` option to change the parallelism.
+
+```sh
+make test-e2e-node PARALLELISM=4 # run test with 4 parallel nodes
+make test-e2e-node PARALLELISM=1 # run test sequentially
+```
+
 ## Run tests with kubenet network plugin

 [kubenet](http://kubernetes.io/docs/admin/network-plugins/#kubenet) is
--- a/hack/make-rules/test-e2e-node.sh
+++ b/hack/make-rules/test-e2e-node.sh
@ -19,6 +19,13 @@ source "${KUBE_ROOT}/hack/lib/init.sh"

 focus=${FOCUS:-""}
 skip=${SKIP:-""}
+# The number of tests that can run in parallel depends on what tests
+# are running and on the size of the node. Too many, and tests will
+# fail due to resource contention. 8 is a reasonable default for a
+# n1-standard-1 node.
+# Currently, parallelism only affects when REMOTE=true. For local test,
+# ginkgo default parallelism (cores - 1) is used.
+parallelism=${PARALLELISM:-8}
 report=${REPORT:-"/tmp/"}
 artifacts=${ARTIFACTS:-"/tmp/_artifacts"}
 remote=${REMOTE:-"false"}
@ -47,6 +54,25 @@ if [[ -z "${ginkgo}" ]]; then
  exit 1
 fi

+# Parse the flags to pass to ginkgo
+ginkgoflags=""
+if [[ $parallelism > 1 ]]; then
+  ginkgoflags="$ginkgoflags -nodes=$parallelism "
+fi
+
+if [[ $focus != "" ]]; then
+  ginkgoflags="$ginkgoflags -focus=$focus "
+fi
+
+if [[ $skip != "" ]]; then
+  ginkgoflags="$ginkgoflags -skip=$skip "
+fi
+
+if [[ $run_until_failure != "" ]]; then
+  ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
+fi
+
+
 if [ $remote = true ] ; then
  # Setup the directory to copy test artifacts (logs, junit.xml, etc) from remote host to local host
  if [ ! -d "${artifacts}" ]; then
@ -90,20 +116,6 @@ if [ $remote = true ] ; then
       done
  fi

-  # Parse the flags to pass to ginkgo
-  ginkgoflags=""
-  if [[ $focus != "" ]]; then
-     ginkgoflags="$ginkgoflags -focus=$focus "
-  fi
-
-  if [[ $skip != "" ]]; then
-     ginkgoflags="$ginkgoflags -skip=$skip "
-  fi
-
-  if [[ $run_until_failure != "" ]]; then
-     ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
-  fi
-
  # Output the configuration we will try to run
  echo "Running tests remotely using"
  echo "Project: $project"
@ -149,7 +161,7 @@ else

  # Test using the host the script was run on
  # Provided for backwards compatibility
-  "${ginkgo}" --focus=$focus --skip=$skip "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
+  "${ginkgo}" $ginkgoflags "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
    -- --alsologtostderr --v 2 --node-name $(hostname) --build-services=true \
    --start-services=true --stop-services=true $test_args
  exit $?
--- a/test/e2e_node/e2e_node_suite_test.go
+++ b/test/e2e_node/e2e_node_suite_test.go
@ -20,6 +20,7 @@ package e2e_node

 import (
 	"bytes"
+	"encoding/json"
 	"flag"
 	"fmt"
 	"io/ioutil"
@ -27,7 +28,6 @@ import (
 	"os"
 	"os/exec"
 	"path"
-	"strings"
 	"testing"
 	"time"

@ -48,6 +48,27 @@ import (

 var e2es *e2eService

+// context is the test context shared by all parallel nodes.
+// Originally we setup the test environment and initialize global variables
+// in BeforeSuite, and then used the global variables in the test.
+// However, after we make the test parallel, ginkgo will run all tests
+// in several parallel test nodes. And for each test node, the BeforeSuite
+// and AfterSuite will be run.
+// We don't want to start services (kubelet, apiserver and etcd) for all
+// parallel nodes, but we do want to set some globally shared variable which
+// could be used in test.
+// We have to use SynchronizedBeforeSuite to achieve that. The first
+// function of SynchronizedBeforeSuite is only called once, and the second
+// function is called in each parallel test node. The result returned by
+// the first function will be the parameter of the second function.
+// So we'll start all services and initialize the shared context in the first
+// function, and propagate the context to all parallel test nodes in the
+// second function.
+// Notice no lock is needed for shared context, because context should only be
+// initialized in the first function in SynchronizedBeforeSuite. After that
+// it should never be modified.
+var context SharedContext
+
 var prePullImages = flag.Bool("prepull-images", true, "If true, prepull images so image pull failures do not cause test failures.")
 var junitFileNumber = flag.Int("junit-file-number", 1, "Used to create junit filename - e.g. junit_01.xml.")

@ -77,17 +98,10 @@ func TestE2eNode(t *testing.T) {
 }

 // Setup the kubelet on the node
-var _ = BeforeSuite(func() {
+var _ = SynchronizedBeforeSuite(func() []byte {
 	if *buildServices {
 		buildGo()
 	}
-	if framework.TestContext.NodeName == "" {
-		output, err := exec.Command("hostname").CombinedOutput()
-		if err != nil {
-			glog.Fatalf("Could not get node name from hostname %v.  Output:\n%s", err, output)
-		}
-		framework.TestContext.NodeName = strings.TrimSpace(fmt.Sprintf("%s", output))
-	}

 	// Pre-pull the images tests depend on so we can fail immediately if there is an image pull issue
 	// This helps with debugging test flakes since it is hard to tell when a test failure is due to image pulling.
@ -102,8 +116,9 @@ var _ = BeforeSuite(func() {
 	// We should mask locksmithd when provisioning the machine.
 	maskLocksmithdOnCoreos()

+	shared := &SharedContext{}
 	if *startServices {
-		e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS)
+		e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS, shared)
 		if err := e2es.start(); err != nil {
 			Fail(fmt.Sprintf("Unable to start node services.\n%v", err))
 		}
@ -117,10 +132,28 @@ var _ = BeforeSuite(func() {

 	// Reference common test to make the import valid.
 	commontest.CurrentSuite = commontest.NodeE2E
+
+	data, err := json.Marshal(shared)
+	Expect(err).NotTo(HaveOccurred())
+
+	return data
+}, func(data []byte) {
+	// Set the shared context got from the synchronized initialize function
+	shared := &SharedContext{}
+	Expect(json.Unmarshal(data, shared)).To(Succeed())
+	context = *shared
+
+	if framework.TestContext.NodeName == "" {
+		hostname, err := os.Hostname()
+		if err != nil {
+			glog.Fatalf("Could not get node name: %v", err)
+		}
+		framework.TestContext.NodeName = hostname
+	}
 })

 // Tear down the kubelet on the node
-var _ = AfterSuite(func() {
+var _ = SynchronizedAfterSuite(func() {}, func() {
 	if e2es != nil {
 		e2es.getLogFiles()
 		if *startServices && *stopServices {
--- a/test/e2e_node/e2e_service.go
+++ b/test/e2e_node/e2e_service.go
@ -42,11 +42,11 @@ type e2eService struct {
 	killCmds []*killCmd
 	rmDirs   []string

-	etcdDataDir         string
-	kubeletStaticPodDir string
-	nodeName            string
-	logFiles            map[string]logFileData
-	cgroupsPerQOS       bool
+	context       *SharedContext
+	etcdDataDir   string
+	nodeName      string
+	logFiles      map[string]logFileData
+	cgroupsPerQOS bool
 }

 type logFileData struct {
@ -61,7 +61,7 @@ const (
 	defaultEtcdPath = "/tmp/etcd"
 )

-func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
+func newE2eService(nodeName string, cgroupsPerQOS bool, context *SharedContext) *e2eService {
 	// Special log files that need to be collected for additional debugging.
 	var logFiles = map[string]logFileData{
 		"kern.log":   {[]string{"/var/log/kern.log"}, []string{"-k"}},
@ -69,6 +69,7 @@ func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
 	}

 	return &e2eService{
+		context:       context,
 		nodeName:      nodeName,
 		logFiles:      logFiles,
 		cgroupsPerQOS: cgroupsPerQOS,
@ -101,7 +102,7 @@ func (es *e2eService) start() error {
 		return err
 	}
 	es.killCmds = append(es.killCmds, cmd)
-	es.rmDirs = append(es.rmDirs, es.kubeletStaticPodDir)
+	es.rmDirs = append(es.rmDirs, es.context.PodConfigPath)

 	return nil
 }
@ -222,7 +223,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
 	if err != nil {
 		return nil, err
 	}
-	es.kubeletStaticPodDir = dataDir
+	es.context.PodConfigPath = dataDir
 	var killOverride *exec.Cmd
 	cmdArgs := []string{}
 	if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
@ -247,7 +248,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
 		"--volume-stats-agg-period", "10s", // Aggregate volumes frequently so tests don't need to wait as long
 		"--allow-privileged", "true",
 		"--serialize-image-pulls", "false",
-		"--config", es.kubeletStaticPodDir,
+		"--config", es.context.PodConfigPath,
 		"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
 		"--v", LOG_VERBOSITY_LEVEL, "--logtostderr",
 		"--pod-cidr=10.180.0.0/24", // Assign a fixed CIDR to the node because there is no node controller.
--- a/test/e2e_node/jenkins/e2e-node-jenkins.sh
+++ b/test/e2e_node/jenkins/e2e-node-jenkins.sh
@ -40,6 +40,7 @@ make generated_files
 go generate test/e2e/framework/gobindata_util.go
 go build test/e2e_node/environment/conformance.go

+PARALLELISM=${PARALLELISM:-8}
 WORKSPACE=${WORKSPACE:-"/tmp/"}
 ARTIFACTS=${WORKSPACE}/_artifacts

@ -48,5 +49,5 @@ go run test/e2e_node/runner/run_e2e.go  --logtostderr --vmodule=*=2 --ssh-env="g
  --zone="$GCE_ZONE" --project="$GCE_PROJECT" --hosts="$GCE_HOSTS" \
  --images="$GCE_IMAGES" --image-project="$GCE_IMAGE_PROJECT" \
  --image-config-file="$GCE_IMAGE_CONFIG_PATH" --cleanup="$CLEANUP" \
-  --results-dir="$ARTIFACTS" --ginkgo-flags="$GINKGO_FLAGS" \
+  --results-dir="$ARTIFACTS" --ginkgo-flags="--nodes=$PARALLELISM $GINKGO_FLAGS" \
  --setup-node="$SETUP_NODE" --test_args="$TEST_ARGS" --instance-metadata="$GCE_INSTANCE_METADATA"
--- a/test/e2e_node/mirror_pod_test.go
+++ b/test/e2e_node/mirror_pod_test.go
@ -37,14 +37,14 @@ import (
 var _ = framework.KubeDescribe("MirrorPod", func() {
 	f := framework.NewDefaultFramework("mirror-pod")
 	Context("when create a mirror pod ", func() {
-		var staticPodName, mirrorPodName string
+		var ns, staticPodName, mirrorPodName string
 		BeforeEach(func() {
-			ns := f.Namespace.Name
+			ns = f.Namespace.Name
 			staticPodName = "static-pod-" + string(util.NewUUID())
-			mirrorPodName = staticPodName + "-" + e2es.nodeName
+			mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName

 			By("create the static pod")
-			err := createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
+			err := createStaticPod(context.PodConfigPath, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to be running")
@ -53,7 +53,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		It("should be updated when static pod updated", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -61,7 +60,7 @@ var _ = framework.KubeDescribe("MirrorPod", func() {

 			By("update the static pod container image")
 			image := ImageRegistry[pauseImage]
-			err = createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, image, api.RestartPolicyAlways)
+			err = createStaticPod(context.PodConfigPath, staticPodName, ns, image, api.RestartPolicyAlways)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to be updated")
@ -76,7 +75,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			Expect(pod.Spec.Containers[0].Image).Should(Equal(image))
 		})
 		It("should be recreated when mirror pod gracefully deleted", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -92,7 +90,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		It("should be recreated when mirror pod forcibly deleted", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -108,9 +105,8 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		AfterEach(func() {
-			ns := f.Namespace.Name
 			By("delete the static pod")
-			err := deleteStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns)
+			err := deleteStaticPod(context.PodConfigPath, staticPodName, ns)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to disappear")
--- a/test/e2e_node/util.go
+++ b/test/e2e_node/util.go
@ -26,3 +26,7 @@ var disableKubenet = flag.Bool("disable-kubenet", false, "If true, start kubelet
 var buildServices = flag.Bool("build-services", true, "If true, build local executables")
 var startServices = flag.Bool("start-services", true, "If true, start local node services")
 var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests")
+
+type SharedContext struct {
+	PodConfigPath string
+}