Make the node e2e test run in parallel.

2025-08-15 06:43:54 +00:00 · 2016-07-18 00:52:39 -07:00 · 2016-07-18 00:52:39 -07:00 · 9d48c76361
commit 9d48c76361
parent 17e31bacbc
7 changed files with 105 additions and 46 deletions
--- a/docs/devel/e2e-node-tests.md
+++ b/docs/devel/e2e-node-tests.md
@ -205,6 +205,18 @@ less useful for catching flakes related creating the instance from an image.**
 make test-e2e-node REMOTE=true RUN_UNTIL_FAILURE=true
 ```

+## Run tests in parallel
+
+Running test in parallel can usually shorten the test duration. By default node
+e2e test runs with`--nodes=8` (see ginkgo flag
+[--nodes](https://onsi.github.io/ginkgo/#parallel-specs)). You can use the
+`PARALLELISM` option to change the parallelism.
+
+```sh
+make test-e2e-node PARALLELISM=4 # run test with 4 parallel nodes
+make test-e2e-node PARALLELISM=1 # run test sequentially
+```
+
 ## Run tests with kubenet network plugin

 [kubenet](http://kubernetes.io/docs/admin/network-plugins/#kubenet) is
--- a/hack/make-rules/test-e2e-node.sh
+++ b/hack/make-rules/test-e2e-node.sh
@ -19,6 +19,13 @@ source "${KUBE_ROOT}/hack/lib/init.sh"

 focus=${FOCUS:-""}
 skip=${SKIP:-""}
+# The number of tests that can run in parallel depends on what tests
+# are running and on the size of the node. Too many, and tests will
+# fail due to resource contention. 8 is a reasonable default for a
+# n1-standard-1 node.
+# Currently, parallelism only affects when REMOTE=true. For local test,
+# ginkgo default parallelism (cores - 1) is used.
+parallelism=${PARALLELISM:-8}
 report=${REPORT:-"/tmp/"}
 artifacts=${ARTIFACTS:-"/tmp/_artifacts"}
 remote=${REMOTE:-"false"}
@ -46,6 +53,25 @@ if [[ -z "${ginkgo}" ]]; then
  exit 1
 fi

+# Parse the flags to pass to ginkgo
+ginkgoflags=""
+if [[ $parallelism > 1 ]]; then
+  ginkgoflags="$ginkgoflags -nodes=$parallelism "
+fi
+
+if [[ $focus != "" ]]; then
+  ginkgoflags="$ginkgoflags -focus=$focus "
+fi
+
+if [[ $skip != "" ]]; then
+  ginkgoflags="$ginkgoflags -skip=$skip "
+fi
+
+if [[ $run_until_failure != "" ]]; then
+  ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
+fi
+
+
 if [ $remote = true ] ; then
  # Setup the directory to copy test artifacts (logs, junit.xml, etc) from remote host to local host
  if [ ! -d "${artifacts}" ]; then
@ -89,20 +115,6 @@ if [ $remote = true ] ; then
       done
  fi

-  # Parse the flags to pass to ginkgo
-  ginkgoflags=""
-  if [[ $focus != "" ]]; then
-     ginkgoflags="$ginkgoflags -focus=$focus "
-  fi
-
-  if [[ $skip != "" ]]; then
-     ginkgoflags="$ginkgoflags -skip=$skip "
-  fi
-
-  if [[ $run_until_failure != "" ]]; then
-     ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
-  fi
-
  # Output the configuration we will try to run
  echo "Running tests remotely using"
  echo "Project: $project"
@ -133,7 +145,7 @@ else
  fi
  # Test using the host the script was run on
  # Provided for backwards compatibility
-  "${ginkgo}" --focus=$focus --skip=$skip "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
+  "${ginkgo}" $ginkgoflags "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
    -- --alsologtostderr --v 2 --node-name $(hostname) --build-services=true \
    --start-services=true --stop-services=true $test_args
  exit $?
--- a/test/e2e_node/e2e_node_suite_test.go
+++ b/test/e2e_node/e2e_node_suite_test.go
@ -20,6 +20,7 @@ package e2e_node

 import (
 	"bytes"
+	"encoding/json"
 	"flag"
 	"fmt"
 	"io/ioutil"
@ -27,7 +28,6 @@ import (
 	"os"
 	"os/exec"
 	"path"
-	"strings"
 	"testing"
 	"time"

@ -48,6 +48,27 @@ import (

 var e2es *e2eService

+// context is the test context shared by all parallel nodes.
+// Originally we setup the test environment and initialize global variables
+// in BeforeSuite, and then used the global variables in the test.
+// However, after we make the test parallel, ginkgo will run all tests
+// in several parallel test nodes. And for each test node, the BeforeSuite
+// and AfterSuite will be run.
+// We don't want to start services (kubelet, apiserver and etcd) for all
+// parallel nodes, but we do want to set some globally shared variable which
+// could be used in test.
+// We have to use SynchronizedBeforeSuite to achieve that. The first
+// function of SynchronizedBeforeSuite is only called once, and the second
+// function is called in each parallel test node. The result returned by
+// the first function will be the parameter of the second function.
+// So we'll start all services and initialize the shared context in the first
+// function, and propagate the context to all parallel test nodes in the
+// second function.
+// Notice no lock is needed for shared context, because context should only be
+// initialized in the first function in SynchronizedBeforeSuite. After that
+// it should never be modified.
+var context SharedContext
+
 var prePullImages = flag.Bool("prepull-images", true, "If true, prepull images so image pull failures do not cause test failures.")
 var junitFileNumber = flag.Int("junit-file-number", 1, "Used to create junit filename - e.g. junit_01.xml.")

@ -77,17 +98,10 @@ func TestE2eNode(t *testing.T) {
 }

 // Setup the kubelet on the node
-var _ = BeforeSuite(func() {
+var _ = SynchronizedBeforeSuite(func() []byte {
 	if *buildServices {
 		buildGo()
 	}
-	if framework.TestContext.NodeName == "" {
-		output, err := exec.Command("hostname").CombinedOutput()
-		if err != nil {
-			glog.Fatalf("Could not get node name from hostname %v.  Output:\n%s", err, output)
-		}
-		framework.TestContext.NodeName = strings.TrimSpace(fmt.Sprintf("%s", output))
-	}

 	// Pre-pull the images tests depend on so we can fail immediately if there is an image pull issue
 	// This helps with debugging test flakes since it is hard to tell when a test failure is due to image pulling.
@ -102,8 +116,9 @@ var _ = BeforeSuite(func() {
 	// We should mask locksmithd when provisioning the machine.
 	maskLocksmithdOnCoreos()

+	shared := &SharedContext{}
 	if *startServices {
-		e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS)
+		e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS, shared)
 		if err := e2es.start(); err != nil {
 			Fail(fmt.Sprintf("Unable to start node services.\n%v", err))
 		}
@ -117,10 +132,28 @@ var _ = BeforeSuite(func() {

 	// Reference common test to make the import valid.
 	commontest.CurrentSuite = commontest.NodeE2E
+
+	data, err := json.Marshal(shared)
+	Expect(err).NotTo(HaveOccurred())
+
+	return data
+}, func(data []byte) {
+	// Set the shared context got from the synchronized initialize function
+	shared := &SharedContext{}
+	Expect(json.Unmarshal(data, shared)).To(Succeed())
+	context = *shared
+
+	if framework.TestContext.NodeName == "" {
+		hostname, err := os.Hostname()
+		if err != nil {
+			glog.Fatalf("Could not get node name: %v", err)
+		}
+		framework.TestContext.NodeName = hostname
+	}
 })

 // Tear down the kubelet on the node
-var _ = AfterSuite(func() {
+var _ = SynchronizedAfterSuite(func() {}, func() {
 	if e2es != nil {
 		e2es.getLogFiles()
 		if *startServices && *stopServices {
--- a/test/e2e_node/e2e_service.go
+++ b/test/e2e_node/e2e_service.go
@ -42,11 +42,11 @@ type e2eService struct {
 	killCmds []*killCmd
 	rmDirs   []string

-	etcdDataDir         string
-	kubeletStaticPodDir string
-	nodeName            string
-	logFiles            map[string]logFileData
-	cgroupsPerQOS       bool
+	context       *SharedContext
+	etcdDataDir   string
+	nodeName      string
+	logFiles      map[string]logFileData
+	cgroupsPerQOS bool
 }

 type logFileData struct {
@ -59,7 +59,7 @@ const (
 	LOG_VERBOSITY_LEVEL = "4"
 )

-func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
+func newE2eService(nodeName string, cgroupsPerQOS bool, context *SharedContext) *e2eService {
 	// Special log files that need to be collected for additional debugging.
 	var logFiles = map[string]logFileData{
 		"kern.log":   {[]string{"/var/log/kern.log"}, []string{"-k"}},
@ -67,6 +67,7 @@ func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
 	}

 	return &e2eService{
+		context:       context,
 		nodeName:      nodeName,
 		logFiles:      logFiles,
 		cgroupsPerQOS: cgroupsPerQOS,
@ -99,7 +100,7 @@ func (es *e2eService) start() error {
 		return err
 	}
 	es.killCmds = append(es.killCmds, cmd)
-	es.rmDirs = append(es.rmDirs, es.kubeletStaticPodDir)
+	es.rmDirs = append(es.rmDirs, es.context.PodConfigPath)

 	return nil
 }
@ -211,7 +212,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
 	if err != nil {
 		return nil, err
 	}
-	es.kubeletStaticPodDir = dataDir
+	es.context.PodConfigPath = dataDir
 	var killOverride *exec.Cmd
 	cmdArgs := []string{}
 	if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
@ -236,7 +237,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
 		"--volume-stats-agg-period", "10s", // Aggregate volumes frequently so tests don't need to wait as long
 		"--allow-privileged", "true",
 		"--serialize-image-pulls", "false",
-		"--config", es.kubeletStaticPodDir,
+		"--config", es.context.PodConfigPath,
 		"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
 		"--v", LOG_VERBOSITY_LEVEL, "--logtostderr",
 		"--pod-cidr=10.180.0.0/24", // Assign a fixed CIDR to the node because there is no node controller.
--- a/test/e2e_node/jenkins/e2e-node-jenkins.sh
+++ b/test/e2e_node/jenkins/e2e-node-jenkins.sh
@ -31,6 +31,7 @@ set -x
 make generated_files
 go build test/e2e_node/environment/conformance.go

+PARALLELISM=${PARALLELISM:-8}
 WORKSPACE=${WORKSPACE:-"/tmp/"}
 ARTIFACTS=${WORKSPACE}/_artifacts

@ -39,5 +40,5 @@ go run test/e2e_node/runner/run_e2e.go  --logtostderr --vmodule=*=2 --ssh-env="g
  --zone="$GCE_ZONE" --project="$GCE_PROJECT" --hosts="$GCE_HOSTS" \
  --images="$GCE_IMAGES" --image-project="$GCE_IMAGE_PROJECT" \
  --image-config-file="$GCE_IMAGE_CONFIG_PATH" --cleanup="$CLEANUP" \
-  --results-dir="$ARTIFACTS" --ginkgo-flags="$GINKGO_FLAGS" \
+  --results-dir="$ARTIFACTS" --ginkgo-flags="--nodes=$PARALLELISM $GINKGO_FLAGS" \
  --setup-node="$SETUP_NODE" --test_args="$TEST_ARGS" --instance-metadata="$GCE_INSTANCE_METADATA"
--- a/test/e2e_node/mirror_pod_test.go
+++ b/test/e2e_node/mirror_pod_test.go
@ -37,14 +37,14 @@ import (
 var _ = framework.KubeDescribe("MirrorPod", func() {
 	f := framework.NewDefaultFramework("mirror-pod")
 	Context("when create a mirror pod ", func() {
-		var staticPodName, mirrorPodName string
+		var ns, staticPodName, mirrorPodName string
 		BeforeEach(func() {
-			ns := f.Namespace.Name
+			ns = f.Namespace.Name
 			staticPodName = "static-pod-" + string(util.NewUUID())
-			mirrorPodName = staticPodName + "-" + e2es.nodeName
+			mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName

 			By("create the static pod")
-			err := createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
+			err := createStaticPod(context.PodConfigPath, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to be running")
@ -53,7 +53,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		It("should be updated when static pod updated", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -61,7 +60,7 @@ var _ = framework.KubeDescribe("MirrorPod", func() {

 			By("update the static pod container image")
 			image := ImageRegistry[pauseImage]
-			err = createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, image, api.RestartPolicyAlways)
+			err = createStaticPod(context.PodConfigPath, staticPodName, ns, image, api.RestartPolicyAlways)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to be updated")
@ -76,7 +75,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			Expect(pod.Spec.Containers[0].Image).Should(Equal(image))
 		})
 		It("should be recreated when mirror pod gracefully deleted", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -92,7 +90,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		It("should be recreated when mirror pod forcibly deleted", func() {
-			ns := f.Namespace.Name
 			By("get mirror pod uid")
 			pod, err := f.Client.Pods(ns).Get(mirrorPodName)
 			Expect(err).ShouldNot(HaveOccurred())
@ -108,9 +105,8 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
 			}, 2*time.Minute, time.Second*4).Should(BeNil())
 		})
 		AfterEach(func() {
-			ns := f.Namespace.Name
 			By("delete the static pod")
-			err := deleteStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns)
+			err := deleteStaticPod(context.PodConfigPath, staticPodName, ns)
 			Expect(err).ShouldNot(HaveOccurred())

 			By("wait for the mirror pod to disappear")
--- a/test/e2e_node/util.go
+++ b/test/e2e_node/util.go
@ -26,3 +26,7 @@ var disableKubenet = flag.Bool("disable-kubenet", false, "If true, start kubelet
 var buildServices = flag.Bool("build-services", true, "If true, build local executables")
 var startServices = flag.Bool("start-services", true, "If true, start local node services")
 var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests")
+
+type SharedContext struct {
+	PodConfigPath string
+}