Merge pull request #29092 from Random-Liu/make-node-e2e-parallel

Automatic merge from submit-queue

Node E2E: Make node e2e parallel

For https://github.com/kubernetes/kubernetes/issues/29081.
Fix https://github.com/kubernetes/kubernetes/issues/26215.
Based on https://github.com/kubernetes/kubernetes/pull/28807, https://github.com/kubernetes/kubernetes/pull/29020, will rebase after they are merged.

**Only the last commit is new.**

We are going to move more tests into the node e2e test. However, currently node e2e test only run sequentially, the test duration will increase quickly when we add more test.

This PR makes the node e2e test run in parallel so as to shorten test duration, so that we can add more test to improve the test coverage.

* If you run the test locally with `make test-e2e-node`, it will use `-p` ginkgo flag, which uses `(cores-1)` parallel test nodes by default.
* If you run the test remotely or in the Jenkin, the parallelism will be controlled by the environment variable `PARALLELISM`. The default value is `8`, which is reasonable for our test node (n1-standard-1).

Before this PR, it took  **833.592s** to run all test on my desktop.
With this PR, it only takes **234.058s** to run.

The pull request node e2e run with this PR takes **232.327s**.
The pull request node e2e run for other PRs takes **673.810s**.

/cc @kubernetes/sig-node
This commit is contained in:
k8s-merge-robot 2016-07-29 18:38:29 -07:00 committed by GitHub
commit fe3dda1aeb
7 changed files with 105 additions and 46 deletions

View File

@ -206,6 +206,18 @@ less useful for catching flakes related creating the instance from an image.**
make test-e2e-node REMOTE=true RUN_UNTIL_FAILURE=true
```
## Run tests in parallel
Running test in parallel can usually shorten the test duration. By default node
e2e test runs with`--nodes=8` (see ginkgo flag
[--nodes](https://onsi.github.io/ginkgo/#parallel-specs)). You can use the
`PARALLELISM` option to change the parallelism.
```sh
make test-e2e-node PARALLELISM=4 # run test with 4 parallel nodes
make test-e2e-node PARALLELISM=1 # run test sequentially
```
## Run tests with kubenet network plugin
[kubenet](http://kubernetes.io/docs/admin/network-plugins/#kubenet) is

View File

@ -19,6 +19,13 @@ source "${KUBE_ROOT}/hack/lib/init.sh"
focus=${FOCUS:-""}
skip=${SKIP:-""}
# The number of tests that can run in parallel depends on what tests
# are running and on the size of the node. Too many, and tests will
# fail due to resource contention. 8 is a reasonable default for a
# n1-standard-1 node.
# Currently, parallelism only affects when REMOTE=true. For local test,
# ginkgo default parallelism (cores - 1) is used.
parallelism=${PARALLELISM:-8}
report=${REPORT:-"/tmp/"}
artifacts=${ARTIFACTS:-"/tmp/_artifacts"}
remote=${REMOTE:-"false"}
@ -47,6 +54,25 @@ if [[ -z "${ginkgo}" ]]; then
exit 1
fi
# Parse the flags to pass to ginkgo
ginkgoflags=""
if [[ $parallelism > 1 ]]; then
ginkgoflags="$ginkgoflags -nodes=$parallelism "
fi
if [[ $focus != "" ]]; then
ginkgoflags="$ginkgoflags -focus=$focus "
fi
if [[ $skip != "" ]]; then
ginkgoflags="$ginkgoflags -skip=$skip "
fi
if [[ $run_until_failure != "" ]]; then
ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
fi
if [ $remote = true ] ; then
# Setup the directory to copy test artifacts (logs, junit.xml, etc) from remote host to local host
if [ ! -d "${artifacts}" ]; then
@ -90,20 +116,6 @@ if [ $remote = true ] ; then
done
fi
# Parse the flags to pass to ginkgo
ginkgoflags=""
if [[ $focus != "" ]]; then
ginkgoflags="$ginkgoflags -focus=$focus "
fi
if [[ $skip != "" ]]; then
ginkgoflags="$ginkgoflags -skip=$skip "
fi
if [[ $run_until_failure != "" ]]; then
ginkgoflags="$ginkgoflags -untilItFails=$run_until_failure "
fi
# Output the configuration we will try to run
echo "Running tests remotely using"
echo "Project: $project"
@ -149,7 +161,7 @@ else
# Test using the host the script was run on
# Provided for backwards compatibility
"${ginkgo}" --focus=$focus --skip=$skip "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
"${ginkgo}" $ginkgoflags "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
-- --alsologtostderr --v 2 --node-name $(hostname) --build-services=true \
--start-services=true --stop-services=true $test_args
exit $?

View File

@ -20,6 +20,7 @@ package e2e_node
import (
"bytes"
"encoding/json"
"flag"
"fmt"
"io/ioutil"
@ -27,7 +28,6 @@ import (
"os"
"os/exec"
"path"
"strings"
"testing"
"time"
@ -48,6 +48,27 @@ import (
var e2es *e2eService
// context is the test context shared by all parallel nodes.
// Originally we setup the test environment and initialize global variables
// in BeforeSuite, and then used the global variables in the test.
// However, after we make the test parallel, ginkgo will run all tests
// in several parallel test nodes. And for each test node, the BeforeSuite
// and AfterSuite will be run.
// We don't want to start services (kubelet, apiserver and etcd) for all
// parallel nodes, but we do want to set some globally shared variable which
// could be used in test.
// We have to use SynchronizedBeforeSuite to achieve that. The first
// function of SynchronizedBeforeSuite is only called once, and the second
// function is called in each parallel test node. The result returned by
// the first function will be the parameter of the second function.
// So we'll start all services and initialize the shared context in the first
// function, and propagate the context to all parallel test nodes in the
// second function.
// Notice no lock is needed for shared context, because context should only be
// initialized in the first function in SynchronizedBeforeSuite. After that
// it should never be modified.
var context SharedContext
var prePullImages = flag.Bool("prepull-images", true, "If true, prepull images so image pull failures do not cause test failures.")
var junitFileNumber = flag.Int("junit-file-number", 1, "Used to create junit filename - e.g. junit_01.xml.")
@ -77,17 +98,10 @@ func TestE2eNode(t *testing.T) {
}
// Setup the kubelet on the node
var _ = BeforeSuite(func() {
var _ = SynchronizedBeforeSuite(func() []byte {
if *buildServices {
buildGo()
}
if framework.TestContext.NodeName == "" {
output, err := exec.Command("hostname").CombinedOutput()
if err != nil {
glog.Fatalf("Could not get node name from hostname %v. Output:\n%s", err, output)
}
framework.TestContext.NodeName = strings.TrimSpace(fmt.Sprintf("%s", output))
}
// Pre-pull the images tests depend on so we can fail immediately if there is an image pull issue
// This helps with debugging test flakes since it is hard to tell when a test failure is due to image pulling.
@ -102,8 +116,9 @@ var _ = BeforeSuite(func() {
// We should mask locksmithd when provisioning the machine.
maskLocksmithdOnCoreos()
shared := &SharedContext{}
if *startServices {
e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS)
e2es = newE2eService(framework.TestContext.NodeName, framework.TestContext.CgroupsPerQOS, shared)
if err := e2es.start(); err != nil {
Fail(fmt.Sprintf("Unable to start node services.\n%v", err))
}
@ -117,10 +132,28 @@ var _ = BeforeSuite(func() {
// Reference common test to make the import valid.
commontest.CurrentSuite = commontest.NodeE2E
data, err := json.Marshal(shared)
Expect(err).NotTo(HaveOccurred())
return data
}, func(data []byte) {
// Set the shared context got from the synchronized initialize function
shared := &SharedContext{}
Expect(json.Unmarshal(data, shared)).To(Succeed())
context = *shared
if framework.TestContext.NodeName == "" {
hostname, err := os.Hostname()
if err != nil {
glog.Fatalf("Could not get node name: %v", err)
}
framework.TestContext.NodeName = hostname
}
})
// Tear down the kubelet on the node
var _ = AfterSuite(func() {
var _ = SynchronizedAfterSuite(func() {}, func() {
if e2es != nil {
e2es.getLogFiles()
if *startServices && *stopServices {

View File

@ -42,11 +42,11 @@ type e2eService struct {
killCmds []*killCmd
rmDirs []string
etcdDataDir string
kubeletStaticPodDir string
nodeName string
logFiles map[string]logFileData
cgroupsPerQOS bool
context *SharedContext
etcdDataDir string
nodeName string
logFiles map[string]logFileData
cgroupsPerQOS bool
}
type logFileData struct {
@ -61,7 +61,7 @@ const (
defaultEtcdPath = "/tmp/etcd"
)
func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
func newE2eService(nodeName string, cgroupsPerQOS bool, context *SharedContext) *e2eService {
// Special log files that need to be collected for additional debugging.
var logFiles = map[string]logFileData{
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
@ -69,6 +69,7 @@ func newE2eService(nodeName string, cgroupsPerQOS bool) *e2eService {
}
return &e2eService{
context: context,
nodeName: nodeName,
logFiles: logFiles,
cgroupsPerQOS: cgroupsPerQOS,
@ -101,7 +102,7 @@ func (es *e2eService) start() error {
return err
}
es.killCmds = append(es.killCmds, cmd)
es.rmDirs = append(es.rmDirs, es.kubeletStaticPodDir)
es.rmDirs = append(es.rmDirs, es.context.PodConfigPath)
return nil
}
@ -222,7 +223,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
if err != nil {
return nil, err
}
es.kubeletStaticPodDir = dataDir
es.context.PodConfigPath = dataDir
var killOverride *exec.Cmd
cmdArgs := []string{}
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
@ -247,7 +248,7 @@ func (es *e2eService) startKubeletServer() (*killCmd, error) {
"--volume-stats-agg-period", "10s", // Aggregate volumes frequently so tests don't need to wait as long
"--allow-privileged", "true",
"--serialize-image-pulls", "false",
"--config", es.kubeletStaticPodDir,
"--config", es.context.PodConfigPath,
"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
"--v", LOG_VERBOSITY_LEVEL, "--logtostderr",
"--pod-cidr=10.180.0.0/24", // Assign a fixed CIDR to the node because there is no node controller.

View File

@ -40,6 +40,7 @@ make generated_files
go generate test/e2e/framework/gobindata_util.go
go build test/e2e_node/environment/conformance.go
PARALLELISM=${PARALLELISM:-8}
WORKSPACE=${WORKSPACE:-"/tmp/"}
ARTIFACTS=${WORKSPACE}/_artifacts
@ -48,5 +49,5 @@ go run test/e2e_node/runner/run_e2e.go --logtostderr --vmodule=*=2 --ssh-env="g
--zone="$GCE_ZONE" --project="$GCE_PROJECT" --hosts="$GCE_HOSTS" \
--images="$GCE_IMAGES" --image-project="$GCE_IMAGE_PROJECT" \
--image-config-file="$GCE_IMAGE_CONFIG_PATH" --cleanup="$CLEANUP" \
--results-dir="$ARTIFACTS" --ginkgo-flags="$GINKGO_FLAGS" \
--results-dir="$ARTIFACTS" --ginkgo-flags="--nodes=$PARALLELISM $GINKGO_FLAGS" \
--setup-node="$SETUP_NODE" --test_args="$TEST_ARGS" --instance-metadata="$GCE_INSTANCE_METADATA"

View File

@ -37,14 +37,14 @@ import (
var _ = framework.KubeDescribe("MirrorPod", func() {
f := framework.NewDefaultFramework("mirror-pod")
Context("when create a mirror pod ", func() {
var staticPodName, mirrorPodName string
var ns, staticPodName, mirrorPodName string
BeforeEach(func() {
ns := f.Namespace.Name
ns = f.Namespace.Name
staticPodName = "static-pod-" + string(util.NewUUID())
mirrorPodName = staticPodName + "-" + e2es.nodeName
mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName
By("create the static pod")
err := createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
err := createStaticPod(context.PodConfigPath, staticPodName, ns, ImageRegistry[nginxImage], api.RestartPolicyAlways)
Expect(err).ShouldNot(HaveOccurred())
By("wait for the mirror pod to be running")
@ -53,7 +53,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
}, 2*time.Minute, time.Second*4).Should(BeNil())
})
It("should be updated when static pod updated", func() {
ns := f.Namespace.Name
By("get mirror pod uid")
pod, err := f.Client.Pods(ns).Get(mirrorPodName)
Expect(err).ShouldNot(HaveOccurred())
@ -61,7 +60,7 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
By("update the static pod container image")
image := ImageRegistry[pauseImage]
err = createStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns, image, api.RestartPolicyAlways)
err = createStaticPod(context.PodConfigPath, staticPodName, ns, image, api.RestartPolicyAlways)
Expect(err).ShouldNot(HaveOccurred())
By("wait for the mirror pod to be updated")
@ -76,7 +75,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
Expect(pod.Spec.Containers[0].Image).Should(Equal(image))
})
It("should be recreated when mirror pod gracefully deleted", func() {
ns := f.Namespace.Name
By("get mirror pod uid")
pod, err := f.Client.Pods(ns).Get(mirrorPodName)
Expect(err).ShouldNot(HaveOccurred())
@ -92,7 +90,6 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
}, 2*time.Minute, time.Second*4).Should(BeNil())
})
It("should be recreated when mirror pod forcibly deleted", func() {
ns := f.Namespace.Name
By("get mirror pod uid")
pod, err := f.Client.Pods(ns).Get(mirrorPodName)
Expect(err).ShouldNot(HaveOccurred())
@ -108,9 +105,8 @@ var _ = framework.KubeDescribe("MirrorPod", func() {
}, 2*time.Minute, time.Second*4).Should(BeNil())
})
AfterEach(func() {
ns := f.Namespace.Name
By("delete the static pod")
err := deleteStaticPod(e2es.kubeletStaticPodDir, staticPodName, ns)
err := deleteStaticPod(context.PodConfigPath, staticPodName, ns)
Expect(err).ShouldNot(HaveOccurred())
By("wait for the mirror pod to disappear")

View File

@ -26,3 +26,7 @@ var disableKubenet = flag.Bool("disable-kubenet", false, "If true, start kubelet
var buildServices = flag.Bool("build-services", true, "If true, build local executables")
var startServices = flag.Bool("start-services", true, "If true, start local node services")
var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests")
type SharedContext struct {
PodConfigPath string
}