From 40a33dcc7ea957bd539ffbe78b6fa41120537d40 Mon Sep 17 00:00:00 2001 From: Eric Tune Date: Sat, 24 Oct 2015 22:51:02 -0700 Subject: [PATCH 1/2] Fix e2e test flakes. Makes number of failures per pod fixed at 1, for the RestartOnFailure case, which prevents Kubelet restart backoff, which causes test timeout. For RestartNever tests, it keeps using the random success/failure. Fixes #15389. Renables previously flaky e2e. --- hack/jenkins/e2e.sh | 1 - test/e2e/job.go | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index c3945f25b07..46ed1c6ee95 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -135,7 +135,6 @@ DISRUPTIVE_TESTS=( GCE_FLAKY_TESTS=( "DaemonRestart\sController\sManager" "Daemon\sset\sshould\srun\sand\sstop\scomplex\sdaemon" - "Jobs\sare\slocally\srestarted" "Resource\susage\sof\ssystem\scontainers" "allows\sscheduling\sof\spods\son\sa\sminion\safter\sit\srejoins\sthe\scluster" # file: resize_nodes.go, issue: #13258 "pod\sw/two\sRW\sPDs\sboth\smounted\sto\sone\scontainer,\swrite\sto\sPD" # file: pd.go, issue: #15382 diff --git a/test/e2e/job.go b/test/e2e/job.go index 04e602de717..39ec0169151 100644 --- a/test/e2e/job.go +++ b/test/e2e/job.go @@ -64,8 +64,14 @@ var _ = Describe("Job", func() { It("should run a job to completion when tasks sometimes fail and are locally restarted", func() { SkipIfProviderIs("gke") By("Creating a job") - // 50% chance of container success, local restarts. - job := newTestJob("randomlySucceedOrFail", "rand-local", api.RestartPolicyOnFailure, parallelism, completions) + // One failure, then a success, local restarts. + // We can't use the random failure approach used by the + // non-local test below, because kubelet will throttle + // frequently failing containers in a given pod, ramping + // up to 5 minutes between restarts, making test timeouts + // due to successive failures too likely with a reasonable + // test timeout. + job := newTestJob("failOnce", "fail-once-local", api.RestartPolicyOnFailure, parallelism, completions) job, err := createJob(f.Client, f.Namespace.Name, job) Expect(err).NotTo(HaveOccurred()) @@ -79,6 +85,11 @@ var _ = Describe("Job", func() { SkipIfProviderIs("gke") By("Creating a job") // 50% chance of container success, local restarts. + // Can't use the failOnce approach because that relies + // on an emptyDir, which is not preserved across new pods. + // Worst case analysis: 15 failures, each taking 1 minute to + // run doe to some slowness, 1 in 2^15 chance of happening, + // causing test flake. Should be very rare. job := newTestJob("randomlySucceedOrFail", "rand-non-local", api.RestartPolicyNever, parallelism, completions) job, err := createJob(f.Client, f.Namespace.Name, job) Expect(err).NotTo(HaveOccurred()) @@ -197,11 +208,25 @@ func newTestJob(behavior, name string, rPol api.RestartPolicy, parallelism, comp }, Spec: api.PodSpec{ RestartPolicy: rPol, + Volumes: []api.Volume{ + { + Name: "data", + VolumeSource: api.VolumeSource{ + EmptyDir: &api.EmptyDirVolumeSource{}, + }, + }, + }, Containers: []api.Container{ { Name: "c", Image: "gcr.io/google_containers/busybox", Command: []string{}, + VolumeMounts: []api.VolumeMount{ + { + MountPath: "/data", + Name: "data", + }, + }, }, }, }, @@ -219,6 +244,13 @@ func newTestJob(behavior, name string, rPol api.RestartPolicy, parallelism, comp // Bash's $RANDOM generates pseudorandom int in range 0 - 32767. // Dividing by 16384 gives roughly 50/50 chance of success. job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"} + case "failOnce": + // Fail the first the container of the pod is run, and + // succeed the second time. Checks for file on emptydir. + // If present, succeed. If not, create but fail. + // Note that this cannot be used with RestartNever because + // it always fails the first time for a pod. + job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"} } return job } From c2f6768cbc08a2722821df9e12995ee47a4404b9 Mon Sep 17 00:00:00 2001 From: Eric Tune Date: Mon, 26 Oct 2015 16:36:01 -0700 Subject: [PATCH 2/2] Fix. --- test/e2e/job.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/job.go b/test/e2e/job.go index 39ec0169151..6b0d3e2aac6 100644 --- a/test/e2e/job.go +++ b/test/e2e/job.go @@ -88,7 +88,7 @@ var _ = Describe("Job", func() { // Can't use the failOnce approach because that relies // on an emptyDir, which is not preserved across new pods. // Worst case analysis: 15 failures, each taking 1 minute to - // run doe to some slowness, 1 in 2^15 chance of happening, + // run due to some slowness, 1 in 2^15 chance of happening, // causing test flake. Should be very rare. job := newTestJob("randomlySucceedOrFail", "rand-non-local", api.RestartPolicyNever, parallelism, completions) job, err := createJob(f.Client, f.Namespace.Name, job)