Merge pull request #16196 from erictune/job-e2e-fix

Fix e2e test flakes.
2025-07-30 15:05:27 +00:00 · 2015-10-27 15:04:57 -07:00 · 2015-10-27 15:04:57 -07:00 · e1901bf891
commit e1901bf891
parent 7fade8e230 c2f6768cbc
2 changed files with 34 additions and 3 deletions
--- a/hack/jenkins/e2e.sh
+++ b/hack/jenkins/e2e.sh
@ -140,7 +140,6 @@ DISRUPTIVE_TESTS=(
 GCE_FLAKY_TESTS=(
    "DaemonRestart\sController\sManager"
    "Daemon\sset\sshould\srun\sand\sstop\scomplex\sdaemon"
-    "Jobs\sare\slocally\srestarted"
    "Resource\susage\sof\ssystem\scontainers"
    "allows\sscheduling\sof\spods\son\sa\sminion\safter\sit\srejoins\sthe\scluster" # file: resize_nodes.go, issue: #13258
    "deployment.*\sin\sthe\sright\sorder" # issue: #15369
--- a/test/e2e/job.go
+++ b/test/e2e/job.go
@ -64,8 +64,14 @@ var _ = Describe("Job", func() {
 	It("should run a job to completion when tasks sometimes fail and are locally restarted", func() {
 		SkipIfProviderIs("gke")
 		By("Creating a job")
-		// 50% chance of container success, local restarts.
-		job := newTestJob("randomlySucceedOrFail", "rand-local", api.RestartPolicyOnFailure, parallelism, completions)
+		// One failure, then a success, local restarts.
+		// We can't use the random failure approach used by the
+		// non-local test below, because kubelet will throttle
+		// frequently failing containers in a given pod, ramping
+		// up to 5 minutes between restarts, making test timeouts
+		// due to successive failures too likely with a reasonable
+		// test timeout.
+		job := newTestJob("failOnce", "fail-once-local", api.RestartPolicyOnFailure, parallelism, completions)
 		job, err := createJob(f.Client, f.Namespace.Name, job)
 		Expect(err).NotTo(HaveOccurred())

@ -79,6 +85,11 @@ var _ = Describe("Job", func() {
 		SkipIfProviderIs("gke")
 		By("Creating a job")
 		// 50% chance of container success, local restarts.
+		// Can't use the failOnce approach because that relies
+		// on an emptyDir, which is not preserved across new pods.
+		// Worst case analysis: 15 failures, each taking 1 minute to
+		// run due to some slowness, 1 in 2^15 chance of happening,
+		// causing test flake.  Should be very rare.
 		job := newTestJob("randomlySucceedOrFail", "rand-non-local", api.RestartPolicyNever, parallelism, completions)
 		job, err := createJob(f.Client, f.Namespace.Name, job)
 		Expect(err).NotTo(HaveOccurred())
@ -197,11 +208,25 @@ func newTestJob(behavior, name string, rPol api.RestartPolicy, parallelism, comp
 				},
 				Spec: api.PodSpec{
 					RestartPolicy: rPol,
+					Volumes: []api.Volume{
+						{
+							Name: "data",
+							VolumeSource: api.VolumeSource{
+								EmptyDir: &api.EmptyDirVolumeSource{},
+							},
+						},
+					},
 					Containers: []api.Container{
 						{
 							Name:    "c",
 							Image:   "gcr.io/google_containers/busybox",
 							Command: []string{},
+							VolumeMounts: []api.VolumeMount{
+								{
+									MountPath: "/data",
+									Name:      "data",
+								},
+							},
 						},
 					},
 				},
@ -219,6 +244,13 @@ func newTestJob(behavior, name string, rPol api.RestartPolicy, parallelism, comp
 		// Bash's $RANDOM generates pseudorandom int in range 0 - 32767.
 		// Dividing by 16384 gives roughly 50/50 chance of success.
 		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"}
+	case "failOnce":
+		// Fail the first the container of the pod is run, and
+		// succeed the second time. Checks for file on emptydir.
+		// If present, succeed.  If not, create but fail.
+		// Note that this cannot be used with RestartNever because
+		// it always fails the first time for a pod.
+		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"}
 	}
 	return job
 }