Merge pull request #113927 from alculquicondor/job-wait-finish

Add e2e test to ignore failures with 137 exit code
2026-01-05 07:27:21 +00:00 · 2022-11-15 12:32:48 -08:00
parent 591fc0d8ab f40debc8c5
commit e39a0af5ce
1 changed files with 62 additions and 46 deletions
--- a/test/e2e/apps/job.go
+++ b/test/e2e/apps/job.go
@@ -185,22 +185,57 @@ var _ = SIGDescribe("Job", func() {
 	// 4. Make sure the 0-indexed pod is running
 	// 5. Evict the 0-indexed pod
 	// 6. Await for the job to successfully complete
-	ginkgo.It("should allow to use the pod failure policy to not count pod disruption towards the backoffLimit", func() {
-		mode := batchv1.IndexedCompletion
+	ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit",
+		func(policy *batchv1.PodFailurePolicy) {
+			mode := batchv1.IndexedCompletion

-		// We set the backoffLimit to 0 so that any pod failure would trigger
-		// job failure if not for the pod failure policy to ignore the failed
-		// pods from counting them towards the backoffLimit.
-		backoffLimit := int32(0)
+			// We set the backoffLimit to 0 so that any pod failure would trigger
+			// job failure if not for the pod failure policy to ignore the failed
+			// pods from counting them towards the backoffLimit.
+			backoffLimit := int32(0)

-		ginkgo.By("Looking for a node to schedule job pods")
-		node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
-		framework.ExpectNoError(err)
+			ginkgo.By("Looking for a node to schedule job pods")
+			node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
+			framework.ExpectNoError(err)

-		ginkgo.By("Creating a job")
-		job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
-		job.Spec.CompletionMode = &mode
-		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
+			ginkgo.By("Creating a job")
+			job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
+			job.Spec.CompletionMode = &mode
+			job.Spec.PodFailurePolicy = policy
+			job, err = e2ejob.CreateJob(f.ClientSet, f.Namespace.Name, job)
+			framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
+
+			ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
+			err = e2ejob.WaitForJobPodsSucceeded(f.ClientSet, f.Namespace.Name, job.Name, completions-1)
+			framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
+
+			ginkgo.By("Awaiting for the 0-indexed pod to be running")
+			err = e2ejob.WaitForJobPodsRunning(f.ClientSet, f.Namespace.Name, job.Name, 1)
+			framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
+
+			pods, err := e2ejob.GetAllRunningJobPods(f.ClientSet, f.Namespace.Name, job.Name)
+			framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
+			framework.ExpectEqual(len(pods), 1, "Exactly one running pod is expected")
+			pod := pods[0]
+			ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
+			evictTarget := &policyv1.Eviction{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      pod.Name,
+					Namespace: pod.Namespace,
+				},
+			}
+			f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
+			framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
+
+			ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
+			err = e2epod.WaitForPodNotFoundInNamespace(f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
+			framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
+
+			ginkgo.By("Ensuring job reaches completions")
+			err = e2ejob.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, completions)
+			framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
+		},
+		ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{
 			Rules: []batchv1.PodFailurePolicyRule{
 				{
 					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
@@ -221,40 +256,21 @@ var _ = SIGDescribe("Job", func() {
 					},
 				},
 			},
-		}
-		job, err = e2ejob.CreateJob(f.ClientSet, f.Namespace.Name, job)
-		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
-
-		ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
-		err = e2ejob.WaitForJobPodsSucceeded(f.ClientSet, f.Namespace.Name, job.Name, completions-1)
-		framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
-
-		ginkgo.By("Awaiting for the 0-indexed pod to be running")
-		err = e2ejob.WaitForJobPodsRunning(f.ClientSet, f.Namespace.Name, job.Name, 1)
-		framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
-
-		pods, err := e2ejob.GetAllRunningJobPods(f.ClientSet, f.Namespace.Name, job.Name)
-		framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
-		framework.ExpectEqual(len(pods), 1, "Exactly one running pod is expected")
-		pod := pods[0]
-		ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
-		evictTarget := &policyv1.Eviction{
-			ObjectMeta: metav1.ObjectMeta{
-				Name:      pod.Name,
-				Namespace: pod.Namespace,
+		}),
+		ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{
+			Rules: []batchv1.PodFailurePolicyRule{
+				{
+					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
+					// And the 127 in the 0-indexed pod due to eviction.
+					Action: batchv1.PodFailurePolicyActionIgnore,
+					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+						Values:   []int32{1, 137},
+					},
+				},
 			},
-		}
-		f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
-		framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
-
-		ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
-		err = e2epod.WaitForPodNotFoundInNamespace(f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
-		framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
-
-		ginkgo.By("Ensuring job reaches completions")
-		err = e2ejob.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, completions)
-		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
-	})
+		}),
+	)

 	ginkgo.It("should not create pods when created in suspend state", func() {
 		ginkgo.By("Creating a job with suspend=true")