Merge pull request #125485 from mimowo/refactor-job-e2e-for-conformance

Split Job e2e test to make them possible targets for conformance promotion
2025-09-15 06:01:50 +00:00 · 2024-06-19 10:06:41 -07:00
parent ccbe92982d b954879548
commit 4e25953c8b
1 changed files with 142 additions and 77 deletions
--- a/test/e2e/apps/job.go
+++ b/test/e2e/apps/job.go
@@ -173,74 +173,120 @@ var _ = SIGDescribe("Job", func() {
 		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
 	})

-	// This test is using an indexed job. The pod corresponding to the 0th index
-	// creates a marker file on the host and runs 'forever' until evicted. We use
-	// the non-0-indexed pods to determine if the marker file is already
-	// created by the 0th indexed pod - the non-0-indexed pods fail and restart
-	// until the marker file is created (their potential failures are ignored
-	// based on the exit code). Once the marker file is created the 0th indexed
-	// pod is evicted (DisruptionTarget condition is added in the process),
-	// after restart it runs to successful completion.
-	// Steps:
-	// 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
-	// 2. Create the indexed job
-	// 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
-	// 4. Make sure the 0-indexed pod is running
-	// 5. Evict the 0-indexed pod
-	// 6. Await for the job to successfully complete
-	ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit",
-		func(ctx context.Context, policy *batchv1.PodFailurePolicy) {
-			mode := batchv1.IndexedCompletion
+	/*
+		Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
+		Description: This test is using an indexed job. The pod corresponding to the 0th index
+		creates a marker file on the host and runs 'forever' until evicted. We use
+		the non-0-indexed pods to determine if the marker file is already
+		created by the 0th indexed pod - the non-0-indexed pods fail and restart
+		until the marker file is created (their potential failures are ignored
+		based on the exit code). Once the marker file is created the 0th indexed
+		pod is evicted (DisruptionTarget condition is added in the process),
+		after restart it runs to successful completion.
+		Steps:
+		1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
+		2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
+		3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
+		4. Make sure the 0-indexed pod is running
+		5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
+		6. Await for the job to successfully complete
+	*/
+	ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) {
+		// We set the backoffLimit to 0 so that any pod failure would trigger
+		// job failure if not for the pod failure policy to ignore the failed
+		// pods from counting them towards the backoffLimit.
+		parallelism := int32(2)
+		completions := int32(4)
+		backoffLimit := int32(0)

-			// We set the backoffLimit to 0 so that any pod failure would trigger
-			// job failure if not for the pod failure policy to ignore the failed
-			// pods from counting them towards the backoffLimit.
-			parallelism := int32(2)
-			completions := int32(4)
-			backoffLimit := int32(0)
+		ginkgo.By("Looking for a node to schedule job pods")
+		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
+		framework.ExpectNoError(err)

-			ginkgo.By("Looking for a node to schedule job pods")
-			node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
-			framework.ExpectNoError(err)
-
-			ginkgo.By("Creating a job")
-			job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
-			job.Spec.CompletionMode = &mode
-			job.Spec.PodFailurePolicy = policy
-			job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
-			framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
-
-			ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
-			err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
-			framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
-
-			ginkgo.By("Awaiting for the 0-indexed pod to be running")
-			err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
-			framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
-
-			pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
-			framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
-			gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
-			pod := pods[0]
-			ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
-			evictTarget := &policyv1.Eviction{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      pod.Name,
-					Namespace: pod.Namespace,
+		ginkgo.By("Creating a job")
+		job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
+		job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
+		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
+			Rules: []batchv1.PodFailurePolicyRule{
+				{
+					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
+					// And the 137 in the 0-indexed pod due to eviction.
+					Action: batchv1.PodFailurePolicyActionIgnore,
+					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+						Values:   []int32{1, 137},
+					},
 				},
-			}
-			f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
-			framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
+			},
+		}
+		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
+		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)

-			ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
-			err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
-			framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
+		ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
+		err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
+		framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)

-			ginkgo.By("Ensuring job reaches completions")
-			err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
-			framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
-		},
-		ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{
+		ginkgo.By("Awaiting for the 0-indexed pod to be running")
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
+		framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
+
+		pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
+		framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
+		gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
+		pod := pods[0]
+		ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
+		evictTarget := &policyv1.Eviction{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      pod.Name,
+				Namespace: pod.Namespace,
+			},
+		}
+		err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
+		framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
+
+		ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
+		err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
+		framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
+
+		ginkgo.By("Ensuring job reaches completions")
+		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
+		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
+	})
+
+	/*
+		Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
+		Description: This test is using an indexed job. The pod corresponding to the 0th index
+		creates a marker file on the host and runs 'forever' until evicted. We use
+		the non-0-indexed pods to determine if the marker file is already
+		created by the 0th indexed pod - the non-0-indexed pods fail and restart
+		until the marker file is created (their potential failures are ignored
+		based on the exit code). Once the marker file is created the 0th indexed
+		pod is evicted (DisruptionTarget condition is added in the process),
+		after restart it runs to successful completion.
+		Steps:
+		1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
+		2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
+		3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
+		4. Make sure the 0-indexed pod is running
+		5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
+		6. Await for the job to successfully complete
+	*/
+	ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) {
+		// We set the backoffLimit to 0 so that any pod failure would trigger
+		// job failure if not for the pod failure policy to ignore the failed
+		// pods from counting them towards the backoffLimit.
+		parallelism := int32(2)
+		completions := int32(4)
+		backoffLimit := int32(0)
+
+		ginkgo.By("Looking for a node to schedule job pods")
+		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
+		framework.ExpectNoError(err)
+
+		ginkgo.By("Creating a job")
+		job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
+		job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
+		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
 			Rules: []batchv1.PodFailurePolicyRule{
 				{
 					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
@@ -261,21 +307,40 @@ var _ = SIGDescribe("Job", func() {
 					},
 				},
 			},
-		}),
-		ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{
-			Rules: []batchv1.PodFailurePolicyRule{
-				{
-					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
-					// And the 137 in the 0-indexed pod due to eviction.
-					Action: batchv1.PodFailurePolicyActionIgnore,
-					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
-						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
-						Values:   []int32{1, 137},
-					},
-				},
+		}
+		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
+		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
+
+		ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
+		err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
+		framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
+
+		ginkgo.By("Awaiting for the 0-indexed pod to be running")
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
+		framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
+
+		pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
+		framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
+		gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
+		pod := pods[0]
+		ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
+		evictTarget := &policyv1.Eviction{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      pod.Name,
+				Namespace: pod.Namespace,
 			},
-		}),
-	)
+		}
+		err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
+		framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
+
+		ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
+		err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
+		framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
+
+		ginkgo.By("Ensuring job reaches completions")
+		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
+		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
+	})

 	ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) {
 		parallelism := int32(2)