Merge pull request #125485 from mimowo/refactor-job-e2e-for-conformance

Split Job e2e test to make them possible targets for conformance promotion
2025-07-30 06:54:01 +00:00 · 2024-06-19 10:06:41 -07:00 · 2024-06-19 10:06:41 -07:00 · 4e25953c8b
commit 4e25953c8b
parent ccbe92982d b954879548
1 changed files with 142 additions and 77 deletions
--- a/test/e2e/apps/job.go
+++ b/test/e2e/apps/job.go
@ -173,74 +173,120 @@ var _ = SIGDescribe("Job", func() {
 		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
 	})
-	// This test is using an indexed job. The pod corresponding to the 0th index
+	/*
-	// creates a marker file on the host and runs 'forever' until evicted. We use
+		Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
-	// the non-0-indexed pods to determine if the marker file is already
+		Description: This test is using an indexed job. The pod corresponding to the 0th index
-	// created by the 0th indexed pod - the non-0-indexed pods fail and restart
+		creates a marker file on the host and runs 'forever' until evicted. We use
-	// until the marker file is created (their potential failures are ignored
+		the non-0-indexed pods to determine if the marker file is already
-	// based on the exit code). Once the marker file is created the 0th indexed
+		created by the 0th indexed pod - the non-0-indexed pods fail and restart
-	// pod is evicted (DisruptionTarget condition is added in the process),
+		until the marker file is created (their potential failures are ignored
-	// after restart it runs to successful completion.
+		based on the exit code). Once the marker file is created the 0th indexed
-	// Steps:
+		pod is evicted (DisruptionTarget condition is added in the process),
-	// 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
+		after restart it runs to successful completion.
-	// 2. Create the indexed job
+		Steps:
-	// 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
+		1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
-	// 4. Make sure the 0-indexed pod is running
+		2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
-	// 5. Evict the 0-indexed pod
+		3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
-	// 6. Await for the job to successfully complete
+		4. Make sure the 0-indexed pod is running
-	ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit",
+		5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
-		func(ctx context.Context, policy *batchv1.PodFailurePolicy) {
+		6. Await for the job to successfully complete
-			mode := batchv1.IndexedCompletion
+	*/
 	ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) {
 		// We set the backoffLimit to 0 so that any pod failure would trigger
 		// job failure if not for the pod failure policy to ignore the failed
 		// pods from counting them towards the backoffLimit.
 		parallelism := int32(2)
 		completions := int32(4)
 		backoffLimit := int32(0)
-			// We set the backoffLimit to 0 so that any pod failure would trigger
+		ginkgo.By("Looking for a node to schedule job pods")
-			// job failure if not for the pod failure policy to ignore the failed
+		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
-			// pods from counting them towards the backoffLimit.
+		framework.ExpectNoError(err)
 			parallelism := int32(2)
 			completions := int32(4)
 			backoffLimit := int32(0)
-			ginkgo.By("Looking for a node to schedule job pods")
+		ginkgo.By("Creating a job")
-			node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
+		job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
-			framework.ExpectNoError(err)
+		job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
-
+		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
-			ginkgo.By("Creating a job")
+			Rules: []batchv1.PodFailurePolicyRule{
-			job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
+				{
-			job.Spec.CompletionMode = &mode
+					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
-			job.Spec.PodFailurePolicy = policy
+					// And the 137 in the 0-indexed pod due to eviction.
-			job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
+					Action: batchv1.PodFailurePolicyActionIgnore,
-			framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
+					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
-
+						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
-			ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
+						Values:   []int32{1, 137},
-			err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
+					},
 			framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
 			ginkgo.By("Awaiting for the 0-indexed pod to be running")
 			err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
 			framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
 			pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
 			framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
 			gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
 			pod := pods[0]
 			ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
 			evictTarget := &policyv1.Eviction{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      pod.Name,
 					Namespace: pod.Namespace,
 				},
-			}
+			},
-			f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
+		}
-			framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
+		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
 		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
-			ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
+		ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
-			err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
+		err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
-			framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
+		framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
-			ginkgo.By("Ensuring job reaches completions")
+		ginkgo.By("Awaiting for the 0-indexed pod to be running")
-			err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
-			framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
+		framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
-		},
+
-		ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{
+		pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
 		framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
 		gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
 		pod := pods[0]
 		ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
 		evictTarget := &policyv1.Eviction{
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      pod.Name,
 				Namespace: pod.Namespace,
 			},
 		}
 		err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
 		framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
 		ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
 		err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
 		framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
 		ginkgo.By("Ensuring job reaches completions")
 		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
 		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
 	})
 	/*
 		Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
 		Description: This test is using an indexed job. The pod corresponding to the 0th index
 		creates a marker file on the host and runs 'forever' until evicted. We use
 		the non-0-indexed pods to determine if the marker file is already
 		created by the 0th indexed pod - the non-0-indexed pods fail and restart
 		until the marker file is created (their potential failures are ignored
 		based on the exit code). Once the marker file is created the 0th indexed
 		pod is evicted (DisruptionTarget condition is added in the process),
 		after restart it runs to successful completion.
 		Steps:
 		1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
 		2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
 		3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
 		4. Make sure the 0-indexed pod is running
 		5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
 		6. Await for the job to successfully complete
 	*/
 	ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) {
 		// We set the backoffLimit to 0 so that any pod failure would trigger
 		// job failure if not for the pod failure policy to ignore the failed
 		// pods from counting them towards the backoffLimit.
 		parallelism := int32(2)
 		completions := int32(4)
 		backoffLimit := int32(0)
 		ginkgo.By("Looking for a node to schedule job pods")
 		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
 		framework.ExpectNoError(err)
 		ginkgo.By("Creating a job")
 		job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
 		job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
 		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
 			Rules: []batchv1.PodFailurePolicyRule{
 				{
 					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
@ -261,21 +307,40 @@ var _ = SIGDescribe("Job", func() {
 					},
 				},
 			},
-		}),
+		}
-		ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{
+		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
-			Rules: []batchv1.PodFailurePolicyRule{
+		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
-				{
+
-					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
+		ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
-					// And the 137 in the 0-indexed pod due to eviction.
+		err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
-					Action: batchv1.PodFailurePolicyActionIgnore,
+		framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
-					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
+
-						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
+		ginkgo.By("Awaiting for the 0-indexed pod to be running")
-						Values:   []int32{1, 137},
+		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
-					},
+		framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
-				},
+
 		pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
 		framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
 		gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
 		pod := pods[0]
 		ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
 		evictTarget := &policyv1.Eviction{
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      pod.Name,
 				Namespace: pod.Namespace,
 			},
-		}),
+		}
-	)
+		err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
 		framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
 		ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
 		err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
 		framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
 		ginkgo.By("Ensuring job reaches completions")
 		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
 		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
 	})
 	ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) {
 		parallelism := int32(2)