diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go index 288bf6e4020..ea6abb968d6 100644 --- a/test/e2e/apps/job.go +++ b/test/e2e/apps/job.go @@ -173,74 +173,120 @@ var _ = SIGDescribe("Job", func() { framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) }) - // This test is using an indexed job. The pod corresponding to the 0th index - // creates a marker file on the host and runs 'forever' until evicted. We use - // the non-0-indexed pods to determine if the marker file is already - // created by the 0th indexed pod - the non-0-indexed pods fail and restart - // until the marker file is created (their potential failures are ignored - // based on the exit code). Once the marker file is created the 0th indexed - // pod is evicted (DisruptionTarget condition is added in the process), - // after restart it runs to successful completion. - // Steps: - // 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods - // 2. Create the indexed job - // 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod - // 4. Make sure the 0-indexed pod is running - // 5. Evict the 0-indexed pod - // 6. Await for the job to successfully complete - ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit", - func(ctx context.Context, policy *batchv1.PodFailurePolicy) { - mode := batchv1.IndexedCompletion + /* + Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code + Description: This test is using an indexed job. The pod corresponding to the 0th index + creates a marker file on the host and runs 'forever' until evicted. We use + the non-0-indexed pods to determine if the marker file is already + created by the 0th indexed pod - the non-0-indexed pods fail and restart + until the marker file is created (their potential failures are ignored + based on the exit code). Once the marker file is created the 0th indexed + pod is evicted (DisruptionTarget condition is added in the process), + after restart it runs to successful completion. + Steps: + 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods + 2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code + 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod + 4. Make sure the 0-indexed pod is running + 5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy + 6. Await for the job to successfully complete + */ + ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) { + // We set the backoffLimit to 0 so that any pod failure would trigger + // job failure if not for the pod failure policy to ignore the failed + // pods from counting them towards the backoffLimit. + parallelism := int32(2) + completions := int32(4) + backoffLimit := int32(0) - // We set the backoffLimit to 0 so that any pod failure would trigger - // job failure if not for the pod failure policy to ignore the failed - // pods from counting them towards the backoffLimit. - parallelism := int32(2) - completions := int32(4) - backoffLimit := int32(0) + ginkgo.By("Looking for a node to schedule job pods") + node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) + framework.ExpectNoError(err) - ginkgo.By("Looking for a node to schedule job pods") - node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) - framework.ExpectNoError(err) - - ginkgo.By("Creating a job") - job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) - job.Spec.CompletionMode = &mode - job.Spec.PodFailurePolicy = policy - job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) - framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) - - ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") - err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) - framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace) - - ginkgo.By("Awaiting for the 0-indexed pod to be running") - err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) - framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace) - - pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) - framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace) - gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected") - pod := pods[0] - ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace)) - evictTarget := &policyv1.Eviction{ - ObjectMeta: metav1.ObjectMeta{ - Name: pod.Name, - Namespace: pod.Namespace, + ginkgo.By("Creating a job") + job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) + job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion) + job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ + Rules: []batchv1.PodFailurePolicyRule{ + { + // Ignore failures of the non 0-indexed pods which fail until the marker file is created + // And the 137 in the 0-indexed pod due to eviction. + Action: batchv1.PodFailurePolicyActionIgnore, + OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ + Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, + Values: []int32{1, 137}, + }, }, - } - f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget) - framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) + }, + } + job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) + framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) - ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) - err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) - framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) + ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") + err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) + framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace) - ginkgo.By("Ensuring job reaches completions") - err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) - framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) - }, - ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{ + ginkgo.By("Awaiting for the 0-indexed pod to be running") + err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) + framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace) + + pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) + framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace) + gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected") + pod := pods[0] + ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace)) + evictTarget := &policyv1.Eviction{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + } + err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget) + framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) + + ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) + err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) + framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) + + ginkgo.By("Ensuring job reaches completions") + err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) + framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) + }) + + /* + Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition + Description: This test is using an indexed job. The pod corresponding to the 0th index + creates a marker file on the host and runs 'forever' until evicted. We use + the non-0-indexed pods to determine if the marker file is already + created by the 0th indexed pod - the non-0-indexed pods fail and restart + until the marker file is created (their potential failures are ignored + based on the exit code). Once the marker file is created the 0th indexed + pod is evicted (DisruptionTarget condition is added in the process), + after restart it runs to successful completion. + Steps: + 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods + 2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition + 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod + 4. Make sure the 0-indexed pod is running + 5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy + 6. Await for the job to successfully complete + */ + ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) { + // We set the backoffLimit to 0 so that any pod failure would trigger + // job failure if not for the pod failure policy to ignore the failed + // pods from counting them towards the backoffLimit. + parallelism := int32(2) + completions := int32(4) + backoffLimit := int32(0) + + ginkgo.By("Looking for a node to schedule job pods") + node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) + framework.ExpectNoError(err) + + ginkgo.By("Creating a job") + job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) + job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion) + job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ Rules: []batchv1.PodFailurePolicyRule{ { // Ignore failures of the non 0-indexed pods which fail until the marker file is created @@ -261,21 +307,40 @@ var _ = SIGDescribe("Job", func() { }, }, }, - }), - ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{ - Rules: []batchv1.PodFailurePolicyRule{ - { - // Ignore failures of the non 0-indexed pods which fail until the marker file is created - // And the 137 in the 0-indexed pod due to eviction. - Action: batchv1.PodFailurePolicyActionIgnore, - OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ - Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, - Values: []int32{1, 137}, - }, - }, + } + job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) + framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) + + ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") + err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) + framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace) + + ginkgo.By("Awaiting for the 0-indexed pod to be running") + err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) + framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace) + + pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) + framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace) + gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected") + pod := pods[0] + ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace)) + evictTarget := &policyv1.Eviction{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, }, - }), - ) + } + err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget) + framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) + + ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) + err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) + framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) + + ginkgo.By("Ensuring job reaches completions") + err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) + framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) + }) ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) { parallelism := int32(2)