Merge pull request #125485 from mimowo/refactor-job-e2e-for-conformance

Split Job e2e test to make them possible targets for conformance promotion
This commit is contained in:
Kubernetes Prow Robot 2024-06-19 10:06:41 -07:00 committed by GitHub
commit 4e25953c8b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -173,74 +173,120 @@ var _ = SIGDescribe("Job", func() {
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
})
// This test is using an indexed job. The pod corresponding to the 0th index
// creates a marker file on the host and runs 'forever' until evicted. We use
// the non-0-indexed pods to determine if the marker file is already
// created by the 0th indexed pod - the non-0-indexed pods fail and restart
// until the marker file is created (their potential failures are ignored
// based on the exit code). Once the marker file is created the 0th indexed
// pod is evicted (DisruptionTarget condition is added in the process),
// after restart it runs to successful completion.
// Steps:
// 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
// 2. Create the indexed job
// 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
// 4. Make sure the 0-indexed pod is running
// 5. Evict the 0-indexed pod
// 6. Await for the job to successfully complete
ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit",
func(ctx context.Context, policy *batchv1.PodFailurePolicy) {
mode := batchv1.IndexedCompletion
/*
Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
Description: This test is using an indexed job. The pod corresponding to the 0th index
creates a marker file on the host and runs 'forever' until evicted. We use
the non-0-indexed pods to determine if the marker file is already
created by the 0th indexed pod - the non-0-indexed pods fail and restart
until the marker file is created (their potential failures are ignored
based on the exit code). Once the marker file is created the 0th indexed
pod is evicted (DisruptionTarget condition is added in the process),
after restart it runs to successful completion.
Steps:
1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
4. Make sure the 0-indexed pod is running
5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
6. Await for the job to successfully complete
*/
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) {
// We set the backoffLimit to 0 so that any pod failure would trigger
// job failure if not for the pod failure policy to ignore the failed
// pods from counting them towards the backoffLimit.
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
// We set the backoffLimit to 0 so that any pod failure would trigger
// job failure if not for the pod failure policy to ignore the failed
// pods from counting them towards the backoffLimit.
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
ginkgo.By("Looking for a node to schedule job pods")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
framework.ExpectNoError(err)
ginkgo.By("Looking for a node to schedule job pods")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
framework.ExpectNoError(err)
ginkgo.By("Creating a job")
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
job.Spec.CompletionMode = &mode
job.Spec.PodFailurePolicy = policy
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
ginkgo.By("Awaiting for the 0-indexed pod to be running")
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
ginkgo.By("Creating a job")
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
// Ignore failures of the non 0-indexed pods which fail until the marker file is created
// And the 137 in the 0-indexed pod due to eviction.
Action: batchv1.PodFailurePolicyActionIgnore,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 137},
},
},
}
f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
},
}
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
},
ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{
ginkgo.By("Awaiting for the 0-indexed pod to be running")
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
},
}
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
})
/*
Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
Description: This test is using an indexed job. The pod corresponding to the 0th index
creates a marker file on the host and runs 'forever' until evicted. We use
the non-0-indexed pods to determine if the marker file is already
created by the 0th indexed pod - the non-0-indexed pods fail and restart
until the marker file is created (their potential failures are ignored
based on the exit code). Once the marker file is created the 0th indexed
pod is evicted (DisruptionTarget condition is added in the process),
after restart it runs to successful completion.
Steps:
1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
4. Make sure the 0-indexed pod is running
5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
6. Await for the job to successfully complete
*/
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) {
// We set the backoffLimit to 0 so that any pod failure would trigger
// job failure if not for the pod failure policy to ignore the failed
// pods from counting them towards the backoffLimit.
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
ginkgo.By("Looking for a node to schedule job pods")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
framework.ExpectNoError(err)
ginkgo.By("Creating a job")
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
// Ignore failures of the non 0-indexed pods which fail until the marker file is created
@ -261,21 +307,40 @@ var _ = SIGDescribe("Job", func() {
},
},
},
}),
ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{
{
// Ignore failures of the non 0-indexed pods which fail until the marker file is created
// And the 137 in the 0-indexed pod due to eviction.
Action: batchv1.PodFailurePolicyActionIgnore,
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
Values: []int32{1, 137},
},
},
}
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
ginkgo.By("Awaiting for the 0-indexed pod to be running")
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
},
}),
)
}
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
})
ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) {
parallelism := int32(2)