Merge pull request #125485 from mimowo/refactor-job-e2e-for-conformance

Split Job e2e test to make them possible targets for conformance promotion
This commit is contained in:
Kubernetes Prow Robot 2024-06-19 10:06:41 -07:00 committed by GitHub
commit 4e25953c8b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -173,74 +173,120 @@ var _ = SIGDescribe("Job", func() {
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
}) })
// This test is using an indexed job. The pod corresponding to the 0th index /*
// creates a marker file on the host and runs 'forever' until evicted. We use Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the exit code
// the non-0-indexed pods to determine if the marker file is already Description: This test is using an indexed job. The pod corresponding to the 0th index
// created by the 0th indexed pod - the non-0-indexed pods fail and restart creates a marker file on the host and runs 'forever' until evicted. We use
// until the marker file is created (their potential failures are ignored the non-0-indexed pods to determine if the marker file is already
// based on the exit code). Once the marker file is created the 0th indexed created by the 0th indexed pod - the non-0-indexed pods fail and restart
// pod is evicted (DisruptionTarget condition is added in the process), until the marker file is created (their potential failures are ignored
// after restart it runs to successful completion. based on the exit code). Once the marker file is created the 0th indexed
// Steps: pod is evicted (DisruptionTarget condition is added in the process),
// 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods after restart it runs to successful completion.
// 2. Create the indexed job Steps:
// 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
// 4. Make sure the 0-indexed pod is running 2. Create the indexed job with pod failure policy which ignores failed pods with 137 exit code
// 5. Evict the 0-indexed pod 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
// 6. Await for the job to successfully complete 4. Make sure the 0-indexed pod is running
ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit", 5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
func(ctx context.Context, policy *batchv1.PodFailurePolicy) { 6. Await for the job to successfully complete
mode := batchv1.IndexedCompletion */
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the exit code", func(ctx context.Context) {
// We set the backoffLimit to 0 so that any pod failure would trigger
// job failure if not for the pod failure policy to ignore the failed
// pods from counting them towards the backoffLimit.
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
// We set the backoffLimit to 0 so that any pod failure would trigger ginkgo.By("Looking for a node to schedule job pods")
// job failure if not for the pod failure policy to ignore the failed node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
// pods from counting them towards the backoffLimit. framework.ExpectNoError(err)
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
ginkgo.By("Looking for a node to schedule job pods") ginkgo.By("Creating a job")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-exit-code", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
framework.ExpectNoError(err) job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
ginkgo.By("Creating a job") Rules: []batchv1.PodFailurePolicyRule{
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) {
job.Spec.CompletionMode = &mode // Ignore failures of the non 0-indexed pods which fail until the marker file is created
job.Spec.PodFailurePolicy = policy // And the 137 in the 0-indexed pod due to eviction.
job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) Action: batchv1.PodFailurePolicyActionIgnore,
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") Values: []int32{1, 137},
err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) },
framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
ginkgo.By("Awaiting for the 0-indexed pod to be running")
err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
}, },
} },
f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget) }
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
ginkgo.By("Ensuring job reaches completions") ginkgo.By("Awaiting for the 0-indexed pod to be running")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
},
ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{ pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
},
}
err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
})
/*
Testname: Ensure pod failure policy allows to ignore failure for an evicted pod; matching on the DisruptionTarget condition
Description: This test is using an indexed job. The pod corresponding to the 0th index
creates a marker file on the host and runs 'forever' until evicted. We use
the non-0-indexed pods to determine if the marker file is already
created by the 0th indexed pod - the non-0-indexed pods fail and restart
until the marker file is created (their potential failures are ignored
based on the exit code). Once the marker file is created the 0th indexed
pod is evicted (DisruptionTarget condition is added in the process),
after restart it runs to successful completion.
Steps:
1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
2. Create the indexed job with pod failure policy which ignores failed pods with DisruptionTarget condition
3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
4. Make sure the 0-indexed pod is running
5. Evict the 0-indexed pod, the failure is ignored as it matches the pod failure policy
6. Await for the job to successfully complete
*/
ginkgo.It("should allow to use a pod failure policy to ignore failure for an evicted pod; matching on the DisruptionTarget condition", func(ctx context.Context) {
// We set the backoffLimit to 0 so that any pod failure would trigger
// job failure if not for the pod failure policy to ignore the failed
// pods from counting them towards the backoffLimit.
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(0)
ginkgo.By("Looking for a node to schedule job pods")
node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
framework.ExpectNoError(err)
ginkgo.By("Creating a job")
job := e2ejob.NewTestJobOnNode("notTerminateOnce", "evicted-pod-ignore-on-disruption-condition", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
job.Spec.CompletionMode = ptr.To(batchv1.IndexedCompletion)
job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
Rules: []batchv1.PodFailurePolicyRule{ Rules: []batchv1.PodFailurePolicyRule{
{ {
// Ignore failures of the non 0-indexed pods which fail until the marker file is created // Ignore failures of the non 0-indexed pods which fail until the marker file is created
@ -261,21 +307,40 @@ var _ = SIGDescribe("Job", func() {
}, },
}, },
}, },
}), }
ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{ job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
Rules: []batchv1.PodFailurePolicyRule{ framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
{
// Ignore failures of the non 0-indexed pods which fail until the marker file is created ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
// And the 137 in the 0-indexed pod due to eviction. err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
Action: batchv1.PodFailurePolicyActionIgnore, framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, ginkgo.By("Awaiting for the 0-indexed pod to be running")
Values: []int32{1, 137}, err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
}, framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
},
pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
pod := pods[0]
ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
evictTarget := &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{
Name: pod.Name,
Namespace: pod.Namespace,
}, },
}), }
) err = f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(ctx, evictTarget)
framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
})
ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) { ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) {
parallelism := int32(2) parallelism := int32(2)