diff --git a/test/e2e/apps/job.go b/test/e2e/apps/job.go index 0f08c885ec0..9fd7111f5b7 100644 --- a/test/e2e/apps/job.go +++ b/test/e2e/apps/job.go @@ -29,6 +29,7 @@ import ( batchinternal "k8s.io/kubernetes/pkg/apis/batch" "k8s.io/kubernetes/test/e2e/framework" jobutil "k8s.io/kubernetes/test/e2e/framework/job" + e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" "github.com/onsi/ginkgo" @@ -97,12 +98,10 @@ var _ = SIGDescribe("Job", func() { framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func() { ginkgo.By("Creating a job") // One failure, then a success, local restarts. - // We can't use the random failure approach used by the - // non-local test below, because kubelet will throttle - // frequently failing containers in a given pod, ramping - // up to 5 minutes between restarts, making test timeouts - // due to successive failures too likely with a reasonable - // test timeout. + // We can't use the random failure approach, because kubelet will + // throttle frequently failing containers in a given pod, ramping + // up to 5 minutes between restarts, making test timeout due to + // successive failures too likely with a reasonable test timeout. job := jobutil.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit) job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job) framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) @@ -114,18 +113,20 @@ var _ = SIGDescribe("Job", func() { // Pods sometimes fail, but eventually succeed, after pod restarts ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func() { + // One failure, then a success, no local restarts. + // We can't use the random failure approach, because JobController + // will throttle frequently failing Pods of a given Job, ramping + // up to 6 minutes between restarts, making test timeout due to + // successive failures. + // Instead, we force the Job's Pods to be scheduled to a single Node + // and use a hostPath volume to persist data across new Pods. + ginkgo.By("Looking for a node to schedule job pod") + node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet) + framework.ExpectNoError(err) + ginkgo.By("Creating a job") - // 50% chance of container success, local restarts. - // Can't use the failOnce approach because that relies - // on an emptyDir, which is not preserved across new pods. - // Worst case analysis: 15 failures, each taking 1 minute to - // run due to some slowness, 1 in 2^15 chance of happening, - // causing test flake. Should be very rare. - // With the introduction of backoff limit and high failure rate this - // is hitting its timeout, the 3 is a reasonable that should make this - // test less flaky, for now. - job := jobutil.NewTestJob("randomlySucceedOrFail", "rand-non-local", v1.RestartPolicyNever, parallelism, 3, nil, 999) - job, err := jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job) + job := jobutil.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) + job, err = jobutil.CreateJob(f.ClientSet, f.Namespace.Name, job) framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) ginkgo.By("Ensuring job reaches completions") diff --git a/test/e2e/framework/job/BUILD b/test/e2e/framework/job/BUILD index e799bd082a8..e635a7ee0d6 100644 --- a/test/e2e/framework/job/BUILD +++ b/test/e2e/framework/job/BUILD @@ -17,6 +17,7 @@ go_library( "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/rand:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library", "//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//test/e2e/framework:go_default_library", diff --git a/test/e2e/framework/job/fixtures.go b/test/e2e/framework/job/fixtures.go index e6f13cc56bf..0ee29bd1d05 100644 --- a/test/e2e/framework/job/fixtures.go +++ b/test/e2e/framework/job/fixtures.go @@ -20,6 +20,7 @@ import ( batchv1 "k8s.io/api/batch/v1" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/rand" "k8s.io/kubernetes/test/e2e/framework" ) @@ -30,6 +31,13 @@ import ( // policy of the containers in which the Pod is running. Parallelism is the Job's parallelism, and completions is the // Job's required number of completions. func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32) *batchv1.Job { + anyNode := "" + return NewTestJobOnNode(behavior, name, rPol, parallelism, completions, activeDeadlineSeconds, backoffLimit, anyNode) +} + +// NewTestJobOnNode is similar to NewTestJob but supports specifying a Node on which the Job's Pods will run. +// Empty nodeName means no node selection constraints. +func NewTestJobOnNode(behavior, name string, rPol v1.RestartPolicy, parallelism, completions int32, activeDeadlineSeconds *int64, backoffLimit int32, nodeName string) *batchv1.Job { manualSelector := false job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -72,6 +80,7 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl SecurityContext: &v1.SecurityContext{}, }, }, + NodeName: nodeName, }, }, }, @@ -89,10 +98,21 @@ func NewTestJob(behavior, name string, rPol v1.RestartPolicy, parallelism, compl job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "exit $(( $RANDOM / 16384 ))"} case "failOnce": // Fail the first the container of the pod is run, and - // succeed the second time. Checks for file on emptydir. + // succeed the second time. Checks for file on a data volume. // If present, succeed. If not, create but fail. - // Note that this cannot be used with RestartNever because - // it always fails the first time for a pod. + // If RestartPolicy is Never, the nodeName should be set to + // ensure all job pods run on a single node and the volume + // will be mounted from a hostPath instead. + if len(nodeName) > 0 { + randomDir := "/tmp/job-e2e/" + rand.String(10) + hostPathType := v1.HostPathDirectoryOrCreate + job.Spec.Template.Spec.Volumes[0].VolumeSource = v1.VolumeSource{HostPath: &v1.HostPathVolumeSource{Path: randomDir, Type: &hostPathType}} + // Tests involving r/w operations on hostPath volume needs to run in + // privileged mode for SELinux enabled distro, while Windows platform + // neither supports nor needs privileged mode. + privileged := !framework.NodeOSDistroIs("windows") + job.Spec.Template.Spec.Containers[0].SecurityContext.Privileged = &privileged + } job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", "if [[ -r /data/foo ]] ; then exit 0 ; else touch /data/foo ; exit 1 ; fi"} } return job