Merge pull request #31476 from janetkuo/fix-sj-finished-job-warning

Automatic merge from submit-queue

Fix the bug that SJ sees finished jobs as unexpected

<!--  Thanks for sending a pull request!  Here are some tips for you:
1. If this is your first time, read our contributor guidelines https://github.com/kubernetes/kubernetes/blob/master/CONTRIBUTING.md and developer guide https://github.com/kubernetes/kubernetes/blob/master/docs/devel/development.md
2. If you want *faster* PR reviews, read how: https://github.com/kubernetes/kubernetes/blob/master/docs/devel/faster_reviews.md
3. Follow the instructions for writing a release note: https://github.com/kubernetes/kubernetes/blob/master/docs/devel/pull-requests.md#release-notes
-->

**What this PR does / why we need it**:

**Which issue this PR fixes** *(optional, in `fixes #<issue number>(, #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #31472 

**Special notes for your reviewer**:

**Release note**:
<!--  Steps to write your release note:
1. Use the release-note-* labels to set the release note state (if you have access) 
2. Enter your extended release note in the below block; leaving it blank means using the PR title as the release note. If no release note is required, just write `NONE`. 
-->
```release-note
NONE
```

cc @soltysh @erictune
This commit is contained in:
Kubernetes Submit Queue 2016-08-26 20:20:21 -07:00 committed by GitHub
commit 4c92a7f52f
2 changed files with 89 additions and 18 deletions

View File

@ -127,11 +127,12 @@ func (jm *ScheduledJobController) SyncAll() {
func SyncOne(sj batch.ScheduledJob, js []batch.Job, now time.Time, jc jobControlInterface, sjc sjControlInterface, pc podControlInterface, recorder record.EventRecorder) {
nameForLog := fmt.Sprintf("%s/%s", sj.Namespace, sj.Name)
for _, j := range js {
for i := range js {
j := js[i]
found := inActiveList(sj, j.ObjectMeta.UID)
if !found {
if !found && !job.IsJobFinished(&j) {
recorder.Eventf(&sj, api.EventTypeWarning, "UnexpectedJob", "Saw a job that the controller did not create or forgot: %v", j.Name)
// We found a job object that has us as the parent, but it is not in our Active list.
// We found an unfinished job that has us as the parent, but it is not in our Active list.
// This could happen if we crashed right after creating the Job and before updating the status,
// or if our jobs list is newer than our sj status after a relist, or if someone intentionally created
// a job that they wanted us to adopt.
@ -141,12 +142,10 @@ func SyncOne(sj batch.ScheduledJob, js []batch.Job, now time.Time, jc jobControl
// user has permission to create a job within a namespace, then they have permission to make any scheduledJob
// in the same namespace "adopt" that job. ReplicaSets and their Pods work the same way.
// TBS: how to update sj.Status.LastScheduleTime if the adopted job is newer than any we knew about?
} else {
if job.IsJobFinished(&j) {
deleteFromActiveList(&sj, j.ObjectMeta.UID)
// TODO: event to call out failure vs success.
recorder.Eventf(&sj, api.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name)
}
} else if found && job.IsJobFinished(&j) {
deleteFromActiveList(&sj, j.ObjectMeta.UID)
// TODO: event to call out failure vs success.
recorder.Eventf(&sj, api.EventTypeNormal, "SawCompletedJob", "Saw completed job: %v", j.Name)
}
}
updatedSJ, err := sjc.UpdateStatus(&sj)

View File

@ -28,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/apis/batch"
client "k8s.io/kubernetes/pkg/client/unversioned"
"k8s.io/kubernetes/pkg/controller/job"
"k8s.io/kubernetes/pkg/util/wait"
"k8s.io/kubernetes/test/e2e/framework"
)
@ -56,7 +57,7 @@ var _ = framework.KubeDescribe("ScheduledJob", func() {
// multiple jobs running at once
It("should schedule multiple jobs concurrently", func() {
By("Creating a scheduledjob")
scheduledJob := newTestScheduledJob("concurrent", "*/1 * * * ?", batch.AllowConcurrent)
scheduledJob := newTestScheduledJob("concurrent", "*/1 * * * ?", batch.AllowConcurrent, true)
scheduledJob, err := createScheduledJob(f.Client, f.Namespace.Name, scheduledJob)
Expect(err).NotTo(HaveOccurred())
@ -77,7 +78,7 @@ var _ = framework.KubeDescribe("ScheduledJob", func() {
// suspended should not schedule jobs
It("should not schedule jobs when suspended", func() {
By("Creating a suspended scheduledjob")
scheduledJob := newTestScheduledJob("suspended", "*/1 * * * ?", batch.AllowConcurrent)
scheduledJob := newTestScheduledJob("suspended", "*/1 * * * ?", batch.AllowConcurrent, true)
scheduledJob.Spec.Suspend = newBool(true)
scheduledJob, err := createScheduledJob(f.Client, f.Namespace.Name, scheduledJob)
Expect(err).NotTo(HaveOccurred())
@ -99,7 +100,7 @@ var _ = framework.KubeDescribe("ScheduledJob", func() {
// only single active job is allowed for ForbidConcurrent
It("should not schedule new jobs when ForbidConcurrent", func() {
By("Creating a ForbidConcurrent scheduledjob")
scheduledJob := newTestScheduledJob("forbid", "*/1 * * * ?", batch.ForbidConcurrent)
scheduledJob := newTestScheduledJob("forbid", "*/1 * * * ?", batch.ForbidConcurrent, true)
scheduledJob, err := createScheduledJob(f.Client, f.Namespace.Name, scheduledJob)
Expect(err).NotTo(HaveOccurred())
@ -129,7 +130,7 @@ var _ = framework.KubeDescribe("ScheduledJob", func() {
// only single active job is allowed for ReplaceConcurrent
It("should replace jobs when ReplaceConcurrent", func() {
By("Creating a ReplaceConcurrent scheduledjob")
scheduledJob := newTestScheduledJob("replace", "*/1 * * * ?", batch.ReplaceConcurrent)
scheduledJob := newTestScheduledJob("replace", "*/1 * * * ?", batch.ReplaceConcurrent, true)
scheduledJob, err := createScheduledJob(f.Client, f.Namespace.Name, scheduledJob)
Expect(err).NotTo(HaveOccurred())
@ -155,13 +156,35 @@ var _ = framework.KubeDescribe("ScheduledJob", func() {
err = deleteScheduledJob(f.Client, f.Namespace.Name, scheduledJob.Name)
Expect(err).NotTo(HaveOccurred())
})
// shouldn't give us unexpected warnings
It("should not emit unexpected warnings", func() {
By("Creating a scheduledjob")
scheduledJob := newTestScheduledJob("concurrent", "*/1 * * * ?", batch.AllowConcurrent, false)
scheduledJob, err := createScheduledJob(f.Client, f.Namespace.Name, scheduledJob)
Expect(err).NotTo(HaveOccurred())
By("Ensuring at least two jobs and at least one finished job exists by listing jobs explicitly")
err = waitForJobsAtLeast(f.Client, f.Namespace.Name, 2)
Expect(err).NotTo(HaveOccurred())
err = waitForAnyFinishedJob(f.Client, f.Namespace.Name)
Expect(err).NotTo(HaveOccurred())
By("Ensuring no unexpected event has happened")
err = checkNoUnexpectedEvents(f.Client, f.Namespace.Name, scheduledJob.Name)
Expect(err).NotTo(HaveOccurred())
By("Removing scheduledjob")
err = deleteScheduledJob(f.Client, f.Namespace.Name, scheduledJob.Name)
Expect(err).NotTo(HaveOccurred())
})
})
// newTestScheduledJob returns a scheduledjob which does one of several testing behaviors.
func newTestScheduledJob(name, schedule string, concurrencyPolicy batch.ConcurrencyPolicy) *batch.ScheduledJob {
func newTestScheduledJob(name, schedule string, concurrencyPolicy batch.ConcurrencyPolicy, sleep bool) *batch.ScheduledJob {
parallelism := int32(1)
completions := int32(1)
return &batch.ScheduledJob{
sj := &batch.ScheduledJob{
ObjectMeta: api.ObjectMeta{
Name: name,
},
@ -185,9 +208,8 @@ func newTestScheduledJob(name, schedule string, concurrencyPolicy batch.Concurre
},
Containers: []api.Container{
{
Name: "c",
Image: "gcr.io/google_containers/busybox:1.24",
Command: []string{"sleep", "300"},
Name: "c",
Image: "gcr.io/google_containers/busybox:1.24",
VolumeMounts: []api.VolumeMount{
{
MountPath: "/data",
@ -202,6 +224,10 @@ func newTestScheduledJob(name, schedule string, concurrencyPolicy batch.Concurre
},
},
}
if sleep {
sj.Spec.JobTemplate.Spec.Template.Spec.Containers[0].Command = []string{"sleep", "300"}
}
return sj
}
func createScheduledJob(c *client.Client, ns string, scheduledJob *batch.ScheduledJob) (*batch.ScheduledJob, error) {
@ -252,3 +278,49 @@ func waitForJobReplaced(c *client.Client, ns, previousJobName string) error {
return jobs.Items[0].Name != previousJobName, nil
})
}
// waitForJobsAtLeast waits for at least a number of jobs to appear.
func waitForJobsAtLeast(c *client.Client, ns string, atLeast int) error {
return wait.Poll(framework.Poll, scheduledJobTimeout, func() (bool, error) {
jobs, err := c.Batch().Jobs(ns).List(api.ListOptions{})
if err != nil {
return false, err
}
return len(jobs.Items) >= atLeast, nil
})
}
// waitForAnyFinishedJob waits for any completed job to appear.
func waitForAnyFinishedJob(c *client.Client, ns string) error {
return wait.Poll(framework.Poll, scheduledJobTimeout, func() (bool, error) {
jobs, err := c.Batch().Jobs(ns).List(api.ListOptions{})
if err != nil {
return false, err
}
for i := range jobs.Items {
if job.IsJobFinished(&jobs.Items[i]) {
return true, nil
}
}
return false, nil
})
}
// checkNoUnexpectedEvents checks unexpected events didn't happen.
// Currently only "UnexpectedJob" is checked.
func checkNoUnexpectedEvents(c *client.Client, ns, scheduledJobName string) error {
sj, err := c.Batch().ScheduledJobs(ns).Get(scheduledJobName)
if err != nil {
return fmt.Errorf("error in getting scheduledjob %s/%s: %v", ns, scheduledJobName, err)
}
events, err := c.Events(ns).Search(sj)
if err != nil {
return fmt.Errorf("error in listing events: %s", err)
}
for _, e := range events.Items {
if e.Reason == "UnexpectedJob" {
return fmt.Errorf("found unexpected event: %#v", e)
}
}
return nil
}