mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #52091 from clamoriniere1A/bugfix/jobcontroller_backoff
Automatic merge from submit-queue (batch tested with PRs 52091, 52071) Bugfix: Improve how JobController use queue for backoff **What this PR does / why we need it**: In some cases, the backoff delay for a given Job is reset unnecessarily. the PR improves how JobController uses queue for backoff: - Centralize the key "forget" and "re-queue" process in only on method. - Change the signature of the syncJob method in order to return the information if it is necessary to forget the backoff delay for a given key. **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes # Links to #51153 **Special notes for your reviewer**: **Release note**: ```release-note ```
This commit is contained in:
commit
26d72847d0
@ -51,7 +51,7 @@ import (
|
||||
// controllerKind contains the schema.GroupVersionKind for this controller type.
|
||||
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
|
||||
|
||||
const (
|
||||
var (
|
||||
// DefaultJobBackOff is the max backoff period, exported for the e2e test
|
||||
DefaultJobBackOff = 10 * time.Second
|
||||
// MaxJobBackOff is the max backoff period, exported for the e2e test
|
||||
@ -64,7 +64,7 @@ type JobController struct {
|
||||
|
||||
// To allow injection of updateJobStatus for testing.
|
||||
updateHandler func(job *batch.Job) error
|
||||
syncHandler func(jobKey string) error
|
||||
syncHandler func(jobKey string) (bool, error)
|
||||
// podStoreSynced returns true if the pod store has been synced at least once.
|
||||
// Added as a member to the struct to allow injection for testing.
|
||||
podStoreSynced cache.InformerSynced
|
||||
@ -375,9 +375,11 @@ func (jm *JobController) processNextWorkItem() bool {
|
||||
}
|
||||
defer jm.queue.Done(key)
|
||||
|
||||
err := jm.syncHandler(key.(string))
|
||||
forget, err := jm.syncHandler(key.(string))
|
||||
if err == nil {
|
||||
jm.queue.Forget(key)
|
||||
if forget {
|
||||
jm.queue.Forget(key)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@ -420,7 +422,7 @@ func (jm *JobController) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
|
||||
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
|
||||
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
|
||||
// concurrently with the same key.
|
||||
func (jm *JobController) syncJob(key string) error {
|
||||
func (jm *JobController) syncJob(key string) (bool, error) {
|
||||
startTime := time.Now()
|
||||
defer func() {
|
||||
glog.V(4).Infof("Finished syncing job %q (%v)", key, time.Now().Sub(startTime))
|
||||
@ -428,26 +430,25 @@ func (jm *JobController) syncJob(key string) error {
|
||||
|
||||
ns, name, err := cache.SplitMetaNamespaceKey(key)
|
||||
if err != nil {
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
if len(ns) == 0 || len(name) == 0 {
|
||||
return fmt.Errorf("invalid job key %q: either namespace or name is missing", key)
|
||||
return false, fmt.Errorf("invalid job key %q: either namespace or name is missing", key)
|
||||
}
|
||||
sharedJob, err := jm.jobLister.Jobs(ns).Get(name)
|
||||
if err != nil {
|
||||
if errors.IsNotFound(err) {
|
||||
glog.V(4).Infof("Job has been deleted: %v", key)
|
||||
jm.expectations.DeleteExpectations(key)
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
job := *sharedJob
|
||||
|
||||
// if job was finished previously, we don't want to redo the termination
|
||||
if IsJobFinished(&job) {
|
||||
jm.queue.Forget(key)
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// retrieve the previous number of retry
|
||||
@ -460,7 +461,7 @@ func (jm *JobController) syncJob(key string) error {
|
||||
|
||||
pods, err := jm.getPodsForJob(&job)
|
||||
if err != nil {
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
|
||||
activePods := controller.FilterActivePods(pods)
|
||||
@ -551,6 +552,7 @@ func (jm *JobController) syncJob(key string) error {
|
||||
}
|
||||
}
|
||||
|
||||
forget := false
|
||||
// no need to update the job if the status hasn't changed since last time
|
||||
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
|
||||
job.Status.Active = active
|
||||
@ -558,19 +560,18 @@ func (jm *JobController) syncJob(key string) error {
|
||||
job.Status.Failed = failed
|
||||
|
||||
if err := jm.updateHandler(&job); err != nil {
|
||||
return err
|
||||
return false, err
|
||||
}
|
||||
|
||||
if jobHaveNewFailure && !IsJobFinished(&job) {
|
||||
// returning an error will re-enqueue Job after the backoff period
|
||||
return false, fmt.Errorf("failed pod(s) detected for job key %q", key)
|
||||
}
|
||||
|
||||
forget = true
|
||||
}
|
||||
|
||||
if jobHaveNewFailure {
|
||||
// re-enqueue Job after the backoff period
|
||||
jm.queue.AddRateLimited(key)
|
||||
} else {
|
||||
// if no new Failure the job backoff period can be reset
|
||||
jm.queue.Forget(key)
|
||||
}
|
||||
|
||||
return manageJobErr
|
||||
return forget, manageJobErr
|
||||
}
|
||||
|
||||
func (jm *JobController) deleteJobPods(job *batch.Job, pods []*v1.Pod, errCh chan<- error) {
|
||||
|
@ -134,6 +134,7 @@ func TestControllerSyncJob(t *testing.T) {
|
||||
|
||||
// pod setup
|
||||
podControllerError error
|
||||
jobKeyForget bool
|
||||
pendingPods int32
|
||||
activePods int32
|
||||
succeededPods int32
|
||||
@ -150,102 +151,102 @@ func TestControllerSyncJob(t *testing.T) {
|
||||
}{
|
||||
"job start": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 0, 0, 0,
|
||||
nil, true, 0, 0, 0, 0,
|
||||
2, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"WQ job start": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 0, 0, 0,
|
||||
nil, true, 0, 0, 0, 0,
|
||||
2, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"pending pods": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 2, 0, 0, 0,
|
||||
nil, true, 2, 0, 0, 0,
|
||||
0, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"correct # of pods": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 2, 0, 0,
|
||||
nil, true, 0, 2, 0, 0,
|
||||
0, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"WQ job: correct # of pods": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 2, 0, 0,
|
||||
nil, true, 0, 2, 0, 0,
|
||||
0, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"too few active pods": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 1, 1, 0,
|
||||
nil, true, 0, 1, 1, 0,
|
||||
1, 0, 2, 1, 0, nil, "",
|
||||
},
|
||||
"too few active pods with a dynamic job": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 1, 0, 0,
|
||||
nil, true, 0, 1, 0, 0,
|
||||
1, 0, 2, 0, 0, nil, "",
|
||||
},
|
||||
"too few active pods, with controller error": {
|
||||
2, 5, 6, false, 0,
|
||||
fmt.Errorf("Fake error"), 0, 1, 1, 0,
|
||||
fmt.Errorf("Fake error"), true, 0, 1, 1, 0,
|
||||
1, 0, 1, 1, 0, nil, "",
|
||||
},
|
||||
"too many active pods": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 3, 0, 0,
|
||||
nil, true, 0, 3, 0, 0,
|
||||
0, 1, 2, 0, 0, nil, "",
|
||||
},
|
||||
"too many active pods, with controller error": {
|
||||
2, 5, 6, false, 0,
|
||||
fmt.Errorf("Fake error"), 0, 3, 0, 0,
|
||||
fmt.Errorf("Fake error"), true, 0, 3, 0, 0,
|
||||
0, 1, 3, 0, 0, nil, "",
|
||||
},
|
||||
"failed pod": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 1, 1, 1,
|
||||
1, 0, 2, 1, 1, nil, "",
|
||||
fmt.Errorf("Fake error"), false, 0, 1, 1, 1,
|
||||
1, 0, 1, 1, 1, nil, "",
|
||||
},
|
||||
"job finish": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 0, 5, 0,
|
||||
nil, true, 0, 0, 5, 0,
|
||||
0, 0, 0, 5, 0, nil, "",
|
||||
},
|
||||
"WQ job finishing": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 1, 1, 0,
|
||||
nil, true, 0, 1, 1, 0,
|
||||
0, 0, 1, 1, 0, nil, "",
|
||||
},
|
||||
"WQ job all finished": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 0, 2, 0,
|
||||
nil, true, 0, 0, 2, 0,
|
||||
0, 0, 0, 2, 0, &jobConditionComplete, "",
|
||||
},
|
||||
"WQ job all finished despite one failure": {
|
||||
2, -1, 6, false, 0,
|
||||
nil, 0, 0, 1, 1,
|
||||
nil, true, 0, 0, 1, 1,
|
||||
0, 0, 0, 1, 1, &jobConditionComplete, "",
|
||||
},
|
||||
"more active pods than completions": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 10, 0, 0,
|
||||
nil, true, 0, 10, 0, 0,
|
||||
0, 8, 2, 0, 0, nil, "",
|
||||
},
|
||||
"status change": {
|
||||
2, 5, 6, false, 0,
|
||||
nil, 0, 2, 2, 0,
|
||||
nil, true, 0, 2, 2, 0,
|
||||
0, 0, 2, 2, 0, nil, "",
|
||||
},
|
||||
"deleting job": {
|
||||
2, 5, 6, true, 0,
|
||||
nil, 1, 1, 1, 0,
|
||||
nil, true, 1, 1, 1, 0,
|
||||
0, 0, 2, 1, 0, nil, "",
|
||||
},
|
||||
"limited pods": {
|
||||
100, 200, 6, false, 10,
|
||||
nil, 0, 0, 0, 0,
|
||||
nil, true, 0, 0, 0, 0,
|
||||
10, 0, 10, 0, 0, nil, "",
|
||||
},
|
||||
"to many job sync failure": {
|
||||
2, 5, 0, true, 0,
|
||||
nil, 0, 0, 0, 1,
|
||||
nil, true, 0, 0, 0, 1,
|
||||
0, 0, 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
|
||||
},
|
||||
}
|
||||
@ -286,7 +287,7 @@ func TestControllerSyncJob(t *testing.T) {
|
||||
}
|
||||
|
||||
// run
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
|
||||
// We need requeue syncJob task if podController error
|
||||
if tc.podControllerError != nil {
|
||||
@ -298,7 +299,9 @@ func TestControllerSyncJob(t *testing.T) {
|
||||
t.Errorf("%s: unexpected error when syncing jobs %v", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
if forget != tc.jobKeyForget {
|
||||
t.Errorf("%s: unexpected forget value. Expected %v, saw %v\n", name, tc.jobKeyForget, forget)
|
||||
}
|
||||
// validate created/deleted pods
|
||||
if int32(len(fakePodControl.Templates)) != tc.expectedCreations {
|
||||
t.Errorf("%s: unexpected number of creates. Expected %d, saw %d\n", name, tc.expectedCreations, len(fakePodControl.Templates))
|
||||
@ -371,6 +374,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
||||
failedPods int32
|
||||
|
||||
// expectations
|
||||
expectedForgetKey bool
|
||||
expectedDeletions int32
|
||||
expectedActive int32
|
||||
expectedSucceeded int32
|
||||
@ -380,22 +384,22 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
||||
"activeDeadlineSeconds less than single pod execution": {
|
||||
1, 1, 10, 15, 6,
|
||||
1, 0, 0,
|
||||
1, 0, 0, 1, "DeadlineExceeded",
|
||||
true, 1, 0, 0, 1, "DeadlineExceeded",
|
||||
},
|
||||
"activeDeadlineSeconds bigger than single pod execution": {
|
||||
1, 2, 10, 15, 6,
|
||||
1, 1, 0,
|
||||
1, 0, 1, 1, "DeadlineExceeded",
|
||||
true, 1, 0, 1, 1, "DeadlineExceeded",
|
||||
},
|
||||
"activeDeadlineSeconds times-out before any pod starts": {
|
||||
1, 1, 10, 10, 6,
|
||||
0, 0, 0,
|
||||
0, 0, 0, 0, "DeadlineExceeded",
|
||||
true, 0, 0, 0, 0, "DeadlineExceeded",
|
||||
},
|
||||
"activeDeadlineSeconds with backofflimit reach": {
|
||||
1, 1, 1, 10, 0,
|
||||
1, 0, 2,
|
||||
1, 0, 0, 3, "BackoffLimitExceeded",
|
||||
true, 1, 0, 0, 3, "BackoffLimitExceeded",
|
||||
},
|
||||
}
|
||||
|
||||
@ -431,11 +435,13 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
||||
}
|
||||
|
||||
// run
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
if err != nil {
|
||||
t.Errorf("%s: unexpected error when syncing jobs %v", name, err)
|
||||
}
|
||||
|
||||
if forget != tc.expectedForgetKey {
|
||||
t.Errorf("%s: unexpected forget value. Expected %v, saw %v\n", name, tc.expectedForgetKey, forget)
|
||||
}
|
||||
// validate created/deleted pods
|
||||
if int32(len(fakePodControl.Templates)) != 0 {
|
||||
t.Errorf("%s: unexpected number of creates. Expected 0, saw %d\n", name, len(fakePodControl.Templates))
|
||||
@ -492,10 +498,13 @@ func TestSyncPastDeadlineJobFinished(t *testing.T) {
|
||||
job.Status.StartTime = &start
|
||||
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, "DeadlineExceeded", "Job was active longer than specified deadline"))
|
||||
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error when syncing jobs %v", err)
|
||||
}
|
||||
if !forget {
|
||||
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", true, forget)
|
||||
}
|
||||
if len(fakePodControl.Templates) != 0 {
|
||||
t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", 0, len(fakePodControl.Templates))
|
||||
}
|
||||
@ -518,10 +527,13 @@ func TestSyncJobComplete(t *testing.T) {
|
||||
job := newJob(1, 1, 6)
|
||||
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobComplete, "", ""))
|
||||
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error when syncing jobs %v", err)
|
||||
}
|
||||
if !forget {
|
||||
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", true, forget)
|
||||
}
|
||||
actual, err := manager.jobLister.Jobs(job.Namespace).Get(job.Name)
|
||||
if err != nil {
|
||||
t.Fatalf("Unexpected error when trying to get job from the store: %v", err)
|
||||
@ -541,10 +553,13 @@ func TestSyncJobDeleted(t *testing.T) {
|
||||
manager.jobStoreSynced = alwaysReady
|
||||
manager.updateHandler = func(job *batch.Job) error { return nil }
|
||||
job := newJob(2, 2, 6)
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error when syncing jobs %v", err)
|
||||
}
|
||||
if !forget {
|
||||
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", true, forget)
|
||||
}
|
||||
if len(fakePodControl.Templates) != 0 {
|
||||
t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", 0, len(fakePodControl.Templates))
|
||||
}
|
||||
@ -555,6 +570,7 @@ func TestSyncJobDeleted(t *testing.T) {
|
||||
|
||||
func TestSyncJobUpdateRequeue(t *testing.T) {
|
||||
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &api.Registry.GroupOrDie(v1.GroupName).GroupVersion}})
|
||||
DefaultJobBackOff = time.Duration(0) // overwrite the default value for testing
|
||||
manager, sharedInformerFactory := newJobControllerFromClient(clientset, controller.NoResyncPeriodFunc)
|
||||
fakePodControl := controller.FakePodControl{}
|
||||
manager.podControl = &fakePodControl
|
||||
@ -567,10 +583,13 @@ func TestSyncJobUpdateRequeue(t *testing.T) {
|
||||
}
|
||||
job := newJob(2, 2, 6)
|
||||
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
|
||||
err := manager.syncJob(getKey(job, t))
|
||||
forget, err := manager.syncJob(getKey(job, t))
|
||||
if err == nil || err != updateError {
|
||||
t.Errorf("Expected error %v when syncing jobs, got %v", updateError, err)
|
||||
}
|
||||
if forget != false {
|
||||
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", false, forget)
|
||||
}
|
||||
t.Log("Waiting for a job in the queue")
|
||||
key, _ := manager.queue.Get()
|
||||
expectedKey := getKey(job, t)
|
||||
@ -1168,7 +1187,7 @@ func TestWatchJobs(t *testing.T) {
|
||||
|
||||
// The update sent through the fakeWatcher should make its way into the workqueue,
|
||||
// and eventually into the syncHandler.
|
||||
manager.syncHandler = func(key string) error {
|
||||
manager.syncHandler = func(key string) (bool, error) {
|
||||
defer close(received)
|
||||
ns, name, err := cache.SplitMetaNamespaceKey(key)
|
||||
if err != nil {
|
||||
@ -1177,12 +1196,12 @@ func TestWatchJobs(t *testing.T) {
|
||||
job, err := manager.jobLister.Jobs(ns).Get(name)
|
||||
if err != nil || job == nil {
|
||||
t.Errorf("Expected to find job under key %v: %v", key, err)
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
if !apiequality.Semantic.DeepDerivative(*job, testJob) {
|
||||
t.Errorf("Expected %#v, but got %#v", testJob, *job)
|
||||
}
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
// Start only the job watcher and the workqueue, send a watch event,
|
||||
// and make sure it hits the sync method.
|
||||
@ -1213,7 +1232,7 @@ func TestWatchPods(t *testing.T) {
|
||||
received := make(chan struct{})
|
||||
// The pod update sent through the fakeWatcher should figure out the managing job and
|
||||
// send it into the syncHandler.
|
||||
manager.syncHandler = func(key string) error {
|
||||
manager.syncHandler = func(key string) (bool, error) {
|
||||
ns, name, err := cache.SplitMetaNamespaceKey(key)
|
||||
if err != nil {
|
||||
t.Errorf("Error getting namespace/name from key %v: %v", key, err)
|
||||
@ -1225,10 +1244,10 @@ func TestWatchPods(t *testing.T) {
|
||||
if !apiequality.Semantic.DeepDerivative(job, testJob) {
|
||||
t.Errorf("\nExpected %#v,\nbut got %#v", testJob, job)
|
||||
close(received)
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
close(received)
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
// Start only the pod watcher and the workqueue, send a watch event,
|
||||
// and make sure it hits the sync method for the right job.
|
||||
|
Loading…
Reference in New Issue
Block a user