Merge pull request #110294 from harshanarayana/bug/git-110239/fix-activedeadline-enforcement

GIT-110239: fix activeDeadlineSeconds enforcement bug
This commit is contained in:
Kubernetes Prow Robot 2022-06-14 08:03:33 -07:00 committed by GitHub
commit 03b18bf138
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 67 additions and 39 deletions

View File

@ -746,17 +746,11 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) { if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) {
ready = pointer.Int32(countReadyPods(activePods)) ready = pointer.Int32(countReadyPods(activePods))
} }
// Job first start. Set StartTime and start the ActiveDeadlineSeconds timer
// only if the job is not in the suspended state. // Job first start. Set StartTime only if the job is not in the suspended state.
if job.Status.StartTime == nil && !jobSuspended(&job) { if job.Status.StartTime == nil && !jobSuspended(&job) {
now := metav1.Now() now := metav1.Now()
job.Status.StartTime = &now job.Status.StartTime = &now
// enqueue a sync to check if job past ActiveDeadlineSeconds
if job.Spec.ActiveDeadlineSeconds != nil {
klog.V(4).Infof("Job %s has ActiveDeadlineSeconds will sync after %d seconds",
key, *job.Spec.ActiveDeadlineSeconds)
jm.queue.AddAfter(key, time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second)
}
} }
var manageJobErr error var manageJobErr error
@ -775,6 +769,10 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit") finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit")
} else if pastActiveDeadline(&job) { } else if pastActiveDeadline(&job) {
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline") finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline")
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - time.Since(job.Status.StartTime.Time)
klog.V(2).InfoS("Job has activeDeadlineSeconds configuration. Will sync this job again", "job", key, "nextSyncIn", syncDuration)
jm.queue.AddAfter(key, syncDuration)
} }
var prevSucceededIndexes, succeededIndexes orderedIntervals var prevSucceededIndexes, succeededIndexes orderedIntervals

View File

@ -59,11 +59,11 @@ import (
var alwaysReady = func() bool { return true } var alwaysReady = func() bool { return true }
func newJob(parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job { func newJobWithName(name string, parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job {
j := &batch.Job{ j := &batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"}, TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "foobar", Name: name,
UID: uuid.NewUUID(), UID: uuid.NewUUID(),
Namespace: metav1.NamespaceDefault, Namespace: metav1.NamespaceDefault,
}, },
@ -105,6 +105,10 @@ func newJob(parallelism, completions, backoffLimit int32, completionMode batch.C
return j return j
} }
func newJob(parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job {
return newJobWithName("foobar", parallelism, completions, backoffLimit, completionMode)
}
func newControllerFromClient(kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc) (*Controller, informers.SharedInformerFactory) { func newControllerFromClient(kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc) (*Controller, informers.SharedInformerFactory) {
sharedInformers := informers.NewSharedInformerFactory(kubeClient, resyncPeriod()) sharedInformers := informers.NewSharedInformerFactory(kubeClient, resyncPeriod())
jm := NewController(sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), kubeClient) jm := NewController(sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), kubeClient)
@ -1828,40 +1832,66 @@ func hasTrueCondition(job *batch.Job) *batch.JobConditionType {
} }
func TestSyncPastDeadlineJobFinished(t *testing.T) { func TestSyncPastDeadlineJobFinished(t *testing.T) {
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) clientset := fake.NewSimpleClientset()
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc) manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{}
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
actual = job
return job, nil
}
job := newJob(1, 1, 6, batch.NonIndexedCompletion) ctx, cancel := context.WithCancel(context.Background())
activeDeadlineSeconds := int64(10) defer cancel()
job.Spec.ActiveDeadlineSeconds = &activeDeadlineSeconds sharedInformerFactory.Start(ctx.Done())
start := metav1.Unix(metav1.Now().Time.Unix()-15, 0)
job.Status.StartTime = &start go manager.Run(ctx, 1)
job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline"))
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) tests := []struct {
forget, err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) name string
if err != nil { setStartTime bool
t.Errorf("Unexpected error when syncing jobs %v", err) jobName string
}{
{
name: "New job created without start time being set",
setStartTime: false,
jobName: "job1",
},
{
name: "New job created with start time being set",
setStartTime: true,
jobName: "job2",
},
} }
if !forget { for _, tc := range tests {
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", true, forget) t.Run(tc.name, func(t *testing.T) {
} job := newJobWithName(tc.jobName, 1, 1, 6, batch.NonIndexedCompletion)
if len(fakePodControl.Templates) != 0 { activeDeadlineSeconds := int64(1)
t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", 0, len(fakePodControl.Templates)) job.Spec.ActiveDeadlineSeconds = &activeDeadlineSeconds
} if tc.setStartTime {
if len(fakePodControl.DeletePodName) != 0 { start := metav1.Unix(metav1.Now().Time.Unix()-1, 0)
t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName)) job.Status.StartTime = &start
} }
if actual != nil {
t.Error("Unexpected job modification") _, err := clientset.BatchV1().Jobs(job.GetNamespace()).Create(ctx, job, metav1.CreateOptions{})
if err != nil {
t.Errorf("Could not create Job: %v", err)
}
if err := sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job); err != nil {
t.Fatalf("Failed to insert job in index: %v", err)
}
var j *batch.Job
err = wait.Poll(200*time.Millisecond, 3*time.Second, func() (done bool, err error) {
j, err = clientset.BatchV1().Jobs(metav1.NamespaceDefault).Get(ctx, job.GetName(), metav1.GetOptions{})
if err != nil {
return false, nil
}
if len(j.Status.Conditions) == 1 && j.Status.Conditions[0].Reason == "DeadlineExceeded" {
return true, nil
}
return false, nil
})
if err != nil {
t.Errorf("Job failed to enforce activeDeadlineSeconds configuration. Expected condition with Reason 'DeadlineExceeded' was not found in %v", j.Status)
}
})
} }
} }